1 /*- 2 * 3 * SPDX-License-Identifier: BSD-3-Clause 4 * 5 * Copyright (c) 2018-2019 6 * Netflix Inc. 7 * All rights reserved. 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions 11 * are met: 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in the 16 * documentation and/or other materials provided with the distribution. 17 * 18 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 19 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 21 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 22 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 24 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 28 * SUCH DAMAGE. 29 * 30 */ 31 /** 32 * Author: Randall Stewart <rrs@netflix.com> 33 */ 34 35 #include <sys/cdefs.h> 36 __FBSDID("$FreeBSD$"); 37 #include "opt_inet.h" 38 #include "opt_inet6.h" 39 #include "opt_ipsec.h" 40 #include "opt_tcpdebug.h" 41 #include "opt_ratelimit.h" 42 #include <sys/param.h> 43 #include <sys/kernel.h> 44 #include <sys/malloc.h> 45 #include <sys/mbuf.h> 46 #include <sys/socket.h> 47 #include <sys/socketvar.h> 48 #ifdef KERN_TLS 49 #include <sys/sockbuf_tls.h> 50 #endif 51 #include <sys/sysctl.h> 52 #include <sys/eventhandler.h> 53 #include <sys/mutex.h> 54 #include <sys/ck.h> 55 #define TCPSTATES /* for logging */ 56 #include <netinet/in.h> 57 #include <netinet/in_pcb.h> 58 #include <netinet/tcp_var.h> 59 #ifdef INET6 60 #include <netinet6/tcp6_var.h> 61 #endif 62 #include <netinet/tcp_ratelimit.h> 63 #ifndef USECS_IN_SECOND 64 #define USECS_IN_SECOND 1000000 65 #endif 66 /* 67 * For the purposes of each send, what is the size 68 * of an ethernet frame. 69 */ 70 #ifndef ETHERNET_SEGMENT_SIZE 71 #define ETHERNET_SEGMENT_SIZE 1500 72 #endif 73 MALLOC_DEFINE(M_TCPPACE, "tcp_hwpace", "TCP Hardware pacing memory"); 74 #ifdef RATELIMIT 75 76 #define COMMON_RATE 180500 77 uint64_t desired_rates[] = { 78 62500, /* 500Kbps */ 79 180500, /* 1.44Mpbs */ 80 375000, /* 3Mbps */ 81 500000, /* 4Mbps */ 82 625000, /* 5Mbps */ 83 750000, /* 6Mbps */ 84 1000000, /* 8Mbps */ 85 1250000, /* 10Mbps */ 86 2500000, /* 20Mbps */ 87 3750000, /* 30Mbps */ 88 5000000, /* 40Meg */ 89 6250000, /* 50Mbps */ 90 12500000, /* 100Mbps */ 91 25000000, /* 200Mbps */ 92 50000000, /* 400Mbps */ 93 100000000, /* 800Mbps */ 94 12500, /* 100kbps */ 95 25000, /* 200kbps */ 96 875000, /* 7Mbps */ 97 1125000, /* 9Mbps */ 98 1875000, /* 15Mbps */ 99 3125000, /* 25Mbps */ 100 8125000, /* 65Mbps */ 101 10000000, /* 80Mbps */ 102 18750000, /* 150Mbps */ 103 20000000, /* 250Mbps */ 104 37500000, /* 350Mbps */ 105 62500000, /* 500Mbps */ 106 78125000, /* 625Mbps */ 107 125000000, /* 1Gbps */ 108 }; 109 #define MAX_HDWR_RATES (sizeof(desired_rates)/sizeof(uint64_t)) 110 #define RS_ORDERED_COUNT 16 /* 111 * Number that are in order 112 * at the beginning of the table, 113 * over this a sort is required. 114 */ 115 #define RS_NEXT_ORDER_GROUP 16 /* 116 * The point in our table where 117 * we come fill in a second ordered 118 * group (index wise means -1). 119 */ 120 #define ALL_HARDWARE_RATES 1004 /* 121 * 1Meg - 1Gig in 1 Meg steps 122 * plus 100, 200k and 500k and 123 * 10Gig 124 */ 125 126 #define RS_ONE_MEGABIT_PERSEC 1000000 127 #define RS_ONE_GIGABIT_PERSEC 1000000000 128 #define RS_TEN_GIGABIT_PERSEC 10000000000 129 130 static struct head_tcp_rate_set int_rs; 131 static struct mtx rs_mtx; 132 uint32_t rs_number_alive; 133 uint32_t rs_number_dead; 134 135 SYSCTL_NODE(_net_inet_tcp, OID_AUTO, rl, CTLFLAG_RW, 0, 136 "TCP Ratelimit stats"); 137 SYSCTL_UINT(_net_inet_tcp_rl, OID_AUTO, alive, CTLFLAG_RW, 138 &rs_number_alive, 0, 139 "Number of interfaces initialized for ratelimiting"); 140 SYSCTL_UINT(_net_inet_tcp_rl, OID_AUTO, dead, CTLFLAG_RW, 141 &rs_number_dead, 0, 142 "Number of interfaces departing from ratelimiting"); 143 144 static void 145 rl_add_syctl_entries(struct sysctl_oid *rl_sysctl_root, struct tcp_rate_set *rs) 146 { 147 /* 148 * Add sysctl entries for thus interface. 149 */ 150 if (rs->rs_flags & RS_INTF_NO_SUP) { 151 SYSCTL_ADD_S32(&rs->sysctl_ctx, 152 SYSCTL_CHILDREN(rl_sysctl_root), 153 OID_AUTO, "disable", CTLFLAG_RD, 154 &rs->rs_disable, 0, 155 "Disable this interface from new hdwr limiting?"); 156 } else { 157 SYSCTL_ADD_S32(&rs->sysctl_ctx, 158 SYSCTL_CHILDREN(rl_sysctl_root), 159 OID_AUTO, "disable", CTLFLAG_RW, 160 &rs->rs_disable, 0, 161 "Disable this interface from new hdwr limiting?"); 162 } 163 SYSCTL_ADD_S32(&rs->sysctl_ctx, 164 SYSCTL_CHILDREN(rl_sysctl_root), 165 OID_AUTO, "minseg", CTLFLAG_RW, 166 &rs->rs_min_seg, 0, 167 "What is the minimum we need to send on this interface?"); 168 SYSCTL_ADD_U64(&rs->sysctl_ctx, 169 SYSCTL_CHILDREN(rl_sysctl_root), 170 OID_AUTO, "flow_limit", CTLFLAG_RW, 171 &rs->rs_flow_limit, 0, 172 "What is the limit for number of flows (0=unlimited)?"); 173 SYSCTL_ADD_S32(&rs->sysctl_ctx, 174 SYSCTL_CHILDREN(rl_sysctl_root), 175 OID_AUTO, "highest", CTLFLAG_RD, 176 &rs->rs_highest_valid, 0, 177 "Highest valid rate"); 178 SYSCTL_ADD_S32(&rs->sysctl_ctx, 179 SYSCTL_CHILDREN(rl_sysctl_root), 180 OID_AUTO, "lowest", CTLFLAG_RD, 181 &rs->rs_lowest_valid, 0, 182 "Lowest valid rate"); 183 SYSCTL_ADD_S32(&rs->sysctl_ctx, 184 SYSCTL_CHILDREN(rl_sysctl_root), 185 OID_AUTO, "flags", CTLFLAG_RD, 186 &rs->rs_flags, 0, 187 "What lags are on the entry?"); 188 SYSCTL_ADD_S32(&rs->sysctl_ctx, 189 SYSCTL_CHILDREN(rl_sysctl_root), 190 OID_AUTO, "numrates", CTLFLAG_RD, 191 &rs->rs_rate_cnt, 0, 192 "How many rates re there?"); 193 SYSCTL_ADD_U64(&rs->sysctl_ctx, 194 SYSCTL_CHILDREN(rl_sysctl_root), 195 OID_AUTO, "flows_using", CTLFLAG_RD, 196 &rs->rs_flows_using, 0, 197 "How many flows are using this interface now?"); 198 #ifdef DETAILED_RATELIMIT_SYSCTL 199 if (rs->rs_rlt && rs->rs_rate_cnt > 0) { 200 /* Lets display the rates */ 201 int i; 202 struct sysctl_oid *rl_rates; 203 struct sysctl_oid *rl_rate_num; 204 char rate_num[16]; 205 rl_rates = SYSCTL_ADD_NODE(&rs->sysctl_ctx, 206 SYSCTL_CHILDREN(rl_sysctl_root), 207 OID_AUTO, 208 "rate", 209 CTLFLAG_RW, 0, 210 "Ratelist"); 211 for( i = 0; i < rs->rs_rate_cnt; i++) { 212 sprintf(rate_num, "%d", i); 213 rl_rate_num = SYSCTL_ADD_NODE(&rs->sysctl_ctx, 214 SYSCTL_CHILDREN(rl_rates), 215 OID_AUTO, 216 rate_num, 217 CTLFLAG_RW, 0, 218 "Individual Rate"); 219 SYSCTL_ADD_U32(&rs->sysctl_ctx, 220 SYSCTL_CHILDREN(rl_rate_num), 221 OID_AUTO, "flags", CTLFLAG_RD, 222 &rs->rs_rlt[i].flags, 0, 223 "Flags on this rate"); 224 SYSCTL_ADD_U32(&rs->sysctl_ctx, 225 SYSCTL_CHILDREN(rl_rate_num), 226 OID_AUTO, "pacetime", CTLFLAG_RD, 227 &rs->rs_rlt[i].time_between, 0, 228 "Time hardware inserts between 1500 byte sends"); 229 SYSCTL_ADD_U64(&rs->sysctl_ctx, 230 SYSCTL_CHILDREN(rl_rate_num), 231 OID_AUTO, "rate", CTLFLAG_RD, 232 &rs->rs_rlt[i].rate, 0, 233 "Rate in bytes per second"); 234 } 235 } 236 #endif 237 } 238 239 static void 240 rs_destroy(epoch_context_t ctx) 241 { 242 struct tcp_rate_set *rs; 243 244 rs = __containerof(ctx, struct tcp_rate_set, rs_epoch_ctx); 245 mtx_lock(&rs_mtx); 246 rs->rs_flags &= ~RS_FUNERAL_SCHD; 247 if (rs->rs_flows_using == 0) { 248 /* 249 * In theory its possible (but unlikely) 250 * that while the delete was occuring 251 * and we were applying the DEAD flag 252 * someone slipped in and found the 253 * interface in a lookup. While we 254 * decided rs_flows_using were 0 and 255 * scheduling the epoch_call, the other 256 * thread incremented rs_flow_using. This 257 * is because users have a pointer and 258 * we only use the rs_flows_using in an 259 * atomic fashion, i.e. the other entities 260 * are not protected. To assure this did 261 * not occur, we check rs_flows_using here 262 * before deleteing. 263 */ 264 sysctl_ctx_free(&rs->sysctl_ctx); 265 free(rs->rs_rlt, M_TCPPACE); 266 free(rs, M_TCPPACE); 267 rs_number_dead--; 268 } 269 mtx_unlock(&rs_mtx); 270 271 } 272 273 #ifdef INET 274 extern counter_u64_t rate_limit_set_ok; 275 extern counter_u64_t rate_limit_active; 276 extern counter_u64_t rate_limit_alloc_fail; 277 #endif 278 279 static int 280 rl_attach_txrtlmt(struct ifnet *ifp, 281 uint32_t flowtype, 282 int flowid, 283 uint64_t cfg_rate, 284 struct m_snd_tag **tag) 285 { 286 int error; 287 union if_snd_tag_alloc_params params = { 288 .rate_limit.hdr.type = IF_SND_TAG_TYPE_RATE_LIMIT, 289 .rate_limit.hdr.flowid = flowid, 290 .rate_limit.hdr.flowtype = flowtype, 291 .rate_limit.max_rate = cfg_rate, 292 .rate_limit.flags = M_NOWAIT, 293 }; 294 295 if (ifp->if_snd_tag_alloc == NULL) { 296 error = EOPNOTSUPP; 297 } else { 298 error = ifp->if_snd_tag_alloc(ifp, ¶ms, tag); 299 #ifdef INET 300 if (error == 0) { 301 if_ref((*tag)->ifp); 302 counter_u64_add(rate_limit_set_ok, 1); 303 counter_u64_add(rate_limit_active, 1); 304 } else 305 counter_u64_add(rate_limit_alloc_fail, 1); 306 #endif 307 } 308 return (error); 309 } 310 311 static void 312 populate_canned_table(struct tcp_rate_set *rs, const uint64_t *rate_table_act) 313 { 314 /* 315 * The internal table is "special", it 316 * is two seperate ordered tables that 317 * must be merged. We get here when the 318 * adapter specifies a number of rates that 319 * covers both ranges in the table in some 320 * form. 321 */ 322 int i, at_low, at_high; 323 uint8_t low_disabled = 0, high_disabled = 0; 324 325 for(i = 0, at_low = 0, at_high = RS_NEXT_ORDER_GROUP; i < rs->rs_rate_cnt; i++) { 326 rs->rs_rlt[i].flags = 0; 327 rs->rs_rlt[i].time_between = 0; 328 if ((low_disabled == 0) && 329 (high_disabled || 330 (rate_table_act[at_low] < rate_table_act[at_high]))) { 331 rs->rs_rlt[i].rate = rate_table_act[at_low]; 332 at_low++; 333 if (at_low == RS_NEXT_ORDER_GROUP) 334 low_disabled = 1; 335 } else if (high_disabled == 0) { 336 rs->rs_rlt[i].rate = rate_table_act[at_high]; 337 at_high++; 338 if (at_high == MAX_HDWR_RATES) 339 high_disabled = 1; 340 } 341 } 342 } 343 344 static struct tcp_rate_set * 345 rt_setup_new_rs(struct ifnet *ifp, int *error) 346 { 347 struct tcp_rate_set *rs; 348 const uint64_t *rate_table_act; 349 uint64_t lentim, res; 350 size_t sz; 351 uint32_t hash_type; 352 int i; 353 struct if_ratelimit_query_results rl; 354 struct sysctl_oid *rl_sysctl_root; 355 /* 356 * We expect to enter with the 357 * mutex locked. 358 */ 359 360 if (ifp->if_ratelimit_query == NULL) { 361 /* 362 * We can do nothing if we cannot 363 * get a query back from the driver. 364 */ 365 return (NULL); 366 } 367 rs = malloc(sizeof(struct tcp_rate_set), M_TCPPACE, M_NOWAIT | M_ZERO); 368 if (rs == NULL) { 369 if (error) 370 *error = ENOMEM; 371 return (NULL); 372 } 373 rl.flags = RT_NOSUPPORT; 374 ifp->if_ratelimit_query(ifp, &rl); 375 if (rl.flags & RT_IS_UNUSABLE) { 376 /* 377 * The interface does not really support 378 * the rate-limiting. 379 */ 380 memset(rs, 0, sizeof(struct tcp_rate_set)); 381 rs->rs_ifp = ifp; 382 rs->rs_if_dunit = ifp->if_dunit; 383 rs->rs_flags = RS_INTF_NO_SUP; 384 rs->rs_disable = 1; 385 rs_number_alive++; 386 sysctl_ctx_init(&rs->sysctl_ctx); 387 rl_sysctl_root = SYSCTL_ADD_NODE(&rs->sysctl_ctx, 388 SYSCTL_STATIC_CHILDREN(_net_inet_tcp_rl), 389 OID_AUTO, 390 rs->rs_ifp->if_xname, 391 CTLFLAG_RW, 0, 392 ""); 393 CK_LIST_INSERT_HEAD(&int_rs, rs, next); 394 /* Unlock to allow the sysctl stuff to allocate */ 395 mtx_unlock(&rs_mtx); 396 rl_add_syctl_entries(rl_sysctl_root, rs); 397 /* re-lock for our caller */ 398 mtx_lock(&rs_mtx); 399 return (rs); 400 } else if ((rl.flags & RT_IS_INDIRECT) == RT_IS_INDIRECT) { 401 memset(rs, 0, sizeof(struct tcp_rate_set)); 402 rs->rs_ifp = ifp; 403 rs->rs_if_dunit = ifp->if_dunit; 404 rs->rs_flags = RS_IS_DEFF; 405 rs_number_alive++; 406 sysctl_ctx_init(&rs->sysctl_ctx); 407 rl_sysctl_root = SYSCTL_ADD_NODE(&rs->sysctl_ctx, 408 SYSCTL_STATIC_CHILDREN(_net_inet_tcp_rl), 409 OID_AUTO, 410 rs->rs_ifp->if_xname, 411 CTLFLAG_RW, 0, 412 ""); 413 CK_LIST_INSERT_HEAD(&int_rs, rs, next); 414 /* Unlock to allow the sysctl stuff to allocate */ 415 mtx_unlock(&rs_mtx); 416 rl_add_syctl_entries(rl_sysctl_root, rs); 417 /* re-lock for our caller */ 418 mtx_lock(&rs_mtx); 419 return (rs); 420 } else if ((rl.flags & RT_IS_FIXED_TABLE) == RT_IS_FIXED_TABLE) { 421 /* Mellanox most likely */ 422 rs->rs_ifp = ifp; 423 rs->rs_if_dunit = ifp->if_dunit; 424 rs->rs_rate_cnt = rl.number_of_rates; 425 rs->rs_min_seg = rl.min_segment_burst; 426 rs->rs_highest_valid = 0; 427 rs->rs_flow_limit = rl.max_flows; 428 rs->rs_flags = RS_IS_INTF | RS_NO_PRE; 429 rs->rs_disable = 0; 430 rate_table_act = rl.rate_table; 431 } else if ((rl.flags & RT_IS_SELECTABLE) == RT_IS_SELECTABLE) { 432 /* Chelsio */ 433 rs->rs_ifp = ifp; 434 rs->rs_if_dunit = ifp->if_dunit; 435 rs->rs_rate_cnt = rl.number_of_rates; 436 rs->rs_min_seg = rl.min_segment_burst; 437 rs->rs_disable = 0; 438 rs->rs_flow_limit = rl.max_flows; 439 rate_table_act = desired_rates; 440 if ((rs->rs_rate_cnt > MAX_HDWR_RATES) && 441 (rs->rs_rate_cnt < ALL_HARDWARE_RATES)) { 442 /* 443 * Our desired table is not big 444 * enough, do what we can. 445 */ 446 rs->rs_rate_cnt = MAX_HDWR_RATES; 447 } 448 if (rs->rs_rate_cnt <= RS_ORDERED_COUNT) 449 rs->rs_flags = RS_IS_INTF; 450 else 451 rs->rs_flags = RS_IS_INTF | RS_INT_TBL; 452 if (rs->rs_rate_cnt >= ALL_HARDWARE_RATES) 453 rs->rs_rate_cnt = ALL_HARDWARE_RATES; 454 } else { 455 printf("Interface:%s unit:%d not one known to have rate-limits\n", 456 ifp->if_dname, 457 ifp->if_dunit); 458 free(rs, M_TCPPACE); 459 return (NULL); 460 } 461 sz = sizeof(struct tcp_hwrate_limit_table) * rs->rs_rate_cnt; 462 rs->rs_rlt = malloc(sz, M_TCPPACE, M_NOWAIT); 463 if (rs->rs_rlt == NULL) { 464 if (error) 465 *error = ENOMEM; 466 bail: 467 free(rs, M_TCPPACE); 468 return (NULL); 469 } 470 if (rs->rs_rate_cnt >= ALL_HARDWARE_RATES) { 471 /* 472 * The interface supports all 473 * the rates we could possibly want. 474 */ 475 uint64_t rat; 476 477 rs->rs_rlt[0].rate = 12500; /* 100k */ 478 rs->rs_rlt[1].rate = 25000; /* 200k */ 479 rs->rs_rlt[2].rate = 62500; /* 500k */ 480 /* Note 125000 == 1Megabit 481 * populate 1Meg - 1000meg. 482 */ 483 for(i = 3, rat = 125000; i< (ALL_HARDWARE_RATES-1); i++) { 484 rs->rs_rlt[i].rate = rat; 485 rat += 125000; 486 } 487 rs->rs_rlt[(ALL_HARDWARE_RATES-1)].rate = 1250000000; 488 } else if (rs->rs_flags & RS_INT_TBL) { 489 /* We populate this in a special way */ 490 populate_canned_table(rs, rate_table_act); 491 } else { 492 /* 493 * Just copy in the rates from 494 * the table, it is in order. 495 */ 496 for (i=0; i<rs->rs_rate_cnt; i++) { 497 rs->rs_rlt[i].rate = rate_table_act[i]; 498 rs->rs_rlt[i].time_between = 0; 499 rs->rs_rlt[i].flags = 0; 500 } 501 } 502 for (i = (rs->rs_rate_cnt - 1); i >= 0; i--) { 503 /* 504 * We go backwards through the list so that if we can't get 505 * a rate and fail to init one, we have at least a chance of 506 * getting the highest one. 507 */ 508 rs->rs_rlt[i].ptbl = rs; 509 rs->rs_rlt[i].tag = NULL; 510 /* 511 * Calculate the time between. 512 */ 513 lentim = ETHERNET_SEGMENT_SIZE * USECS_IN_SECOND; 514 res = lentim / rs->rs_rlt[i].rate; 515 if (res > 0) 516 rs->rs_rlt[i].time_between = res; 517 else 518 rs->rs_rlt[i].time_between = 1; 519 if (rs->rs_flags & RS_NO_PRE) { 520 rs->rs_rlt[i].flags = HDWRPACE_INITED; 521 rs->rs_lowest_valid = i; 522 } else { 523 int err; 524 #ifdef RSS 525 hash_type = M_HASHTYPE_RSS_TCP_IPV4; 526 #else 527 hash_type = M_HASHTYPE_OPAQUE_HASH; 528 #endif 529 err = rl_attach_txrtlmt(ifp, 530 hash_type, 531 (i + 1), 532 rs->rs_rlt[i].rate, 533 &rs->rs_rlt[i].tag); 534 if (err) { 535 if (i == (rs->rs_rate_cnt - 1)) { 536 /* 537 * Huh - first rate and we can't get 538 * it? 539 */ 540 free(rs->rs_rlt, M_TCPPACE); 541 if (error) 542 *error = err; 543 goto bail; 544 } else { 545 if (error) 546 *error = err; 547 } 548 break; 549 } else { 550 rs->rs_rlt[i].flags = HDWRPACE_INITED | HDWRPACE_TAGPRESENT; 551 rs->rs_lowest_valid = i; 552 } 553 } 554 } 555 /* Did we get at least 1 rate? */ 556 if (rs->rs_rlt[(rs->rs_rate_cnt - 1)].flags & HDWRPACE_INITED) 557 rs->rs_highest_valid = rs->rs_rate_cnt - 1; 558 else { 559 free(rs->rs_rlt, M_TCPPACE); 560 goto bail; 561 } 562 rs_number_alive++; 563 CK_LIST_INSERT_HEAD(&int_rs, rs, next); 564 sysctl_ctx_init(&rs->sysctl_ctx); 565 rl_sysctl_root = SYSCTL_ADD_NODE(&rs->sysctl_ctx, 566 SYSCTL_STATIC_CHILDREN(_net_inet_tcp_rl), 567 OID_AUTO, 568 rs->rs_ifp->if_xname, 569 CTLFLAG_RW, 0, 570 ""); 571 /* Unlock to allow the sysctl stuff to allocate */ 572 mtx_unlock(&rs_mtx); 573 rl_add_syctl_entries(rl_sysctl_root, rs); 574 /* re-lock for our caller */ 575 mtx_lock(&rs_mtx); 576 return (rs); 577 } 578 579 static const struct tcp_hwrate_limit_table * 580 tcp_int_find_suitable_rate(const struct tcp_rate_set *rs, 581 uint64_t bytes_per_sec, uint32_t flags) 582 { 583 struct tcp_hwrate_limit_table *arte = NULL, *rte = NULL; 584 uint64_t mbits_per_sec, ind_calc; 585 int i; 586 587 mbits_per_sec = (bytes_per_sec * 8); 588 if (flags & RS_PACING_LT) { 589 if ((mbits_per_sec < RS_ONE_MEGABIT_PERSEC) && 590 (rs->rs_lowest_valid <= 2)){ 591 /* 592 * Smaller than 1Meg, only 593 * 3 entries can match it. 594 */ 595 for(i = rs->rs_lowest_valid; i < 3; i++) { 596 if (bytes_per_sec <= rs->rs_rlt[i].rate) { 597 rte = &rs->rs_rlt[i]; 598 break; 599 } else if (rs->rs_rlt[i].flags & HDWRPACE_INITED) { 600 arte = &rs->rs_rlt[i]; 601 } 602 } 603 goto done; 604 } else if ((mbits_per_sec > RS_ONE_GIGABIT_PERSEC) && 605 (rs->rs_rlt[(ALL_HARDWARE_RATES-1)].flags & HDWRPACE_INITED)){ 606 /* 607 * Larger than 1G (the majority of 608 * our table. 609 */ 610 if (mbits_per_sec < RS_TEN_GIGABIT_PERSEC) 611 rte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)]; 612 else 613 arte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)]; 614 goto done; 615 } 616 /* 617 * If we reach here its in our table (between 1Meg - 1000Meg), 618 * just take the rounded down mbits per second, and add 619 * 1Megabit to it, from this we can calculate 620 * the index in the table. 621 */ 622 ind_calc = mbits_per_sec/RS_ONE_MEGABIT_PERSEC; 623 if ((ind_calc * RS_ONE_MEGABIT_PERSEC) != mbits_per_sec) 624 ind_calc++; 625 /* our table is offset by 3, we add 2 */ 626 ind_calc += 2; 627 if (ind_calc > (ALL_HARDWARE_RATES-1)) { 628 /* This should not happen */ 629 ind_calc = ALL_HARDWARE_RATES-1; 630 } 631 if ((ind_calc >= rs->rs_lowest_valid) && 632 (ind_calc <= rs->rs_highest_valid)) 633 rte = &rs->rs_rlt[ind_calc]; 634 } else if (flags & RS_PACING_EXACT_MATCH) { 635 if ((mbits_per_sec < RS_ONE_MEGABIT_PERSEC) && 636 (rs->rs_lowest_valid <= 2)){ 637 for(i = rs->rs_lowest_valid; i < 3; i++) { 638 if (bytes_per_sec == rs->rs_rlt[i].rate) { 639 rte = &rs->rs_rlt[i]; 640 break; 641 } 642 } 643 } else if ((mbits_per_sec > RS_ONE_GIGABIT_PERSEC) && 644 (rs->rs_rlt[(ALL_HARDWARE_RATES-1)].flags & HDWRPACE_INITED)) { 645 /* > 1Gbps only one rate */ 646 if (bytes_per_sec == rs->rs_rlt[(ALL_HARDWARE_RATES-1)].rate) { 647 /* Its 10G wow */ 648 rte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)]; 649 } 650 } else { 651 /* Ok it must be a exact meg (its between 1G and 1Meg) */ 652 ind_calc = mbits_per_sec/RS_ONE_MEGABIT_PERSEC; 653 if ((ind_calc * RS_ONE_MEGABIT_PERSEC) == mbits_per_sec) { 654 /* its an exact Mbps */ 655 ind_calc += 2; 656 if (ind_calc > (ALL_HARDWARE_RATES-1)) { 657 /* This should not happen */ 658 ind_calc = ALL_HARDWARE_RATES-1; 659 } 660 if (rs->rs_rlt[ind_calc].flags & HDWRPACE_INITED) 661 rte = &rs->rs_rlt[ind_calc]; 662 } 663 } 664 } else { 665 /* we want greater than the requested rate */ 666 if ((mbits_per_sec < RS_ONE_MEGABIT_PERSEC) && 667 (rs->rs_lowest_valid <= 2)){ 668 arte = &rs->rs_rlt[3]; /* set alternate to 1Meg */ 669 for (i=2; i>=rs->rs_lowest_valid; i--) { 670 if (bytes_per_sec < rs->rs_rlt[i].rate) { 671 rte = &rs->rs_rlt[i]; 672 break; 673 } else if ((flags & RS_PACING_GEQ) && 674 (bytes_per_sec == rs->rs_rlt[i].rate)) { 675 rte = &rs->rs_rlt[i]; 676 break; 677 } else { 678 arte = &rs->rs_rlt[i]; /* new alternate */ 679 } 680 } 681 } else if (mbits_per_sec > RS_ONE_GIGABIT_PERSEC) { 682 if ((bytes_per_sec < rs->rs_rlt[(ALL_HARDWARE_RATES-1)].rate) && 683 (rs->rs_rlt[(ALL_HARDWARE_RATES-1)].flags & HDWRPACE_INITED)){ 684 /* Our top rate is larger than the request */ 685 rte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)]; 686 } else if ((flags & RS_PACING_GEQ) && 687 (bytes_per_sec == rs->rs_rlt[(ALL_HARDWARE_RATES-1)].rate) && 688 (rs->rs_rlt[(ALL_HARDWARE_RATES-1)].flags & HDWRPACE_INITED)) { 689 /* It matches our top rate */ 690 rte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)]; 691 } else if (rs->rs_rlt[(ALL_HARDWARE_RATES-1)].flags & HDWRPACE_INITED) { 692 /* The top rate is an alternative */ 693 arte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)]; 694 } 695 } else { 696 /* Its in our range 1Meg - 1Gig */ 697 if (flags & RS_PACING_GEQ) { 698 ind_calc = mbits_per_sec/RS_ONE_MEGABIT_PERSEC; 699 if ((ind_calc * RS_ONE_MEGABIT_PERSEC) == mbits_per_sec) { 700 if (ind_calc > (ALL_HARDWARE_RATES-1)) { 701 /* This should not happen */ 702 ind_calc = (ALL_HARDWARE_RATES-1); 703 } 704 rte = &rs->rs_rlt[ind_calc]; 705 } 706 goto done; 707 } 708 ind_calc = (mbits_per_sec + (RS_ONE_MEGABIT_PERSEC-1))/RS_ONE_MEGABIT_PERSEC; 709 ind_calc += 2; 710 if (ind_calc > (ALL_HARDWARE_RATES-1)) { 711 /* This should not happen */ 712 ind_calc = ALL_HARDWARE_RATES-1; 713 } 714 if (rs->rs_rlt[ind_calc].flags & HDWRPACE_INITED) 715 rte = &rs->rs_rlt[ind_calc]; 716 } 717 } 718 done: 719 if ((rte == NULL) && 720 (arte != NULL) && 721 (flags & RS_PACING_SUB_OK)) { 722 /* We can use the substitute */ 723 rte = arte; 724 } 725 return (rte); 726 } 727 728 static const struct tcp_hwrate_limit_table * 729 tcp_find_suitable_rate(const struct tcp_rate_set *rs, uint64_t bytes_per_sec, uint32_t flags) 730 { 731 /** 732 * Hunt the rate table with the restrictions in flags and find a 733 * suitable rate if possible. 734 * RS_PACING_EXACT_MATCH - look for an exact match to rate. 735 * RS_PACING_GT - must be greater than. 736 * RS_PACING_GEQ - must be greater than or equal. 737 * RS_PACING_LT - must be less than. 738 * RS_PACING_SUB_OK - If we don't meet criteria a 739 * substitute is ok. 740 */ 741 int i, matched; 742 struct tcp_hwrate_limit_table *rte = NULL; 743 744 745 if ((rs->rs_flags & RS_INT_TBL) && 746 (rs->rs_rate_cnt >= ALL_HARDWARE_RATES)) { 747 /* 748 * Here we don't want to paw thru 749 * a big table, we have everything 750 * from 1Meg - 1000Meg in 1Meg increments. 751 * Use an alternate method to "lookup". 752 */ 753 return (tcp_int_find_suitable_rate(rs, bytes_per_sec, flags)); 754 } 755 if ((flags & RS_PACING_LT) || 756 (flags & RS_PACING_EXACT_MATCH)) { 757 /* 758 * For exact and less than we go forward through the table. 759 * This way when we find one larger we stop (exact was a 760 * toss up). 761 */ 762 for (i = rs->rs_lowest_valid, matched = 0; i <= rs->rs_highest_valid; i++) { 763 if ((flags & RS_PACING_EXACT_MATCH) && 764 (bytes_per_sec == rs->rs_rlt[i].rate)) { 765 rte = &rs->rs_rlt[i]; 766 matched = 1; 767 break; 768 } else if ((flags & RS_PACING_LT) && 769 (bytes_per_sec <= rs->rs_rlt[i].rate)) { 770 rte = &rs->rs_rlt[i]; 771 matched = 1; 772 break; 773 } 774 if (bytes_per_sec > rs->rs_rlt[i].rate) 775 break; 776 } 777 if ((matched == 0) && 778 (flags & RS_PACING_LT) && 779 (flags & RS_PACING_SUB_OK)) { 780 /* Kick in a substitute (the lowest) */ 781 rte = &rs->rs_rlt[rs->rs_lowest_valid]; 782 } 783 } else { 784 /* 785 * Here we go backward through the table so that we can find 786 * the one greater in theory faster (but its probably a 787 * wash). 788 */ 789 for (i = rs->rs_highest_valid, matched = 0; i >= rs->rs_lowest_valid; i--) { 790 if (rs->rs_rlt[i].rate > bytes_per_sec) { 791 /* A possible candidate */ 792 rte = &rs->rs_rlt[i]; 793 } 794 if ((flags & RS_PACING_GEQ) && 795 (bytes_per_sec == rs->rs_rlt[i].rate)) { 796 /* An exact match and we want equal */ 797 matched = 1; 798 rte = &rs->rs_rlt[i]; 799 break; 800 } else if (rte) { 801 /* 802 * Found one that is larger than but don't 803 * stop, there may be a more closer match. 804 */ 805 matched = 1; 806 } 807 if (rs->rs_rlt[i].rate < bytes_per_sec) { 808 /* 809 * We found a table entry that is smaller, 810 * stop there will be none greater or equal. 811 */ 812 break; 813 } 814 } 815 if ((matched == 0) && 816 (flags & RS_PACING_SUB_OK)) { 817 /* Kick in a substitute (the highest) */ 818 rte = &rs->rs_rlt[rs->rs_highest_valid]; 819 } 820 } 821 return (rte); 822 } 823 824 static struct ifnet * 825 rt_find_real_interface(struct ifnet *ifp, struct inpcb *inp, int *error) 826 { 827 struct ifnet *tifp; 828 struct m_snd_tag *tag; 829 union if_snd_tag_alloc_params params = { 830 .rate_limit.hdr.type = IF_SND_TAG_TYPE_RATE_LIMIT, 831 .rate_limit.hdr.flowid = 1, 832 .rate_limit.max_rate = COMMON_RATE, 833 .rate_limit.flags = M_NOWAIT, 834 }; 835 int err; 836 #ifdef RSS 837 params.rate_limit.hdr.flowtype = ((inp->inp_vflag & INP_IPV6) ? 838 M_HASHTYPE_RSS_TCP_IPV6 : M_HASHTYPE_RSS_TCP_IPV4); 839 #else 840 params.rate_limit.hdr.flowtype = M_HASHTYPE_OPAQUE_HASH; 841 #endif 842 tag = NULL; 843 if (ifp->if_snd_tag_alloc) { 844 if (error) 845 *error = ENODEV; 846 return (NULL); 847 } 848 err = ifp->if_snd_tag_alloc(ifp, ¶ms, &tag); 849 if (err) { 850 /* Failed to setup a tag? */ 851 if (error) 852 *error = err; 853 return (NULL); 854 } 855 tifp = tag->ifp; 856 tifp->if_snd_tag_free(tag); 857 return (tifp); 858 } 859 860 static const struct tcp_hwrate_limit_table * 861 rt_setup_rate(struct inpcb *inp, struct ifnet *ifp, uint64_t bytes_per_sec, 862 uint32_t flags, int *error) 863 { 864 /* First lets find the interface if it exists */ 865 const struct tcp_hwrate_limit_table *rte; 866 struct tcp_rate_set *rs; 867 struct epoch_tracker et; 868 int err; 869 870 epoch_enter_preempt(net_epoch_preempt, &et); 871 use_real_interface: 872 CK_LIST_FOREACH(rs, &int_rs, next) { 873 /* 874 * Note we don't look with the lock since we either see a 875 * new entry or will get one when we try to add it. 876 */ 877 if (rs->rs_flags & RS_IS_DEAD) { 878 /* The dead are not looked at */ 879 continue; 880 } 881 if ((rs->rs_ifp == ifp) && 882 (rs->rs_if_dunit == ifp->if_dunit)) { 883 /* Ok we found it */ 884 break; 885 } 886 } 887 if ((rs == NULL) || 888 (rs->rs_flags & RS_INTF_NO_SUP) || 889 (rs->rs_flags & RS_IS_DEAD)) { 890 /* 891 * This means we got a packet *before* 892 * the IF-UP was processed below, <or> 893 * while or after we already received an interface 894 * departed event. In either case we really don't 895 * want to do anything with pacing, in 896 * the departing case the packet is not 897 * going to go very far. The new case 898 * might be arguable, but its impossible 899 * to tell from the departing case. 900 */ 901 if (rs->rs_disable && error) 902 *error = ENODEV; 903 epoch_exit_preempt(net_epoch_preempt, &et); 904 return (NULL); 905 } 906 907 if ((rs == NULL) || (rs->rs_disable != 0)) { 908 if (rs->rs_disable && error) 909 *error = ENOSPC; 910 epoch_exit_preempt(net_epoch_preempt, &et); 911 return (NULL); 912 } 913 if (rs->rs_flags & RS_IS_DEFF) { 914 /* We need to find the real interface */ 915 struct ifnet *tifp; 916 917 tifp = rt_find_real_interface(ifp, inp, error); 918 if (tifp == NULL) { 919 if (rs->rs_disable && error) 920 *error = ENOTSUP; 921 epoch_exit_preempt(net_epoch_preempt, &et); 922 return (NULL); 923 } 924 goto use_real_interface; 925 } 926 if (rs->rs_flow_limit && 927 ((rs->rs_flows_using + 1) > rs->rs_flow_limit)) { 928 if (error) 929 *error = ENOSPC; 930 epoch_exit_preempt(net_epoch_preempt, &et); 931 return (NULL); 932 } 933 rte = tcp_find_suitable_rate(rs, bytes_per_sec, flags); 934 if (rte) { 935 err = in_pcbattach_txrtlmt(inp, rs->rs_ifp, 936 inp->inp_flowtype, 937 inp->inp_flowid, 938 rte->rate, 939 &inp->inp_snd_tag); 940 if (err) { 941 /* Failed to attach */ 942 if (error) 943 *error = err; 944 rte = NULL; 945 } 946 } 947 if (rte) { 948 /* 949 * We use an atomic here for accounting so we don't have to 950 * use locks when freeing. 951 */ 952 atomic_add_64(&rs->rs_flows_using, 1); 953 } 954 epoch_exit_preempt(net_epoch_preempt, &et); 955 return (rte); 956 } 957 958 static void 959 tcp_rl_ifnet_link(void *arg __unused, struct ifnet *ifp, int link_state) 960 { 961 int error; 962 struct tcp_rate_set *rs; 963 964 if (((ifp->if_capabilities & IFCAP_TXRTLMT) == 0) || 965 (link_state != LINK_STATE_UP)) { 966 /* 967 * We only care on an interface going up that is rate-limit 968 * capable. 969 */ 970 return; 971 } 972 mtx_lock(&rs_mtx); 973 CK_LIST_FOREACH(rs, &int_rs, next) { 974 if ((rs->rs_ifp == ifp) && 975 (rs->rs_if_dunit == ifp->if_dunit)) { 976 /* We already have initialized this guy */ 977 mtx_unlock(&rs_mtx); 978 return; 979 } 980 } 981 rt_setup_new_rs(ifp, &error); 982 mtx_unlock(&rs_mtx); 983 } 984 985 static void 986 tcp_rl_ifnet_departure(void *arg __unused, struct ifnet *ifp) 987 { 988 struct tcp_rate_set *rs, *nrs; 989 struct ifnet *tifp; 990 int i; 991 992 mtx_lock(&rs_mtx); 993 CK_LIST_FOREACH_SAFE(rs, &int_rs, next, nrs) { 994 if ((rs->rs_ifp == ifp) && 995 (rs->rs_if_dunit == ifp->if_dunit)) { 996 CK_LIST_REMOVE(rs, next); 997 rs_number_alive--; 998 rs_number_dead++; 999 rs->rs_flags |= RS_IS_DEAD; 1000 for (i = 0; i < rs->rs_rate_cnt; i++) { 1001 if (rs->rs_rlt[i].flags & HDWRPACE_TAGPRESENT) { 1002 tifp = rs->rs_rlt[i].tag->ifp; 1003 in_pcbdetach_tag(tifp, rs->rs_rlt[i].tag); 1004 rs->rs_rlt[i].tag = NULL; 1005 } 1006 rs->rs_rlt[i].flags = HDWRPACE_IFPDEPARTED; 1007 } 1008 if (rs->rs_flows_using == 0) { 1009 /* 1010 * No references left, so we can schedule the 1011 * destruction after the epoch (with a caveat). 1012 */ 1013 rs->rs_flags |= RS_FUNERAL_SCHD; 1014 epoch_call(net_epoch, &rs->rs_epoch_ctx, rs_destroy); 1015 } 1016 break; 1017 } 1018 } 1019 mtx_unlock(&rs_mtx); 1020 } 1021 1022 static void 1023 tcp_rl_shutdown(void *arg __unused, int howto __unused) 1024 { 1025 struct tcp_rate_set *rs, *nrs; 1026 struct ifnet *tifp; 1027 int i; 1028 1029 mtx_lock(&rs_mtx); 1030 CK_LIST_FOREACH_SAFE(rs, &int_rs, next, nrs) { 1031 CK_LIST_REMOVE(rs, next); 1032 rs_number_alive--; 1033 rs_number_dead++; 1034 rs->rs_flags |= RS_IS_DEAD; 1035 for (i = 0; i < rs->rs_rate_cnt; i++) { 1036 if (rs->rs_rlt[i].flags & HDWRPACE_TAGPRESENT) { 1037 tifp = rs->rs_rlt[i].tag->ifp; 1038 in_pcbdetach_tag(tifp, rs->rs_rlt[i].tag); 1039 rs->rs_rlt[i].tag = NULL; 1040 } 1041 rs->rs_rlt[i].flags = HDWRPACE_IFPDEPARTED; 1042 } 1043 if (rs->rs_flows_using != 0) { 1044 /* 1045 * We dont hold a reference 1046 * so we have nothing left to 1047 * do. 1048 */ 1049 } else { 1050 /* 1051 * No references left, so we can destroy it 1052 * after the epoch. 1053 */ 1054 rs->rs_flags |= RS_FUNERAL_SCHD; 1055 epoch_call(net_epoch, &rs->rs_epoch_ctx, rs_destroy); 1056 } 1057 } 1058 mtx_unlock(&rs_mtx); 1059 } 1060 1061 const struct tcp_hwrate_limit_table * 1062 tcp_set_pacing_rate(struct tcpcb *tp, struct ifnet *ifp, 1063 uint64_t bytes_per_sec, int flags, int *error) 1064 { 1065 const struct tcp_hwrate_limit_table *rte; 1066 1067 if (tp->t_inpcb->inp_snd_tag == NULL) { 1068 /* 1069 * We are setting up a rate for the first time. 1070 */ 1071 if ((ifp->if_capabilities & IFCAP_TXRTLMT) == 0) { 1072 /* Not supported by the egress */ 1073 if (error) 1074 *error = ENODEV; 1075 return (NULL); 1076 } 1077 #ifdef KERN_TLS 1078 if (tp->t_inpcb->inp_socket->so_snd.sb_tls_flags & SB_TLS_IFNET) { 1079 /* 1080 * We currently can't do both TLS and hardware 1081 * pacing 1082 */ 1083 if (error) 1084 *error = EINVAL; 1085 return (NULL); 1086 } 1087 #endif 1088 rte = rt_setup_rate(tp->t_inpcb, ifp, bytes_per_sec, flags, error); 1089 } else { 1090 /* 1091 * We are modifying a rate, wrong interface? 1092 */ 1093 if (error) 1094 *error = EINVAL; 1095 rte = NULL; 1096 } 1097 return (rte); 1098 } 1099 1100 const struct tcp_hwrate_limit_table * 1101 tcp_chg_pacing_rate(const struct tcp_hwrate_limit_table *crte, 1102 struct tcpcb *tp, struct ifnet *ifp, 1103 uint64_t bytes_per_sec, int flags, int *error) 1104 { 1105 const struct tcp_hwrate_limit_table *nrte; 1106 const struct tcp_rate_set *rs; 1107 int is_indirect = 0; 1108 int err; 1109 1110 1111 if ((tp->t_inpcb->inp_snd_tag == NULL) || 1112 (crte == NULL)) { 1113 /* Wrong interface */ 1114 if (error) 1115 *error = EINVAL; 1116 return (NULL); 1117 } 1118 rs = crte->ptbl; 1119 if ((rs->rs_flags & RS_IS_DEAD) || 1120 (crte->flags & HDWRPACE_IFPDEPARTED)) { 1121 /* Release the rate, and try anew */ 1122 re_rate: 1123 tcp_rel_pacing_rate(crte, tp); 1124 nrte = tcp_set_pacing_rate(tp, ifp, 1125 bytes_per_sec, flags, error); 1126 return (nrte); 1127 } 1128 if ((rs->rs_flags & RT_IS_INDIRECT ) == RT_IS_INDIRECT) 1129 is_indirect = 1; 1130 else 1131 is_indirect = 0; 1132 if ((is_indirect == 0) && 1133 ((ifp != rs->rs_ifp) || 1134 (ifp->if_dunit != rs->rs_if_dunit))) { 1135 /* 1136 * Something changed, the user is not pointing to the same 1137 * ifp? Maybe a route updated on this guy? 1138 */ 1139 goto re_rate; 1140 } else if (is_indirect) { 1141 /* 1142 * For indirect we have to dig in and find the real interface. 1143 */ 1144 struct ifnet *rifp; 1145 1146 rifp = rt_find_real_interface(ifp, tp->t_inpcb, error); 1147 if (rifp == NULL) { 1148 /* Can't find it? */ 1149 goto re_rate; 1150 } 1151 if ((rifp != rs->rs_ifp) || 1152 (ifp->if_dunit != rs->rs_if_dunit)) { 1153 goto re_rate; 1154 } 1155 } 1156 nrte = tcp_find_suitable_rate(rs, bytes_per_sec, flags); 1157 if (nrte == crte) { 1158 /* No change */ 1159 if (error) 1160 *error = 0; 1161 return (crte); 1162 } 1163 if (nrte == NULL) { 1164 /* Release the old rate */ 1165 tcp_rel_pacing_rate(crte, tp); 1166 return (NULL); 1167 } 1168 /* Change rates to our new entry */ 1169 err = in_pcbmodify_txrtlmt(tp->t_inpcb, nrte->rate); 1170 if (err) { 1171 if (error) 1172 *error = err; 1173 return (NULL); 1174 } 1175 if (error) 1176 *error = 0; 1177 return (nrte); 1178 } 1179 1180 void 1181 tcp_rel_pacing_rate(const struct tcp_hwrate_limit_table *crte, struct tcpcb *tp) 1182 { 1183 const struct tcp_rate_set *crs; 1184 struct tcp_rate_set *rs; 1185 uint64_t pre; 1186 1187 crs = crte->ptbl; 1188 /* 1189 * Now we must break the const 1190 * in order to release our refcount. 1191 */ 1192 rs = __DECONST(struct tcp_rate_set *, crs); 1193 pre = atomic_fetchadd_64(&rs->rs_flows_using, -1); 1194 if (pre == 1) { 1195 mtx_lock(&rs_mtx); 1196 /* 1197 * Is it dead? 1198 */ 1199 if ((rs->rs_flags & RS_IS_DEAD) && 1200 ((rs->rs_flags & RS_FUNERAL_SCHD) == 0)){ 1201 /* 1202 * We were the last, 1203 * and a funeral is not pending, so 1204 * we must schedule it. 1205 */ 1206 rs->rs_flags |= RS_FUNERAL_SCHD; 1207 epoch_call(net_epoch, &rs->rs_epoch_ctx, rs_destroy); 1208 } 1209 mtx_unlock(&rs_mtx); 1210 } 1211 in_pcbdetach_txrtlmt(tp->t_inpcb); 1212 } 1213 1214 static eventhandler_tag rl_ifnet_departs; 1215 static eventhandler_tag rl_ifnet_arrives; 1216 static eventhandler_tag rl_shutdown_start; 1217 1218 static void 1219 tcp_rs_init(void *st __unused) 1220 { 1221 CK_LIST_INIT(&int_rs); 1222 rs_number_alive = 0; 1223 rs_number_dead = 0;; 1224 mtx_init(&rs_mtx, "tcp_rs_mtx", "rsmtx", MTX_DEF); 1225 rl_ifnet_departs = EVENTHANDLER_REGISTER(ifnet_departure_event, 1226 tcp_rl_ifnet_departure, 1227 NULL, EVENTHANDLER_PRI_ANY); 1228 rl_ifnet_arrives = EVENTHANDLER_REGISTER(ifnet_link_event, 1229 tcp_rl_ifnet_link, 1230 NULL, EVENTHANDLER_PRI_ANY); 1231 rl_shutdown_start = EVENTHANDLER_REGISTER(shutdown_pre_sync, 1232 tcp_rl_shutdown, NULL, 1233 SHUTDOWN_PRI_FIRST); 1234 printf("TCP_ratelimit: Is now initialized\n"); 1235 } 1236 1237 SYSINIT(tcp_rl_init, SI_SUB_SMP + 1, SI_ORDER_ANY, tcp_rs_init, NULL); 1238 #endif 1239