1 /*- 2 * 3 * SPDX-License-Identifier: BSD-3-Clause 4 * 5 * Copyright (c) 2018-2020 6 * Netflix Inc. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 * 29 */ 30 /** 31 * Author: Randall Stewart <rrs@netflix.com> 32 */ 33 34 #include <sys/cdefs.h> 35 __FBSDID("$FreeBSD$"); 36 #include "opt_inet.h" 37 #include "opt_inet6.h" 38 #include "opt_ipsec.h" 39 #include "opt_ratelimit.h" 40 #include <sys/param.h> 41 #include <sys/kernel.h> 42 #include <sys/malloc.h> 43 #include <sys/mbuf.h> 44 #include <sys/socket.h> 45 #include <sys/socketvar.h> 46 #include <sys/sysctl.h> 47 #include <sys/eventhandler.h> 48 #include <sys/mutex.h> 49 #include <sys/ck.h> 50 #include <net/if.h> 51 #include <net/if_var.h> 52 #include <netinet/in.h> 53 #include <netinet/in_pcb.h> 54 #define TCPSTATES /* for logging */ 55 #include <netinet/tcp_var.h> 56 #include <netinet/tcp_hpts.h> 57 #include <netinet/tcp_log_buf.h> 58 #include <netinet/tcp_ratelimit.h> 59 #ifndef USECS_IN_SECOND 60 #define USECS_IN_SECOND 1000000 61 #endif 62 /* 63 * For the purposes of each send, what is the size 64 * of an ethernet frame. 65 */ 66 MALLOC_DEFINE(M_TCPPACE, "tcp_hwpace", "TCP Hardware pacing memory"); 67 #ifdef RATELIMIT 68 69 /* 70 * The following preferred table will seem weird to 71 * the casual viewer. Why do we not have any rates below 72 * 1Mbps? Why do we have a rate at 1.44Mbps called common? 73 * Why do the rates cluster in the 1-100Mbps range more 74 * than others? Why does the table jump around at the beginnign 75 * and then be more consistently raising? 76 * 77 * Let me try to answer those questions. A lot of 78 * this is dependant on the hardware. We have three basic 79 * supporters of rate limiting 80 * 81 * Chelsio - Supporting 16 configurable rates. 82 * Mlx - c4 supporting 13 fixed rates. 83 * Mlx - c5 & c6 supporting 127 configurable rates. 84 * 85 * The c4 is why we have a common rate that is available 86 * in all rate tables. This is a selected rate from the 87 * c4 table and we assure its available in all ratelimit 88 * tables. This way the tcp_ratelimit code has an assured 89 * rate it should always be able to get. This answers a 90 * couple of the questions above. 91 * 92 * So what about the rest, well the table is built to 93 * try to get the most out of a joint hardware/software 94 * pacing system. The software pacer will always pick 95 * a rate higher than the b/w that it is estimating 96 * 97 * on the path. This is done for two reasons. 98 * a) So we can discover more b/w 99 * and 100 * b) So we can send a block of MSS's down and then 101 * have the software timer go off after the previous 102 * send is completely out of the hardware. 103 * 104 * But when we do <b> we don't want to have the delay 105 * between the last packet sent by the hardware be 106 * excessively long (to reach our desired rate). 107 * 108 * So let me give an example for clarity. 109 * 110 * Lets assume that the tcp stack sees that 29,110,000 bps is 111 * what the bw of the path is. The stack would select the 112 * rate 31Mbps. 31Mbps means that each send that is done 113 * by the hardware will cause a 390 micro-second gap between 114 * the packets sent at that rate. For 29,110,000 bps we 115 * would need 416 micro-seconds gap between each send. 116 * 117 * Note that are calculating a complete time for pacing 118 * which includes the ethernet, IP and TCP overhead. So 119 * a full 1514 bytes is used for the above calculations. 120 * My testing has shown that both cards are also using this 121 * as their basis i.e. full payload size of the ethernet frame. 122 * The TCP stack caller needs to be aware of this and make the 123 * appropriate overhead calculations be included in its choices. 124 * 125 * Now, continuing our example, we pick a MSS size based on the 126 * delta between the two rates (416 - 390) divided into the rate 127 * we really wish to send at rounded up. That results in a MSS 128 * send of 17 mss's at once. The hardware then will 129 * run out of data in a single 17MSS send in 6,630 micro-seconds. 130 * 131 * On the other hand the software pacer will send more data 132 * in 7,072 micro-seconds. This means that we will refill 133 * the hardware 52 microseconds after it would have sent 134 * next if it had not ran out of data. This is a win since we are 135 * only sending every 7ms or so and yet all the packets are spaced on 136 * the wire with 94% of what they should be and only 137 * the last packet is delayed extra to make up for the 138 * difference. 139 * 140 * Note that the above formula has two important caveat. 141 * If we are above (b/w wise) over 100Mbps we double the result 142 * of the MSS calculation. The second caveat is if we are 500Mbps 143 * or more we just send the maximum MSS at once i.e. 45MSS. At 144 * the higher b/w's even the cards have limits to what times (timer granularity) 145 * they can insert between packets and start to send more than one 146 * packet at a time on the wire. 147 * 148 */ 149 #define COMMON_RATE 180500 150 const uint64_t desired_rates[] = { 151 122500, /* 1Mbps - rate 1 */ 152 180500, /* 1.44Mpbs - rate 2 common rate */ 153 375000, /* 3Mbps - rate 3 */ 154 625000, /* 5Mbps - rate 4 */ 155 1250000, /* 10Mbps - rate 5 */ 156 1875000, /* 15Mbps - rate 6 */ 157 2500000, /* 20Mbps - rate 7 */ 158 3125000, /* 25Mbps - rate 8 */ 159 3750000, /* 30Mbps - rate 9 */ 160 4375000, /* 35Mbps - rate 10 */ 161 5000000, /* 40Meg - rate 11 */ 162 6250000, /* 50Mbps - rate 12 */ 163 12500000, /* 100Mbps - rate 13 */ 164 25000000, /* 200Mbps - rate 14 */ 165 50000000, /* 400Mbps - rate 15 */ 166 100000000, /* 800Mbps - rate 16 */ 167 5625000, /* 45Mbps - rate 17 */ 168 6875000, /* 55Mbps - rate 19 */ 169 7500000, /* 60Mbps - rate 20 */ 170 8125000, /* 65Mbps - rate 21 */ 171 8750000, /* 70Mbps - rate 22 */ 172 9375000, /* 75Mbps - rate 23 */ 173 10000000, /* 80Mbps - rate 24 */ 174 10625000, /* 85Mbps - rate 25 */ 175 11250000, /* 90Mbps - rate 26 */ 176 11875000, /* 95Mbps - rate 27 */ 177 12500000, /* 100Mbps - rate 28 */ 178 13750000, /* 110Mbps - rate 29 */ 179 15000000, /* 120Mbps - rate 30 */ 180 16250000, /* 130Mbps - rate 31 */ 181 17500000, /* 140Mbps - rate 32 */ 182 18750000, /* 150Mbps - rate 33 */ 183 20000000, /* 160Mbps - rate 34 */ 184 21250000, /* 170Mbps - rate 35 */ 185 22500000, /* 180Mbps - rate 36 */ 186 23750000, /* 190Mbps - rate 37 */ 187 26250000, /* 210Mbps - rate 38 */ 188 27500000, /* 220Mbps - rate 39 */ 189 28750000, /* 230Mbps - rate 40 */ 190 30000000, /* 240Mbps - rate 41 */ 191 31250000, /* 250Mbps - rate 42 */ 192 34375000, /* 275Mbps - rate 43 */ 193 37500000, /* 300Mbps - rate 44 */ 194 40625000, /* 325Mbps - rate 45 */ 195 43750000, /* 350Mbps - rate 46 */ 196 46875000, /* 375Mbps - rate 47 */ 197 53125000, /* 425Mbps - rate 48 */ 198 56250000, /* 450Mbps - rate 49 */ 199 59375000, /* 475Mbps - rate 50 */ 200 62500000, /* 500Mbps - rate 51 */ 201 68750000, /* 550Mbps - rate 52 */ 202 75000000, /* 600Mbps - rate 53 */ 203 81250000, /* 650Mbps - rate 54 */ 204 87500000, /* 700Mbps - rate 55 */ 205 93750000, /* 750Mbps - rate 56 */ 206 106250000, /* 850Mbps - rate 57 */ 207 112500000, /* 900Mbps - rate 58 */ 208 125000000, /* 1Gbps - rate 59 */ 209 156250000, /* 1.25Gps - rate 60 */ 210 187500000, /* 1.5Gps - rate 61 */ 211 218750000, /* 1.75Gps - rate 62 */ 212 250000000, /* 2Gbps - rate 63 */ 213 281250000, /* 2.25Gps - rate 64 */ 214 312500000, /* 2.5Gbps - rate 65 */ 215 343750000, /* 2.75Gbps - rate 66 */ 216 375000000, /* 3Gbps - rate 67 */ 217 500000000, /* 4Gbps - rate 68 */ 218 625000000, /* 5Gbps - rate 69 */ 219 750000000, /* 6Gbps - rate 70 */ 220 875000000, /* 7Gbps - rate 71 */ 221 1000000000, /* 8Gbps - rate 72 */ 222 1125000000, /* 9Gbps - rate 73 */ 223 1250000000, /* 10Gbps - rate 74 */ 224 1875000000, /* 15Gbps - rate 75 */ 225 2500000000 /* 20Gbps - rate 76 */ 226 }; 227 228 #define MAX_HDWR_RATES (sizeof(desired_rates)/sizeof(uint64_t)) 229 #define RS_ORDERED_COUNT 16 /* 230 * Number that are in order 231 * at the beginning of the table, 232 * over this a sort is required. 233 */ 234 #define RS_NEXT_ORDER_GROUP 16 /* 235 * The point in our table where 236 * we come fill in a second ordered 237 * group (index wise means -1). 238 */ 239 #define ALL_HARDWARE_RATES 1004 /* 240 * 1Meg - 1Gig in 1 Meg steps 241 * plus 100, 200k and 500k and 242 * 10Gig 243 */ 244 245 #define RS_ONE_MEGABIT_PERSEC 1000000 246 #define RS_ONE_GIGABIT_PERSEC 1000000000 247 #define RS_TEN_GIGABIT_PERSEC 10000000000 248 249 static struct head_tcp_rate_set int_rs; 250 static struct mtx rs_mtx; 251 uint32_t rs_number_alive; 252 uint32_t rs_number_dead; 253 static uint32_t rs_floor_mss = 0; 254 static uint32_t wait_time_floor = 8000; /* 8 ms */ 255 static uint32_t rs_hw_floor_mss = 16; 256 static uint32_t num_of_waits_allowed = 1; /* How many time blocks are we willing to wait */ 257 258 SYSCTL_NODE(_net_inet_tcp, OID_AUTO, rl, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 259 "TCP Ratelimit stats"); 260 SYSCTL_UINT(_net_inet_tcp_rl, OID_AUTO, alive, CTLFLAG_RW, 261 &rs_number_alive, 0, 262 "Number of interfaces initialized for ratelimiting"); 263 SYSCTL_UINT(_net_inet_tcp_rl, OID_AUTO, dead, CTLFLAG_RW, 264 &rs_number_dead, 0, 265 "Number of interfaces departing from ratelimiting"); 266 SYSCTL_UINT(_net_inet_tcp_rl, OID_AUTO, floor_mss, CTLFLAG_RW, 267 &rs_floor_mss, 0, 268 "Number of MSS that will override the normal minimums (0 means don't enforce)"); 269 SYSCTL_UINT(_net_inet_tcp_rl, OID_AUTO, wait_floor, CTLFLAG_RW, 270 &wait_time_floor, 2000, 271 "Has b/w increases what is the wait floor we are willing to wait at the end?"); 272 SYSCTL_UINT(_net_inet_tcp_rl, OID_AUTO, time_blocks, CTLFLAG_RW, 273 &num_of_waits_allowed, 1, 274 "How many time blocks on the end should software pacing be willing to wait?"); 275 276 SYSCTL_UINT(_net_inet_tcp_rl, OID_AUTO, hw_floor_mss, CTLFLAG_RW, 277 &rs_hw_floor_mss, 16, 278 "Number of mss that are a minum for hardware pacing?"); 279 280 281 static void 282 rl_add_syctl_entries(struct sysctl_oid *rl_sysctl_root, struct tcp_rate_set *rs) 283 { 284 /* 285 * Add sysctl entries for thus interface. 286 */ 287 if (rs->rs_flags & RS_INTF_NO_SUP) { 288 SYSCTL_ADD_S32(&rs->sysctl_ctx, 289 SYSCTL_CHILDREN(rl_sysctl_root), 290 OID_AUTO, "disable", CTLFLAG_RD, 291 &rs->rs_disable, 0, 292 "Disable this interface from new hdwr limiting?"); 293 } else { 294 SYSCTL_ADD_S32(&rs->sysctl_ctx, 295 SYSCTL_CHILDREN(rl_sysctl_root), 296 OID_AUTO, "disable", CTLFLAG_RW, 297 &rs->rs_disable, 0, 298 "Disable this interface from new hdwr limiting?"); 299 } 300 SYSCTL_ADD_S32(&rs->sysctl_ctx, 301 SYSCTL_CHILDREN(rl_sysctl_root), 302 OID_AUTO, "minseg", CTLFLAG_RW, 303 &rs->rs_min_seg, 0, 304 "What is the minimum we need to send on this interface?"); 305 SYSCTL_ADD_U64(&rs->sysctl_ctx, 306 SYSCTL_CHILDREN(rl_sysctl_root), 307 OID_AUTO, "flow_limit", CTLFLAG_RW, 308 &rs->rs_flow_limit, 0, 309 "What is the limit for number of flows (0=unlimited)?"); 310 SYSCTL_ADD_S32(&rs->sysctl_ctx, 311 SYSCTL_CHILDREN(rl_sysctl_root), 312 OID_AUTO, "highest", CTLFLAG_RD, 313 &rs->rs_highest_valid, 0, 314 "Highest valid rate"); 315 SYSCTL_ADD_S32(&rs->sysctl_ctx, 316 SYSCTL_CHILDREN(rl_sysctl_root), 317 OID_AUTO, "lowest", CTLFLAG_RD, 318 &rs->rs_lowest_valid, 0, 319 "Lowest valid rate"); 320 SYSCTL_ADD_S32(&rs->sysctl_ctx, 321 SYSCTL_CHILDREN(rl_sysctl_root), 322 OID_AUTO, "flags", CTLFLAG_RD, 323 &rs->rs_flags, 0, 324 "What lags are on the entry?"); 325 SYSCTL_ADD_S32(&rs->sysctl_ctx, 326 SYSCTL_CHILDREN(rl_sysctl_root), 327 OID_AUTO, "numrates", CTLFLAG_RD, 328 &rs->rs_rate_cnt, 0, 329 "How many rates re there?"); 330 SYSCTL_ADD_U64(&rs->sysctl_ctx, 331 SYSCTL_CHILDREN(rl_sysctl_root), 332 OID_AUTO, "flows_using", CTLFLAG_RD, 333 &rs->rs_flows_using, 0, 334 "How many flows are using this interface now?"); 335 #ifdef DETAILED_RATELIMIT_SYSCTL 336 if (rs->rs_rlt && rs->rs_rate_cnt > 0) { 337 /* Lets display the rates */ 338 int i; 339 struct sysctl_oid *rl_rates; 340 struct sysctl_oid *rl_rate_num; 341 char rate_num[16]; 342 rl_rates = SYSCTL_ADD_NODE(&rs->sysctl_ctx, 343 SYSCTL_CHILDREN(rl_sysctl_root), 344 OID_AUTO, 345 "rate", 346 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 347 "Ratelist"); 348 for( i = 0; i < rs->rs_rate_cnt; i++) { 349 sprintf(rate_num, "%d", i); 350 rl_rate_num = SYSCTL_ADD_NODE(&rs->sysctl_ctx, 351 SYSCTL_CHILDREN(rl_rates), 352 OID_AUTO, 353 rate_num, 354 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 355 "Individual Rate"); 356 SYSCTL_ADD_U32(&rs->sysctl_ctx, 357 SYSCTL_CHILDREN(rl_rate_num), 358 OID_AUTO, "flags", CTLFLAG_RD, 359 &rs->rs_rlt[i].flags, 0, 360 "Flags on this rate"); 361 SYSCTL_ADD_U32(&rs->sysctl_ctx, 362 SYSCTL_CHILDREN(rl_rate_num), 363 OID_AUTO, "pacetime", CTLFLAG_RD, 364 &rs->rs_rlt[i].time_between, 0, 365 "Time hardware inserts between 1500 byte sends"); 366 SYSCTL_ADD_LONG(&rs->sysctl_ctx, 367 SYSCTL_CHILDREN(rl_rate_num), 368 OID_AUTO, "rate", CTLFLAG_RD, 369 &rs->rs_rlt[i].rate, 370 "Rate in bytes per second"); 371 SYSCTL_ADD_LONG(&rs->sysctl_ctx, 372 SYSCTL_CHILDREN(rl_rate_num), 373 OID_AUTO, "using", CTLFLAG_RD, 374 &rs->rs_rlt[i].using, 375 "Number of flows using"); 376 SYSCTL_ADD_LONG(&rs->sysctl_ctx, 377 SYSCTL_CHILDREN(rl_rate_num), 378 OID_AUTO, "enobufs", CTLFLAG_RD, 379 &rs->rs_rlt[i].rs_num_enobufs, 380 "Number of enobufs logged on this rate"); 381 382 } 383 } 384 #endif 385 } 386 387 static void 388 rs_destroy(epoch_context_t ctx) 389 { 390 struct tcp_rate_set *rs; 391 bool do_free_rs; 392 393 rs = __containerof(ctx, struct tcp_rate_set, rs_epoch_ctx); 394 395 mtx_lock(&rs_mtx); 396 rs->rs_flags &= ~RS_FUNERAL_SCHD; 397 /* 398 * In theory its possible (but unlikely) 399 * that while the delete was occuring 400 * and we were applying the DEAD flag 401 * someone slipped in and found the 402 * interface in a lookup. While we 403 * decided rs_flows_using were 0 and 404 * scheduling the epoch_call, the other 405 * thread incremented rs_flow_using. This 406 * is because users have a pointer and 407 * we only use the rs_flows_using in an 408 * atomic fashion, i.e. the other entities 409 * are not protected. To assure this did 410 * not occur, we check rs_flows_using here 411 * before deleting. 412 */ 413 do_free_rs = (rs->rs_flows_using == 0); 414 rs_number_dead--; 415 mtx_unlock(&rs_mtx); 416 417 if (do_free_rs) { 418 sysctl_ctx_free(&rs->sysctl_ctx); 419 free(rs->rs_rlt, M_TCPPACE); 420 free(rs, M_TCPPACE); 421 } 422 } 423 424 static void 425 rs_defer_destroy(struct tcp_rate_set *rs) 426 { 427 428 mtx_assert(&rs_mtx, MA_OWNED); 429 430 /* Check if already pending. */ 431 if (rs->rs_flags & RS_FUNERAL_SCHD) 432 return; 433 434 rs_number_dead++; 435 436 /* Set flag to only defer once. */ 437 rs->rs_flags |= RS_FUNERAL_SCHD; 438 NET_EPOCH_CALL(rs_destroy, &rs->rs_epoch_ctx); 439 } 440 441 #ifdef INET 442 extern counter_u64_t rate_limit_new; 443 extern counter_u64_t rate_limit_chg; 444 extern counter_u64_t rate_limit_set_ok; 445 extern counter_u64_t rate_limit_active; 446 extern counter_u64_t rate_limit_alloc_fail; 447 #endif 448 449 static int 450 rl_attach_txrtlmt(struct ifnet *ifp, 451 uint32_t flowtype, 452 int flowid, 453 uint64_t cfg_rate, 454 struct m_snd_tag **tag) 455 { 456 int error; 457 union if_snd_tag_alloc_params params = { 458 .rate_limit.hdr.type = IF_SND_TAG_TYPE_RATE_LIMIT, 459 .rate_limit.hdr.flowid = flowid, 460 .rate_limit.hdr.flowtype = flowtype, 461 .rate_limit.max_rate = cfg_rate, 462 .rate_limit.flags = M_NOWAIT, 463 }; 464 465 error = m_snd_tag_alloc(ifp, ¶ms, tag); 466 #ifdef INET 467 if (error == 0) { 468 counter_u64_add(rate_limit_set_ok, 1); 469 counter_u64_add(rate_limit_active, 1); 470 } else if (error != EOPNOTSUPP) 471 counter_u64_add(rate_limit_alloc_fail, 1); 472 #endif 473 return (error); 474 } 475 476 static void 477 populate_canned_table(struct tcp_rate_set *rs, const uint64_t *rate_table_act) 478 { 479 /* 480 * The internal table is "special", it 481 * is two seperate ordered tables that 482 * must be merged. We get here when the 483 * adapter specifies a number of rates that 484 * covers both ranges in the table in some 485 * form. 486 */ 487 int i, at_low, at_high; 488 uint8_t low_disabled = 0, high_disabled = 0; 489 490 for(i = 0, at_low = 0, at_high = RS_NEXT_ORDER_GROUP; i < rs->rs_rate_cnt; i++) { 491 rs->rs_rlt[i].flags = 0; 492 rs->rs_rlt[i].time_between = 0; 493 if ((low_disabled == 0) && 494 (high_disabled || 495 (rate_table_act[at_low] < rate_table_act[at_high]))) { 496 rs->rs_rlt[i].rate = rate_table_act[at_low]; 497 at_low++; 498 if (at_low == RS_NEXT_ORDER_GROUP) 499 low_disabled = 1; 500 } else if (high_disabled == 0) { 501 rs->rs_rlt[i].rate = rate_table_act[at_high]; 502 at_high++; 503 if (at_high == MAX_HDWR_RATES) 504 high_disabled = 1; 505 } 506 } 507 } 508 509 static struct tcp_rate_set * 510 rt_setup_new_rs(struct ifnet *ifp, int *error) 511 { 512 struct tcp_rate_set *rs; 513 const uint64_t *rate_table_act; 514 uint64_t lentim, res; 515 size_t sz; 516 uint32_t hash_type; 517 int i; 518 struct if_ratelimit_query_results rl; 519 struct sysctl_oid *rl_sysctl_root; 520 struct epoch_tracker et; 521 /* 522 * We expect to enter with the 523 * mutex locked. 524 */ 525 526 if (ifp->if_ratelimit_query == NULL) { 527 /* 528 * We can do nothing if we cannot 529 * get a query back from the driver. 530 */ 531 printf("Warning:No query functions for %s:%d-- failed\n", 532 ifp->if_dname, ifp->if_dunit); 533 return (NULL); 534 } 535 rs = malloc(sizeof(struct tcp_rate_set), M_TCPPACE, M_NOWAIT | M_ZERO); 536 if (rs == NULL) { 537 if (error) 538 *error = ENOMEM; 539 printf("Warning:No memory for malloc of tcp_rate_set\n"); 540 return (NULL); 541 } 542 memset(&rl, 0, sizeof(rl)); 543 rl.flags = RT_NOSUPPORT; 544 ifp->if_ratelimit_query(ifp, &rl); 545 if (rl.flags & RT_IS_UNUSABLE) { 546 /* 547 * The interface does not really support 548 * the rate-limiting. 549 */ 550 memset(rs, 0, sizeof(struct tcp_rate_set)); 551 rs->rs_ifp = ifp; 552 rs->rs_if_dunit = ifp->if_dunit; 553 rs->rs_flags = RS_INTF_NO_SUP; 554 rs->rs_disable = 1; 555 rs_number_alive++; 556 sysctl_ctx_init(&rs->sysctl_ctx); 557 rl_sysctl_root = SYSCTL_ADD_NODE(&rs->sysctl_ctx, 558 SYSCTL_STATIC_CHILDREN(_net_inet_tcp_rl), 559 OID_AUTO, 560 rs->rs_ifp->if_xname, 561 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 562 ""); 563 rl_add_syctl_entries(rl_sysctl_root, rs); 564 NET_EPOCH_ENTER(et); 565 mtx_lock(&rs_mtx); 566 CK_LIST_INSERT_HEAD(&int_rs, rs, next); 567 mtx_unlock(&rs_mtx); 568 NET_EPOCH_EXIT(et); 569 return (rs); 570 } else if ((rl.flags & RT_IS_INDIRECT) == RT_IS_INDIRECT) { 571 memset(rs, 0, sizeof(struct tcp_rate_set)); 572 rs->rs_ifp = ifp; 573 rs->rs_if_dunit = ifp->if_dunit; 574 rs->rs_flags = RS_IS_DEFF; 575 rs_number_alive++; 576 sysctl_ctx_init(&rs->sysctl_ctx); 577 rl_sysctl_root = SYSCTL_ADD_NODE(&rs->sysctl_ctx, 578 SYSCTL_STATIC_CHILDREN(_net_inet_tcp_rl), 579 OID_AUTO, 580 rs->rs_ifp->if_xname, 581 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 582 ""); 583 rl_add_syctl_entries(rl_sysctl_root, rs); 584 NET_EPOCH_ENTER(et); 585 mtx_lock(&rs_mtx); 586 CK_LIST_INSERT_HEAD(&int_rs, rs, next); 587 mtx_unlock(&rs_mtx); 588 NET_EPOCH_EXIT(et); 589 return (rs); 590 } else if ((rl.flags & RT_IS_FIXED_TABLE) == RT_IS_FIXED_TABLE) { 591 /* Mellanox C4 likely */ 592 rs->rs_ifp = ifp; 593 rs->rs_if_dunit = ifp->if_dunit; 594 rs->rs_rate_cnt = rl.number_of_rates; 595 rs->rs_min_seg = rl.min_segment_burst; 596 rs->rs_highest_valid = 0; 597 rs->rs_flow_limit = rl.max_flows; 598 rs->rs_flags = RS_IS_INTF | RS_NO_PRE; 599 rs->rs_disable = 0; 600 rate_table_act = rl.rate_table; 601 } else if ((rl.flags & RT_IS_SELECTABLE) == RT_IS_SELECTABLE) { 602 /* Chelsio, C5 and C6 of Mellanox? */ 603 rs->rs_ifp = ifp; 604 rs->rs_if_dunit = ifp->if_dunit; 605 rs->rs_rate_cnt = rl.number_of_rates; 606 rs->rs_min_seg = rl.min_segment_burst; 607 rs->rs_disable = 0; 608 rs->rs_flow_limit = rl.max_flows; 609 rate_table_act = desired_rates; 610 if ((rs->rs_rate_cnt > MAX_HDWR_RATES) && 611 (rs->rs_rate_cnt < ALL_HARDWARE_RATES)) { 612 /* 613 * Our desired table is not big 614 * enough, do what we can. 615 */ 616 rs->rs_rate_cnt = MAX_HDWR_RATES; 617 } 618 if (rs->rs_rate_cnt <= RS_ORDERED_COUNT) 619 rs->rs_flags = RS_IS_INTF; 620 else 621 rs->rs_flags = RS_IS_INTF | RS_INT_TBL; 622 if (rs->rs_rate_cnt >= ALL_HARDWARE_RATES) 623 rs->rs_rate_cnt = ALL_HARDWARE_RATES; 624 } else { 625 free(rs, M_TCPPACE); 626 return (NULL); 627 } 628 sz = sizeof(struct tcp_hwrate_limit_table) * rs->rs_rate_cnt; 629 rs->rs_rlt = malloc(sz, M_TCPPACE, M_NOWAIT); 630 if (rs->rs_rlt == NULL) { 631 if (error) 632 *error = ENOMEM; 633 bail: 634 free(rs, M_TCPPACE); 635 return (NULL); 636 } 637 if (rs->rs_rate_cnt >= ALL_HARDWARE_RATES) { 638 /* 639 * The interface supports all 640 * the rates we could possibly want. 641 */ 642 uint64_t rat; 643 644 rs->rs_rlt[0].rate = 12500; /* 100k */ 645 rs->rs_rlt[1].rate = 25000; /* 200k */ 646 rs->rs_rlt[2].rate = 62500; /* 500k */ 647 /* Note 125000 == 1Megabit 648 * populate 1Meg - 1000meg. 649 */ 650 for(i = 3, rat = 125000; i< (ALL_HARDWARE_RATES-1); i++) { 651 rs->rs_rlt[i].rate = rat; 652 rat += 125000; 653 } 654 rs->rs_rlt[(ALL_HARDWARE_RATES-1)].rate = 1250000000; 655 } else if (rs->rs_flags & RS_INT_TBL) { 656 /* We populate this in a special way */ 657 populate_canned_table(rs, rate_table_act); 658 } else { 659 /* 660 * Just copy in the rates from 661 * the table, it is in order. 662 */ 663 for (i=0; i<rs->rs_rate_cnt; i++) { 664 rs->rs_rlt[i].rate = rate_table_act[i]; 665 rs->rs_rlt[i].time_between = 0; 666 rs->rs_rlt[i].flags = 0; 667 } 668 } 669 for (i = (rs->rs_rate_cnt - 1); i >= 0; i--) { 670 /* 671 * We go backwards through the list so that if we can't get 672 * a rate and fail to init one, we have at least a chance of 673 * getting the highest one. 674 */ 675 rs->rs_rlt[i].ptbl = rs; 676 rs->rs_rlt[i].tag = NULL; 677 rs->rs_rlt[i].using = 0; 678 rs->rs_rlt[i].rs_num_enobufs = 0; 679 /* 680 * Calculate the time between. 681 */ 682 lentim = ETHERNET_SEGMENT_SIZE * USECS_IN_SECOND; 683 res = lentim / rs->rs_rlt[i].rate; 684 if (res > 0) 685 rs->rs_rlt[i].time_between = res; 686 else 687 rs->rs_rlt[i].time_between = 1; 688 if (rs->rs_flags & RS_NO_PRE) { 689 rs->rs_rlt[i].flags = HDWRPACE_INITED; 690 rs->rs_lowest_valid = i; 691 } else { 692 int err; 693 694 if ((rl.flags & RT_IS_SETUP_REQ) && 695 (ifp->if_ratelimit_query)) { 696 err = ifp->if_ratelimit_setup(ifp, 697 rs->rs_rlt[i].rate, i); 698 if (err) 699 goto handle_err; 700 } 701 #ifdef RSS 702 hash_type = M_HASHTYPE_RSS_TCP_IPV4; 703 #else 704 hash_type = M_HASHTYPE_OPAQUE_HASH; 705 #endif 706 err = rl_attach_txrtlmt(ifp, 707 hash_type, 708 (i + 1), 709 rs->rs_rlt[i].rate, 710 &rs->rs_rlt[i].tag); 711 if (err) { 712 handle_err: 713 if (i == (rs->rs_rate_cnt - 1)) { 714 /* 715 * Huh - first rate and we can't get 716 * it? 717 */ 718 free(rs->rs_rlt, M_TCPPACE); 719 if (error) 720 *error = err; 721 goto bail; 722 } else { 723 if (error) 724 *error = err; 725 } 726 break; 727 } else { 728 rs->rs_rlt[i].flags = HDWRPACE_INITED | HDWRPACE_TAGPRESENT; 729 rs->rs_lowest_valid = i; 730 } 731 } 732 } 733 /* Did we get at least 1 rate? */ 734 if (rs->rs_rlt[(rs->rs_rate_cnt - 1)].flags & HDWRPACE_INITED) 735 rs->rs_highest_valid = rs->rs_rate_cnt - 1; 736 else { 737 free(rs->rs_rlt, M_TCPPACE); 738 goto bail; 739 } 740 rs_number_alive++; 741 sysctl_ctx_init(&rs->sysctl_ctx); 742 rl_sysctl_root = SYSCTL_ADD_NODE(&rs->sysctl_ctx, 743 SYSCTL_STATIC_CHILDREN(_net_inet_tcp_rl), 744 OID_AUTO, 745 rs->rs_ifp->if_xname, 746 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 747 ""); 748 rl_add_syctl_entries(rl_sysctl_root, rs); 749 NET_EPOCH_ENTER(et); 750 mtx_lock(&rs_mtx); 751 CK_LIST_INSERT_HEAD(&int_rs, rs, next); 752 mtx_unlock(&rs_mtx); 753 NET_EPOCH_EXIT(et); 754 return (rs); 755 } 756 757 /* 758 * For an explanation of why the argument is volatile please 759 * look at the comments around rt_setup_rate(). 760 */ 761 static const struct tcp_hwrate_limit_table * 762 tcp_int_find_suitable_rate(const volatile struct tcp_rate_set *rs, 763 uint64_t bytes_per_sec, uint32_t flags, uint64_t *lower_rate) 764 { 765 struct tcp_hwrate_limit_table *arte = NULL, *rte = NULL; 766 uint64_t mbits_per_sec, ind_calc, previous_rate = 0; 767 int i; 768 769 mbits_per_sec = (bytes_per_sec * 8); 770 if (flags & RS_PACING_LT) { 771 if ((mbits_per_sec < RS_ONE_MEGABIT_PERSEC) && 772 (rs->rs_lowest_valid <= 2)){ 773 /* 774 * Smaller than 1Meg, only 775 * 3 entries can match it. 776 */ 777 previous_rate = 0; 778 for(i = rs->rs_lowest_valid; i < 3; i++) { 779 if (bytes_per_sec <= rs->rs_rlt[i].rate) { 780 rte = &rs->rs_rlt[i]; 781 break; 782 } else if (rs->rs_rlt[i].flags & HDWRPACE_INITED) { 783 arte = &rs->rs_rlt[i]; 784 } 785 previous_rate = rs->rs_rlt[i].rate; 786 } 787 goto done; 788 } else if ((mbits_per_sec > RS_ONE_GIGABIT_PERSEC) && 789 (rs->rs_rlt[(ALL_HARDWARE_RATES-1)].flags & HDWRPACE_INITED)){ 790 /* 791 * Larger than 1G (the majority of 792 * our table. 793 */ 794 if (mbits_per_sec < RS_TEN_GIGABIT_PERSEC) 795 rte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)]; 796 else 797 arte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)]; 798 previous_rate = rs->rs_rlt[(ALL_HARDWARE_RATES-2)].rate; 799 goto done; 800 } 801 /* 802 * If we reach here its in our table (between 1Meg - 1000Meg), 803 * just take the rounded down mbits per second, and add 804 * 1Megabit to it, from this we can calculate 805 * the index in the table. 806 */ 807 ind_calc = mbits_per_sec/RS_ONE_MEGABIT_PERSEC; 808 if ((ind_calc * RS_ONE_MEGABIT_PERSEC) != mbits_per_sec) 809 ind_calc++; 810 /* our table is offset by 3, we add 2 */ 811 ind_calc += 2; 812 if (ind_calc > (ALL_HARDWARE_RATES-1)) { 813 /* This should not happen */ 814 ind_calc = ALL_HARDWARE_RATES-1; 815 } 816 if ((ind_calc >= rs->rs_lowest_valid) && 817 (ind_calc <= rs->rs_highest_valid)) { 818 rte = &rs->rs_rlt[ind_calc]; 819 if (ind_calc >= 1) 820 previous_rate = rs->rs_rlt[(ind_calc-1)].rate; 821 } 822 } else if (flags & RS_PACING_EXACT_MATCH) { 823 if ((mbits_per_sec < RS_ONE_MEGABIT_PERSEC) && 824 (rs->rs_lowest_valid <= 2)){ 825 for(i = rs->rs_lowest_valid; i < 3; i++) { 826 if (bytes_per_sec == rs->rs_rlt[i].rate) { 827 rte = &rs->rs_rlt[i]; 828 break; 829 } 830 } 831 } else if ((mbits_per_sec > RS_ONE_GIGABIT_PERSEC) && 832 (rs->rs_rlt[(ALL_HARDWARE_RATES-1)].flags & HDWRPACE_INITED)) { 833 /* > 1Gbps only one rate */ 834 if (bytes_per_sec == rs->rs_rlt[(ALL_HARDWARE_RATES-1)].rate) { 835 /* Its 10G wow */ 836 rte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)]; 837 } 838 } else { 839 /* Ok it must be a exact meg (its between 1G and 1Meg) */ 840 ind_calc = mbits_per_sec/RS_ONE_MEGABIT_PERSEC; 841 if ((ind_calc * RS_ONE_MEGABIT_PERSEC) == mbits_per_sec) { 842 /* its an exact Mbps */ 843 ind_calc += 2; 844 if (ind_calc > (ALL_HARDWARE_RATES-1)) { 845 /* This should not happen */ 846 ind_calc = ALL_HARDWARE_RATES-1; 847 } 848 if (rs->rs_rlt[ind_calc].flags & HDWRPACE_INITED) 849 rte = &rs->rs_rlt[ind_calc]; 850 } 851 } 852 } else { 853 /* we want greater than the requested rate */ 854 if ((mbits_per_sec < RS_ONE_MEGABIT_PERSEC) && 855 (rs->rs_lowest_valid <= 2)){ 856 arte = &rs->rs_rlt[3]; /* set alternate to 1Meg */ 857 for (i=2; i>=rs->rs_lowest_valid; i--) { 858 if (bytes_per_sec < rs->rs_rlt[i].rate) { 859 rte = &rs->rs_rlt[i]; 860 if (i >= 1) { 861 previous_rate = rs->rs_rlt[(i-1)].rate; 862 } 863 break; 864 } else if ((flags & RS_PACING_GEQ) && 865 (bytes_per_sec == rs->rs_rlt[i].rate)) { 866 rte = &rs->rs_rlt[i]; 867 if (i >= 1) { 868 previous_rate = rs->rs_rlt[(i-1)].rate; 869 } 870 break; 871 } else { 872 arte = &rs->rs_rlt[i]; /* new alternate */ 873 } 874 } 875 } else if (mbits_per_sec > RS_ONE_GIGABIT_PERSEC) { 876 if ((bytes_per_sec < rs->rs_rlt[(ALL_HARDWARE_RATES-1)].rate) && 877 (rs->rs_rlt[(ALL_HARDWARE_RATES-1)].flags & HDWRPACE_INITED)){ 878 /* Our top rate is larger than the request */ 879 rte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)]; 880 } else if ((flags & RS_PACING_GEQ) && 881 (bytes_per_sec == rs->rs_rlt[(ALL_HARDWARE_RATES-1)].rate) && 882 (rs->rs_rlt[(ALL_HARDWARE_RATES-1)].flags & HDWRPACE_INITED)) { 883 /* It matches our top rate */ 884 rte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)]; 885 } else if (rs->rs_rlt[(ALL_HARDWARE_RATES-1)].flags & HDWRPACE_INITED) { 886 /* The top rate is an alternative */ 887 arte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)]; 888 } 889 previous_rate = rs->rs_rlt[(ALL_HARDWARE_RATES-2)].rate; 890 } else { 891 /* Its in our range 1Meg - 1Gig */ 892 if (flags & RS_PACING_GEQ) { 893 ind_calc = mbits_per_sec/RS_ONE_MEGABIT_PERSEC; 894 if ((ind_calc * RS_ONE_MEGABIT_PERSEC) == mbits_per_sec) { 895 if (ind_calc > (ALL_HARDWARE_RATES-1)) { 896 /* This should not happen */ 897 ind_calc = (ALL_HARDWARE_RATES-1); 898 } 899 rte = &rs->rs_rlt[ind_calc]; 900 if (ind_calc >= 1) 901 previous_rate = rs->rs_rlt[(ind_calc-1)].rate; 902 } 903 goto done; 904 } 905 ind_calc = (mbits_per_sec + (RS_ONE_MEGABIT_PERSEC-1))/RS_ONE_MEGABIT_PERSEC; 906 ind_calc += 2; 907 if (ind_calc > (ALL_HARDWARE_RATES-1)) { 908 /* This should not happen */ 909 ind_calc = ALL_HARDWARE_RATES-1; 910 } 911 if (rs->rs_rlt[ind_calc].flags & HDWRPACE_INITED) { 912 rte = &rs->rs_rlt[ind_calc]; 913 if (ind_calc >= 1) 914 previous_rate = rs->rs_rlt[(ind_calc-1)].rate; 915 } 916 } 917 } 918 done: 919 if ((rte == NULL) && 920 (arte != NULL) && 921 (flags & RS_PACING_SUB_OK)) { 922 /* We can use the substitute */ 923 rte = arte; 924 } 925 if (lower_rate) 926 *lower_rate = previous_rate; 927 return (rte); 928 } 929 930 /* 931 * For an explanation of why the argument is volatile please 932 * look at the comments around rt_setup_rate(). 933 */ 934 static const struct tcp_hwrate_limit_table * 935 tcp_find_suitable_rate(const volatile struct tcp_rate_set *rs, uint64_t bytes_per_sec, uint32_t flags, uint64_t *lower_rate) 936 { 937 /** 938 * Hunt the rate table with the restrictions in flags and find a 939 * suitable rate if possible. 940 * RS_PACING_EXACT_MATCH - look for an exact match to rate. 941 * RS_PACING_GT - must be greater than. 942 * RS_PACING_GEQ - must be greater than or equal. 943 * RS_PACING_LT - must be less than. 944 * RS_PACING_SUB_OK - If we don't meet criteria a 945 * substitute is ok. 946 */ 947 int i, matched; 948 struct tcp_hwrate_limit_table *rte = NULL; 949 uint64_t previous_rate = 0; 950 951 if ((rs->rs_flags & RS_INT_TBL) && 952 (rs->rs_rate_cnt >= ALL_HARDWARE_RATES)) { 953 /* 954 * Here we don't want to paw thru 955 * a big table, we have everything 956 * from 1Meg - 1000Meg in 1Meg increments. 957 * Use an alternate method to "lookup". 958 */ 959 return (tcp_int_find_suitable_rate(rs, bytes_per_sec, flags, lower_rate)); 960 } 961 if ((flags & RS_PACING_LT) || 962 (flags & RS_PACING_EXACT_MATCH)) { 963 /* 964 * For exact and less than we go forward through the table. 965 * This way when we find one larger we stop (exact was a 966 * toss up). 967 */ 968 for (i = rs->rs_lowest_valid, matched = 0; i <= rs->rs_highest_valid; i++) { 969 if ((flags & RS_PACING_EXACT_MATCH) && 970 (bytes_per_sec == rs->rs_rlt[i].rate)) { 971 rte = &rs->rs_rlt[i]; 972 matched = 1; 973 if (lower_rate != NULL) 974 *lower_rate = previous_rate; 975 break; 976 } else if ((flags & RS_PACING_LT) && 977 (bytes_per_sec <= rs->rs_rlt[i].rate)) { 978 rte = &rs->rs_rlt[i]; 979 matched = 1; 980 if (lower_rate != NULL) 981 *lower_rate = previous_rate; 982 break; 983 } 984 previous_rate = rs->rs_rlt[i].rate; 985 if (bytes_per_sec > rs->rs_rlt[i].rate) 986 break; 987 } 988 if ((matched == 0) && 989 (flags & RS_PACING_LT) && 990 (flags & RS_PACING_SUB_OK)) { 991 /* Kick in a substitute (the lowest) */ 992 rte = &rs->rs_rlt[rs->rs_lowest_valid]; 993 } 994 } else { 995 /* 996 * Here we go backward through the table so that we can find 997 * the one greater in theory faster (but its probably a 998 * wash). 999 */ 1000 for (i = rs->rs_highest_valid, matched = 0; i >= rs->rs_lowest_valid; i--) { 1001 if (rs->rs_rlt[i].rate > bytes_per_sec) { 1002 /* A possible candidate */ 1003 rte = &rs->rs_rlt[i]; 1004 } 1005 if ((flags & RS_PACING_GEQ) && 1006 (bytes_per_sec == rs->rs_rlt[i].rate)) { 1007 /* An exact match and we want equal */ 1008 matched = 1; 1009 rte = &rs->rs_rlt[i]; 1010 break; 1011 } else if (rte) { 1012 /* 1013 * Found one that is larger than but don't 1014 * stop, there may be a more closer match. 1015 */ 1016 matched = 1; 1017 } 1018 if (rs->rs_rlt[i].rate < bytes_per_sec) { 1019 /* 1020 * We found a table entry that is smaller, 1021 * stop there will be none greater or equal. 1022 */ 1023 if (lower_rate != NULL) 1024 *lower_rate = rs->rs_rlt[i].rate; 1025 break; 1026 } 1027 } 1028 if ((matched == 0) && 1029 (flags & RS_PACING_SUB_OK)) { 1030 /* Kick in a substitute (the highest) */ 1031 rte = &rs->rs_rlt[rs->rs_highest_valid]; 1032 } 1033 } 1034 return (rte); 1035 } 1036 1037 static struct ifnet * 1038 rt_find_real_interface(struct ifnet *ifp, struct inpcb *inp, int *error) 1039 { 1040 struct ifnet *tifp; 1041 struct m_snd_tag *tag, *ntag; 1042 union if_snd_tag_alloc_params params = { 1043 .rate_limit.hdr.type = IF_SND_TAG_TYPE_RATE_LIMIT, 1044 .rate_limit.hdr.flowid = inp->inp_flowid, 1045 .rate_limit.hdr.numa_domain = inp->inp_numa_domain, 1046 .rate_limit.max_rate = COMMON_RATE, 1047 .rate_limit.flags = M_NOWAIT, 1048 }; 1049 int err; 1050 #ifdef RSS 1051 params.rate_limit.hdr.flowtype = ((inp->inp_vflag & INP_IPV6) ? 1052 M_HASHTYPE_RSS_TCP_IPV6 : M_HASHTYPE_RSS_TCP_IPV4); 1053 #else 1054 params.rate_limit.hdr.flowtype = M_HASHTYPE_OPAQUE_HASH; 1055 #endif 1056 err = m_snd_tag_alloc(ifp, ¶ms, &tag); 1057 if (err) { 1058 /* Failed to setup a tag? */ 1059 if (error) 1060 *error = err; 1061 return (NULL); 1062 } 1063 ntag = tag; 1064 while (ntag->sw->next_snd_tag != NULL) { 1065 ntag = ntag->sw->next_snd_tag(ntag); 1066 } 1067 tifp = ntag->ifp; 1068 m_snd_tag_rele(tag); 1069 return (tifp); 1070 } 1071 1072 static void 1073 rl_increment_using(const struct tcp_hwrate_limit_table *rte) 1074 { 1075 struct tcp_hwrate_limit_table *decon_rte; 1076 1077 decon_rte = __DECONST(struct tcp_hwrate_limit_table *, rte); 1078 atomic_add_long(&decon_rte->using, 1); 1079 } 1080 1081 static void 1082 rl_decrement_using(const struct tcp_hwrate_limit_table *rte) 1083 { 1084 struct tcp_hwrate_limit_table *decon_rte; 1085 1086 decon_rte = __DECONST(struct tcp_hwrate_limit_table *, rte); 1087 atomic_subtract_long(&decon_rte->using, 1); 1088 } 1089 1090 void 1091 tcp_rl_log_enobuf(const struct tcp_hwrate_limit_table *rte) 1092 { 1093 struct tcp_hwrate_limit_table *decon_rte; 1094 1095 decon_rte = __DECONST(struct tcp_hwrate_limit_table *, rte); 1096 atomic_add_long(&decon_rte->rs_num_enobufs, 1); 1097 } 1098 1099 /* 1100 * Do NOT take the __noinline out of the 1101 * find_rs_for_ifp() function. If you do the inline 1102 * of it for the rt_setup_rate() will show you a 1103 * compiler bug. For some reason the compiler thinks 1104 * the list can never be empty. The consequence of 1105 * this will be a crash when we dereference NULL 1106 * if an ifp is removed just has a hw rate limit 1107 * is attempted. If you are working on the compiler 1108 * and want to "test" this go ahead and take the noinline 1109 * out otherwise let sleeping dogs ly until such time 1110 * as we get a compiler fix 10/2/20 -- RRS 1111 */ 1112 static __noinline struct tcp_rate_set * 1113 find_rs_for_ifp(struct ifnet *ifp) 1114 { 1115 struct tcp_rate_set *rs; 1116 1117 CK_LIST_FOREACH(rs, &int_rs, next) { 1118 if ((rs->rs_ifp == ifp) && 1119 (rs->rs_if_dunit == ifp->if_dunit)) { 1120 /* Ok we found it */ 1121 return (rs); 1122 } 1123 } 1124 return (NULL); 1125 } 1126 1127 1128 static const struct tcp_hwrate_limit_table * 1129 rt_setup_rate(struct inpcb *inp, struct ifnet *ifp, uint64_t bytes_per_sec, 1130 uint32_t flags, int *error, uint64_t *lower_rate) 1131 { 1132 /* First lets find the interface if it exists */ 1133 const struct tcp_hwrate_limit_table *rte; 1134 /* 1135 * So why is rs volatile? This is to defeat a 1136 * compiler bug where in the compiler is convinced 1137 * that rs can never be NULL (which is not true). Because 1138 * of its conviction it nicely optimizes out the if ((rs == NULL 1139 * below which means if you get a NULL back you dereference it. 1140 */ 1141 volatile struct tcp_rate_set *rs; 1142 struct epoch_tracker et; 1143 struct ifnet *oifp = ifp; 1144 int err; 1145 1146 NET_EPOCH_ENTER(et); 1147 use_real_interface: 1148 rs = find_rs_for_ifp(ifp); 1149 if ((rs == NULL) || 1150 (rs->rs_flags & RS_INTF_NO_SUP) || 1151 (rs->rs_flags & RS_IS_DEAD)) { 1152 /* 1153 * This means we got a packet *before* 1154 * the IF-UP was processed below, <or> 1155 * while or after we already received an interface 1156 * departed event. In either case we really don't 1157 * want to do anything with pacing, in 1158 * the departing case the packet is not 1159 * going to go very far. The new case 1160 * might be arguable, but its impossible 1161 * to tell from the departing case. 1162 */ 1163 if (error) 1164 *error = ENODEV; 1165 NET_EPOCH_EXIT(et); 1166 return (NULL); 1167 } 1168 1169 if ((rs == NULL) || (rs->rs_disable != 0)) { 1170 if (error) 1171 *error = ENOSPC; 1172 NET_EPOCH_EXIT(et); 1173 return (NULL); 1174 } 1175 if (rs->rs_flags & RS_IS_DEFF) { 1176 /* We need to find the real interface */ 1177 struct ifnet *tifp; 1178 1179 tifp = rt_find_real_interface(ifp, inp, error); 1180 if (tifp == NULL) { 1181 if (rs->rs_disable && error) 1182 *error = ENOTSUP; 1183 NET_EPOCH_EXIT(et); 1184 return (NULL); 1185 } 1186 KASSERT((tifp != ifp), 1187 ("Lookup failure ifp:%p inp:%p rt_find_real_interface() returns the same interface tifp:%p?\n", 1188 ifp, inp, tifp)); 1189 ifp = tifp; 1190 goto use_real_interface; 1191 } 1192 if (rs->rs_flow_limit && 1193 ((rs->rs_flows_using + 1) > rs->rs_flow_limit)) { 1194 if (error) 1195 *error = ENOSPC; 1196 NET_EPOCH_EXIT(et); 1197 return (NULL); 1198 } 1199 rte = tcp_find_suitable_rate(rs, bytes_per_sec, flags, lower_rate); 1200 if (rte) { 1201 err = in_pcbattach_txrtlmt(inp, oifp, 1202 inp->inp_flowtype, 1203 inp->inp_flowid, 1204 rte->rate, 1205 &inp->inp_snd_tag); 1206 if (err) { 1207 /* Failed to attach */ 1208 if (error) 1209 *error = err; 1210 rte = NULL; 1211 } else { 1212 KASSERT((inp->inp_snd_tag != NULL) , 1213 ("Setup rate has no snd_tag inp:%p rte:%p rate:%llu rs:%p", 1214 inp, rte, (unsigned long long)rte->rate, rs)); 1215 #ifdef INET 1216 counter_u64_add(rate_limit_new, 1); 1217 #endif 1218 } 1219 } 1220 if (rte) { 1221 /* 1222 * We use an atomic here for accounting so we don't have to 1223 * use locks when freeing. 1224 */ 1225 atomic_add_64(&rs->rs_flows_using, 1); 1226 } 1227 NET_EPOCH_EXIT(et); 1228 return (rte); 1229 } 1230 1231 static void 1232 tcp_rl_ifnet_link(void *arg __unused, struct ifnet *ifp, int link_state) 1233 { 1234 int error; 1235 struct tcp_rate_set *rs; 1236 struct epoch_tracker et; 1237 1238 if (((ifp->if_capenable & IFCAP_TXRTLMT) == 0) || 1239 (link_state != LINK_STATE_UP)) { 1240 /* 1241 * We only care on an interface going up that is rate-limit 1242 * capable. 1243 */ 1244 return; 1245 } 1246 NET_EPOCH_ENTER(et); 1247 mtx_lock(&rs_mtx); 1248 rs = find_rs_for_ifp(ifp); 1249 if (rs) { 1250 /* We already have initialized this guy */ 1251 mtx_unlock(&rs_mtx); 1252 NET_EPOCH_EXIT(et); 1253 return; 1254 } 1255 mtx_unlock(&rs_mtx); 1256 NET_EPOCH_EXIT(et); 1257 rt_setup_new_rs(ifp, &error); 1258 } 1259 1260 static void 1261 tcp_rl_ifnet_departure(void *arg __unused, struct ifnet *ifp) 1262 { 1263 struct tcp_rate_set *rs; 1264 struct epoch_tracker et; 1265 int i; 1266 1267 NET_EPOCH_ENTER(et); 1268 mtx_lock(&rs_mtx); 1269 rs = find_rs_for_ifp(ifp); 1270 if (rs) { 1271 CK_LIST_REMOVE(rs, next); 1272 rs_number_alive--; 1273 rs->rs_flags |= RS_IS_DEAD; 1274 for (i = 0; i < rs->rs_rate_cnt; i++) { 1275 if (rs->rs_rlt[i].flags & HDWRPACE_TAGPRESENT) { 1276 in_pcbdetach_tag(rs->rs_rlt[i].tag); 1277 rs->rs_rlt[i].tag = NULL; 1278 } 1279 rs->rs_rlt[i].flags = HDWRPACE_IFPDEPARTED; 1280 } 1281 if (rs->rs_flows_using == 0) 1282 rs_defer_destroy(rs); 1283 } 1284 mtx_unlock(&rs_mtx); 1285 NET_EPOCH_EXIT(et); 1286 } 1287 1288 static void 1289 tcp_rl_shutdown(void *arg __unused, int howto __unused) 1290 { 1291 struct tcp_rate_set *rs, *nrs; 1292 struct epoch_tracker et; 1293 int i; 1294 1295 NET_EPOCH_ENTER(et); 1296 mtx_lock(&rs_mtx); 1297 CK_LIST_FOREACH_SAFE(rs, &int_rs, next, nrs) { 1298 CK_LIST_REMOVE(rs, next); 1299 rs_number_alive--; 1300 rs->rs_flags |= RS_IS_DEAD; 1301 for (i = 0; i < rs->rs_rate_cnt; i++) { 1302 if (rs->rs_rlt[i].flags & HDWRPACE_TAGPRESENT) { 1303 in_pcbdetach_tag(rs->rs_rlt[i].tag); 1304 rs->rs_rlt[i].tag = NULL; 1305 } 1306 rs->rs_rlt[i].flags = HDWRPACE_IFPDEPARTED; 1307 } 1308 if (rs->rs_flows_using == 0) 1309 rs_defer_destroy(rs); 1310 } 1311 mtx_unlock(&rs_mtx); 1312 NET_EPOCH_EXIT(et); 1313 } 1314 1315 const struct tcp_hwrate_limit_table * 1316 tcp_set_pacing_rate(struct tcpcb *tp, struct ifnet *ifp, 1317 uint64_t bytes_per_sec, int flags, int *error, uint64_t *lower_rate) 1318 { 1319 struct inpcb *inp = tptoinpcb(tp); 1320 const struct tcp_hwrate_limit_table *rte; 1321 #ifdef KERN_TLS 1322 struct ktls_session *tls; 1323 #endif 1324 1325 INP_WLOCK_ASSERT(inp); 1326 1327 if (inp->inp_snd_tag == NULL) { 1328 /* 1329 * We are setting up a rate for the first time. 1330 */ 1331 if ((ifp->if_capenable & IFCAP_TXRTLMT) == 0) { 1332 /* Not supported by the egress */ 1333 if (error) 1334 *error = ENODEV; 1335 return (NULL); 1336 } 1337 #ifdef KERN_TLS 1338 tls = NULL; 1339 if (tptosocket(tp)->so_snd.sb_flags & SB_TLS_IFNET) { 1340 tls = tptosocket(tp)->so_snd.sb_tls_info; 1341 1342 if ((ifp->if_capenable & IFCAP_TXTLS_RTLMT) == 0 || 1343 tls->mode != TCP_TLS_MODE_IFNET) { 1344 if (error) 1345 *error = ENODEV; 1346 return (NULL); 1347 } 1348 } 1349 #endif 1350 rte = rt_setup_rate(inp, ifp, bytes_per_sec, flags, error, lower_rate); 1351 if (rte) 1352 rl_increment_using(rte); 1353 #ifdef KERN_TLS 1354 if (rte != NULL && tls != NULL && tls->snd_tag != NULL) { 1355 /* 1356 * Fake a route change error to reset the TLS 1357 * send tag. This will convert the existing 1358 * tag to a TLS ratelimit tag. 1359 */ 1360 MPASS(tls->snd_tag->sw->type == IF_SND_TAG_TYPE_TLS); 1361 ktls_output_eagain(inp, tls); 1362 } 1363 #endif 1364 } else { 1365 /* 1366 * We are modifying a rate, wrong interface? 1367 */ 1368 if (error) 1369 *error = EINVAL; 1370 rte = NULL; 1371 } 1372 if (rte != NULL) { 1373 tp->t_pacing_rate = rte->rate; 1374 *error = 0; 1375 } 1376 return (rte); 1377 } 1378 1379 const struct tcp_hwrate_limit_table * 1380 tcp_chg_pacing_rate(const struct tcp_hwrate_limit_table *crte, 1381 struct tcpcb *tp, struct ifnet *ifp, 1382 uint64_t bytes_per_sec, int flags, int *error, uint64_t *lower_rate) 1383 { 1384 struct inpcb *inp = tptoinpcb(tp); 1385 const struct tcp_hwrate_limit_table *nrte; 1386 const struct tcp_rate_set *rs; 1387 #ifdef KERN_TLS 1388 struct ktls_session *tls = NULL; 1389 #endif 1390 int err; 1391 1392 INP_WLOCK_ASSERT(inp); 1393 1394 if (crte == NULL) { 1395 /* Wrong interface */ 1396 if (error) 1397 *error = EINVAL; 1398 return (NULL); 1399 } 1400 1401 #ifdef KERN_TLS 1402 if (tptosocket(tp)->so_snd.sb_flags & SB_TLS_IFNET) { 1403 tls = tptosocket(tp)->so_snd.sb_tls_info; 1404 if (tls->mode != TCP_TLS_MODE_IFNET) 1405 tls = NULL; 1406 else if (tls->snd_tag != NULL && 1407 tls->snd_tag->sw->type != IF_SND_TAG_TYPE_TLS_RATE_LIMIT) { 1408 if (!tls->reset_pending) { 1409 /* 1410 * NIC probably doesn't support 1411 * ratelimit TLS tags if it didn't 1412 * allocate one when an existing rate 1413 * was present, so ignore. 1414 */ 1415 tcp_rel_pacing_rate(crte, tp); 1416 if (error) 1417 *error = EOPNOTSUPP; 1418 return (NULL); 1419 } 1420 1421 /* 1422 * The send tag is being converted, so set the 1423 * rate limit on the inpcb tag. There is a 1424 * race that the new NIC send tag might use 1425 * the current rate instead of this one. 1426 */ 1427 tls = NULL; 1428 } 1429 } 1430 #endif 1431 if (inp->inp_snd_tag == NULL) { 1432 /* Wrong interface */ 1433 tcp_rel_pacing_rate(crte, tp); 1434 if (error) 1435 *error = EINVAL; 1436 return (NULL); 1437 } 1438 rs = crte->ptbl; 1439 if ((rs->rs_flags & RS_IS_DEAD) || 1440 (crte->flags & HDWRPACE_IFPDEPARTED)) { 1441 /* Release the rate, and try anew */ 1442 1443 tcp_rel_pacing_rate(crte, tp); 1444 nrte = tcp_set_pacing_rate(tp, ifp, 1445 bytes_per_sec, flags, error, lower_rate); 1446 return (nrte); 1447 } 1448 nrte = tcp_find_suitable_rate(rs, bytes_per_sec, flags, lower_rate); 1449 if (nrte == crte) { 1450 /* No change */ 1451 if (error) 1452 *error = 0; 1453 return (crte); 1454 } 1455 if (nrte == NULL) { 1456 /* Release the old rate */ 1457 if (error) 1458 *error = ENOENT; 1459 tcp_rel_pacing_rate(crte, tp); 1460 return (NULL); 1461 } 1462 rl_decrement_using(crte); 1463 rl_increment_using(nrte); 1464 /* Change rates to our new entry */ 1465 #ifdef KERN_TLS 1466 if (tls != NULL) 1467 err = ktls_modify_txrtlmt(tls, nrte->rate); 1468 else 1469 #endif 1470 err = in_pcbmodify_txrtlmt(inp, nrte->rate); 1471 if (err) { 1472 struct tcp_rate_set *lrs; 1473 uint64_t pre; 1474 1475 rl_decrement_using(nrte); 1476 lrs = __DECONST(struct tcp_rate_set *, rs); 1477 pre = atomic_fetchadd_64(&lrs->rs_flows_using, -1); 1478 /* Do we still have a snd-tag attached? */ 1479 if (inp->inp_snd_tag) 1480 in_pcbdetach_txrtlmt(inp); 1481 1482 if (pre == 1) { 1483 struct epoch_tracker et; 1484 1485 NET_EPOCH_ENTER(et); 1486 mtx_lock(&rs_mtx); 1487 /* 1488 * Is it dead? 1489 */ 1490 if (lrs->rs_flags & RS_IS_DEAD) 1491 rs_defer_destroy(lrs); 1492 mtx_unlock(&rs_mtx); 1493 NET_EPOCH_EXIT(et); 1494 } 1495 if (error) 1496 *error = err; 1497 return (NULL); 1498 } else { 1499 #ifdef INET 1500 counter_u64_add(rate_limit_chg, 1); 1501 #endif 1502 } 1503 if (error) 1504 *error = 0; 1505 tp->t_pacing_rate = nrte->rate; 1506 return (nrte); 1507 } 1508 1509 void 1510 tcp_rel_pacing_rate(const struct tcp_hwrate_limit_table *crte, struct tcpcb *tp) 1511 { 1512 struct inpcb *inp = tptoinpcb(tp); 1513 const struct tcp_rate_set *crs; 1514 struct tcp_rate_set *rs; 1515 uint64_t pre; 1516 1517 INP_WLOCK_ASSERT(inp); 1518 1519 tp->t_pacing_rate = -1; 1520 crs = crte->ptbl; 1521 /* 1522 * Now we must break the const 1523 * in order to release our refcount. 1524 */ 1525 rs = __DECONST(struct tcp_rate_set *, crs); 1526 rl_decrement_using(crte); 1527 pre = atomic_fetchadd_64(&rs->rs_flows_using, -1); 1528 if (pre == 1) { 1529 struct epoch_tracker et; 1530 1531 NET_EPOCH_ENTER(et); 1532 mtx_lock(&rs_mtx); 1533 /* 1534 * Is it dead? 1535 */ 1536 if (rs->rs_flags & RS_IS_DEAD) 1537 rs_defer_destroy(rs); 1538 mtx_unlock(&rs_mtx); 1539 NET_EPOCH_EXIT(et); 1540 } 1541 1542 /* 1543 * XXX: If this connection is using ifnet TLS, should we 1544 * switch it to using an unlimited rate, or perhaps use 1545 * ktls_output_eagain() to reset the send tag to a plain 1546 * TLS tag? 1547 */ 1548 in_pcbdetach_txrtlmt(inp); 1549 } 1550 1551 #define ONE_POINT_TWO_MEG 150000 /* 1.2 megabits in bytes */ 1552 #define ONE_HUNDRED_MBPS 12500000 /* 100Mbps in bytes per second */ 1553 #define FIVE_HUNDRED_MBPS 62500000 /* 500Mbps in bytes per second */ 1554 #define MAX_MSS_SENT 43 /* 43 mss = 43 x 1500 = 64,500 bytes */ 1555 1556 static void 1557 tcp_log_pacing_size(struct tcpcb *tp, uint64_t bw, uint32_t segsiz, uint32_t new_tso, 1558 uint64_t hw_rate, uint32_t time_between, uint32_t calc_time_between, 1559 uint32_t segs, uint32_t res_div, uint16_t mult, uint8_t mod) 1560 { 1561 if (tp->t_logstate != TCP_LOG_STATE_OFF) { 1562 union tcp_log_stackspecific log; 1563 struct timeval tv; 1564 1565 memset(&log, 0, sizeof(log)); 1566 log.u_bbr.flex1 = segsiz; 1567 log.u_bbr.flex2 = new_tso; 1568 log.u_bbr.flex3 = time_between; 1569 log.u_bbr.flex4 = calc_time_between; 1570 log.u_bbr.flex5 = segs; 1571 log.u_bbr.flex6 = res_div; 1572 log.u_bbr.flex7 = mult; 1573 log.u_bbr.flex8 = mod; 1574 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 1575 log.u_bbr.cur_del_rate = bw; 1576 log.u_bbr.delRate = hw_rate; 1577 TCP_LOG_EVENTP(tp, NULL, 1578 &tptosocket(tp)->so_rcv, 1579 &tptosocket(tp)->so_snd, 1580 TCP_HDWR_PACE_SIZE, 0, 1581 0, &log, false, &tv); 1582 } 1583 } 1584 1585 uint32_t 1586 tcp_get_pacing_burst_size (struct tcpcb *tp, uint64_t bw, uint32_t segsiz, int can_use_1mss, 1587 const struct tcp_hwrate_limit_table *te, int *err) 1588 { 1589 /* 1590 * We use the google formula to calculate the 1591 * TSO size. I.E. 1592 * bw < 24Meg 1593 * tso = 2mss 1594 * else 1595 * tso = min(bw/1000, 64k) 1596 * 1597 * Note for these calculations we ignore the 1598 * packet overhead (enet hdr, ip hdr and tcp hdr). 1599 */ 1600 uint64_t lentim, res, bytes; 1601 uint32_t new_tso, min_tso_segs; 1602 1603 bytes = bw / 1000; 1604 if (bytes > (64 * 1000)) 1605 bytes = 64 * 1000; 1606 /* Round up */ 1607 new_tso = (bytes + segsiz - 1) / segsiz; 1608 if (can_use_1mss && (bw < ONE_POINT_TWO_MEG)) 1609 min_tso_segs = 1; 1610 else 1611 min_tso_segs = 2; 1612 if (rs_floor_mss && (new_tso < rs_floor_mss)) 1613 new_tso = rs_floor_mss; 1614 else if (new_tso < min_tso_segs) 1615 new_tso = min_tso_segs; 1616 if (new_tso > MAX_MSS_SENT) 1617 new_tso = MAX_MSS_SENT; 1618 new_tso *= segsiz; 1619 tcp_log_pacing_size(tp, bw, segsiz, new_tso, 1620 0, 0, 0, 0, 0, 0, 1); 1621 /* 1622 * If we are not doing hardware pacing 1623 * then we are done. 1624 */ 1625 if (te == NULL) { 1626 if (err) 1627 *err = 0; 1628 return(new_tso); 1629 } 1630 /* 1631 * For hardware pacing we look at the 1632 * rate you are sending at and compare 1633 * that to the rate you have in hardware. 1634 * 1635 * If the hardware rate is slower than your 1636 * software rate then you are in error and 1637 * we will build a queue in our hardware whic 1638 * is probably not desired, in such a case 1639 * just return the non-hardware TSO size. 1640 * 1641 * If the rate in hardware is faster (which 1642 * it should be) then look at how long it 1643 * takes to send one ethernet segment size at 1644 * your b/w and compare that to the time it 1645 * takes to send at the rate you had selected. 1646 * 1647 * If your time is greater (which we hope it is) 1648 * we get the delta between the two, and then 1649 * divide that into your pacing time. This tells 1650 * us how many MSS you can send down at once (rounded up). 1651 * 1652 * Note we also double this value if the b/w is over 1653 * 100Mbps. If its over 500meg we just set you to the 1654 * max (43 segments). 1655 */ 1656 if (te->rate > FIVE_HUNDRED_MBPS) 1657 goto max; 1658 if (te->rate == bw) { 1659 /* We are pacing at exactly the hdwr rate */ 1660 max: 1661 tcp_log_pacing_size(tp, bw, segsiz, new_tso, 1662 te->rate, te->time_between, (uint32_t)0, 1663 (segsiz * MAX_MSS_SENT), 0, 0, 3); 1664 return (segsiz * MAX_MSS_SENT); 1665 } 1666 lentim = ETHERNET_SEGMENT_SIZE * USECS_IN_SECOND; 1667 res = lentim / bw; 1668 if (res > te->time_between) { 1669 uint32_t delta, segs, res_div; 1670 1671 res_div = ((res * num_of_waits_allowed) + wait_time_floor); 1672 delta = res - te->time_between; 1673 segs = (res_div + delta - 1)/delta; 1674 if (segs < min_tso_segs) 1675 segs = min_tso_segs; 1676 if (segs < rs_hw_floor_mss) 1677 segs = rs_hw_floor_mss; 1678 if (segs > MAX_MSS_SENT) 1679 segs = MAX_MSS_SENT; 1680 segs *= segsiz; 1681 tcp_log_pacing_size(tp, bw, segsiz, new_tso, 1682 te->rate, te->time_between, (uint32_t)res, 1683 segs, res_div, 1, 3); 1684 if (err) 1685 *err = 0; 1686 if (segs < new_tso) { 1687 /* unexpected ? */ 1688 return(new_tso); 1689 } else { 1690 return (segs); 1691 } 1692 } else { 1693 /* 1694 * Your time is smaller which means 1695 * we will grow a queue on our 1696 * hardware. Send back the non-hardware 1697 * rate. 1698 */ 1699 tcp_log_pacing_size(tp, bw, segsiz, new_tso, 1700 te->rate, te->time_between, (uint32_t)res, 1701 0, 0, 0, 4); 1702 if (err) 1703 *err = -1; 1704 return (new_tso); 1705 } 1706 } 1707 1708 uint64_t 1709 tcp_hw_highest_rate_ifp(struct ifnet *ifp, struct inpcb *inp) 1710 { 1711 struct epoch_tracker et; 1712 struct tcp_rate_set *rs; 1713 uint64_t rate_ret; 1714 1715 NET_EPOCH_ENTER(et); 1716 use_next_interface: 1717 rs = find_rs_for_ifp(ifp); 1718 if (rs == NULL) { 1719 /* This interface does not do ratelimiting */ 1720 rate_ret = 0; 1721 } else if (rs->rs_flags & RS_IS_DEFF) { 1722 /* We need to find the real interface */ 1723 struct ifnet *tifp; 1724 1725 tifp = rt_find_real_interface(ifp, inp, NULL); 1726 if (tifp == NULL) { 1727 NET_EPOCH_EXIT(et); 1728 return (0); 1729 } 1730 ifp = tifp; 1731 goto use_next_interface; 1732 } else { 1733 /* Lets return the highest rate this guy has */ 1734 rate_ret = rs->rs_rlt[rs->rs_highest_valid].rate; 1735 } 1736 NET_EPOCH_EXIT(et); 1737 return(rate_ret); 1738 } 1739 1740 static eventhandler_tag rl_ifnet_departs; 1741 static eventhandler_tag rl_ifnet_arrives; 1742 static eventhandler_tag rl_shutdown_start; 1743 1744 static void 1745 tcp_rs_init(void *st __unused) 1746 { 1747 CK_LIST_INIT(&int_rs); 1748 rs_number_alive = 0; 1749 rs_number_dead = 0; 1750 mtx_init(&rs_mtx, "tcp_rs_mtx", "rsmtx", MTX_DEF); 1751 rl_ifnet_departs = EVENTHANDLER_REGISTER(ifnet_departure_event, 1752 tcp_rl_ifnet_departure, 1753 NULL, EVENTHANDLER_PRI_ANY); 1754 rl_ifnet_arrives = EVENTHANDLER_REGISTER(ifnet_link_event, 1755 tcp_rl_ifnet_link, 1756 NULL, EVENTHANDLER_PRI_ANY); 1757 rl_shutdown_start = EVENTHANDLER_REGISTER(shutdown_pre_sync, 1758 tcp_rl_shutdown, NULL, 1759 SHUTDOWN_PRI_FIRST); 1760 printf("TCP_ratelimit: Is now initialized\n"); 1761 } 1762 1763 SYSINIT(tcp_rl_init, SI_SUB_SMP + 1, SI_ORDER_ANY, tcp_rs_init, NULL); 1764 #endif 1765