1 /*- 2 * 3 * SPDX-License-Identifier: BSD-3-Clause 4 * 5 * Copyright (c) 2018-2020 6 * Netflix Inc. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 * 29 */ 30 /** 31 * Author: Randall Stewart <rrs@netflix.com> 32 */ 33 34 #include <sys/cdefs.h> 35 __FBSDID("$FreeBSD$"); 36 #include "opt_inet.h" 37 #include "opt_inet6.h" 38 #include "opt_ipsec.h" 39 #include "opt_tcpdebug.h" 40 #include "opt_ratelimit.h" 41 #include <sys/param.h> 42 #include <sys/kernel.h> 43 #include <sys/malloc.h> 44 #include <sys/mbuf.h> 45 #include <sys/socket.h> 46 #include <sys/socketvar.h> 47 #include <sys/sysctl.h> 48 #include <sys/eventhandler.h> 49 #include <sys/mutex.h> 50 #include <sys/ck.h> 51 #include <net/if.h> 52 #include <net/if_var.h> 53 #include <netinet/in.h> 54 #include <netinet/in_pcb.h> 55 #define TCPSTATES /* for logging */ 56 #include <netinet/tcp_var.h> 57 #include <netinet/tcp_hpts.h> 58 #include <netinet/tcp_log_buf.h> 59 #include <netinet/tcp_ratelimit.h> 60 #ifndef USECS_IN_SECOND 61 #define USECS_IN_SECOND 1000000 62 #endif 63 /* 64 * For the purposes of each send, what is the size 65 * of an ethernet frame. 66 */ 67 MALLOC_DEFINE(M_TCPPACE, "tcp_hwpace", "TCP Hardware pacing memory"); 68 #ifdef RATELIMIT 69 70 /* 71 * The following preferred table will seem weird to 72 * the casual viewer. Why do we not have any rates below 73 * 1Mbps? Why do we have a rate at 1.44Mbps called common? 74 * Why do the rates cluster in the 1-100Mbps range more 75 * than others? Why does the table jump around at the beginnign 76 * and then be more consistently raising? 77 * 78 * Let me try to answer those questions. A lot of 79 * this is dependant on the hardware. We have three basic 80 * supporters of rate limiting 81 * 82 * Chelsio - Supporting 16 configurable rates. 83 * Mlx - c4 supporting 13 fixed rates. 84 * Mlx - c5 & c6 supporting 127 configurable rates. 85 * 86 * The c4 is why we have a common rate that is available 87 * in all rate tables. This is a selected rate from the 88 * c4 table and we assure its available in all ratelimit 89 * tables. This way the tcp_ratelimit code has an assured 90 * rate it should always be able to get. This answers a 91 * couple of the questions above. 92 * 93 * So what about the rest, well the table is built to 94 * try to get the most out of a joint hardware/software 95 * pacing system. The software pacer will always pick 96 * a rate higher than the b/w that it is estimating 97 * 98 * on the path. This is done for two reasons. 99 * a) So we can discover more b/w 100 * and 101 * b) So we can send a block of MSS's down and then 102 * have the software timer go off after the previous 103 * send is completely out of the hardware. 104 * 105 * But when we do <b> we don't want to have the delay 106 * between the last packet sent by the hardware be 107 * excessively long (to reach our desired rate). 108 * 109 * So let me give an example for clarity. 110 * 111 * Lets assume that the tcp stack sees that 29,110,000 bps is 112 * what the bw of the path is. The stack would select the 113 * rate 31Mbps. 31Mbps means that each send that is done 114 * by the hardware will cause a 390 micro-second gap between 115 * the packets sent at that rate. For 29,110,000 bps we 116 * would need 416 micro-seconds gap between each send. 117 * 118 * Note that are calculating a complete time for pacing 119 * which includes the ethernet, IP and TCP overhead. So 120 * a full 1514 bytes is used for the above calculations. 121 * My testing has shown that both cards are also using this 122 * as their basis i.e. full payload size of the ethernet frame. 123 * The TCP stack caller needs to be aware of this and make the 124 * appropriate overhead calculations be included in its choices. 125 * 126 * Now, continuing our example, we pick a MSS size based on the 127 * delta between the two rates (416 - 390) divided into the rate 128 * we really wish to send at rounded up. That results in a MSS 129 * send of 17 mss's at once. The hardware then will 130 * run out of data in a single 17MSS send in 6,630 micro-seconds. 131 * 132 * On the other hand the software pacer will send more data 133 * in 7,072 micro-seconds. This means that we will refill 134 * the hardware 52 microseconds after it would have sent 135 * next if it had not ran out of data. This is a win since we are 136 * only sending every 7ms or so and yet all the packets are spaced on 137 * the wire with 94% of what they should be and only 138 * the last packet is delayed extra to make up for the 139 * difference. 140 * 141 * Note that the above formula has two important caveat. 142 * If we are above (b/w wise) over 100Mbps we double the result 143 * of the MSS calculation. The second caveat is if we are 500Mbps 144 * or more we just send the maximum MSS at once i.e. 45MSS. At 145 * the higher b/w's even the cards have limits to what times (timer granularity) 146 * they can insert between packets and start to send more than one 147 * packet at a time on the wire. 148 * 149 */ 150 #define COMMON_RATE 180500 151 const uint64_t desired_rates[] = { 152 122500, /* 1Mbps - rate 1 */ 153 180500, /* 1.44Mpbs - rate 2 common rate */ 154 375000, /* 3Mbps - rate 3 */ 155 625000, /* 5Mbps - rate 4 */ 156 1250000, /* 10Mbps - rate 5 */ 157 1875000, /* 15Mbps - rate 6 */ 158 2500000, /* 20Mbps - rate 7 */ 159 3125000, /* 25Mbps - rate 8 */ 160 3750000, /* 30Mbps - rate 9 */ 161 4375000, /* 35Mbps - rate 10 */ 162 5000000, /* 40Meg - rate 11 */ 163 6250000, /* 50Mbps - rate 12 */ 164 12500000, /* 100Mbps - rate 13 */ 165 25000000, /* 200Mbps - rate 14 */ 166 50000000, /* 400Mbps - rate 15 */ 167 100000000, /* 800Mbps - rate 16 */ 168 5625000, /* 45Mbps - rate 17 */ 169 6875000, /* 55Mbps - rate 19 */ 170 7500000, /* 60Mbps - rate 20 */ 171 8125000, /* 65Mbps - rate 21 */ 172 8750000, /* 70Mbps - rate 22 */ 173 9375000, /* 75Mbps - rate 23 */ 174 10000000, /* 80Mbps - rate 24 */ 175 10625000, /* 85Mbps - rate 25 */ 176 11250000, /* 90Mbps - rate 26 */ 177 11875000, /* 95Mbps - rate 27 */ 178 12500000, /* 100Mbps - rate 28 */ 179 13750000, /* 110Mbps - rate 29 */ 180 15000000, /* 120Mbps - rate 30 */ 181 16250000, /* 130Mbps - rate 31 */ 182 17500000, /* 140Mbps - rate 32 */ 183 18750000, /* 150Mbps - rate 33 */ 184 20000000, /* 160Mbps - rate 34 */ 185 21250000, /* 170Mbps - rate 35 */ 186 22500000, /* 180Mbps - rate 36 */ 187 23750000, /* 190Mbps - rate 37 */ 188 26250000, /* 210Mbps - rate 38 */ 189 27500000, /* 220Mbps - rate 39 */ 190 28750000, /* 230Mbps - rate 40 */ 191 30000000, /* 240Mbps - rate 41 */ 192 31250000, /* 250Mbps - rate 42 */ 193 34375000, /* 275Mbps - rate 43 */ 194 37500000, /* 300Mbps - rate 44 */ 195 40625000, /* 325Mbps - rate 45 */ 196 43750000, /* 350Mbps - rate 46 */ 197 46875000, /* 375Mbps - rate 47 */ 198 53125000, /* 425Mbps - rate 48 */ 199 56250000, /* 450Mbps - rate 49 */ 200 59375000, /* 475Mbps - rate 50 */ 201 62500000, /* 500Mbps - rate 51 */ 202 68750000, /* 550Mbps - rate 52 */ 203 75000000, /* 600Mbps - rate 53 */ 204 81250000, /* 650Mbps - rate 54 */ 205 87500000, /* 700Mbps - rate 55 */ 206 93750000, /* 750Mbps - rate 56 */ 207 106250000, /* 850Mbps - rate 57 */ 208 112500000, /* 900Mbps - rate 58 */ 209 125000000, /* 1Gbps - rate 59 */ 210 156250000, /* 1.25Gps - rate 60 */ 211 187500000, /* 1.5Gps - rate 61 */ 212 218750000, /* 1.75Gps - rate 62 */ 213 250000000, /* 2Gbps - rate 63 */ 214 281250000, /* 2.25Gps - rate 64 */ 215 312500000, /* 2.5Gbps - rate 65 */ 216 343750000, /* 2.75Gbps - rate 66 */ 217 375000000, /* 3Gbps - rate 67 */ 218 500000000, /* 4Gbps - rate 68 */ 219 625000000, /* 5Gbps - rate 69 */ 220 750000000, /* 6Gbps - rate 70 */ 221 875000000, /* 7Gbps - rate 71 */ 222 1000000000, /* 8Gbps - rate 72 */ 223 1125000000, /* 9Gbps - rate 73 */ 224 1250000000, /* 10Gbps - rate 74 */ 225 1875000000, /* 15Gbps - rate 75 */ 226 2500000000 /* 20Gbps - rate 76 */ 227 }; 228 229 #define MAX_HDWR_RATES (sizeof(desired_rates)/sizeof(uint64_t)) 230 #define RS_ORDERED_COUNT 16 /* 231 * Number that are in order 232 * at the beginning of the table, 233 * over this a sort is required. 234 */ 235 #define RS_NEXT_ORDER_GROUP 16 /* 236 * The point in our table where 237 * we come fill in a second ordered 238 * group (index wise means -1). 239 */ 240 #define ALL_HARDWARE_RATES 1004 /* 241 * 1Meg - 1Gig in 1 Meg steps 242 * plus 100, 200k and 500k and 243 * 10Gig 244 */ 245 246 #define RS_ONE_MEGABIT_PERSEC 1000000 247 #define RS_ONE_GIGABIT_PERSEC 1000000000 248 #define RS_TEN_GIGABIT_PERSEC 10000000000 249 250 static struct head_tcp_rate_set int_rs; 251 static struct mtx rs_mtx; 252 uint32_t rs_number_alive; 253 uint32_t rs_number_dead; 254 static uint32_t rs_floor_mss = 0; 255 static uint32_t wait_time_floor = 8000; /* 8 ms */ 256 static uint32_t rs_hw_floor_mss = 16; 257 static uint32_t num_of_waits_allowed = 1; /* How many time blocks are we willing to wait */ 258 259 SYSCTL_NODE(_net_inet_tcp, OID_AUTO, rl, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 260 "TCP Ratelimit stats"); 261 SYSCTL_UINT(_net_inet_tcp_rl, OID_AUTO, alive, CTLFLAG_RW, 262 &rs_number_alive, 0, 263 "Number of interfaces initialized for ratelimiting"); 264 SYSCTL_UINT(_net_inet_tcp_rl, OID_AUTO, dead, CTLFLAG_RW, 265 &rs_number_dead, 0, 266 "Number of interfaces departing from ratelimiting"); 267 SYSCTL_UINT(_net_inet_tcp_rl, OID_AUTO, floor_mss, CTLFLAG_RW, 268 &rs_floor_mss, 0, 269 "Number of MSS that will override the normal minimums (0 means don't enforce)"); 270 SYSCTL_UINT(_net_inet_tcp_rl, OID_AUTO, wait_floor, CTLFLAG_RW, 271 &wait_time_floor, 2000, 272 "Has b/w increases what is the wait floor we are willing to wait at the end?"); 273 SYSCTL_UINT(_net_inet_tcp_rl, OID_AUTO, time_blocks, CTLFLAG_RW, 274 &num_of_waits_allowed, 1, 275 "How many time blocks on the end should software pacing be willing to wait?"); 276 277 SYSCTL_UINT(_net_inet_tcp_rl, OID_AUTO, hw_floor_mss, CTLFLAG_RW, 278 &rs_hw_floor_mss, 16, 279 "Number of mss that are a minum for hardware pacing?"); 280 281 282 static void 283 rl_add_syctl_entries(struct sysctl_oid *rl_sysctl_root, struct tcp_rate_set *rs) 284 { 285 /* 286 * Add sysctl entries for thus interface. 287 */ 288 if (rs->rs_flags & RS_INTF_NO_SUP) { 289 SYSCTL_ADD_S32(&rs->sysctl_ctx, 290 SYSCTL_CHILDREN(rl_sysctl_root), 291 OID_AUTO, "disable", CTLFLAG_RD, 292 &rs->rs_disable, 0, 293 "Disable this interface from new hdwr limiting?"); 294 } else { 295 SYSCTL_ADD_S32(&rs->sysctl_ctx, 296 SYSCTL_CHILDREN(rl_sysctl_root), 297 OID_AUTO, "disable", CTLFLAG_RW, 298 &rs->rs_disable, 0, 299 "Disable this interface from new hdwr limiting?"); 300 } 301 SYSCTL_ADD_S32(&rs->sysctl_ctx, 302 SYSCTL_CHILDREN(rl_sysctl_root), 303 OID_AUTO, "minseg", CTLFLAG_RW, 304 &rs->rs_min_seg, 0, 305 "What is the minimum we need to send on this interface?"); 306 SYSCTL_ADD_U64(&rs->sysctl_ctx, 307 SYSCTL_CHILDREN(rl_sysctl_root), 308 OID_AUTO, "flow_limit", CTLFLAG_RW, 309 &rs->rs_flow_limit, 0, 310 "What is the limit for number of flows (0=unlimited)?"); 311 SYSCTL_ADD_S32(&rs->sysctl_ctx, 312 SYSCTL_CHILDREN(rl_sysctl_root), 313 OID_AUTO, "highest", CTLFLAG_RD, 314 &rs->rs_highest_valid, 0, 315 "Highest valid rate"); 316 SYSCTL_ADD_S32(&rs->sysctl_ctx, 317 SYSCTL_CHILDREN(rl_sysctl_root), 318 OID_AUTO, "lowest", CTLFLAG_RD, 319 &rs->rs_lowest_valid, 0, 320 "Lowest valid rate"); 321 SYSCTL_ADD_S32(&rs->sysctl_ctx, 322 SYSCTL_CHILDREN(rl_sysctl_root), 323 OID_AUTO, "flags", CTLFLAG_RD, 324 &rs->rs_flags, 0, 325 "What lags are on the entry?"); 326 SYSCTL_ADD_S32(&rs->sysctl_ctx, 327 SYSCTL_CHILDREN(rl_sysctl_root), 328 OID_AUTO, "numrates", CTLFLAG_RD, 329 &rs->rs_rate_cnt, 0, 330 "How many rates re there?"); 331 SYSCTL_ADD_U64(&rs->sysctl_ctx, 332 SYSCTL_CHILDREN(rl_sysctl_root), 333 OID_AUTO, "flows_using", CTLFLAG_RD, 334 &rs->rs_flows_using, 0, 335 "How many flows are using this interface now?"); 336 #ifdef DETAILED_RATELIMIT_SYSCTL 337 if (rs->rs_rlt && rs->rs_rate_cnt > 0) { 338 /* Lets display the rates */ 339 int i; 340 struct sysctl_oid *rl_rates; 341 struct sysctl_oid *rl_rate_num; 342 char rate_num[16]; 343 rl_rates = SYSCTL_ADD_NODE(&rs->sysctl_ctx, 344 SYSCTL_CHILDREN(rl_sysctl_root), 345 OID_AUTO, 346 "rate", 347 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 348 "Ratelist"); 349 for( i = 0; i < rs->rs_rate_cnt; i++) { 350 sprintf(rate_num, "%d", i); 351 rl_rate_num = SYSCTL_ADD_NODE(&rs->sysctl_ctx, 352 SYSCTL_CHILDREN(rl_rates), 353 OID_AUTO, 354 rate_num, 355 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 356 "Individual Rate"); 357 SYSCTL_ADD_U32(&rs->sysctl_ctx, 358 SYSCTL_CHILDREN(rl_rate_num), 359 OID_AUTO, "flags", CTLFLAG_RD, 360 &rs->rs_rlt[i].flags, 0, 361 "Flags on this rate"); 362 SYSCTL_ADD_U32(&rs->sysctl_ctx, 363 SYSCTL_CHILDREN(rl_rate_num), 364 OID_AUTO, "pacetime", CTLFLAG_RD, 365 &rs->rs_rlt[i].time_between, 0, 366 "Time hardware inserts between 1500 byte sends"); 367 SYSCTL_ADD_LONG(&rs->sysctl_ctx, 368 SYSCTL_CHILDREN(rl_rate_num), 369 OID_AUTO, "rate", CTLFLAG_RD, 370 &rs->rs_rlt[i].rate, 371 "Rate in bytes per second"); 372 SYSCTL_ADD_LONG(&rs->sysctl_ctx, 373 SYSCTL_CHILDREN(rl_rate_num), 374 OID_AUTO, "using", CTLFLAG_RD, 375 &rs->rs_rlt[i].using, 376 "Number of flows using"); 377 SYSCTL_ADD_LONG(&rs->sysctl_ctx, 378 SYSCTL_CHILDREN(rl_rate_num), 379 OID_AUTO, "enobufs", CTLFLAG_RD, 380 &rs->rs_rlt[i].rs_num_enobufs, 381 "Number of enobufs logged on this rate"); 382 383 } 384 } 385 #endif 386 } 387 388 static void 389 rs_destroy(epoch_context_t ctx) 390 { 391 struct tcp_rate_set *rs; 392 bool do_free_rs; 393 394 rs = __containerof(ctx, struct tcp_rate_set, rs_epoch_ctx); 395 396 mtx_lock(&rs_mtx); 397 rs->rs_flags &= ~RS_FUNERAL_SCHD; 398 /* 399 * In theory its possible (but unlikely) 400 * that while the delete was occuring 401 * and we were applying the DEAD flag 402 * someone slipped in and found the 403 * interface in a lookup. While we 404 * decided rs_flows_using were 0 and 405 * scheduling the epoch_call, the other 406 * thread incremented rs_flow_using. This 407 * is because users have a pointer and 408 * we only use the rs_flows_using in an 409 * atomic fashion, i.e. the other entities 410 * are not protected. To assure this did 411 * not occur, we check rs_flows_using here 412 * before deleting. 413 */ 414 do_free_rs = (rs->rs_flows_using == 0); 415 rs_number_dead--; 416 mtx_unlock(&rs_mtx); 417 418 if (do_free_rs) { 419 sysctl_ctx_free(&rs->sysctl_ctx); 420 free(rs->rs_rlt, M_TCPPACE); 421 free(rs, M_TCPPACE); 422 } 423 } 424 425 static void 426 rs_defer_destroy(struct tcp_rate_set *rs) 427 { 428 429 mtx_assert(&rs_mtx, MA_OWNED); 430 431 /* Check if already pending. */ 432 if (rs->rs_flags & RS_FUNERAL_SCHD) 433 return; 434 435 rs_number_dead++; 436 437 /* Set flag to only defer once. */ 438 rs->rs_flags |= RS_FUNERAL_SCHD; 439 NET_EPOCH_CALL(rs_destroy, &rs->rs_epoch_ctx); 440 } 441 442 #ifdef INET 443 extern counter_u64_t rate_limit_new; 444 extern counter_u64_t rate_limit_chg; 445 extern counter_u64_t rate_limit_set_ok; 446 extern counter_u64_t rate_limit_active; 447 extern counter_u64_t rate_limit_alloc_fail; 448 #endif 449 450 static int 451 rl_attach_txrtlmt(struct ifnet *ifp, 452 uint32_t flowtype, 453 int flowid, 454 uint64_t cfg_rate, 455 struct m_snd_tag **tag) 456 { 457 int error; 458 union if_snd_tag_alloc_params params = { 459 .rate_limit.hdr.type = IF_SND_TAG_TYPE_RATE_LIMIT, 460 .rate_limit.hdr.flowid = flowid, 461 .rate_limit.hdr.flowtype = flowtype, 462 .rate_limit.max_rate = cfg_rate, 463 .rate_limit.flags = M_NOWAIT, 464 }; 465 466 error = m_snd_tag_alloc(ifp, ¶ms, tag); 467 #ifdef INET 468 if (error == 0) { 469 counter_u64_add(rate_limit_set_ok, 1); 470 counter_u64_add(rate_limit_active, 1); 471 } else if (error != EOPNOTSUPP) 472 counter_u64_add(rate_limit_alloc_fail, 1); 473 #endif 474 return (error); 475 } 476 477 static void 478 populate_canned_table(struct tcp_rate_set *rs, const uint64_t *rate_table_act) 479 { 480 /* 481 * The internal table is "special", it 482 * is two seperate ordered tables that 483 * must be merged. We get here when the 484 * adapter specifies a number of rates that 485 * covers both ranges in the table in some 486 * form. 487 */ 488 int i, at_low, at_high; 489 uint8_t low_disabled = 0, high_disabled = 0; 490 491 for(i = 0, at_low = 0, at_high = RS_NEXT_ORDER_GROUP; i < rs->rs_rate_cnt; i++) { 492 rs->rs_rlt[i].flags = 0; 493 rs->rs_rlt[i].time_between = 0; 494 if ((low_disabled == 0) && 495 (high_disabled || 496 (rate_table_act[at_low] < rate_table_act[at_high]))) { 497 rs->rs_rlt[i].rate = rate_table_act[at_low]; 498 at_low++; 499 if (at_low == RS_NEXT_ORDER_GROUP) 500 low_disabled = 1; 501 } else if (high_disabled == 0) { 502 rs->rs_rlt[i].rate = rate_table_act[at_high]; 503 at_high++; 504 if (at_high == MAX_HDWR_RATES) 505 high_disabled = 1; 506 } 507 } 508 } 509 510 static struct tcp_rate_set * 511 rt_setup_new_rs(struct ifnet *ifp, int *error) 512 { 513 struct tcp_rate_set *rs; 514 const uint64_t *rate_table_act; 515 uint64_t lentim, res; 516 size_t sz; 517 uint32_t hash_type; 518 int i; 519 struct if_ratelimit_query_results rl; 520 struct sysctl_oid *rl_sysctl_root; 521 struct epoch_tracker et; 522 /* 523 * We expect to enter with the 524 * mutex locked. 525 */ 526 527 if (ifp->if_ratelimit_query == NULL) { 528 /* 529 * We can do nothing if we cannot 530 * get a query back from the driver. 531 */ 532 printf("Warning:No query functions for %s:%d-- failed\n", 533 ifp->if_dname, ifp->if_dunit); 534 return (NULL); 535 } 536 rs = malloc(sizeof(struct tcp_rate_set), M_TCPPACE, M_NOWAIT | M_ZERO); 537 if (rs == NULL) { 538 if (error) 539 *error = ENOMEM; 540 printf("Warning:No memory for malloc of tcp_rate_set\n"); 541 return (NULL); 542 } 543 memset(&rl, 0, sizeof(rl)); 544 rl.flags = RT_NOSUPPORT; 545 ifp->if_ratelimit_query(ifp, &rl); 546 if (rl.flags & RT_IS_UNUSABLE) { 547 /* 548 * The interface does not really support 549 * the rate-limiting. 550 */ 551 memset(rs, 0, sizeof(struct tcp_rate_set)); 552 rs->rs_ifp = ifp; 553 rs->rs_if_dunit = ifp->if_dunit; 554 rs->rs_flags = RS_INTF_NO_SUP; 555 rs->rs_disable = 1; 556 rs_number_alive++; 557 sysctl_ctx_init(&rs->sysctl_ctx); 558 rl_sysctl_root = SYSCTL_ADD_NODE(&rs->sysctl_ctx, 559 SYSCTL_STATIC_CHILDREN(_net_inet_tcp_rl), 560 OID_AUTO, 561 rs->rs_ifp->if_xname, 562 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 563 ""); 564 rl_add_syctl_entries(rl_sysctl_root, rs); 565 NET_EPOCH_ENTER(et); 566 mtx_lock(&rs_mtx); 567 CK_LIST_INSERT_HEAD(&int_rs, rs, next); 568 mtx_unlock(&rs_mtx); 569 NET_EPOCH_EXIT(et); 570 return (rs); 571 } else if ((rl.flags & RT_IS_INDIRECT) == RT_IS_INDIRECT) { 572 memset(rs, 0, sizeof(struct tcp_rate_set)); 573 rs->rs_ifp = ifp; 574 rs->rs_if_dunit = ifp->if_dunit; 575 rs->rs_flags = RS_IS_DEFF; 576 rs_number_alive++; 577 sysctl_ctx_init(&rs->sysctl_ctx); 578 rl_sysctl_root = SYSCTL_ADD_NODE(&rs->sysctl_ctx, 579 SYSCTL_STATIC_CHILDREN(_net_inet_tcp_rl), 580 OID_AUTO, 581 rs->rs_ifp->if_xname, 582 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 583 ""); 584 rl_add_syctl_entries(rl_sysctl_root, rs); 585 NET_EPOCH_ENTER(et); 586 mtx_lock(&rs_mtx); 587 CK_LIST_INSERT_HEAD(&int_rs, rs, next); 588 mtx_unlock(&rs_mtx); 589 NET_EPOCH_EXIT(et); 590 return (rs); 591 } else if ((rl.flags & RT_IS_FIXED_TABLE) == RT_IS_FIXED_TABLE) { 592 /* Mellanox C4 likely */ 593 rs->rs_ifp = ifp; 594 rs->rs_if_dunit = ifp->if_dunit; 595 rs->rs_rate_cnt = rl.number_of_rates; 596 rs->rs_min_seg = rl.min_segment_burst; 597 rs->rs_highest_valid = 0; 598 rs->rs_flow_limit = rl.max_flows; 599 rs->rs_flags = RS_IS_INTF | RS_NO_PRE; 600 rs->rs_disable = 0; 601 rate_table_act = rl.rate_table; 602 } else if ((rl.flags & RT_IS_SELECTABLE) == RT_IS_SELECTABLE) { 603 /* Chelsio, C5 and C6 of Mellanox? */ 604 rs->rs_ifp = ifp; 605 rs->rs_if_dunit = ifp->if_dunit; 606 rs->rs_rate_cnt = rl.number_of_rates; 607 rs->rs_min_seg = rl.min_segment_burst; 608 rs->rs_disable = 0; 609 rs->rs_flow_limit = rl.max_flows; 610 rate_table_act = desired_rates; 611 if ((rs->rs_rate_cnt > MAX_HDWR_RATES) && 612 (rs->rs_rate_cnt < ALL_HARDWARE_RATES)) { 613 /* 614 * Our desired table is not big 615 * enough, do what we can. 616 */ 617 rs->rs_rate_cnt = MAX_HDWR_RATES; 618 } 619 if (rs->rs_rate_cnt <= RS_ORDERED_COUNT) 620 rs->rs_flags = RS_IS_INTF; 621 else 622 rs->rs_flags = RS_IS_INTF | RS_INT_TBL; 623 if (rs->rs_rate_cnt >= ALL_HARDWARE_RATES) 624 rs->rs_rate_cnt = ALL_HARDWARE_RATES; 625 } else { 626 free(rs, M_TCPPACE); 627 return (NULL); 628 } 629 sz = sizeof(struct tcp_hwrate_limit_table) * rs->rs_rate_cnt; 630 rs->rs_rlt = malloc(sz, M_TCPPACE, M_NOWAIT); 631 if (rs->rs_rlt == NULL) { 632 if (error) 633 *error = ENOMEM; 634 bail: 635 free(rs, M_TCPPACE); 636 return (NULL); 637 } 638 if (rs->rs_rate_cnt >= ALL_HARDWARE_RATES) { 639 /* 640 * The interface supports all 641 * the rates we could possibly want. 642 */ 643 uint64_t rat; 644 645 rs->rs_rlt[0].rate = 12500; /* 100k */ 646 rs->rs_rlt[1].rate = 25000; /* 200k */ 647 rs->rs_rlt[2].rate = 62500; /* 500k */ 648 /* Note 125000 == 1Megabit 649 * populate 1Meg - 1000meg. 650 */ 651 for(i = 3, rat = 125000; i< (ALL_HARDWARE_RATES-1); i++) { 652 rs->rs_rlt[i].rate = rat; 653 rat += 125000; 654 } 655 rs->rs_rlt[(ALL_HARDWARE_RATES-1)].rate = 1250000000; 656 } else if (rs->rs_flags & RS_INT_TBL) { 657 /* We populate this in a special way */ 658 populate_canned_table(rs, rate_table_act); 659 } else { 660 /* 661 * Just copy in the rates from 662 * the table, it is in order. 663 */ 664 for (i=0; i<rs->rs_rate_cnt; i++) { 665 rs->rs_rlt[i].rate = rate_table_act[i]; 666 rs->rs_rlt[i].time_between = 0; 667 rs->rs_rlt[i].flags = 0; 668 } 669 } 670 for (i = (rs->rs_rate_cnt - 1); i >= 0; i--) { 671 /* 672 * We go backwards through the list so that if we can't get 673 * a rate and fail to init one, we have at least a chance of 674 * getting the highest one. 675 */ 676 rs->rs_rlt[i].ptbl = rs; 677 rs->rs_rlt[i].tag = NULL; 678 rs->rs_rlt[i].using = 0; 679 rs->rs_rlt[i].rs_num_enobufs = 0; 680 /* 681 * Calculate the time between. 682 */ 683 lentim = ETHERNET_SEGMENT_SIZE * USECS_IN_SECOND; 684 res = lentim / rs->rs_rlt[i].rate; 685 if (res > 0) 686 rs->rs_rlt[i].time_between = res; 687 else 688 rs->rs_rlt[i].time_between = 1; 689 if (rs->rs_flags & RS_NO_PRE) { 690 rs->rs_rlt[i].flags = HDWRPACE_INITED; 691 rs->rs_lowest_valid = i; 692 } else { 693 int err; 694 695 if ((rl.flags & RT_IS_SETUP_REQ) && 696 (ifp->if_ratelimit_query)) { 697 err = ifp->if_ratelimit_setup(ifp, 698 rs->rs_rlt[i].rate, i); 699 if (err) 700 goto handle_err; 701 } 702 #ifdef RSS 703 hash_type = M_HASHTYPE_RSS_TCP_IPV4; 704 #else 705 hash_type = M_HASHTYPE_OPAQUE_HASH; 706 #endif 707 err = rl_attach_txrtlmt(ifp, 708 hash_type, 709 (i + 1), 710 rs->rs_rlt[i].rate, 711 &rs->rs_rlt[i].tag); 712 if (err) { 713 handle_err: 714 if (i == (rs->rs_rate_cnt - 1)) { 715 /* 716 * Huh - first rate and we can't get 717 * it? 718 */ 719 free(rs->rs_rlt, M_TCPPACE); 720 if (error) 721 *error = err; 722 goto bail; 723 } else { 724 if (error) 725 *error = err; 726 } 727 break; 728 } else { 729 rs->rs_rlt[i].flags = HDWRPACE_INITED | HDWRPACE_TAGPRESENT; 730 rs->rs_lowest_valid = i; 731 } 732 } 733 } 734 /* Did we get at least 1 rate? */ 735 if (rs->rs_rlt[(rs->rs_rate_cnt - 1)].flags & HDWRPACE_INITED) 736 rs->rs_highest_valid = rs->rs_rate_cnt - 1; 737 else { 738 free(rs->rs_rlt, M_TCPPACE); 739 goto bail; 740 } 741 rs_number_alive++; 742 sysctl_ctx_init(&rs->sysctl_ctx); 743 rl_sysctl_root = SYSCTL_ADD_NODE(&rs->sysctl_ctx, 744 SYSCTL_STATIC_CHILDREN(_net_inet_tcp_rl), 745 OID_AUTO, 746 rs->rs_ifp->if_xname, 747 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 748 ""); 749 rl_add_syctl_entries(rl_sysctl_root, rs); 750 NET_EPOCH_ENTER(et); 751 mtx_lock(&rs_mtx); 752 CK_LIST_INSERT_HEAD(&int_rs, rs, next); 753 mtx_unlock(&rs_mtx); 754 NET_EPOCH_EXIT(et); 755 return (rs); 756 } 757 758 /* 759 * For an explanation of why the argument is volatile please 760 * look at the comments around rt_setup_rate(). 761 */ 762 static const struct tcp_hwrate_limit_table * 763 tcp_int_find_suitable_rate(const volatile struct tcp_rate_set *rs, 764 uint64_t bytes_per_sec, uint32_t flags, uint64_t *lower_rate) 765 { 766 struct tcp_hwrate_limit_table *arte = NULL, *rte = NULL; 767 uint64_t mbits_per_sec, ind_calc, previous_rate = 0; 768 int i; 769 770 mbits_per_sec = (bytes_per_sec * 8); 771 if (flags & RS_PACING_LT) { 772 if ((mbits_per_sec < RS_ONE_MEGABIT_PERSEC) && 773 (rs->rs_lowest_valid <= 2)){ 774 /* 775 * Smaller than 1Meg, only 776 * 3 entries can match it. 777 */ 778 previous_rate = 0; 779 for(i = rs->rs_lowest_valid; i < 3; i++) { 780 if (bytes_per_sec <= rs->rs_rlt[i].rate) { 781 rte = &rs->rs_rlt[i]; 782 break; 783 } else if (rs->rs_rlt[i].flags & HDWRPACE_INITED) { 784 arte = &rs->rs_rlt[i]; 785 } 786 previous_rate = rs->rs_rlt[i].rate; 787 } 788 goto done; 789 } else if ((mbits_per_sec > RS_ONE_GIGABIT_PERSEC) && 790 (rs->rs_rlt[(ALL_HARDWARE_RATES-1)].flags & HDWRPACE_INITED)){ 791 /* 792 * Larger than 1G (the majority of 793 * our table. 794 */ 795 if (mbits_per_sec < RS_TEN_GIGABIT_PERSEC) 796 rte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)]; 797 else 798 arte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)]; 799 previous_rate = rs->rs_rlt[(ALL_HARDWARE_RATES-2)].rate; 800 goto done; 801 } 802 /* 803 * If we reach here its in our table (between 1Meg - 1000Meg), 804 * just take the rounded down mbits per second, and add 805 * 1Megabit to it, from this we can calculate 806 * the index in the table. 807 */ 808 ind_calc = mbits_per_sec/RS_ONE_MEGABIT_PERSEC; 809 if ((ind_calc * RS_ONE_MEGABIT_PERSEC) != mbits_per_sec) 810 ind_calc++; 811 /* our table is offset by 3, we add 2 */ 812 ind_calc += 2; 813 if (ind_calc > (ALL_HARDWARE_RATES-1)) { 814 /* This should not happen */ 815 ind_calc = ALL_HARDWARE_RATES-1; 816 } 817 if ((ind_calc >= rs->rs_lowest_valid) && 818 (ind_calc <= rs->rs_highest_valid)) { 819 rte = &rs->rs_rlt[ind_calc]; 820 if (ind_calc >= 1) 821 previous_rate = rs->rs_rlt[(ind_calc-1)].rate; 822 } 823 } else if (flags & RS_PACING_EXACT_MATCH) { 824 if ((mbits_per_sec < RS_ONE_MEGABIT_PERSEC) && 825 (rs->rs_lowest_valid <= 2)){ 826 for(i = rs->rs_lowest_valid; i < 3; i++) { 827 if (bytes_per_sec == rs->rs_rlt[i].rate) { 828 rte = &rs->rs_rlt[i]; 829 break; 830 } 831 } 832 } else if ((mbits_per_sec > RS_ONE_GIGABIT_PERSEC) && 833 (rs->rs_rlt[(ALL_HARDWARE_RATES-1)].flags & HDWRPACE_INITED)) { 834 /* > 1Gbps only one rate */ 835 if (bytes_per_sec == rs->rs_rlt[(ALL_HARDWARE_RATES-1)].rate) { 836 /* Its 10G wow */ 837 rte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)]; 838 } 839 } else { 840 /* Ok it must be a exact meg (its between 1G and 1Meg) */ 841 ind_calc = mbits_per_sec/RS_ONE_MEGABIT_PERSEC; 842 if ((ind_calc * RS_ONE_MEGABIT_PERSEC) == mbits_per_sec) { 843 /* its an exact Mbps */ 844 ind_calc += 2; 845 if (ind_calc > (ALL_HARDWARE_RATES-1)) { 846 /* This should not happen */ 847 ind_calc = ALL_HARDWARE_RATES-1; 848 } 849 if (rs->rs_rlt[ind_calc].flags & HDWRPACE_INITED) 850 rte = &rs->rs_rlt[ind_calc]; 851 } 852 } 853 } else { 854 /* we want greater than the requested rate */ 855 if ((mbits_per_sec < RS_ONE_MEGABIT_PERSEC) && 856 (rs->rs_lowest_valid <= 2)){ 857 arte = &rs->rs_rlt[3]; /* set alternate to 1Meg */ 858 for (i=2; i>=rs->rs_lowest_valid; i--) { 859 if (bytes_per_sec < rs->rs_rlt[i].rate) { 860 rte = &rs->rs_rlt[i]; 861 if (i >= 1) { 862 previous_rate = rs->rs_rlt[(i-1)].rate; 863 } 864 break; 865 } else if ((flags & RS_PACING_GEQ) && 866 (bytes_per_sec == rs->rs_rlt[i].rate)) { 867 rte = &rs->rs_rlt[i]; 868 if (i >= 1) { 869 previous_rate = rs->rs_rlt[(i-1)].rate; 870 } 871 break; 872 } else { 873 arte = &rs->rs_rlt[i]; /* new alternate */ 874 } 875 } 876 } else if (mbits_per_sec > RS_ONE_GIGABIT_PERSEC) { 877 if ((bytes_per_sec < rs->rs_rlt[(ALL_HARDWARE_RATES-1)].rate) && 878 (rs->rs_rlt[(ALL_HARDWARE_RATES-1)].flags & HDWRPACE_INITED)){ 879 /* Our top rate is larger than the request */ 880 rte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)]; 881 } else if ((flags & RS_PACING_GEQ) && 882 (bytes_per_sec == rs->rs_rlt[(ALL_HARDWARE_RATES-1)].rate) && 883 (rs->rs_rlt[(ALL_HARDWARE_RATES-1)].flags & HDWRPACE_INITED)) { 884 /* It matches our top rate */ 885 rte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)]; 886 } else if (rs->rs_rlt[(ALL_HARDWARE_RATES-1)].flags & HDWRPACE_INITED) { 887 /* The top rate is an alternative */ 888 arte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)]; 889 } 890 previous_rate = rs->rs_rlt[(ALL_HARDWARE_RATES-2)].rate; 891 } else { 892 /* Its in our range 1Meg - 1Gig */ 893 if (flags & RS_PACING_GEQ) { 894 ind_calc = mbits_per_sec/RS_ONE_MEGABIT_PERSEC; 895 if ((ind_calc * RS_ONE_MEGABIT_PERSEC) == mbits_per_sec) { 896 if (ind_calc > (ALL_HARDWARE_RATES-1)) { 897 /* This should not happen */ 898 ind_calc = (ALL_HARDWARE_RATES-1); 899 } 900 rte = &rs->rs_rlt[ind_calc]; 901 if (ind_calc >= 1) 902 previous_rate = rs->rs_rlt[(ind_calc-1)].rate; 903 } 904 goto done; 905 } 906 ind_calc = (mbits_per_sec + (RS_ONE_MEGABIT_PERSEC-1))/RS_ONE_MEGABIT_PERSEC; 907 ind_calc += 2; 908 if (ind_calc > (ALL_HARDWARE_RATES-1)) { 909 /* This should not happen */ 910 ind_calc = ALL_HARDWARE_RATES-1; 911 } 912 if (rs->rs_rlt[ind_calc].flags & HDWRPACE_INITED) { 913 rte = &rs->rs_rlt[ind_calc]; 914 if (ind_calc >= 1) 915 previous_rate = rs->rs_rlt[(ind_calc-1)].rate; 916 } 917 } 918 } 919 done: 920 if ((rte == NULL) && 921 (arte != NULL) && 922 (flags & RS_PACING_SUB_OK)) { 923 /* We can use the substitute */ 924 rte = arte; 925 } 926 if (lower_rate) 927 *lower_rate = previous_rate; 928 return (rte); 929 } 930 931 /* 932 * For an explanation of why the argument is volatile please 933 * look at the comments around rt_setup_rate(). 934 */ 935 static const struct tcp_hwrate_limit_table * 936 tcp_find_suitable_rate(const volatile struct tcp_rate_set *rs, uint64_t bytes_per_sec, uint32_t flags, uint64_t *lower_rate) 937 { 938 /** 939 * Hunt the rate table with the restrictions in flags and find a 940 * suitable rate if possible. 941 * RS_PACING_EXACT_MATCH - look for an exact match to rate. 942 * RS_PACING_GT - must be greater than. 943 * RS_PACING_GEQ - must be greater than or equal. 944 * RS_PACING_LT - must be less than. 945 * RS_PACING_SUB_OK - If we don't meet criteria a 946 * substitute is ok. 947 */ 948 int i, matched; 949 struct tcp_hwrate_limit_table *rte = NULL; 950 uint64_t previous_rate = 0; 951 952 if ((rs->rs_flags & RS_INT_TBL) && 953 (rs->rs_rate_cnt >= ALL_HARDWARE_RATES)) { 954 /* 955 * Here we don't want to paw thru 956 * a big table, we have everything 957 * from 1Meg - 1000Meg in 1Meg increments. 958 * Use an alternate method to "lookup". 959 */ 960 return (tcp_int_find_suitable_rate(rs, bytes_per_sec, flags, lower_rate)); 961 } 962 if ((flags & RS_PACING_LT) || 963 (flags & RS_PACING_EXACT_MATCH)) { 964 /* 965 * For exact and less than we go forward through the table. 966 * This way when we find one larger we stop (exact was a 967 * toss up). 968 */ 969 for (i = rs->rs_lowest_valid, matched = 0; i <= rs->rs_highest_valid; i++) { 970 if ((flags & RS_PACING_EXACT_MATCH) && 971 (bytes_per_sec == rs->rs_rlt[i].rate)) { 972 rte = &rs->rs_rlt[i]; 973 matched = 1; 974 if (lower_rate != NULL) 975 *lower_rate = previous_rate; 976 break; 977 } else if ((flags & RS_PACING_LT) && 978 (bytes_per_sec <= rs->rs_rlt[i].rate)) { 979 rte = &rs->rs_rlt[i]; 980 matched = 1; 981 if (lower_rate != NULL) 982 *lower_rate = previous_rate; 983 break; 984 } 985 previous_rate = rs->rs_rlt[i].rate; 986 if (bytes_per_sec > rs->rs_rlt[i].rate) 987 break; 988 } 989 if ((matched == 0) && 990 (flags & RS_PACING_LT) && 991 (flags & RS_PACING_SUB_OK)) { 992 /* Kick in a substitute (the lowest) */ 993 rte = &rs->rs_rlt[rs->rs_lowest_valid]; 994 } 995 } else { 996 /* 997 * Here we go backward through the table so that we can find 998 * the one greater in theory faster (but its probably a 999 * wash). 1000 */ 1001 for (i = rs->rs_highest_valid, matched = 0; i >= rs->rs_lowest_valid; i--) { 1002 if (rs->rs_rlt[i].rate > bytes_per_sec) { 1003 /* A possible candidate */ 1004 rte = &rs->rs_rlt[i]; 1005 } 1006 if ((flags & RS_PACING_GEQ) && 1007 (bytes_per_sec == rs->rs_rlt[i].rate)) { 1008 /* An exact match and we want equal */ 1009 matched = 1; 1010 rte = &rs->rs_rlt[i]; 1011 break; 1012 } else if (rte) { 1013 /* 1014 * Found one that is larger than but don't 1015 * stop, there may be a more closer match. 1016 */ 1017 matched = 1; 1018 } 1019 if (rs->rs_rlt[i].rate < bytes_per_sec) { 1020 /* 1021 * We found a table entry that is smaller, 1022 * stop there will be none greater or equal. 1023 */ 1024 if (lower_rate != NULL) 1025 *lower_rate = rs->rs_rlt[i].rate; 1026 break; 1027 } 1028 } 1029 if ((matched == 0) && 1030 (flags & RS_PACING_SUB_OK)) { 1031 /* Kick in a substitute (the highest) */ 1032 rte = &rs->rs_rlt[rs->rs_highest_valid]; 1033 } 1034 } 1035 return (rte); 1036 } 1037 1038 static struct ifnet * 1039 rt_find_real_interface(struct ifnet *ifp, struct inpcb *inp, int *error) 1040 { 1041 struct ifnet *tifp; 1042 struct m_snd_tag *tag, *ntag; 1043 union if_snd_tag_alloc_params params = { 1044 .rate_limit.hdr.type = IF_SND_TAG_TYPE_RATE_LIMIT, 1045 .rate_limit.hdr.flowid = inp->inp_flowid, 1046 .rate_limit.hdr.numa_domain = inp->inp_numa_domain, 1047 .rate_limit.max_rate = COMMON_RATE, 1048 .rate_limit.flags = M_NOWAIT, 1049 }; 1050 int err; 1051 #ifdef RSS 1052 params.rate_limit.hdr.flowtype = ((inp->inp_vflag & INP_IPV6) ? 1053 M_HASHTYPE_RSS_TCP_IPV6 : M_HASHTYPE_RSS_TCP_IPV4); 1054 #else 1055 params.rate_limit.hdr.flowtype = M_HASHTYPE_OPAQUE_HASH; 1056 #endif 1057 err = m_snd_tag_alloc(ifp, ¶ms, &tag); 1058 if (err) { 1059 /* Failed to setup a tag? */ 1060 if (error) 1061 *error = err; 1062 return (NULL); 1063 } 1064 ntag = tag; 1065 while (ntag->sw->next_snd_tag != NULL) { 1066 ntag = ntag->sw->next_snd_tag(ntag); 1067 } 1068 tifp = ntag->ifp; 1069 m_snd_tag_rele(tag); 1070 return (tifp); 1071 } 1072 1073 static void 1074 rl_increment_using(const struct tcp_hwrate_limit_table *rte) 1075 { 1076 struct tcp_hwrate_limit_table *decon_rte; 1077 1078 decon_rte = __DECONST(struct tcp_hwrate_limit_table *, rte); 1079 atomic_add_long(&decon_rte->using, 1); 1080 } 1081 1082 static void 1083 rl_decrement_using(const struct tcp_hwrate_limit_table *rte) 1084 { 1085 struct tcp_hwrate_limit_table *decon_rte; 1086 1087 decon_rte = __DECONST(struct tcp_hwrate_limit_table *, rte); 1088 atomic_subtract_long(&decon_rte->using, 1); 1089 } 1090 1091 void 1092 tcp_rl_log_enobuf(const struct tcp_hwrate_limit_table *rte) 1093 { 1094 struct tcp_hwrate_limit_table *decon_rte; 1095 1096 decon_rte = __DECONST(struct tcp_hwrate_limit_table *, rte); 1097 atomic_add_long(&decon_rte->rs_num_enobufs, 1); 1098 } 1099 1100 /* 1101 * Do NOT take the __noinline out of the 1102 * find_rs_for_ifp() function. If you do the inline 1103 * of it for the rt_setup_rate() will show you a 1104 * compiler bug. For some reason the compiler thinks 1105 * the list can never be empty. The consequence of 1106 * this will be a crash when we dereference NULL 1107 * if an ifp is removed just has a hw rate limit 1108 * is attempted. If you are working on the compiler 1109 * and want to "test" this go ahead and take the noinline 1110 * out otherwise let sleeping dogs ly until such time 1111 * as we get a compiler fix 10/2/20 -- RRS 1112 */ 1113 static __noinline struct tcp_rate_set * 1114 find_rs_for_ifp(struct ifnet *ifp) 1115 { 1116 struct tcp_rate_set *rs; 1117 1118 CK_LIST_FOREACH(rs, &int_rs, next) { 1119 if ((rs->rs_ifp == ifp) && 1120 (rs->rs_if_dunit == ifp->if_dunit)) { 1121 /* Ok we found it */ 1122 return (rs); 1123 } 1124 } 1125 return (NULL); 1126 } 1127 1128 1129 static const struct tcp_hwrate_limit_table * 1130 rt_setup_rate(struct inpcb *inp, struct ifnet *ifp, uint64_t bytes_per_sec, 1131 uint32_t flags, int *error, uint64_t *lower_rate) 1132 { 1133 /* First lets find the interface if it exists */ 1134 const struct tcp_hwrate_limit_table *rte; 1135 /* 1136 * So why is rs volatile? This is to defeat a 1137 * compiler bug where in the compiler is convinced 1138 * that rs can never be NULL (which is not true). Because 1139 * of its conviction it nicely optimizes out the if ((rs == NULL 1140 * below which means if you get a NULL back you dereference it. 1141 */ 1142 volatile struct tcp_rate_set *rs; 1143 struct epoch_tracker et; 1144 struct ifnet *oifp = ifp; 1145 int err; 1146 1147 NET_EPOCH_ENTER(et); 1148 use_real_interface: 1149 rs = find_rs_for_ifp(ifp); 1150 if ((rs == NULL) || 1151 (rs->rs_flags & RS_INTF_NO_SUP) || 1152 (rs->rs_flags & RS_IS_DEAD)) { 1153 /* 1154 * This means we got a packet *before* 1155 * the IF-UP was processed below, <or> 1156 * while or after we already received an interface 1157 * departed event. In either case we really don't 1158 * want to do anything with pacing, in 1159 * the departing case the packet is not 1160 * going to go very far. The new case 1161 * might be arguable, but its impossible 1162 * to tell from the departing case. 1163 */ 1164 if (error) 1165 *error = ENODEV; 1166 NET_EPOCH_EXIT(et); 1167 return (NULL); 1168 } 1169 1170 if ((rs == NULL) || (rs->rs_disable != 0)) { 1171 if (error) 1172 *error = ENOSPC; 1173 NET_EPOCH_EXIT(et); 1174 return (NULL); 1175 } 1176 if (rs->rs_flags & RS_IS_DEFF) { 1177 /* We need to find the real interface */ 1178 struct ifnet *tifp; 1179 1180 tifp = rt_find_real_interface(ifp, inp, error); 1181 if (tifp == NULL) { 1182 if (rs->rs_disable && error) 1183 *error = ENOTSUP; 1184 NET_EPOCH_EXIT(et); 1185 return (NULL); 1186 } 1187 KASSERT((tifp != ifp), 1188 ("Lookup failure ifp:%p inp:%p rt_find_real_interface() returns the same interface tifp:%p?\n", 1189 ifp, inp, tifp)); 1190 ifp = tifp; 1191 goto use_real_interface; 1192 } 1193 if (rs->rs_flow_limit && 1194 ((rs->rs_flows_using + 1) > rs->rs_flow_limit)) { 1195 if (error) 1196 *error = ENOSPC; 1197 NET_EPOCH_EXIT(et); 1198 return (NULL); 1199 } 1200 rte = tcp_find_suitable_rate(rs, bytes_per_sec, flags, lower_rate); 1201 if (rte) { 1202 err = in_pcbattach_txrtlmt(inp, oifp, 1203 inp->inp_flowtype, 1204 inp->inp_flowid, 1205 rte->rate, 1206 &inp->inp_snd_tag); 1207 if (err) { 1208 /* Failed to attach */ 1209 if (error) 1210 *error = err; 1211 rte = NULL; 1212 } else { 1213 KASSERT((inp->inp_snd_tag != NULL) , 1214 ("Setup rate has no snd_tag inp:%p rte:%p rate:%llu rs:%p", 1215 inp, rte, (unsigned long long)rte->rate, rs)); 1216 #ifdef INET 1217 counter_u64_add(rate_limit_new, 1); 1218 #endif 1219 } 1220 } 1221 if (rte) { 1222 /* 1223 * We use an atomic here for accounting so we don't have to 1224 * use locks when freeing. 1225 */ 1226 atomic_add_64(&rs->rs_flows_using, 1); 1227 } 1228 NET_EPOCH_EXIT(et); 1229 return (rte); 1230 } 1231 1232 static void 1233 tcp_rl_ifnet_link(void *arg __unused, struct ifnet *ifp, int link_state) 1234 { 1235 int error; 1236 struct tcp_rate_set *rs; 1237 struct epoch_tracker et; 1238 1239 if (((ifp->if_capenable & IFCAP_TXRTLMT) == 0) || 1240 (link_state != LINK_STATE_UP)) { 1241 /* 1242 * We only care on an interface going up that is rate-limit 1243 * capable. 1244 */ 1245 return; 1246 } 1247 NET_EPOCH_ENTER(et); 1248 mtx_lock(&rs_mtx); 1249 rs = find_rs_for_ifp(ifp); 1250 if (rs) { 1251 /* We already have initialized this guy */ 1252 mtx_unlock(&rs_mtx); 1253 NET_EPOCH_EXIT(et); 1254 return; 1255 } 1256 mtx_unlock(&rs_mtx); 1257 NET_EPOCH_EXIT(et); 1258 rt_setup_new_rs(ifp, &error); 1259 } 1260 1261 static void 1262 tcp_rl_ifnet_departure(void *arg __unused, struct ifnet *ifp) 1263 { 1264 struct tcp_rate_set *rs; 1265 struct epoch_tracker et; 1266 int i; 1267 1268 NET_EPOCH_ENTER(et); 1269 mtx_lock(&rs_mtx); 1270 rs = find_rs_for_ifp(ifp); 1271 if (rs) { 1272 CK_LIST_REMOVE(rs, next); 1273 rs_number_alive--; 1274 rs->rs_flags |= RS_IS_DEAD; 1275 for (i = 0; i < rs->rs_rate_cnt; i++) { 1276 if (rs->rs_rlt[i].flags & HDWRPACE_TAGPRESENT) { 1277 in_pcbdetach_tag(rs->rs_rlt[i].tag); 1278 rs->rs_rlt[i].tag = NULL; 1279 } 1280 rs->rs_rlt[i].flags = HDWRPACE_IFPDEPARTED; 1281 } 1282 if (rs->rs_flows_using == 0) 1283 rs_defer_destroy(rs); 1284 } 1285 mtx_unlock(&rs_mtx); 1286 NET_EPOCH_EXIT(et); 1287 } 1288 1289 static void 1290 tcp_rl_shutdown(void *arg __unused, int howto __unused) 1291 { 1292 struct tcp_rate_set *rs, *nrs; 1293 struct epoch_tracker et; 1294 int i; 1295 1296 NET_EPOCH_ENTER(et); 1297 mtx_lock(&rs_mtx); 1298 CK_LIST_FOREACH_SAFE(rs, &int_rs, next, nrs) { 1299 CK_LIST_REMOVE(rs, next); 1300 rs_number_alive--; 1301 rs->rs_flags |= RS_IS_DEAD; 1302 for (i = 0; i < rs->rs_rate_cnt; i++) { 1303 if (rs->rs_rlt[i].flags & HDWRPACE_TAGPRESENT) { 1304 in_pcbdetach_tag(rs->rs_rlt[i].tag); 1305 rs->rs_rlt[i].tag = NULL; 1306 } 1307 rs->rs_rlt[i].flags = HDWRPACE_IFPDEPARTED; 1308 } 1309 if (rs->rs_flows_using == 0) 1310 rs_defer_destroy(rs); 1311 } 1312 mtx_unlock(&rs_mtx); 1313 NET_EPOCH_EXIT(et); 1314 } 1315 1316 const struct tcp_hwrate_limit_table * 1317 tcp_set_pacing_rate(struct tcpcb *tp, struct ifnet *ifp, 1318 uint64_t bytes_per_sec, int flags, int *error, uint64_t *lower_rate) 1319 { 1320 const struct tcp_hwrate_limit_table *rte; 1321 #ifdef KERN_TLS 1322 struct ktls_session *tls; 1323 #endif 1324 1325 INP_WLOCK_ASSERT(tp->t_inpcb); 1326 1327 if (tp->t_inpcb->inp_snd_tag == NULL) { 1328 /* 1329 * We are setting up a rate for the first time. 1330 */ 1331 if ((ifp->if_capenable & IFCAP_TXRTLMT) == 0) { 1332 /* Not supported by the egress */ 1333 if (error) 1334 *error = ENODEV; 1335 return (NULL); 1336 } 1337 #ifdef KERN_TLS 1338 tls = NULL; 1339 if (tp->t_inpcb->inp_socket->so_snd.sb_flags & SB_TLS_IFNET) { 1340 tls = tp->t_inpcb->inp_socket->so_snd.sb_tls_info; 1341 1342 if ((ifp->if_capenable & IFCAP_TXTLS_RTLMT) == 0 || 1343 tls->mode != TCP_TLS_MODE_IFNET) { 1344 if (error) 1345 *error = ENODEV; 1346 return (NULL); 1347 } 1348 } 1349 #endif 1350 rte = rt_setup_rate(tp->t_inpcb, ifp, bytes_per_sec, flags, error, lower_rate); 1351 if (rte) 1352 rl_increment_using(rte); 1353 #ifdef KERN_TLS 1354 if (rte != NULL && tls != NULL && tls->snd_tag != NULL) { 1355 /* 1356 * Fake a route change error to reset the TLS 1357 * send tag. This will convert the existing 1358 * tag to a TLS ratelimit tag. 1359 */ 1360 MPASS(tls->snd_tag->sw->type == IF_SND_TAG_TYPE_TLS); 1361 ktls_output_eagain(tp->t_inpcb, tls); 1362 } 1363 #endif 1364 } else { 1365 /* 1366 * We are modifying a rate, wrong interface? 1367 */ 1368 if (error) 1369 *error = EINVAL; 1370 rte = NULL; 1371 } 1372 if (rte != NULL) { 1373 tp->t_pacing_rate = rte->rate; 1374 *error = 0; 1375 } 1376 return (rte); 1377 } 1378 1379 const struct tcp_hwrate_limit_table * 1380 tcp_chg_pacing_rate(const struct tcp_hwrate_limit_table *crte, 1381 struct tcpcb *tp, struct ifnet *ifp, 1382 uint64_t bytes_per_sec, int flags, int *error, uint64_t *lower_rate) 1383 { 1384 const struct tcp_hwrate_limit_table *nrte; 1385 const struct tcp_rate_set *rs; 1386 #ifdef KERN_TLS 1387 struct ktls_session *tls = NULL; 1388 #endif 1389 int err; 1390 1391 INP_WLOCK_ASSERT(tp->t_inpcb); 1392 1393 if (crte == NULL) { 1394 /* Wrong interface */ 1395 if (error) 1396 *error = EINVAL; 1397 return (NULL); 1398 } 1399 1400 #ifdef KERN_TLS 1401 if (tp->t_inpcb->inp_socket->so_snd.sb_flags & SB_TLS_IFNET) { 1402 tls = tp->t_inpcb->inp_socket->so_snd.sb_tls_info; 1403 if (tls->mode != TCP_TLS_MODE_IFNET) 1404 tls = NULL; 1405 else if (tls->snd_tag != NULL && 1406 tls->snd_tag->sw->type != IF_SND_TAG_TYPE_TLS_RATE_LIMIT) { 1407 if (!tls->reset_pending) { 1408 /* 1409 * NIC probably doesn't support 1410 * ratelimit TLS tags if it didn't 1411 * allocate one when an existing rate 1412 * was present, so ignore. 1413 */ 1414 tcp_rel_pacing_rate(crte, tp); 1415 if (error) 1416 *error = EOPNOTSUPP; 1417 return (NULL); 1418 } 1419 1420 /* 1421 * The send tag is being converted, so set the 1422 * rate limit on the inpcb tag. There is a 1423 * race that the new NIC send tag might use 1424 * the current rate instead of this one. 1425 */ 1426 tls = NULL; 1427 } 1428 } 1429 #endif 1430 if (tp->t_inpcb->inp_snd_tag == NULL) { 1431 /* Wrong interface */ 1432 tcp_rel_pacing_rate(crte, tp); 1433 if (error) 1434 *error = EINVAL; 1435 return (NULL); 1436 } 1437 rs = crte->ptbl; 1438 if ((rs->rs_flags & RS_IS_DEAD) || 1439 (crte->flags & HDWRPACE_IFPDEPARTED)) { 1440 /* Release the rate, and try anew */ 1441 1442 tcp_rel_pacing_rate(crte, tp); 1443 nrte = tcp_set_pacing_rate(tp, ifp, 1444 bytes_per_sec, flags, error, lower_rate); 1445 return (nrte); 1446 } 1447 nrte = tcp_find_suitable_rate(rs, bytes_per_sec, flags, lower_rate); 1448 if (nrte == crte) { 1449 /* No change */ 1450 if (error) 1451 *error = 0; 1452 return (crte); 1453 } 1454 if (nrte == NULL) { 1455 /* Release the old rate */ 1456 if (error) 1457 *error = ENOENT; 1458 tcp_rel_pacing_rate(crte, tp); 1459 return (NULL); 1460 } 1461 rl_decrement_using(crte); 1462 rl_increment_using(nrte); 1463 /* Change rates to our new entry */ 1464 #ifdef KERN_TLS 1465 if (tls != NULL) 1466 err = ktls_modify_txrtlmt(tls, nrte->rate); 1467 else 1468 #endif 1469 err = in_pcbmodify_txrtlmt(tp->t_inpcb, nrte->rate); 1470 if (err) { 1471 struct tcp_rate_set *lrs; 1472 uint64_t pre; 1473 1474 rl_decrement_using(nrte); 1475 lrs = __DECONST(struct tcp_rate_set *, rs); 1476 pre = atomic_fetchadd_64(&lrs->rs_flows_using, -1); 1477 /* Do we still have a snd-tag attached? */ 1478 if (tp->t_inpcb->inp_snd_tag) 1479 in_pcbdetach_txrtlmt(tp->t_inpcb); 1480 1481 if (pre == 1) { 1482 struct epoch_tracker et; 1483 1484 NET_EPOCH_ENTER(et); 1485 mtx_lock(&rs_mtx); 1486 /* 1487 * Is it dead? 1488 */ 1489 if (lrs->rs_flags & RS_IS_DEAD) 1490 rs_defer_destroy(lrs); 1491 mtx_unlock(&rs_mtx); 1492 NET_EPOCH_EXIT(et); 1493 } 1494 if (error) 1495 *error = err; 1496 return (NULL); 1497 } else { 1498 #ifdef INET 1499 counter_u64_add(rate_limit_chg, 1); 1500 #endif 1501 } 1502 if (error) 1503 *error = 0; 1504 tp->t_pacing_rate = nrte->rate; 1505 return (nrte); 1506 } 1507 1508 void 1509 tcp_rel_pacing_rate(const struct tcp_hwrate_limit_table *crte, struct tcpcb *tp) 1510 { 1511 const struct tcp_rate_set *crs; 1512 struct tcp_rate_set *rs; 1513 uint64_t pre; 1514 1515 INP_WLOCK_ASSERT(tp->t_inpcb); 1516 1517 tp->t_pacing_rate = -1; 1518 crs = crte->ptbl; 1519 /* 1520 * Now we must break the const 1521 * in order to release our refcount. 1522 */ 1523 rs = __DECONST(struct tcp_rate_set *, crs); 1524 rl_decrement_using(crte); 1525 pre = atomic_fetchadd_64(&rs->rs_flows_using, -1); 1526 if (pre == 1) { 1527 struct epoch_tracker et; 1528 1529 NET_EPOCH_ENTER(et); 1530 mtx_lock(&rs_mtx); 1531 /* 1532 * Is it dead? 1533 */ 1534 if (rs->rs_flags & RS_IS_DEAD) 1535 rs_defer_destroy(rs); 1536 mtx_unlock(&rs_mtx); 1537 NET_EPOCH_EXIT(et); 1538 } 1539 1540 /* 1541 * XXX: If this connection is using ifnet TLS, should we 1542 * switch it to using an unlimited rate, or perhaps use 1543 * ktls_output_eagain() to reset the send tag to a plain 1544 * TLS tag? 1545 */ 1546 in_pcbdetach_txrtlmt(tp->t_inpcb); 1547 } 1548 1549 #define ONE_POINT_TWO_MEG 150000 /* 1.2 megabits in bytes */ 1550 #define ONE_HUNDRED_MBPS 12500000 /* 100Mbps in bytes per second */ 1551 #define FIVE_HUNDRED_MBPS 62500000 /* 500Mbps in bytes per second */ 1552 #define MAX_MSS_SENT 43 /* 43 mss = 43 x 1500 = 64,500 bytes */ 1553 1554 static void 1555 tcp_log_pacing_size(struct tcpcb *tp, uint64_t bw, uint32_t segsiz, uint32_t new_tso, 1556 uint64_t hw_rate, uint32_t time_between, uint32_t calc_time_between, 1557 uint32_t segs, uint32_t res_div, uint16_t mult, uint8_t mod) 1558 { 1559 if (tp->t_logstate != TCP_LOG_STATE_OFF) { 1560 union tcp_log_stackspecific log; 1561 struct timeval tv; 1562 1563 memset(&log, 0, sizeof(log)); 1564 log.u_bbr.flex1 = segsiz; 1565 log.u_bbr.flex2 = new_tso; 1566 log.u_bbr.flex3 = time_between; 1567 log.u_bbr.flex4 = calc_time_between; 1568 log.u_bbr.flex5 = segs; 1569 log.u_bbr.flex6 = res_div; 1570 log.u_bbr.flex7 = mult; 1571 log.u_bbr.flex8 = mod; 1572 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 1573 log.u_bbr.cur_del_rate = bw; 1574 log.u_bbr.delRate = hw_rate; 1575 TCP_LOG_EVENTP(tp, NULL, 1576 &tp->t_inpcb->inp_socket->so_rcv, 1577 &tp->t_inpcb->inp_socket->so_snd, 1578 TCP_HDWR_PACE_SIZE, 0, 1579 0, &log, false, &tv); 1580 } 1581 } 1582 1583 uint32_t 1584 tcp_get_pacing_burst_size (struct tcpcb *tp, uint64_t bw, uint32_t segsiz, int can_use_1mss, 1585 const struct tcp_hwrate_limit_table *te, int *err) 1586 { 1587 /* 1588 * We use the google formula to calculate the 1589 * TSO size. I.E. 1590 * bw < 24Meg 1591 * tso = 2mss 1592 * else 1593 * tso = min(bw/1000, 64k) 1594 * 1595 * Note for these calculations we ignore the 1596 * packet overhead (enet hdr, ip hdr and tcp hdr). 1597 */ 1598 uint64_t lentim, res, bytes; 1599 uint32_t new_tso, min_tso_segs; 1600 1601 bytes = bw / 1000; 1602 if (bytes > (64 * 1000)) 1603 bytes = 64 * 1000; 1604 /* Round up */ 1605 new_tso = (bytes + segsiz - 1) / segsiz; 1606 if (can_use_1mss && (bw < ONE_POINT_TWO_MEG)) 1607 min_tso_segs = 1; 1608 else 1609 min_tso_segs = 2; 1610 if (rs_floor_mss && (new_tso < rs_floor_mss)) 1611 new_tso = rs_floor_mss; 1612 else if (new_tso < min_tso_segs) 1613 new_tso = min_tso_segs; 1614 if (new_tso > MAX_MSS_SENT) 1615 new_tso = MAX_MSS_SENT; 1616 new_tso *= segsiz; 1617 tcp_log_pacing_size(tp, bw, segsiz, new_tso, 1618 0, 0, 0, 0, 0, 0, 1); 1619 /* 1620 * If we are not doing hardware pacing 1621 * then we are done. 1622 */ 1623 if (te == NULL) { 1624 if (err) 1625 *err = 0; 1626 return(new_tso); 1627 } 1628 /* 1629 * For hardware pacing we look at the 1630 * rate you are sending at and compare 1631 * that to the rate you have in hardware. 1632 * 1633 * If the hardware rate is slower than your 1634 * software rate then you are in error and 1635 * we will build a queue in our hardware whic 1636 * is probably not desired, in such a case 1637 * just return the non-hardware TSO size. 1638 * 1639 * If the rate in hardware is faster (which 1640 * it should be) then look at how long it 1641 * takes to send one ethernet segment size at 1642 * your b/w and compare that to the time it 1643 * takes to send at the rate you had selected. 1644 * 1645 * If your time is greater (which we hope it is) 1646 * we get the delta between the two, and then 1647 * divide that into your pacing time. This tells 1648 * us how many MSS you can send down at once (rounded up). 1649 * 1650 * Note we also double this value if the b/w is over 1651 * 100Mbps. If its over 500meg we just set you to the 1652 * max (43 segments). 1653 */ 1654 if (te->rate > FIVE_HUNDRED_MBPS) 1655 goto max; 1656 if (te->rate == bw) { 1657 /* We are pacing at exactly the hdwr rate */ 1658 max: 1659 tcp_log_pacing_size(tp, bw, segsiz, new_tso, 1660 te->rate, te->time_between, (uint32_t)0, 1661 (segsiz * MAX_MSS_SENT), 0, 0, 3); 1662 return (segsiz * MAX_MSS_SENT); 1663 } 1664 lentim = ETHERNET_SEGMENT_SIZE * USECS_IN_SECOND; 1665 res = lentim / bw; 1666 if (res > te->time_between) { 1667 uint32_t delta, segs, res_div; 1668 1669 res_div = ((res * num_of_waits_allowed) + wait_time_floor); 1670 delta = res - te->time_between; 1671 segs = (res_div + delta - 1)/delta; 1672 if (segs < min_tso_segs) 1673 segs = min_tso_segs; 1674 if (segs < rs_hw_floor_mss) 1675 segs = rs_hw_floor_mss; 1676 if (segs > MAX_MSS_SENT) 1677 segs = MAX_MSS_SENT; 1678 segs *= segsiz; 1679 tcp_log_pacing_size(tp, bw, segsiz, new_tso, 1680 te->rate, te->time_between, (uint32_t)res, 1681 segs, res_div, 1, 3); 1682 if (err) 1683 *err = 0; 1684 if (segs < new_tso) { 1685 /* unexpected ? */ 1686 return(new_tso); 1687 } else { 1688 return (segs); 1689 } 1690 } else { 1691 /* 1692 * Your time is smaller which means 1693 * we will grow a queue on our 1694 * hardware. Send back the non-hardware 1695 * rate. 1696 */ 1697 tcp_log_pacing_size(tp, bw, segsiz, new_tso, 1698 te->rate, te->time_between, (uint32_t)res, 1699 0, 0, 0, 4); 1700 if (err) 1701 *err = -1; 1702 return (new_tso); 1703 } 1704 } 1705 1706 uint64_t 1707 tcp_hw_highest_rate_ifp(struct ifnet *ifp, struct inpcb *inp) 1708 { 1709 struct epoch_tracker et; 1710 struct tcp_rate_set *rs; 1711 uint64_t rate_ret; 1712 1713 NET_EPOCH_ENTER(et); 1714 use_next_interface: 1715 rs = find_rs_for_ifp(ifp); 1716 if (rs == NULL) { 1717 /* This interface does not do ratelimiting */ 1718 rate_ret = 0; 1719 } else if (rs->rs_flags & RS_IS_DEFF) { 1720 /* We need to find the real interface */ 1721 struct ifnet *tifp; 1722 1723 tifp = rt_find_real_interface(ifp, inp, NULL); 1724 if (tifp == NULL) { 1725 NET_EPOCH_EXIT(et); 1726 return (0); 1727 } 1728 ifp = tifp; 1729 goto use_next_interface; 1730 } else { 1731 /* Lets return the highest rate this guy has */ 1732 rate_ret = rs->rs_rlt[rs->rs_highest_valid].rate; 1733 } 1734 NET_EPOCH_EXIT(et); 1735 return(rate_ret); 1736 } 1737 1738 static eventhandler_tag rl_ifnet_departs; 1739 static eventhandler_tag rl_ifnet_arrives; 1740 static eventhandler_tag rl_shutdown_start; 1741 1742 static void 1743 tcp_rs_init(void *st __unused) 1744 { 1745 CK_LIST_INIT(&int_rs); 1746 rs_number_alive = 0; 1747 rs_number_dead = 0; 1748 mtx_init(&rs_mtx, "tcp_rs_mtx", "rsmtx", MTX_DEF); 1749 rl_ifnet_departs = EVENTHANDLER_REGISTER(ifnet_departure_event, 1750 tcp_rl_ifnet_departure, 1751 NULL, EVENTHANDLER_PRI_ANY); 1752 rl_ifnet_arrives = EVENTHANDLER_REGISTER(ifnet_link_event, 1753 tcp_rl_ifnet_link, 1754 NULL, EVENTHANDLER_PRI_ANY); 1755 rl_shutdown_start = EVENTHANDLER_REGISTER(shutdown_pre_sync, 1756 tcp_rl_shutdown, NULL, 1757 SHUTDOWN_PRI_FIRST); 1758 printf("TCP_ratelimit: Is now initialized\n"); 1759 } 1760 1761 SYSINIT(tcp_rl_init, SI_SUB_SMP + 1, SI_ORDER_ANY, tcp_rs_init, NULL); 1762 #endif 1763