1 /*- 2 * 3 * SPDX-License-Identifier: BSD-3-Clause 4 * 5 * Copyright (c) 2018-2020 6 * Netflix Inc. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 * 29 */ 30 /** 31 * Author: Randall Stewart <rrs@netflix.com> 32 */ 33 34 #include <sys/cdefs.h> 35 __FBSDID("$FreeBSD$"); 36 #include "opt_inet.h" 37 #include "opt_inet6.h" 38 #include "opt_ipsec.h" 39 #include "opt_tcpdebug.h" 40 #include "opt_ratelimit.h" 41 #include <sys/param.h> 42 #include <sys/kernel.h> 43 #include <sys/malloc.h> 44 #include <sys/mbuf.h> 45 #include <sys/socket.h> 46 #include <sys/socketvar.h> 47 #include <sys/sysctl.h> 48 #include <sys/eventhandler.h> 49 #include <sys/mutex.h> 50 #include <sys/ck.h> 51 #include <net/if.h> 52 #include <net/if_var.h> 53 #include <netinet/in.h> 54 #include <netinet/in_pcb.h> 55 #define TCPSTATES /* for logging */ 56 #include <netinet/tcp_var.h> 57 #ifdef INET6 58 #include <netinet6/tcp6_var.h> 59 #endif 60 #include <netinet/tcp_hpts.h> 61 #include <netinet/tcp_log_buf.h> 62 #include <netinet/tcp_ratelimit.h> 63 #ifndef USECS_IN_SECOND 64 #define USECS_IN_SECOND 1000000 65 #endif 66 /* 67 * For the purposes of each send, what is the size 68 * of an ethernet frame. 69 */ 70 MALLOC_DEFINE(M_TCPPACE, "tcp_hwpace", "TCP Hardware pacing memory"); 71 #ifdef RATELIMIT 72 73 /* 74 * The following preferred table will seem weird to 75 * the casual viewer. Why do we not have any rates below 76 * 1Mbps? Why do we have a rate at 1.44Mbps called common? 77 * Why do the rates cluster in the 1-100Mbps range more 78 * than others? Why does the table jump around at the beginnign 79 * and then be more consistently raising? 80 * 81 * Let me try to answer those questions. A lot of 82 * this is dependant on the hardware. We have three basic 83 * supporters of rate limiting 84 * 85 * Chelsio - Supporting 16 configurable rates. 86 * Mlx - c4 supporting 13 fixed rates. 87 * Mlx - c5 & c6 supporting 127 configurable rates. 88 * 89 * The c4 is why we have a common rate that is available 90 * in all rate tables. This is a selected rate from the 91 * c4 table and we assure its available in all ratelimit 92 * tables. This way the tcp_ratelimit code has an assured 93 * rate it should always be able to get. This answers a 94 * couple of the questions above. 95 * 96 * So what about the rest, well the table is built to 97 * try to get the most out of a joint hardware/software 98 * pacing system. The software pacer will always pick 99 * a rate higher than the b/w that it is estimating 100 * 101 * on the path. This is done for two reasons. 102 * a) So we can discover more b/w 103 * and 104 * b) So we can send a block of MSS's down and then 105 * have the software timer go off after the previous 106 * send is completely out of the hardware. 107 * 108 * But when we do <b> we don't want to have the delay 109 * between the last packet sent by the hardware be 110 * excessively long (to reach our desired rate). 111 * 112 * So let me give an example for clarity. 113 * 114 * Lets assume that the tcp stack sees that 29,110,000 bps is 115 * what the bw of the path is. The stack would select the 116 * rate 31Mbps. 31Mbps means that each send that is done 117 * by the hardware will cause a 390 micro-second gap between 118 * the packets sent at that rate. For 29,110,000 bps we 119 * would need 416 micro-seconds gap between each send. 120 * 121 * Note that are calculating a complete time for pacing 122 * which includes the ethernet, IP and TCP overhead. So 123 * a full 1514 bytes is used for the above calculations. 124 * My testing has shown that both cards are also using this 125 * as their basis i.e. full payload size of the ethernet frame. 126 * The TCP stack caller needs to be aware of this and make the 127 * appropriate overhead calculations be included in its choices. 128 * 129 * Now, continuing our example, we pick a MSS size based on the 130 * delta between the two rates (416 - 390) divided into the rate 131 * we really wish to send at rounded up. That results in a MSS 132 * send of 17 mss's at once. The hardware then will 133 * run out of data in a single 17MSS send in 6,630 micro-seconds. 134 * 135 * On the other hand the software pacer will send more data 136 * in 7,072 micro-seconds. This means that we will refill 137 * the hardware 52 microseconds after it would have sent 138 * next if it had not ran out of data. This is a win since we are 139 * only sending every 7ms or so and yet all the packets are spaced on 140 * the wire with 94% of what they should be and only 141 * the last packet is delayed extra to make up for the 142 * difference. 143 * 144 * Note that the above formula has two important caveat. 145 * If we are above (b/w wise) over 100Mbps we double the result 146 * of the MSS calculation. The second caveat is if we are 500Mbps 147 * or more we just send the maximum MSS at once i.e. 45MSS. At 148 * the higher b/w's even the cards have limits to what times (timer granularity) 149 * they can insert between packets and start to send more than one 150 * packet at a time on the wire. 151 * 152 */ 153 #define COMMON_RATE 180500 154 const uint64_t desired_rates[] = { 155 122500, /* 1Mbps - rate 1 */ 156 180500, /* 1.44Mpbs - rate 2 common rate */ 157 375000, /* 3Mbps - rate 3 */ 158 625000, /* 5Mbps - rate 4 */ 159 1250000, /* 10Mbps - rate 5 */ 160 1875000, /* 15Mbps - rate 6 */ 161 2500000, /* 20Mbps - rate 7 */ 162 3125000, /* 25Mbps - rate 8 */ 163 3750000, /* 30Mbps - rate 9 */ 164 4375000, /* 35Mbps - rate 10 */ 165 5000000, /* 40Meg - rate 11 */ 166 6250000, /* 50Mbps - rate 12 */ 167 12500000, /* 100Mbps - rate 13 */ 168 25000000, /* 200Mbps - rate 14 */ 169 50000000, /* 400Mbps - rate 15 */ 170 100000000, /* 800Mbps - rate 16 */ 171 5625000, /* 45Mbps - rate 17 */ 172 6875000, /* 55Mbps - rate 19 */ 173 7500000, /* 60Mbps - rate 20 */ 174 8125000, /* 65Mbps - rate 21 */ 175 8750000, /* 70Mbps - rate 22 */ 176 9375000, /* 75Mbps - rate 23 */ 177 10000000, /* 80Mbps - rate 24 */ 178 10625000, /* 85Mbps - rate 25 */ 179 11250000, /* 90Mbps - rate 26 */ 180 11875000, /* 95Mbps - rate 27 */ 181 12500000, /* 100Mbps - rate 28 */ 182 13750000, /* 110Mbps - rate 29 */ 183 15000000, /* 120Mbps - rate 30 */ 184 16250000, /* 130Mbps - rate 31 */ 185 17500000, /* 140Mbps - rate 32 */ 186 18750000, /* 150Mbps - rate 33 */ 187 20000000, /* 160Mbps - rate 34 */ 188 21250000, /* 170Mbps - rate 35 */ 189 22500000, /* 180Mbps - rate 36 */ 190 23750000, /* 190Mbps - rate 37 */ 191 26250000, /* 210Mbps - rate 38 */ 192 27500000, /* 220Mbps - rate 39 */ 193 28750000, /* 230Mbps - rate 40 */ 194 30000000, /* 240Mbps - rate 41 */ 195 31250000, /* 250Mbps - rate 42 */ 196 34375000, /* 275Mbps - rate 43 */ 197 37500000, /* 300Mbps - rate 44 */ 198 40625000, /* 325Mbps - rate 45 */ 199 43750000, /* 350Mbps - rate 46 */ 200 46875000, /* 375Mbps - rate 47 */ 201 53125000, /* 425Mbps - rate 48 */ 202 56250000, /* 450Mbps - rate 49 */ 203 59375000, /* 475Mbps - rate 50 */ 204 62500000, /* 500Mbps - rate 51 */ 205 68750000, /* 550Mbps - rate 52 */ 206 75000000, /* 600Mbps - rate 53 */ 207 81250000, /* 650Mbps - rate 54 */ 208 87500000, /* 700Mbps - rate 55 */ 209 93750000, /* 750Mbps - rate 56 */ 210 106250000, /* 850Mbps - rate 57 */ 211 112500000, /* 900Mbps - rate 58 */ 212 125000000, /* 1Gbps - rate 59 */ 213 156250000, /* 1.25Gps - rate 60 */ 214 187500000, /* 1.5Gps - rate 61 */ 215 218750000, /* 1.75Gps - rate 62 */ 216 250000000, /* 2Gbps - rate 63 */ 217 281250000, /* 2.25Gps - rate 64 */ 218 312500000, /* 2.5Gbps - rate 65 */ 219 343750000, /* 2.75Gbps - rate 66 */ 220 375000000, /* 3Gbps - rate 67 */ 221 500000000, /* 4Gbps - rate 68 */ 222 625000000, /* 5Gbps - rate 69 */ 223 750000000, /* 6Gbps - rate 70 */ 224 875000000, /* 7Gbps - rate 71 */ 225 1000000000, /* 8Gbps - rate 72 */ 226 1125000000, /* 9Gbps - rate 73 */ 227 1250000000, /* 10Gbps - rate 74 */ 228 1875000000, /* 15Gbps - rate 75 */ 229 2500000000 /* 20Gbps - rate 76 */ 230 }; 231 232 #define MAX_HDWR_RATES (sizeof(desired_rates)/sizeof(uint64_t)) 233 #define RS_ORDERED_COUNT 16 /* 234 * Number that are in order 235 * at the beginning of the table, 236 * over this a sort is required. 237 */ 238 #define RS_NEXT_ORDER_GROUP 16 /* 239 * The point in our table where 240 * we come fill in a second ordered 241 * group (index wise means -1). 242 */ 243 #define ALL_HARDWARE_RATES 1004 /* 244 * 1Meg - 1Gig in 1 Meg steps 245 * plus 100, 200k and 500k and 246 * 10Gig 247 */ 248 249 #define RS_ONE_MEGABIT_PERSEC 1000000 250 #define RS_ONE_GIGABIT_PERSEC 1000000000 251 #define RS_TEN_GIGABIT_PERSEC 10000000000 252 253 static struct head_tcp_rate_set int_rs; 254 static struct mtx rs_mtx; 255 uint32_t rs_number_alive; 256 uint32_t rs_number_dead; 257 static uint32_t rs_floor_mss = 0; 258 static uint32_t wait_time_floor = 8000; /* 8 ms */ 259 static uint32_t rs_hw_floor_mss = 16; 260 static uint32_t num_of_waits_allowed = 1; /* How many time blocks are we willing to wait */ 261 262 SYSCTL_NODE(_net_inet_tcp, OID_AUTO, rl, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 263 "TCP Ratelimit stats"); 264 SYSCTL_UINT(_net_inet_tcp_rl, OID_AUTO, alive, CTLFLAG_RW, 265 &rs_number_alive, 0, 266 "Number of interfaces initialized for ratelimiting"); 267 SYSCTL_UINT(_net_inet_tcp_rl, OID_AUTO, dead, CTLFLAG_RW, 268 &rs_number_dead, 0, 269 "Number of interfaces departing from ratelimiting"); 270 SYSCTL_UINT(_net_inet_tcp_rl, OID_AUTO, floor_mss, CTLFLAG_RW, 271 &rs_floor_mss, 0, 272 "Number of MSS that will override the normal minimums (0 means don't enforce)"); 273 SYSCTL_UINT(_net_inet_tcp_rl, OID_AUTO, wait_floor, CTLFLAG_RW, 274 &wait_time_floor, 2000, 275 "Has b/w increases what is the wait floor we are willing to wait at the end?"); 276 SYSCTL_UINT(_net_inet_tcp_rl, OID_AUTO, time_blocks, CTLFLAG_RW, 277 &num_of_waits_allowed, 1, 278 "How many time blocks on the end should software pacing be willing to wait?"); 279 280 SYSCTL_UINT(_net_inet_tcp_rl, OID_AUTO, hw_floor_mss, CTLFLAG_RW, 281 &rs_hw_floor_mss, 16, 282 "Number of mss that are a minum for hardware pacing?"); 283 284 285 static void 286 rl_add_syctl_entries(struct sysctl_oid *rl_sysctl_root, struct tcp_rate_set *rs) 287 { 288 /* 289 * Add sysctl entries for thus interface. 290 */ 291 if (rs->rs_flags & RS_INTF_NO_SUP) { 292 SYSCTL_ADD_S32(&rs->sysctl_ctx, 293 SYSCTL_CHILDREN(rl_sysctl_root), 294 OID_AUTO, "disable", CTLFLAG_RD, 295 &rs->rs_disable, 0, 296 "Disable this interface from new hdwr limiting?"); 297 } else { 298 SYSCTL_ADD_S32(&rs->sysctl_ctx, 299 SYSCTL_CHILDREN(rl_sysctl_root), 300 OID_AUTO, "disable", CTLFLAG_RW, 301 &rs->rs_disable, 0, 302 "Disable this interface from new hdwr limiting?"); 303 } 304 SYSCTL_ADD_S32(&rs->sysctl_ctx, 305 SYSCTL_CHILDREN(rl_sysctl_root), 306 OID_AUTO, "minseg", CTLFLAG_RW, 307 &rs->rs_min_seg, 0, 308 "What is the minimum we need to send on this interface?"); 309 SYSCTL_ADD_U64(&rs->sysctl_ctx, 310 SYSCTL_CHILDREN(rl_sysctl_root), 311 OID_AUTO, "flow_limit", CTLFLAG_RW, 312 &rs->rs_flow_limit, 0, 313 "What is the limit for number of flows (0=unlimited)?"); 314 SYSCTL_ADD_S32(&rs->sysctl_ctx, 315 SYSCTL_CHILDREN(rl_sysctl_root), 316 OID_AUTO, "highest", CTLFLAG_RD, 317 &rs->rs_highest_valid, 0, 318 "Highest valid rate"); 319 SYSCTL_ADD_S32(&rs->sysctl_ctx, 320 SYSCTL_CHILDREN(rl_sysctl_root), 321 OID_AUTO, "lowest", CTLFLAG_RD, 322 &rs->rs_lowest_valid, 0, 323 "Lowest valid rate"); 324 SYSCTL_ADD_S32(&rs->sysctl_ctx, 325 SYSCTL_CHILDREN(rl_sysctl_root), 326 OID_AUTO, "flags", CTLFLAG_RD, 327 &rs->rs_flags, 0, 328 "What lags are on the entry?"); 329 SYSCTL_ADD_S32(&rs->sysctl_ctx, 330 SYSCTL_CHILDREN(rl_sysctl_root), 331 OID_AUTO, "numrates", CTLFLAG_RD, 332 &rs->rs_rate_cnt, 0, 333 "How many rates re there?"); 334 SYSCTL_ADD_U64(&rs->sysctl_ctx, 335 SYSCTL_CHILDREN(rl_sysctl_root), 336 OID_AUTO, "flows_using", CTLFLAG_RD, 337 &rs->rs_flows_using, 0, 338 "How many flows are using this interface now?"); 339 #ifdef DETAILED_RATELIMIT_SYSCTL 340 if (rs->rs_rlt && rs->rs_rate_cnt > 0) { 341 /* Lets display the rates */ 342 int i; 343 struct sysctl_oid *rl_rates; 344 struct sysctl_oid *rl_rate_num; 345 char rate_num[16]; 346 rl_rates = SYSCTL_ADD_NODE(&rs->sysctl_ctx, 347 SYSCTL_CHILDREN(rl_sysctl_root), 348 OID_AUTO, 349 "rate", 350 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 351 "Ratelist"); 352 for( i = 0; i < rs->rs_rate_cnt; i++) { 353 sprintf(rate_num, "%d", i); 354 rl_rate_num = SYSCTL_ADD_NODE(&rs->sysctl_ctx, 355 SYSCTL_CHILDREN(rl_rates), 356 OID_AUTO, 357 rate_num, 358 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 359 "Individual Rate"); 360 SYSCTL_ADD_U32(&rs->sysctl_ctx, 361 SYSCTL_CHILDREN(rl_rate_num), 362 OID_AUTO, "flags", CTLFLAG_RD, 363 &rs->rs_rlt[i].flags, 0, 364 "Flags on this rate"); 365 SYSCTL_ADD_U32(&rs->sysctl_ctx, 366 SYSCTL_CHILDREN(rl_rate_num), 367 OID_AUTO, "pacetime", CTLFLAG_RD, 368 &rs->rs_rlt[i].time_between, 0, 369 "Time hardware inserts between 1500 byte sends"); 370 SYSCTL_ADD_LONG(&rs->sysctl_ctx, 371 SYSCTL_CHILDREN(rl_rate_num), 372 OID_AUTO, "rate", CTLFLAG_RD, 373 &rs->rs_rlt[i].rate, 374 "Rate in bytes per second"); 375 SYSCTL_ADD_LONG(&rs->sysctl_ctx, 376 SYSCTL_CHILDREN(rl_rate_num), 377 OID_AUTO, "using", CTLFLAG_RD, 378 &rs->rs_rlt[i].using, 379 "Number of flows using"); 380 SYSCTL_ADD_LONG(&rs->sysctl_ctx, 381 SYSCTL_CHILDREN(rl_rate_num), 382 OID_AUTO, "enobufs", CTLFLAG_RD, 383 &rs->rs_rlt[i].rs_num_enobufs, 384 "Number of enobufs logged on this rate"); 385 386 } 387 } 388 #endif 389 } 390 391 static void 392 rs_destroy(epoch_context_t ctx) 393 { 394 struct tcp_rate_set *rs; 395 bool do_free_rs; 396 397 rs = __containerof(ctx, struct tcp_rate_set, rs_epoch_ctx); 398 399 mtx_lock(&rs_mtx); 400 rs->rs_flags &= ~RS_FUNERAL_SCHD; 401 /* 402 * In theory its possible (but unlikely) 403 * that while the delete was occuring 404 * and we were applying the DEAD flag 405 * someone slipped in and found the 406 * interface in a lookup. While we 407 * decided rs_flows_using were 0 and 408 * scheduling the epoch_call, the other 409 * thread incremented rs_flow_using. This 410 * is because users have a pointer and 411 * we only use the rs_flows_using in an 412 * atomic fashion, i.e. the other entities 413 * are not protected. To assure this did 414 * not occur, we check rs_flows_using here 415 * before deleting. 416 */ 417 do_free_rs = (rs->rs_flows_using == 0); 418 rs_number_dead--; 419 mtx_unlock(&rs_mtx); 420 421 if (do_free_rs) { 422 sysctl_ctx_free(&rs->sysctl_ctx); 423 free(rs->rs_rlt, M_TCPPACE); 424 free(rs, M_TCPPACE); 425 } 426 } 427 428 static void 429 rs_defer_destroy(struct tcp_rate_set *rs) 430 { 431 432 mtx_assert(&rs_mtx, MA_OWNED); 433 434 /* Check if already pending. */ 435 if (rs->rs_flags & RS_FUNERAL_SCHD) 436 return; 437 438 rs_number_dead++; 439 440 /* Set flag to only defer once. */ 441 rs->rs_flags |= RS_FUNERAL_SCHD; 442 NET_EPOCH_CALL(rs_destroy, &rs->rs_epoch_ctx); 443 } 444 445 #ifdef INET 446 extern counter_u64_t rate_limit_new; 447 extern counter_u64_t rate_limit_chg; 448 extern counter_u64_t rate_limit_set_ok; 449 extern counter_u64_t rate_limit_active; 450 extern counter_u64_t rate_limit_alloc_fail; 451 #endif 452 453 static int 454 rl_attach_txrtlmt(struct ifnet *ifp, 455 uint32_t flowtype, 456 int flowid, 457 uint64_t cfg_rate, 458 struct m_snd_tag **tag) 459 { 460 int error; 461 union if_snd_tag_alloc_params params = { 462 .rate_limit.hdr.type = IF_SND_TAG_TYPE_RATE_LIMIT, 463 .rate_limit.hdr.flowid = flowid, 464 .rate_limit.hdr.flowtype = flowtype, 465 .rate_limit.max_rate = cfg_rate, 466 .rate_limit.flags = M_NOWAIT, 467 }; 468 469 error = m_snd_tag_alloc(ifp, ¶ms, tag); 470 #ifdef INET 471 if (error == 0) { 472 counter_u64_add(rate_limit_set_ok, 1); 473 counter_u64_add(rate_limit_active, 1); 474 } else if (error != EOPNOTSUPP) 475 counter_u64_add(rate_limit_alloc_fail, 1); 476 #endif 477 return (error); 478 } 479 480 static void 481 populate_canned_table(struct tcp_rate_set *rs, const uint64_t *rate_table_act) 482 { 483 /* 484 * The internal table is "special", it 485 * is two seperate ordered tables that 486 * must be merged. We get here when the 487 * adapter specifies a number of rates that 488 * covers both ranges in the table in some 489 * form. 490 */ 491 int i, at_low, at_high; 492 uint8_t low_disabled = 0, high_disabled = 0; 493 494 for(i = 0, at_low = 0, at_high = RS_NEXT_ORDER_GROUP; i < rs->rs_rate_cnt; i++) { 495 rs->rs_rlt[i].flags = 0; 496 rs->rs_rlt[i].time_between = 0; 497 if ((low_disabled == 0) && 498 (high_disabled || 499 (rate_table_act[at_low] < rate_table_act[at_high]))) { 500 rs->rs_rlt[i].rate = rate_table_act[at_low]; 501 at_low++; 502 if (at_low == RS_NEXT_ORDER_GROUP) 503 low_disabled = 1; 504 } else if (high_disabled == 0) { 505 rs->rs_rlt[i].rate = rate_table_act[at_high]; 506 at_high++; 507 if (at_high == MAX_HDWR_RATES) 508 high_disabled = 1; 509 } 510 } 511 } 512 513 static struct tcp_rate_set * 514 rt_setup_new_rs(struct ifnet *ifp, int *error) 515 { 516 struct tcp_rate_set *rs; 517 const uint64_t *rate_table_act; 518 uint64_t lentim, res; 519 size_t sz; 520 uint32_t hash_type; 521 int i; 522 struct if_ratelimit_query_results rl; 523 struct sysctl_oid *rl_sysctl_root; 524 struct epoch_tracker et; 525 /* 526 * We expect to enter with the 527 * mutex locked. 528 */ 529 530 if (ifp->if_ratelimit_query == NULL) { 531 /* 532 * We can do nothing if we cannot 533 * get a query back from the driver. 534 */ 535 printf("Warning:No query functions for %s:%d-- failed\n", 536 ifp->if_dname, ifp->if_dunit); 537 return (NULL); 538 } 539 rs = malloc(sizeof(struct tcp_rate_set), M_TCPPACE, M_NOWAIT | M_ZERO); 540 if (rs == NULL) { 541 if (error) 542 *error = ENOMEM; 543 printf("Warning:No memory for malloc of tcp_rate_set\n"); 544 return (NULL); 545 } 546 memset(&rl, 0, sizeof(rl)); 547 rl.flags = RT_NOSUPPORT; 548 ifp->if_ratelimit_query(ifp, &rl); 549 if (rl.flags & RT_IS_UNUSABLE) { 550 /* 551 * The interface does not really support 552 * the rate-limiting. 553 */ 554 memset(rs, 0, sizeof(struct tcp_rate_set)); 555 rs->rs_ifp = ifp; 556 rs->rs_if_dunit = ifp->if_dunit; 557 rs->rs_flags = RS_INTF_NO_SUP; 558 rs->rs_disable = 1; 559 rs_number_alive++; 560 sysctl_ctx_init(&rs->sysctl_ctx); 561 rl_sysctl_root = SYSCTL_ADD_NODE(&rs->sysctl_ctx, 562 SYSCTL_STATIC_CHILDREN(_net_inet_tcp_rl), 563 OID_AUTO, 564 rs->rs_ifp->if_xname, 565 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 566 ""); 567 rl_add_syctl_entries(rl_sysctl_root, rs); 568 NET_EPOCH_ENTER(et); 569 mtx_lock(&rs_mtx); 570 CK_LIST_INSERT_HEAD(&int_rs, rs, next); 571 mtx_unlock(&rs_mtx); 572 NET_EPOCH_EXIT(et); 573 return (rs); 574 } else if ((rl.flags & RT_IS_INDIRECT) == RT_IS_INDIRECT) { 575 memset(rs, 0, sizeof(struct tcp_rate_set)); 576 rs->rs_ifp = ifp; 577 rs->rs_if_dunit = ifp->if_dunit; 578 rs->rs_flags = RS_IS_DEFF; 579 rs_number_alive++; 580 sysctl_ctx_init(&rs->sysctl_ctx); 581 rl_sysctl_root = SYSCTL_ADD_NODE(&rs->sysctl_ctx, 582 SYSCTL_STATIC_CHILDREN(_net_inet_tcp_rl), 583 OID_AUTO, 584 rs->rs_ifp->if_xname, 585 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 586 ""); 587 rl_add_syctl_entries(rl_sysctl_root, rs); 588 NET_EPOCH_ENTER(et); 589 mtx_lock(&rs_mtx); 590 CK_LIST_INSERT_HEAD(&int_rs, rs, next); 591 mtx_unlock(&rs_mtx); 592 NET_EPOCH_EXIT(et); 593 return (rs); 594 } else if ((rl.flags & RT_IS_FIXED_TABLE) == RT_IS_FIXED_TABLE) { 595 /* Mellanox C4 likely */ 596 rs->rs_ifp = ifp; 597 rs->rs_if_dunit = ifp->if_dunit; 598 rs->rs_rate_cnt = rl.number_of_rates; 599 rs->rs_min_seg = rl.min_segment_burst; 600 rs->rs_highest_valid = 0; 601 rs->rs_flow_limit = rl.max_flows; 602 rs->rs_flags = RS_IS_INTF | RS_NO_PRE; 603 rs->rs_disable = 0; 604 rate_table_act = rl.rate_table; 605 } else if ((rl.flags & RT_IS_SELECTABLE) == RT_IS_SELECTABLE) { 606 /* Chelsio, C5 and C6 of Mellanox? */ 607 rs->rs_ifp = ifp; 608 rs->rs_if_dunit = ifp->if_dunit; 609 rs->rs_rate_cnt = rl.number_of_rates; 610 rs->rs_min_seg = rl.min_segment_burst; 611 rs->rs_disable = 0; 612 rs->rs_flow_limit = rl.max_flows; 613 rate_table_act = desired_rates; 614 if ((rs->rs_rate_cnt > MAX_HDWR_RATES) && 615 (rs->rs_rate_cnt < ALL_HARDWARE_RATES)) { 616 /* 617 * Our desired table is not big 618 * enough, do what we can. 619 */ 620 rs->rs_rate_cnt = MAX_HDWR_RATES; 621 } 622 if (rs->rs_rate_cnt <= RS_ORDERED_COUNT) 623 rs->rs_flags = RS_IS_INTF; 624 else 625 rs->rs_flags = RS_IS_INTF | RS_INT_TBL; 626 if (rs->rs_rate_cnt >= ALL_HARDWARE_RATES) 627 rs->rs_rate_cnt = ALL_HARDWARE_RATES; 628 } else { 629 free(rs, M_TCPPACE); 630 return (NULL); 631 } 632 sz = sizeof(struct tcp_hwrate_limit_table) * rs->rs_rate_cnt; 633 rs->rs_rlt = malloc(sz, M_TCPPACE, M_NOWAIT); 634 if (rs->rs_rlt == NULL) { 635 if (error) 636 *error = ENOMEM; 637 bail: 638 free(rs, M_TCPPACE); 639 return (NULL); 640 } 641 if (rs->rs_rate_cnt >= ALL_HARDWARE_RATES) { 642 /* 643 * The interface supports all 644 * the rates we could possibly want. 645 */ 646 uint64_t rat; 647 648 rs->rs_rlt[0].rate = 12500; /* 100k */ 649 rs->rs_rlt[1].rate = 25000; /* 200k */ 650 rs->rs_rlt[2].rate = 62500; /* 500k */ 651 /* Note 125000 == 1Megabit 652 * populate 1Meg - 1000meg. 653 */ 654 for(i = 3, rat = 125000; i< (ALL_HARDWARE_RATES-1); i++) { 655 rs->rs_rlt[i].rate = rat; 656 rat += 125000; 657 } 658 rs->rs_rlt[(ALL_HARDWARE_RATES-1)].rate = 1250000000; 659 } else if (rs->rs_flags & RS_INT_TBL) { 660 /* We populate this in a special way */ 661 populate_canned_table(rs, rate_table_act); 662 } else { 663 /* 664 * Just copy in the rates from 665 * the table, it is in order. 666 */ 667 for (i=0; i<rs->rs_rate_cnt; i++) { 668 rs->rs_rlt[i].rate = rate_table_act[i]; 669 rs->rs_rlt[i].time_between = 0; 670 rs->rs_rlt[i].flags = 0; 671 } 672 } 673 for (i = (rs->rs_rate_cnt - 1); i >= 0; i--) { 674 /* 675 * We go backwards through the list so that if we can't get 676 * a rate and fail to init one, we have at least a chance of 677 * getting the highest one. 678 */ 679 rs->rs_rlt[i].ptbl = rs; 680 rs->rs_rlt[i].tag = NULL; 681 rs->rs_rlt[i].using = 0; 682 rs->rs_rlt[i].rs_num_enobufs = 0; 683 /* 684 * Calculate the time between. 685 */ 686 lentim = ETHERNET_SEGMENT_SIZE * USECS_IN_SECOND; 687 res = lentim / rs->rs_rlt[i].rate; 688 if (res > 0) 689 rs->rs_rlt[i].time_between = res; 690 else 691 rs->rs_rlt[i].time_between = 1; 692 if (rs->rs_flags & RS_NO_PRE) { 693 rs->rs_rlt[i].flags = HDWRPACE_INITED; 694 rs->rs_lowest_valid = i; 695 } else { 696 int err; 697 698 if ((rl.flags & RT_IS_SETUP_REQ) && 699 (ifp->if_ratelimit_query)) { 700 err = ifp->if_ratelimit_setup(ifp, 701 rs->rs_rlt[i].rate, i); 702 if (err) 703 goto handle_err; 704 } 705 #ifdef RSS 706 hash_type = M_HASHTYPE_RSS_TCP_IPV4; 707 #else 708 hash_type = M_HASHTYPE_OPAQUE_HASH; 709 #endif 710 err = rl_attach_txrtlmt(ifp, 711 hash_type, 712 (i + 1), 713 rs->rs_rlt[i].rate, 714 &rs->rs_rlt[i].tag); 715 if (err) { 716 handle_err: 717 if (i == (rs->rs_rate_cnt - 1)) { 718 /* 719 * Huh - first rate and we can't get 720 * it? 721 */ 722 free(rs->rs_rlt, M_TCPPACE); 723 if (error) 724 *error = err; 725 goto bail; 726 } else { 727 if (error) 728 *error = err; 729 } 730 break; 731 } else { 732 rs->rs_rlt[i].flags = HDWRPACE_INITED | HDWRPACE_TAGPRESENT; 733 rs->rs_lowest_valid = i; 734 } 735 } 736 } 737 /* Did we get at least 1 rate? */ 738 if (rs->rs_rlt[(rs->rs_rate_cnt - 1)].flags & HDWRPACE_INITED) 739 rs->rs_highest_valid = rs->rs_rate_cnt - 1; 740 else { 741 free(rs->rs_rlt, M_TCPPACE); 742 goto bail; 743 } 744 rs_number_alive++; 745 sysctl_ctx_init(&rs->sysctl_ctx); 746 rl_sysctl_root = SYSCTL_ADD_NODE(&rs->sysctl_ctx, 747 SYSCTL_STATIC_CHILDREN(_net_inet_tcp_rl), 748 OID_AUTO, 749 rs->rs_ifp->if_xname, 750 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 751 ""); 752 rl_add_syctl_entries(rl_sysctl_root, rs); 753 NET_EPOCH_ENTER(et); 754 mtx_lock(&rs_mtx); 755 CK_LIST_INSERT_HEAD(&int_rs, rs, next); 756 mtx_unlock(&rs_mtx); 757 NET_EPOCH_EXIT(et); 758 return (rs); 759 } 760 761 /* 762 * For an explanation of why the argument is volatile please 763 * look at the comments around rt_setup_rate(). 764 */ 765 static const struct tcp_hwrate_limit_table * 766 tcp_int_find_suitable_rate(const volatile struct tcp_rate_set *rs, 767 uint64_t bytes_per_sec, uint32_t flags, uint64_t *lower_rate) 768 { 769 struct tcp_hwrate_limit_table *arte = NULL, *rte = NULL; 770 uint64_t mbits_per_sec, ind_calc, previous_rate = 0; 771 int i; 772 773 mbits_per_sec = (bytes_per_sec * 8); 774 if (flags & RS_PACING_LT) { 775 if ((mbits_per_sec < RS_ONE_MEGABIT_PERSEC) && 776 (rs->rs_lowest_valid <= 2)){ 777 /* 778 * Smaller than 1Meg, only 779 * 3 entries can match it. 780 */ 781 previous_rate = 0; 782 for(i = rs->rs_lowest_valid; i < 3; i++) { 783 if (bytes_per_sec <= rs->rs_rlt[i].rate) { 784 rte = &rs->rs_rlt[i]; 785 break; 786 } else if (rs->rs_rlt[i].flags & HDWRPACE_INITED) { 787 arte = &rs->rs_rlt[i]; 788 } 789 previous_rate = rs->rs_rlt[i].rate; 790 } 791 goto done; 792 } else if ((mbits_per_sec > RS_ONE_GIGABIT_PERSEC) && 793 (rs->rs_rlt[(ALL_HARDWARE_RATES-1)].flags & HDWRPACE_INITED)){ 794 /* 795 * Larger than 1G (the majority of 796 * our table. 797 */ 798 if (mbits_per_sec < RS_TEN_GIGABIT_PERSEC) 799 rte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)]; 800 else 801 arte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)]; 802 previous_rate = rs->rs_rlt[(ALL_HARDWARE_RATES-2)].rate; 803 goto done; 804 } 805 /* 806 * If we reach here its in our table (between 1Meg - 1000Meg), 807 * just take the rounded down mbits per second, and add 808 * 1Megabit to it, from this we can calculate 809 * the index in the table. 810 */ 811 ind_calc = mbits_per_sec/RS_ONE_MEGABIT_PERSEC; 812 if ((ind_calc * RS_ONE_MEGABIT_PERSEC) != mbits_per_sec) 813 ind_calc++; 814 /* our table is offset by 3, we add 2 */ 815 ind_calc += 2; 816 if (ind_calc > (ALL_HARDWARE_RATES-1)) { 817 /* This should not happen */ 818 ind_calc = ALL_HARDWARE_RATES-1; 819 } 820 if ((ind_calc >= rs->rs_lowest_valid) && 821 (ind_calc <= rs->rs_highest_valid)) { 822 rte = &rs->rs_rlt[ind_calc]; 823 if (ind_calc >= 1) 824 previous_rate = rs->rs_rlt[(ind_calc-1)].rate; 825 } 826 } else if (flags & RS_PACING_EXACT_MATCH) { 827 if ((mbits_per_sec < RS_ONE_MEGABIT_PERSEC) && 828 (rs->rs_lowest_valid <= 2)){ 829 for(i = rs->rs_lowest_valid; i < 3; i++) { 830 if (bytes_per_sec == rs->rs_rlt[i].rate) { 831 rte = &rs->rs_rlt[i]; 832 break; 833 } 834 } 835 } else if ((mbits_per_sec > RS_ONE_GIGABIT_PERSEC) && 836 (rs->rs_rlt[(ALL_HARDWARE_RATES-1)].flags & HDWRPACE_INITED)) { 837 /* > 1Gbps only one rate */ 838 if (bytes_per_sec == rs->rs_rlt[(ALL_HARDWARE_RATES-1)].rate) { 839 /* Its 10G wow */ 840 rte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)]; 841 } 842 } else { 843 /* Ok it must be a exact meg (its between 1G and 1Meg) */ 844 ind_calc = mbits_per_sec/RS_ONE_MEGABIT_PERSEC; 845 if ((ind_calc * RS_ONE_MEGABIT_PERSEC) == mbits_per_sec) { 846 /* its an exact Mbps */ 847 ind_calc += 2; 848 if (ind_calc > (ALL_HARDWARE_RATES-1)) { 849 /* This should not happen */ 850 ind_calc = ALL_HARDWARE_RATES-1; 851 } 852 if (rs->rs_rlt[ind_calc].flags & HDWRPACE_INITED) 853 rte = &rs->rs_rlt[ind_calc]; 854 } 855 } 856 } else { 857 /* we want greater than the requested rate */ 858 if ((mbits_per_sec < RS_ONE_MEGABIT_PERSEC) && 859 (rs->rs_lowest_valid <= 2)){ 860 arte = &rs->rs_rlt[3]; /* set alternate to 1Meg */ 861 for (i=2; i>=rs->rs_lowest_valid; i--) { 862 if (bytes_per_sec < rs->rs_rlt[i].rate) { 863 rte = &rs->rs_rlt[i]; 864 if (i >= 1) { 865 previous_rate = rs->rs_rlt[(i-1)].rate; 866 } 867 break; 868 } else if ((flags & RS_PACING_GEQ) && 869 (bytes_per_sec == rs->rs_rlt[i].rate)) { 870 rte = &rs->rs_rlt[i]; 871 if (i >= 1) { 872 previous_rate = rs->rs_rlt[(i-1)].rate; 873 } 874 break; 875 } else { 876 arte = &rs->rs_rlt[i]; /* new alternate */ 877 } 878 } 879 } else if (mbits_per_sec > RS_ONE_GIGABIT_PERSEC) { 880 if ((bytes_per_sec < rs->rs_rlt[(ALL_HARDWARE_RATES-1)].rate) && 881 (rs->rs_rlt[(ALL_HARDWARE_RATES-1)].flags & HDWRPACE_INITED)){ 882 /* Our top rate is larger than the request */ 883 rte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)]; 884 } else if ((flags & RS_PACING_GEQ) && 885 (bytes_per_sec == rs->rs_rlt[(ALL_HARDWARE_RATES-1)].rate) && 886 (rs->rs_rlt[(ALL_HARDWARE_RATES-1)].flags & HDWRPACE_INITED)) { 887 /* It matches our top rate */ 888 rte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)]; 889 } else if (rs->rs_rlt[(ALL_HARDWARE_RATES-1)].flags & HDWRPACE_INITED) { 890 /* The top rate is an alternative */ 891 arte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)]; 892 } 893 previous_rate = rs->rs_rlt[(ALL_HARDWARE_RATES-2)].rate; 894 } else { 895 /* Its in our range 1Meg - 1Gig */ 896 if (flags & RS_PACING_GEQ) { 897 ind_calc = mbits_per_sec/RS_ONE_MEGABIT_PERSEC; 898 if ((ind_calc * RS_ONE_MEGABIT_PERSEC) == mbits_per_sec) { 899 if (ind_calc > (ALL_HARDWARE_RATES-1)) { 900 /* This should not happen */ 901 ind_calc = (ALL_HARDWARE_RATES-1); 902 } 903 rte = &rs->rs_rlt[ind_calc]; 904 if (ind_calc >= 1) 905 previous_rate = rs->rs_rlt[(ind_calc-1)].rate; 906 } 907 goto done; 908 } 909 ind_calc = (mbits_per_sec + (RS_ONE_MEGABIT_PERSEC-1))/RS_ONE_MEGABIT_PERSEC; 910 ind_calc += 2; 911 if (ind_calc > (ALL_HARDWARE_RATES-1)) { 912 /* This should not happen */ 913 ind_calc = ALL_HARDWARE_RATES-1; 914 } 915 if (rs->rs_rlt[ind_calc].flags & HDWRPACE_INITED) { 916 rte = &rs->rs_rlt[ind_calc]; 917 if (ind_calc >= 1) 918 previous_rate = rs->rs_rlt[(ind_calc-1)].rate; 919 } 920 } 921 } 922 done: 923 if ((rte == NULL) && 924 (arte != NULL) && 925 (flags & RS_PACING_SUB_OK)) { 926 /* We can use the substitute */ 927 rte = arte; 928 } 929 if (lower_rate) 930 *lower_rate = previous_rate; 931 return (rte); 932 } 933 934 /* 935 * For an explanation of why the argument is volatile please 936 * look at the comments around rt_setup_rate(). 937 */ 938 static const struct tcp_hwrate_limit_table * 939 tcp_find_suitable_rate(const volatile struct tcp_rate_set *rs, uint64_t bytes_per_sec, uint32_t flags, uint64_t *lower_rate) 940 { 941 /** 942 * Hunt the rate table with the restrictions in flags and find a 943 * suitable rate if possible. 944 * RS_PACING_EXACT_MATCH - look for an exact match to rate. 945 * RS_PACING_GT - must be greater than. 946 * RS_PACING_GEQ - must be greater than or equal. 947 * RS_PACING_LT - must be less than. 948 * RS_PACING_SUB_OK - If we don't meet criteria a 949 * substitute is ok. 950 */ 951 int i, matched; 952 struct tcp_hwrate_limit_table *rte = NULL; 953 uint64_t previous_rate = 0; 954 955 if ((rs->rs_flags & RS_INT_TBL) && 956 (rs->rs_rate_cnt >= ALL_HARDWARE_RATES)) { 957 /* 958 * Here we don't want to paw thru 959 * a big table, we have everything 960 * from 1Meg - 1000Meg in 1Meg increments. 961 * Use an alternate method to "lookup". 962 */ 963 return (tcp_int_find_suitable_rate(rs, bytes_per_sec, flags, lower_rate)); 964 } 965 if ((flags & RS_PACING_LT) || 966 (flags & RS_PACING_EXACT_MATCH)) { 967 /* 968 * For exact and less than we go forward through the table. 969 * This way when we find one larger we stop (exact was a 970 * toss up). 971 */ 972 for (i = rs->rs_lowest_valid, matched = 0; i <= rs->rs_highest_valid; i++) { 973 if ((flags & RS_PACING_EXACT_MATCH) && 974 (bytes_per_sec == rs->rs_rlt[i].rate)) { 975 rte = &rs->rs_rlt[i]; 976 matched = 1; 977 if (lower_rate != NULL) 978 *lower_rate = previous_rate; 979 break; 980 } else if ((flags & RS_PACING_LT) && 981 (bytes_per_sec <= rs->rs_rlt[i].rate)) { 982 rte = &rs->rs_rlt[i]; 983 matched = 1; 984 if (lower_rate != NULL) 985 *lower_rate = previous_rate; 986 break; 987 } 988 previous_rate = rs->rs_rlt[i].rate; 989 if (bytes_per_sec > rs->rs_rlt[i].rate) 990 break; 991 } 992 if ((matched == 0) && 993 (flags & RS_PACING_LT) && 994 (flags & RS_PACING_SUB_OK)) { 995 /* Kick in a substitute (the lowest) */ 996 rte = &rs->rs_rlt[rs->rs_lowest_valid]; 997 } 998 } else { 999 /* 1000 * Here we go backward through the table so that we can find 1001 * the one greater in theory faster (but its probably a 1002 * wash). 1003 */ 1004 for (i = rs->rs_highest_valid, matched = 0; i >= rs->rs_lowest_valid; i--) { 1005 if (rs->rs_rlt[i].rate > bytes_per_sec) { 1006 /* A possible candidate */ 1007 rte = &rs->rs_rlt[i]; 1008 } 1009 if ((flags & RS_PACING_GEQ) && 1010 (bytes_per_sec == rs->rs_rlt[i].rate)) { 1011 /* An exact match and we want equal */ 1012 matched = 1; 1013 rte = &rs->rs_rlt[i]; 1014 break; 1015 } else if (rte) { 1016 /* 1017 * Found one that is larger than but don't 1018 * stop, there may be a more closer match. 1019 */ 1020 matched = 1; 1021 } 1022 if (rs->rs_rlt[i].rate < bytes_per_sec) { 1023 /* 1024 * We found a table entry that is smaller, 1025 * stop there will be none greater or equal. 1026 */ 1027 if (lower_rate != NULL) 1028 *lower_rate = rs->rs_rlt[i].rate; 1029 break; 1030 } 1031 } 1032 if ((matched == 0) && 1033 (flags & RS_PACING_SUB_OK)) { 1034 /* Kick in a substitute (the highest) */ 1035 rte = &rs->rs_rlt[rs->rs_highest_valid]; 1036 } 1037 } 1038 return (rte); 1039 } 1040 1041 static struct ifnet * 1042 rt_find_real_interface(struct ifnet *ifp, struct inpcb *inp, int *error) 1043 { 1044 struct ifnet *tifp; 1045 struct m_snd_tag *tag, *ntag; 1046 union if_snd_tag_alloc_params params = { 1047 .rate_limit.hdr.type = IF_SND_TAG_TYPE_RATE_LIMIT, 1048 .rate_limit.hdr.flowid = inp->inp_flowid, 1049 .rate_limit.hdr.numa_domain = inp->inp_numa_domain, 1050 .rate_limit.max_rate = COMMON_RATE, 1051 .rate_limit.flags = M_NOWAIT, 1052 }; 1053 int err; 1054 #ifdef RSS 1055 params.rate_limit.hdr.flowtype = ((inp->inp_vflag & INP_IPV6) ? 1056 M_HASHTYPE_RSS_TCP_IPV6 : M_HASHTYPE_RSS_TCP_IPV4); 1057 #else 1058 params.rate_limit.hdr.flowtype = M_HASHTYPE_OPAQUE_HASH; 1059 #endif 1060 err = m_snd_tag_alloc(ifp, ¶ms, &tag); 1061 if (err) { 1062 /* Failed to setup a tag? */ 1063 if (error) 1064 *error = err; 1065 return (NULL); 1066 } 1067 ntag = tag; 1068 while (ntag->sw->next_snd_tag != NULL) { 1069 ntag = ntag->sw->next_snd_tag(ntag); 1070 } 1071 tifp = ntag->ifp; 1072 m_snd_tag_rele(tag); 1073 return (tifp); 1074 } 1075 1076 static void 1077 rl_increment_using(const struct tcp_hwrate_limit_table *rte) 1078 { 1079 struct tcp_hwrate_limit_table *decon_rte; 1080 1081 decon_rte = __DECONST(struct tcp_hwrate_limit_table *, rte); 1082 atomic_add_long(&decon_rte->using, 1); 1083 } 1084 1085 static void 1086 rl_decrement_using(const struct tcp_hwrate_limit_table *rte) 1087 { 1088 struct tcp_hwrate_limit_table *decon_rte; 1089 1090 decon_rte = __DECONST(struct tcp_hwrate_limit_table *, rte); 1091 atomic_subtract_long(&decon_rte->using, 1); 1092 } 1093 1094 void 1095 tcp_rl_log_enobuf(const struct tcp_hwrate_limit_table *rte) 1096 { 1097 struct tcp_hwrate_limit_table *decon_rte; 1098 1099 decon_rte = __DECONST(struct tcp_hwrate_limit_table *, rte); 1100 atomic_add_long(&decon_rte->rs_num_enobufs, 1); 1101 } 1102 1103 /* 1104 * Do NOT take the __noinline out of the 1105 * find_rs_for_ifp() function. If you do the inline 1106 * of it for the rt_setup_rate() will show you a 1107 * compiler bug. For some reason the compiler thinks 1108 * the list can never be empty. The consequence of 1109 * this will be a crash when we dereference NULL 1110 * if an ifp is removed just has a hw rate limit 1111 * is attempted. If you are working on the compiler 1112 * and want to "test" this go ahead and take the noinline 1113 * out otherwise let sleeping dogs ly until such time 1114 * as we get a compiler fix 10/2/20 -- RRS 1115 */ 1116 static __noinline struct tcp_rate_set * 1117 find_rs_for_ifp(struct ifnet *ifp) 1118 { 1119 struct tcp_rate_set *rs; 1120 1121 CK_LIST_FOREACH(rs, &int_rs, next) { 1122 if ((rs->rs_ifp == ifp) && 1123 (rs->rs_if_dunit == ifp->if_dunit)) { 1124 /* Ok we found it */ 1125 return (rs); 1126 } 1127 } 1128 return (NULL); 1129 } 1130 1131 1132 static const struct tcp_hwrate_limit_table * 1133 rt_setup_rate(struct inpcb *inp, struct ifnet *ifp, uint64_t bytes_per_sec, 1134 uint32_t flags, int *error, uint64_t *lower_rate) 1135 { 1136 /* First lets find the interface if it exists */ 1137 const struct tcp_hwrate_limit_table *rte; 1138 /* 1139 * So why is rs volatile? This is to defeat a 1140 * compiler bug where in the compiler is convinced 1141 * that rs can never be NULL (which is not true). Because 1142 * of its conviction it nicely optimizes out the if ((rs == NULL 1143 * below which means if you get a NULL back you dereference it. 1144 */ 1145 volatile struct tcp_rate_set *rs; 1146 struct epoch_tracker et; 1147 struct ifnet *oifp = ifp; 1148 int err; 1149 1150 NET_EPOCH_ENTER(et); 1151 use_real_interface: 1152 rs = find_rs_for_ifp(ifp); 1153 if ((rs == NULL) || 1154 (rs->rs_flags & RS_INTF_NO_SUP) || 1155 (rs->rs_flags & RS_IS_DEAD)) { 1156 /* 1157 * This means we got a packet *before* 1158 * the IF-UP was processed below, <or> 1159 * while or after we already received an interface 1160 * departed event. In either case we really don't 1161 * want to do anything with pacing, in 1162 * the departing case the packet is not 1163 * going to go very far. The new case 1164 * might be arguable, but its impossible 1165 * to tell from the departing case. 1166 */ 1167 if (error) 1168 *error = ENODEV; 1169 NET_EPOCH_EXIT(et); 1170 return (NULL); 1171 } 1172 1173 if ((rs == NULL) || (rs->rs_disable != 0)) { 1174 if (error) 1175 *error = ENOSPC; 1176 NET_EPOCH_EXIT(et); 1177 return (NULL); 1178 } 1179 if (rs->rs_flags & RS_IS_DEFF) { 1180 /* We need to find the real interface */ 1181 struct ifnet *tifp; 1182 1183 tifp = rt_find_real_interface(ifp, inp, error); 1184 if (tifp == NULL) { 1185 if (rs->rs_disable && error) 1186 *error = ENOTSUP; 1187 NET_EPOCH_EXIT(et); 1188 return (NULL); 1189 } 1190 KASSERT((tifp != ifp), 1191 ("Lookup failure ifp:%p inp:%p rt_find_real_interface() returns the same interface tifp:%p?\n", 1192 ifp, inp, tifp)); 1193 ifp = tifp; 1194 goto use_real_interface; 1195 } 1196 if (rs->rs_flow_limit && 1197 ((rs->rs_flows_using + 1) > rs->rs_flow_limit)) { 1198 if (error) 1199 *error = ENOSPC; 1200 NET_EPOCH_EXIT(et); 1201 return (NULL); 1202 } 1203 rte = tcp_find_suitable_rate(rs, bytes_per_sec, flags, lower_rate); 1204 if (rte) { 1205 err = in_pcbattach_txrtlmt(inp, oifp, 1206 inp->inp_flowtype, 1207 inp->inp_flowid, 1208 rte->rate, 1209 &inp->inp_snd_tag); 1210 if (err) { 1211 /* Failed to attach */ 1212 if (error) 1213 *error = err; 1214 rte = NULL; 1215 } else { 1216 KASSERT((inp->inp_snd_tag != NULL) , 1217 ("Setup rate has no snd_tag inp:%p rte:%p rate:%llu rs:%p", 1218 inp, rte, (unsigned long long)rte->rate, rs)); 1219 #ifdef INET 1220 counter_u64_add(rate_limit_new, 1); 1221 #endif 1222 } 1223 } 1224 if (rte) { 1225 /* 1226 * We use an atomic here for accounting so we don't have to 1227 * use locks when freeing. 1228 */ 1229 atomic_add_64(&rs->rs_flows_using, 1); 1230 } 1231 NET_EPOCH_EXIT(et); 1232 return (rte); 1233 } 1234 1235 static void 1236 tcp_rl_ifnet_link(void *arg __unused, struct ifnet *ifp, int link_state) 1237 { 1238 int error; 1239 struct tcp_rate_set *rs; 1240 struct epoch_tracker et; 1241 1242 if (((ifp->if_capenable & IFCAP_TXRTLMT) == 0) || 1243 (link_state != LINK_STATE_UP)) { 1244 /* 1245 * We only care on an interface going up that is rate-limit 1246 * capable. 1247 */ 1248 return; 1249 } 1250 NET_EPOCH_ENTER(et); 1251 mtx_lock(&rs_mtx); 1252 rs = find_rs_for_ifp(ifp); 1253 if (rs) { 1254 /* We already have initialized this guy */ 1255 mtx_unlock(&rs_mtx); 1256 NET_EPOCH_EXIT(et); 1257 return; 1258 } 1259 mtx_unlock(&rs_mtx); 1260 NET_EPOCH_EXIT(et); 1261 rt_setup_new_rs(ifp, &error); 1262 } 1263 1264 static void 1265 tcp_rl_ifnet_departure(void *arg __unused, struct ifnet *ifp) 1266 { 1267 struct tcp_rate_set *rs; 1268 struct epoch_tracker et; 1269 int i; 1270 1271 NET_EPOCH_ENTER(et); 1272 mtx_lock(&rs_mtx); 1273 rs = find_rs_for_ifp(ifp); 1274 if (rs) { 1275 CK_LIST_REMOVE(rs, next); 1276 rs_number_alive--; 1277 rs->rs_flags |= RS_IS_DEAD; 1278 for (i = 0; i < rs->rs_rate_cnt; i++) { 1279 if (rs->rs_rlt[i].flags & HDWRPACE_TAGPRESENT) { 1280 in_pcbdetach_tag(rs->rs_rlt[i].tag); 1281 rs->rs_rlt[i].tag = NULL; 1282 } 1283 rs->rs_rlt[i].flags = HDWRPACE_IFPDEPARTED; 1284 } 1285 if (rs->rs_flows_using == 0) 1286 rs_defer_destroy(rs); 1287 } 1288 mtx_unlock(&rs_mtx); 1289 NET_EPOCH_EXIT(et); 1290 } 1291 1292 static void 1293 tcp_rl_shutdown(void *arg __unused, int howto __unused) 1294 { 1295 struct tcp_rate_set *rs, *nrs; 1296 struct epoch_tracker et; 1297 int i; 1298 1299 NET_EPOCH_ENTER(et); 1300 mtx_lock(&rs_mtx); 1301 CK_LIST_FOREACH_SAFE(rs, &int_rs, next, nrs) { 1302 CK_LIST_REMOVE(rs, next); 1303 rs_number_alive--; 1304 rs->rs_flags |= RS_IS_DEAD; 1305 for (i = 0; i < rs->rs_rate_cnt; i++) { 1306 if (rs->rs_rlt[i].flags & HDWRPACE_TAGPRESENT) { 1307 in_pcbdetach_tag(rs->rs_rlt[i].tag); 1308 rs->rs_rlt[i].tag = NULL; 1309 } 1310 rs->rs_rlt[i].flags = HDWRPACE_IFPDEPARTED; 1311 } 1312 if (rs->rs_flows_using == 0) 1313 rs_defer_destroy(rs); 1314 } 1315 mtx_unlock(&rs_mtx); 1316 NET_EPOCH_EXIT(et); 1317 } 1318 1319 const struct tcp_hwrate_limit_table * 1320 tcp_set_pacing_rate(struct tcpcb *tp, struct ifnet *ifp, 1321 uint64_t bytes_per_sec, int flags, int *error, uint64_t *lower_rate) 1322 { 1323 const struct tcp_hwrate_limit_table *rte; 1324 #ifdef KERN_TLS 1325 struct ktls_session *tls; 1326 #endif 1327 1328 INP_WLOCK_ASSERT(tp->t_inpcb); 1329 1330 if (tp->t_inpcb->inp_snd_tag == NULL) { 1331 /* 1332 * We are setting up a rate for the first time. 1333 */ 1334 if ((ifp->if_capenable & IFCAP_TXRTLMT) == 0) { 1335 /* Not supported by the egress */ 1336 if (error) 1337 *error = ENODEV; 1338 return (NULL); 1339 } 1340 #ifdef KERN_TLS 1341 tls = NULL; 1342 if (tp->t_inpcb->inp_socket->so_snd.sb_flags & SB_TLS_IFNET) { 1343 tls = tp->t_inpcb->inp_socket->so_snd.sb_tls_info; 1344 1345 if ((ifp->if_capenable & IFCAP_TXTLS_RTLMT) == 0 || 1346 tls->mode != TCP_TLS_MODE_IFNET) { 1347 if (error) 1348 *error = ENODEV; 1349 return (NULL); 1350 } 1351 } 1352 #endif 1353 rte = rt_setup_rate(tp->t_inpcb, ifp, bytes_per_sec, flags, error, lower_rate); 1354 if (rte) 1355 rl_increment_using(rte); 1356 #ifdef KERN_TLS 1357 if (rte != NULL && tls != NULL && tls->snd_tag != NULL) { 1358 /* 1359 * Fake a route change error to reset the TLS 1360 * send tag. This will convert the existing 1361 * tag to a TLS ratelimit tag. 1362 */ 1363 MPASS(tls->snd_tag->sw->type == IF_SND_TAG_TYPE_TLS); 1364 ktls_output_eagain(tp->t_inpcb, tls); 1365 } 1366 #endif 1367 } else { 1368 /* 1369 * We are modifying a rate, wrong interface? 1370 */ 1371 if (error) 1372 *error = EINVAL; 1373 rte = NULL; 1374 } 1375 if (rte != NULL) { 1376 tp->t_pacing_rate = rte->rate; 1377 *error = 0; 1378 } 1379 return (rte); 1380 } 1381 1382 const struct tcp_hwrate_limit_table * 1383 tcp_chg_pacing_rate(const struct tcp_hwrate_limit_table *crte, 1384 struct tcpcb *tp, struct ifnet *ifp, 1385 uint64_t bytes_per_sec, int flags, int *error, uint64_t *lower_rate) 1386 { 1387 const struct tcp_hwrate_limit_table *nrte; 1388 const struct tcp_rate_set *rs; 1389 #ifdef KERN_TLS 1390 struct ktls_session *tls = NULL; 1391 #endif 1392 int err; 1393 1394 INP_WLOCK_ASSERT(tp->t_inpcb); 1395 1396 if (crte == NULL) { 1397 /* Wrong interface */ 1398 if (error) 1399 *error = EINVAL; 1400 return (NULL); 1401 } 1402 1403 #ifdef KERN_TLS 1404 if (tp->t_inpcb->inp_socket->so_snd.sb_flags & SB_TLS_IFNET) { 1405 tls = tp->t_inpcb->inp_socket->so_snd.sb_tls_info; 1406 if (tls->mode != TCP_TLS_MODE_IFNET) 1407 tls = NULL; 1408 else if (tls->snd_tag != NULL && 1409 tls->snd_tag->sw->type != IF_SND_TAG_TYPE_TLS_RATE_LIMIT) { 1410 if (!tls->reset_pending) { 1411 /* 1412 * NIC probably doesn't support 1413 * ratelimit TLS tags if it didn't 1414 * allocate one when an existing rate 1415 * was present, so ignore. 1416 */ 1417 tcp_rel_pacing_rate(crte, tp); 1418 if (error) 1419 *error = EOPNOTSUPP; 1420 return (NULL); 1421 } 1422 1423 /* 1424 * The send tag is being converted, so set the 1425 * rate limit on the inpcb tag. There is a 1426 * race that the new NIC send tag might use 1427 * the current rate instead of this one. 1428 */ 1429 tls = NULL; 1430 } 1431 } 1432 #endif 1433 if (tp->t_inpcb->inp_snd_tag == NULL) { 1434 /* Wrong interface */ 1435 tcp_rel_pacing_rate(crte, tp); 1436 if (error) 1437 *error = EINVAL; 1438 return (NULL); 1439 } 1440 rs = crte->ptbl; 1441 if ((rs->rs_flags & RS_IS_DEAD) || 1442 (crte->flags & HDWRPACE_IFPDEPARTED)) { 1443 /* Release the rate, and try anew */ 1444 1445 tcp_rel_pacing_rate(crte, tp); 1446 nrte = tcp_set_pacing_rate(tp, ifp, 1447 bytes_per_sec, flags, error, lower_rate); 1448 return (nrte); 1449 } 1450 nrte = tcp_find_suitable_rate(rs, bytes_per_sec, flags, lower_rate); 1451 if (nrte == crte) { 1452 /* No change */ 1453 if (error) 1454 *error = 0; 1455 return (crte); 1456 } 1457 if (nrte == NULL) { 1458 /* Release the old rate */ 1459 if (error) 1460 *error = ENOENT; 1461 tcp_rel_pacing_rate(crte, tp); 1462 return (NULL); 1463 } 1464 rl_decrement_using(crte); 1465 rl_increment_using(nrte); 1466 /* Change rates to our new entry */ 1467 #ifdef KERN_TLS 1468 if (tls != NULL) 1469 err = ktls_modify_txrtlmt(tls, nrte->rate); 1470 else 1471 #endif 1472 err = in_pcbmodify_txrtlmt(tp->t_inpcb, nrte->rate); 1473 if (err) { 1474 struct tcp_rate_set *lrs; 1475 uint64_t pre; 1476 1477 rl_decrement_using(nrte); 1478 lrs = __DECONST(struct tcp_rate_set *, rs); 1479 pre = atomic_fetchadd_64(&lrs->rs_flows_using, -1); 1480 /* Do we still have a snd-tag attached? */ 1481 if (tp->t_inpcb->inp_snd_tag) 1482 in_pcbdetach_txrtlmt(tp->t_inpcb); 1483 1484 if (pre == 1) { 1485 struct epoch_tracker et; 1486 1487 NET_EPOCH_ENTER(et); 1488 mtx_lock(&rs_mtx); 1489 /* 1490 * Is it dead? 1491 */ 1492 if (lrs->rs_flags & RS_IS_DEAD) 1493 rs_defer_destroy(lrs); 1494 mtx_unlock(&rs_mtx); 1495 NET_EPOCH_EXIT(et); 1496 } 1497 if (error) 1498 *error = err; 1499 return (NULL); 1500 } else { 1501 #ifdef INET 1502 counter_u64_add(rate_limit_chg, 1); 1503 #endif 1504 } 1505 if (error) 1506 *error = 0; 1507 tp->t_pacing_rate = nrte->rate; 1508 return (nrte); 1509 } 1510 1511 void 1512 tcp_rel_pacing_rate(const struct tcp_hwrate_limit_table *crte, struct tcpcb *tp) 1513 { 1514 const struct tcp_rate_set *crs; 1515 struct tcp_rate_set *rs; 1516 uint64_t pre; 1517 1518 INP_WLOCK_ASSERT(tp->t_inpcb); 1519 1520 tp->t_pacing_rate = -1; 1521 crs = crte->ptbl; 1522 /* 1523 * Now we must break the const 1524 * in order to release our refcount. 1525 */ 1526 rs = __DECONST(struct tcp_rate_set *, crs); 1527 rl_decrement_using(crte); 1528 pre = atomic_fetchadd_64(&rs->rs_flows_using, -1); 1529 if (pre == 1) { 1530 struct epoch_tracker et; 1531 1532 NET_EPOCH_ENTER(et); 1533 mtx_lock(&rs_mtx); 1534 /* 1535 * Is it dead? 1536 */ 1537 if (rs->rs_flags & RS_IS_DEAD) 1538 rs_defer_destroy(rs); 1539 mtx_unlock(&rs_mtx); 1540 NET_EPOCH_EXIT(et); 1541 } 1542 1543 /* 1544 * XXX: If this connection is using ifnet TLS, should we 1545 * switch it to using an unlimited rate, or perhaps use 1546 * ktls_output_eagain() to reset the send tag to a plain 1547 * TLS tag? 1548 */ 1549 in_pcbdetach_txrtlmt(tp->t_inpcb); 1550 } 1551 1552 #define ONE_POINT_TWO_MEG 150000 /* 1.2 megabits in bytes */ 1553 #define ONE_HUNDRED_MBPS 12500000 /* 100Mbps in bytes per second */ 1554 #define FIVE_HUNDRED_MBPS 62500000 /* 500Mbps in bytes per second */ 1555 #define MAX_MSS_SENT 43 /* 43 mss = 43 x 1500 = 64,500 bytes */ 1556 1557 static void 1558 tcp_log_pacing_size(struct tcpcb *tp, uint64_t bw, uint32_t segsiz, uint32_t new_tso, 1559 uint64_t hw_rate, uint32_t time_between, uint32_t calc_time_between, 1560 uint32_t segs, uint32_t res_div, uint16_t mult, uint8_t mod) 1561 { 1562 if (tp->t_logstate != TCP_LOG_STATE_OFF) { 1563 union tcp_log_stackspecific log; 1564 struct timeval tv; 1565 1566 memset(&log, 0, sizeof(log)); 1567 log.u_bbr.flex1 = segsiz; 1568 log.u_bbr.flex2 = new_tso; 1569 log.u_bbr.flex3 = time_between; 1570 log.u_bbr.flex4 = calc_time_between; 1571 log.u_bbr.flex5 = segs; 1572 log.u_bbr.flex6 = res_div; 1573 log.u_bbr.flex7 = mult; 1574 log.u_bbr.flex8 = mod; 1575 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 1576 log.u_bbr.cur_del_rate = bw; 1577 log.u_bbr.delRate = hw_rate; 1578 TCP_LOG_EVENTP(tp, NULL, 1579 &tp->t_inpcb->inp_socket->so_rcv, 1580 &tp->t_inpcb->inp_socket->so_snd, 1581 TCP_HDWR_PACE_SIZE, 0, 1582 0, &log, false, &tv); 1583 } 1584 } 1585 1586 uint32_t 1587 tcp_get_pacing_burst_size (struct tcpcb *tp, uint64_t bw, uint32_t segsiz, int can_use_1mss, 1588 const struct tcp_hwrate_limit_table *te, int *err) 1589 { 1590 /* 1591 * We use the google formula to calculate the 1592 * TSO size. I.E. 1593 * bw < 24Meg 1594 * tso = 2mss 1595 * else 1596 * tso = min(bw/1000, 64k) 1597 * 1598 * Note for these calculations we ignore the 1599 * packet overhead (enet hdr, ip hdr and tcp hdr). 1600 */ 1601 uint64_t lentim, res, bytes; 1602 uint32_t new_tso, min_tso_segs; 1603 1604 bytes = bw / 1000; 1605 if (bytes > (64 * 1000)) 1606 bytes = 64 * 1000; 1607 /* Round up */ 1608 new_tso = (bytes + segsiz - 1) / segsiz; 1609 if (can_use_1mss && (bw < ONE_POINT_TWO_MEG)) 1610 min_tso_segs = 1; 1611 else 1612 min_tso_segs = 2; 1613 if (rs_floor_mss && (new_tso < rs_floor_mss)) 1614 new_tso = rs_floor_mss; 1615 else if (new_tso < min_tso_segs) 1616 new_tso = min_tso_segs; 1617 if (new_tso > MAX_MSS_SENT) 1618 new_tso = MAX_MSS_SENT; 1619 new_tso *= segsiz; 1620 tcp_log_pacing_size(tp, bw, segsiz, new_tso, 1621 0, 0, 0, 0, 0, 0, 1); 1622 /* 1623 * If we are not doing hardware pacing 1624 * then we are done. 1625 */ 1626 if (te == NULL) { 1627 if (err) 1628 *err = 0; 1629 return(new_tso); 1630 } 1631 /* 1632 * For hardware pacing we look at the 1633 * rate you are sending at and compare 1634 * that to the rate you have in hardware. 1635 * 1636 * If the hardware rate is slower than your 1637 * software rate then you are in error and 1638 * we will build a queue in our hardware whic 1639 * is probably not desired, in such a case 1640 * just return the non-hardware TSO size. 1641 * 1642 * If the rate in hardware is faster (which 1643 * it should be) then look at how long it 1644 * takes to send one ethernet segment size at 1645 * your b/w and compare that to the time it 1646 * takes to send at the rate you had selected. 1647 * 1648 * If your time is greater (which we hope it is) 1649 * we get the delta between the two, and then 1650 * divide that into your pacing time. This tells 1651 * us how many MSS you can send down at once (rounded up). 1652 * 1653 * Note we also double this value if the b/w is over 1654 * 100Mbps. If its over 500meg we just set you to the 1655 * max (43 segments). 1656 */ 1657 if (te->rate > FIVE_HUNDRED_MBPS) 1658 goto max; 1659 if (te->rate == bw) { 1660 /* We are pacing at exactly the hdwr rate */ 1661 max: 1662 tcp_log_pacing_size(tp, bw, segsiz, new_tso, 1663 te->rate, te->time_between, (uint32_t)0, 1664 (segsiz * MAX_MSS_SENT), 0, 0, 3); 1665 return (segsiz * MAX_MSS_SENT); 1666 } 1667 lentim = ETHERNET_SEGMENT_SIZE * USECS_IN_SECOND; 1668 res = lentim / bw; 1669 if (res > te->time_between) { 1670 uint32_t delta, segs, res_div; 1671 1672 res_div = ((res * num_of_waits_allowed) + wait_time_floor); 1673 delta = res - te->time_between; 1674 segs = (res_div + delta - 1)/delta; 1675 if (segs < min_tso_segs) 1676 segs = min_tso_segs; 1677 if (segs < rs_hw_floor_mss) 1678 segs = rs_hw_floor_mss; 1679 if (segs > MAX_MSS_SENT) 1680 segs = MAX_MSS_SENT; 1681 segs *= segsiz; 1682 tcp_log_pacing_size(tp, bw, segsiz, new_tso, 1683 te->rate, te->time_between, (uint32_t)res, 1684 segs, res_div, 1, 3); 1685 if (err) 1686 *err = 0; 1687 if (segs < new_tso) { 1688 /* unexpected ? */ 1689 return(new_tso); 1690 } else { 1691 return (segs); 1692 } 1693 } else { 1694 /* 1695 * Your time is smaller which means 1696 * we will grow a queue on our 1697 * hardware. Send back the non-hardware 1698 * rate. 1699 */ 1700 tcp_log_pacing_size(tp, bw, segsiz, new_tso, 1701 te->rate, te->time_between, (uint32_t)res, 1702 0, 0, 0, 4); 1703 if (err) 1704 *err = -1; 1705 return (new_tso); 1706 } 1707 } 1708 1709 uint64_t 1710 tcp_hw_highest_rate_ifp(struct ifnet *ifp, struct inpcb *inp) 1711 { 1712 struct epoch_tracker et; 1713 struct tcp_rate_set *rs; 1714 uint64_t rate_ret; 1715 1716 NET_EPOCH_ENTER(et); 1717 use_next_interface: 1718 rs = find_rs_for_ifp(ifp); 1719 if (rs == NULL) { 1720 /* This interface does not do ratelimiting */ 1721 rate_ret = 0; 1722 } else if (rs->rs_flags & RS_IS_DEFF) { 1723 /* We need to find the real interface */ 1724 struct ifnet *tifp; 1725 1726 tifp = rt_find_real_interface(ifp, inp, NULL); 1727 if (tifp == NULL) { 1728 NET_EPOCH_EXIT(et); 1729 return (0); 1730 } 1731 ifp = tifp; 1732 goto use_next_interface; 1733 } else { 1734 /* Lets return the highest rate this guy has */ 1735 rate_ret = rs->rs_rlt[rs->rs_highest_valid].rate; 1736 } 1737 NET_EPOCH_EXIT(et); 1738 return(rate_ret); 1739 } 1740 1741 static eventhandler_tag rl_ifnet_departs; 1742 static eventhandler_tag rl_ifnet_arrives; 1743 static eventhandler_tag rl_shutdown_start; 1744 1745 static void 1746 tcp_rs_init(void *st __unused) 1747 { 1748 CK_LIST_INIT(&int_rs); 1749 rs_number_alive = 0; 1750 rs_number_dead = 0; 1751 mtx_init(&rs_mtx, "tcp_rs_mtx", "rsmtx", MTX_DEF); 1752 rl_ifnet_departs = EVENTHANDLER_REGISTER(ifnet_departure_event, 1753 tcp_rl_ifnet_departure, 1754 NULL, EVENTHANDLER_PRI_ANY); 1755 rl_ifnet_arrives = EVENTHANDLER_REGISTER(ifnet_link_event, 1756 tcp_rl_ifnet_link, 1757 NULL, EVENTHANDLER_PRI_ANY); 1758 rl_shutdown_start = EVENTHANDLER_REGISTER(shutdown_pre_sync, 1759 tcp_rl_shutdown, NULL, 1760 SHUTDOWN_PRI_FIRST); 1761 printf("TCP_ratelimit: Is now initialized\n"); 1762 } 1763 1764 SYSINIT(tcp_rl_init, SI_SUB_SMP + 1, SI_ORDER_ANY, tcp_rs_init, NULL); 1765 #endif 1766