1 /*- 2 * 3 * SPDX-License-Identifier: BSD-3-Clause 4 * 5 * Copyright (c) 2018-2020 6 * Netflix Inc. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 * 29 */ 30 /** 31 * Author: Randall Stewart <rrs@netflix.com> 32 */ 33 34 #include <sys/cdefs.h> 35 #include "opt_inet.h" 36 #include "opt_inet6.h" 37 #include "opt_ipsec.h" 38 #include "opt_ratelimit.h" 39 #include <sys/param.h> 40 #include <sys/kernel.h> 41 #include <sys/malloc.h> 42 #include <sys/mbuf.h> 43 #include <sys/socket.h> 44 #include <sys/socketvar.h> 45 #include <sys/sysctl.h> 46 #include <sys/eventhandler.h> 47 #include <sys/mutex.h> 48 #include <sys/ck.h> 49 #include <net/if.h> 50 #include <net/if_var.h> 51 #include <net/if_private.h> 52 #include <netinet/in.h> 53 #include <netinet/in_pcb.h> 54 #define TCPSTATES /* for logging */ 55 #include <netinet/tcp_var.h> 56 #include <netinet/tcp_hpts.h> 57 #include <netinet/tcp_log_buf.h> 58 #include <netinet/tcp_ratelimit.h> 59 #ifndef USECS_IN_SECOND 60 #define USECS_IN_SECOND 1000000 61 #endif 62 /* 63 * For the purposes of each send, what is the size 64 * of an ethernet frame. 65 */ 66 MALLOC_DEFINE(M_TCPPACE, "tcp_hwpace", "TCP Hardware pacing memory"); 67 #ifdef RATELIMIT 68 69 /* 70 * The following preferred table will seem weird to 71 * the casual viewer. Why do we not have any rates below 72 * 1Mbps? Why do we have a rate at 1.44Mbps called common? 73 * Why do the rates cluster in the 1-100Mbps range more 74 * than others? Why does the table jump around at the beginnign 75 * and then be more consistently raising? 76 * 77 * Let me try to answer those questions. A lot of 78 * this is dependant on the hardware. We have three basic 79 * supporters of rate limiting 80 * 81 * Chelsio - Supporting 16 configurable rates. 82 * Mlx - c4 supporting 13 fixed rates. 83 * Mlx - c5 & c6 supporting 127 configurable rates. 84 * 85 * The c4 is why we have a common rate that is available 86 * in all rate tables. This is a selected rate from the 87 * c4 table and we assure its available in all ratelimit 88 * tables. This way the tcp_ratelimit code has an assured 89 * rate it should always be able to get. This answers a 90 * couple of the questions above. 91 * 92 * So what about the rest, well the table is built to 93 * try to get the most out of a joint hardware/software 94 * pacing system. The software pacer will always pick 95 * a rate higher than the b/w that it is estimating 96 * 97 * on the path. This is done for two reasons. 98 * a) So we can discover more b/w 99 * and 100 * b) So we can send a block of MSS's down and then 101 * have the software timer go off after the previous 102 * send is completely out of the hardware. 103 * 104 * But when we do <b> we don't want to have the delay 105 * between the last packet sent by the hardware be 106 * excessively long (to reach our desired rate). 107 * 108 * So let me give an example for clarity. 109 * 110 * Lets assume that the tcp stack sees that 29,110,000 bps is 111 * what the bw of the path is. The stack would select the 112 * rate 31Mbps. 31Mbps means that each send that is done 113 * by the hardware will cause a 390 micro-second gap between 114 * the packets sent at that rate. For 29,110,000 bps we 115 * would need 416 micro-seconds gap between each send. 116 * 117 * Note that are calculating a complete time for pacing 118 * which includes the ethernet, IP and TCP overhead. So 119 * a full 1514 bytes is used for the above calculations. 120 * My testing has shown that both cards are also using this 121 * as their basis i.e. full payload size of the ethernet frame. 122 * The TCP stack caller needs to be aware of this and make the 123 * appropriate overhead calculations be included in its choices. 124 * 125 * Now, continuing our example, we pick a MSS size based on the 126 * delta between the two rates (416 - 390) divided into the rate 127 * we really wish to send at rounded up. That results in a MSS 128 * send of 17 mss's at once. The hardware then will 129 * run out of data in a single 17MSS send in 6,630 micro-seconds. 130 * 131 * On the other hand the software pacer will send more data 132 * in 7,072 micro-seconds. This means that we will refill 133 * the hardware 52 microseconds after it would have sent 134 * next if it had not ran out of data. This is a win since we are 135 * only sending every 7ms or so and yet all the packets are spaced on 136 * the wire with 94% of what they should be and only 137 * the last packet is delayed extra to make up for the 138 * difference. 139 * 140 * Note that the above formula has two important caveat. 141 * If we are above (b/w wise) over 100Mbps we double the result 142 * of the MSS calculation. The second caveat is if we are 500Mbps 143 * or more we just send the maximum MSS at once i.e. 45MSS. At 144 * the higher b/w's even the cards have limits to what times (timer granularity) 145 * they can insert between packets and start to send more than one 146 * packet at a time on the wire. 147 * 148 */ 149 #define COMMON_RATE 180500 150 const uint64_t desired_rates[] = { 151 122500, /* 1Mbps - rate 1 */ 152 180500, /* 1.44Mpbs - rate 2 common rate */ 153 375000, /* 3Mbps - rate 3 */ 154 625000, /* 5Mbps - rate 4 */ 155 1250000, /* 10Mbps - rate 5 */ 156 1875000, /* 15Mbps - rate 6 */ 157 2500000, /* 20Mbps - rate 7 */ 158 3125000, /* 25Mbps - rate 8 */ 159 3750000, /* 30Mbps - rate 9 */ 160 4375000, /* 35Mbps - rate 10 */ 161 5000000, /* 40Meg - rate 11 */ 162 6250000, /* 50Mbps - rate 12 */ 163 12500000, /* 100Mbps - rate 13 */ 164 25000000, /* 200Mbps - rate 14 */ 165 50000000, /* 400Mbps - rate 15 */ 166 100000000, /* 800Mbps - rate 16 */ 167 5625000, /* 45Mbps - rate 17 */ 168 6875000, /* 55Mbps - rate 19 */ 169 7500000, /* 60Mbps - rate 20 */ 170 8125000, /* 65Mbps - rate 21 */ 171 8750000, /* 70Mbps - rate 22 */ 172 9375000, /* 75Mbps - rate 23 */ 173 10000000, /* 80Mbps - rate 24 */ 174 10625000, /* 85Mbps - rate 25 */ 175 11250000, /* 90Mbps - rate 26 */ 176 11875000, /* 95Mbps - rate 27 */ 177 12500000, /* 100Mbps - rate 28 */ 178 13750000, /* 110Mbps - rate 29 */ 179 15000000, /* 120Mbps - rate 30 */ 180 16250000, /* 130Mbps - rate 31 */ 181 17500000, /* 140Mbps - rate 32 */ 182 18750000, /* 150Mbps - rate 33 */ 183 20000000, /* 160Mbps - rate 34 */ 184 21250000, /* 170Mbps - rate 35 */ 185 22500000, /* 180Mbps - rate 36 */ 186 23750000, /* 190Mbps - rate 37 */ 187 26250000, /* 210Mbps - rate 38 */ 188 27500000, /* 220Mbps - rate 39 */ 189 28750000, /* 230Mbps - rate 40 */ 190 30000000, /* 240Mbps - rate 41 */ 191 31250000, /* 250Mbps - rate 42 */ 192 34375000, /* 275Mbps - rate 43 */ 193 37500000, /* 300Mbps - rate 44 */ 194 40625000, /* 325Mbps - rate 45 */ 195 43750000, /* 350Mbps - rate 46 */ 196 46875000, /* 375Mbps - rate 47 */ 197 53125000, /* 425Mbps - rate 48 */ 198 56250000, /* 450Mbps - rate 49 */ 199 59375000, /* 475Mbps - rate 50 */ 200 62500000, /* 500Mbps - rate 51 */ 201 68750000, /* 550Mbps - rate 52 */ 202 75000000, /* 600Mbps - rate 53 */ 203 81250000, /* 650Mbps - rate 54 */ 204 87500000, /* 700Mbps - rate 55 */ 205 93750000, /* 750Mbps - rate 56 */ 206 106250000, /* 850Mbps - rate 57 */ 207 112500000, /* 900Mbps - rate 58 */ 208 125000000, /* 1Gbps - rate 59 */ 209 156250000, /* 1.25Gps - rate 60 */ 210 187500000, /* 1.5Gps - rate 61 */ 211 218750000, /* 1.75Gps - rate 62 */ 212 250000000, /* 2Gbps - rate 63 */ 213 281250000, /* 2.25Gps - rate 64 */ 214 312500000, /* 2.5Gbps - rate 65 */ 215 343750000, /* 2.75Gbps - rate 66 */ 216 375000000, /* 3Gbps - rate 67 */ 217 500000000, /* 4Gbps - rate 68 */ 218 625000000, /* 5Gbps - rate 69 */ 219 750000000, /* 6Gbps - rate 70 */ 220 875000000, /* 7Gbps - rate 71 */ 221 1000000000, /* 8Gbps - rate 72 */ 222 1125000000, /* 9Gbps - rate 73 */ 223 1250000000, /* 10Gbps - rate 74 */ 224 1875000000, /* 15Gbps - rate 75 */ 225 2500000000 /* 20Gbps - rate 76 */ 226 }; 227 228 #define MAX_HDWR_RATES (sizeof(desired_rates)/sizeof(uint64_t)) 229 #define RS_ORDERED_COUNT 16 /* 230 * Number that are in order 231 * at the beginning of the table, 232 * over this a sort is required. 233 */ 234 #define RS_NEXT_ORDER_GROUP 16 /* 235 * The point in our table where 236 * we come fill in a second ordered 237 * group (index wise means -1). 238 */ 239 #define ALL_HARDWARE_RATES 1004 /* 240 * 1Meg - 1Gig in 1 Meg steps 241 * plus 100, 200k and 500k and 242 * 10Gig 243 */ 244 245 #define RS_ONE_MEGABIT_PERSEC 1000000 246 #define RS_ONE_GIGABIT_PERSEC 1000000000 247 #define RS_TEN_GIGABIT_PERSEC 10000000000 248 249 static struct head_tcp_rate_set int_rs; 250 static struct mtx rs_mtx; 251 uint32_t rs_number_alive; 252 uint32_t rs_number_dead; 253 static uint32_t rs_floor_mss = 0; 254 static uint32_t wait_time_floor = 8000; /* 8 ms */ 255 static uint32_t rs_hw_floor_mss = 16; 256 static uint32_t num_of_waits_allowed = 1; /* How many time blocks are we willing to wait */ 257 258 static uint32_t mss_divisor = RL_DEFAULT_DIVISOR; 259 static uint32_t even_num_segs = 1; 260 static uint32_t even_threshold = 4; 261 262 SYSCTL_NODE(_net_inet_tcp, OID_AUTO, rl, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 263 "TCP Ratelimit stats"); 264 SYSCTL_UINT(_net_inet_tcp_rl, OID_AUTO, alive, CTLFLAG_RW, 265 &rs_number_alive, 0, 266 "Number of interfaces initialized for ratelimiting"); 267 SYSCTL_UINT(_net_inet_tcp_rl, OID_AUTO, dead, CTLFLAG_RW, 268 &rs_number_dead, 0, 269 "Number of interfaces departing from ratelimiting"); 270 SYSCTL_UINT(_net_inet_tcp_rl, OID_AUTO, floor_mss, CTLFLAG_RW, 271 &rs_floor_mss, 0, 272 "Number of MSS that will override the normal minimums (0 means don't enforce)"); 273 SYSCTL_UINT(_net_inet_tcp_rl, OID_AUTO, wait_floor, CTLFLAG_RW, 274 &wait_time_floor, 2000, 275 "Has b/w increases what is the wait floor we are willing to wait at the end?"); 276 SYSCTL_UINT(_net_inet_tcp_rl, OID_AUTO, time_blocks, CTLFLAG_RW, 277 &num_of_waits_allowed, 1, 278 "How many time blocks on the end should software pacing be willing to wait?"); 279 280 SYSCTL_UINT(_net_inet_tcp_rl, OID_AUTO, hw_floor_mss, CTLFLAG_RW, 281 &rs_hw_floor_mss, 16, 282 "Number of mss that are a minum for hardware pacing?"); 283 284 SYSCTL_INT(_net_inet_tcp_rl, OID_AUTO, divisor, CTLFLAG_RW, 285 &mss_divisor, RL_DEFAULT_DIVISOR, 286 "The value divided into bytes per second to help establish mss size"); 287 SYSCTL_INT(_net_inet_tcp_rl, OID_AUTO, even, CTLFLAG_RW, 288 &even_num_segs, 1, 289 "Do we round mss size up to an even number of segments for delayed ack"); 290 SYSCTL_INT(_net_inet_tcp_rl, OID_AUTO, eventhresh, CTLFLAG_RW, 291 &even_threshold, 4, 292 "At what number of mss do we start rounding up to an even number of mss?"); 293 294 static void 295 rl_add_syctl_entries(struct sysctl_oid *rl_sysctl_root, struct tcp_rate_set *rs) 296 { 297 /* 298 * Add sysctl entries for thus interface. 299 */ 300 if (rs->rs_flags & RS_INTF_NO_SUP) { 301 SYSCTL_ADD_S32(&rs->sysctl_ctx, 302 SYSCTL_CHILDREN(rl_sysctl_root), 303 OID_AUTO, "disable", CTLFLAG_RD, 304 &rs->rs_disable, 0, 305 "Disable this interface from new hdwr limiting?"); 306 } else { 307 SYSCTL_ADD_S32(&rs->sysctl_ctx, 308 SYSCTL_CHILDREN(rl_sysctl_root), 309 OID_AUTO, "disable", CTLFLAG_RW, 310 &rs->rs_disable, 0, 311 "Disable this interface from new hdwr limiting?"); 312 } 313 SYSCTL_ADD_S32(&rs->sysctl_ctx, 314 SYSCTL_CHILDREN(rl_sysctl_root), 315 OID_AUTO, "minseg", CTLFLAG_RW, 316 &rs->rs_min_seg, 0, 317 "What is the minimum we need to send on this interface?"); 318 SYSCTL_ADD_U64(&rs->sysctl_ctx, 319 SYSCTL_CHILDREN(rl_sysctl_root), 320 OID_AUTO, "flow_limit", CTLFLAG_RW, 321 &rs->rs_flow_limit, 0, 322 "What is the limit for number of flows (0=unlimited)?"); 323 SYSCTL_ADD_S32(&rs->sysctl_ctx, 324 SYSCTL_CHILDREN(rl_sysctl_root), 325 OID_AUTO, "highest", CTLFLAG_RD, 326 &rs->rs_highest_valid, 0, 327 "Highest valid rate"); 328 SYSCTL_ADD_S32(&rs->sysctl_ctx, 329 SYSCTL_CHILDREN(rl_sysctl_root), 330 OID_AUTO, "lowest", CTLFLAG_RD, 331 &rs->rs_lowest_valid, 0, 332 "Lowest valid rate"); 333 SYSCTL_ADD_S32(&rs->sysctl_ctx, 334 SYSCTL_CHILDREN(rl_sysctl_root), 335 OID_AUTO, "flags", CTLFLAG_RD, 336 &rs->rs_flags, 0, 337 "What lags are on the entry?"); 338 SYSCTL_ADD_S32(&rs->sysctl_ctx, 339 SYSCTL_CHILDREN(rl_sysctl_root), 340 OID_AUTO, "numrates", CTLFLAG_RD, 341 &rs->rs_rate_cnt, 0, 342 "How many rates re there?"); 343 SYSCTL_ADD_U64(&rs->sysctl_ctx, 344 SYSCTL_CHILDREN(rl_sysctl_root), 345 OID_AUTO, "flows_using", CTLFLAG_RD, 346 &rs->rs_flows_using, 0, 347 "How many flows are using this interface now?"); 348 #ifdef DETAILED_RATELIMIT_SYSCTL 349 if (rs->rs_rlt && rs->rs_rate_cnt > 0) { 350 /* Lets display the rates */ 351 int i; 352 struct sysctl_oid *rl_rates; 353 struct sysctl_oid *rl_rate_num; 354 char rate_num[16]; 355 rl_rates = SYSCTL_ADD_NODE(&rs->sysctl_ctx, 356 SYSCTL_CHILDREN(rl_sysctl_root), 357 OID_AUTO, 358 "rate", 359 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 360 "Ratelist"); 361 for( i = 0; i < rs->rs_rate_cnt; i++) { 362 sprintf(rate_num, "%d", i); 363 rl_rate_num = SYSCTL_ADD_NODE(&rs->sysctl_ctx, 364 SYSCTL_CHILDREN(rl_rates), 365 OID_AUTO, 366 rate_num, 367 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 368 "Individual Rate"); 369 SYSCTL_ADD_U32(&rs->sysctl_ctx, 370 SYSCTL_CHILDREN(rl_rate_num), 371 OID_AUTO, "flags", CTLFLAG_RD, 372 &rs->rs_rlt[i].flags, 0, 373 "Flags on this rate"); 374 SYSCTL_ADD_U32(&rs->sysctl_ctx, 375 SYSCTL_CHILDREN(rl_rate_num), 376 OID_AUTO, "pacetime", CTLFLAG_RD, 377 &rs->rs_rlt[i].time_between, 0, 378 "Time hardware inserts between 1500 byte sends"); 379 SYSCTL_ADD_LONG(&rs->sysctl_ctx, 380 SYSCTL_CHILDREN(rl_rate_num), 381 OID_AUTO, "rate", CTLFLAG_RD, 382 &rs->rs_rlt[i].rate, 383 "Rate in bytes per second"); 384 SYSCTL_ADD_LONG(&rs->sysctl_ctx, 385 SYSCTL_CHILDREN(rl_rate_num), 386 OID_AUTO, "using", CTLFLAG_RD, 387 &rs->rs_rlt[i].using, 388 "Number of flows using"); 389 SYSCTL_ADD_LONG(&rs->sysctl_ctx, 390 SYSCTL_CHILDREN(rl_rate_num), 391 OID_AUTO, "enobufs", CTLFLAG_RD, 392 &rs->rs_rlt[i].rs_num_enobufs, 393 "Number of enobufs logged on this rate"); 394 395 } 396 } 397 #endif 398 } 399 400 static void 401 rs_destroy(epoch_context_t ctx) 402 { 403 struct tcp_rate_set *rs; 404 bool do_free_rs; 405 406 rs = __containerof(ctx, struct tcp_rate_set, rs_epoch_ctx); 407 408 mtx_lock(&rs_mtx); 409 rs->rs_flags &= ~RS_FUNERAL_SCHD; 410 /* 411 * In theory its possible (but unlikely) 412 * that while the delete was occuring 413 * and we were applying the DEAD flag 414 * someone slipped in and found the 415 * interface in a lookup. While we 416 * decided rs_flows_using were 0 and 417 * scheduling the epoch_call, the other 418 * thread incremented rs_flow_using. This 419 * is because users have a pointer and 420 * we only use the rs_flows_using in an 421 * atomic fashion, i.e. the other entities 422 * are not protected. To assure this did 423 * not occur, we check rs_flows_using here 424 * before deleting. 425 */ 426 do_free_rs = (rs->rs_flows_using == 0); 427 rs_number_dead--; 428 mtx_unlock(&rs_mtx); 429 430 if (do_free_rs) { 431 sysctl_ctx_free(&rs->sysctl_ctx); 432 free(rs->rs_rlt, M_TCPPACE); 433 free(rs, M_TCPPACE); 434 } 435 } 436 437 static void 438 rs_defer_destroy(struct tcp_rate_set *rs) 439 { 440 441 mtx_assert(&rs_mtx, MA_OWNED); 442 443 /* Check if already pending. */ 444 if (rs->rs_flags & RS_FUNERAL_SCHD) 445 return; 446 447 rs_number_dead++; 448 449 /* Set flag to only defer once. */ 450 rs->rs_flags |= RS_FUNERAL_SCHD; 451 NET_EPOCH_CALL(rs_destroy, &rs->rs_epoch_ctx); 452 } 453 454 #ifdef INET 455 extern counter_u64_t rate_limit_new; 456 extern counter_u64_t rate_limit_chg; 457 extern counter_u64_t rate_limit_set_ok; 458 extern counter_u64_t rate_limit_active; 459 extern counter_u64_t rate_limit_alloc_fail; 460 #endif 461 462 static int 463 rl_attach_txrtlmt(struct ifnet *ifp, 464 uint32_t flowtype, 465 int flowid, 466 uint64_t cfg_rate, 467 struct m_snd_tag **tag) 468 { 469 int error; 470 union if_snd_tag_alloc_params params = { 471 .rate_limit.hdr.type = IF_SND_TAG_TYPE_RATE_LIMIT, 472 .rate_limit.hdr.flowid = flowid, 473 .rate_limit.hdr.flowtype = flowtype, 474 .rate_limit.max_rate = cfg_rate, 475 .rate_limit.flags = M_NOWAIT, 476 }; 477 478 error = m_snd_tag_alloc(ifp, ¶ms, tag); 479 #ifdef INET 480 if (error == 0) { 481 counter_u64_add(rate_limit_set_ok, 1); 482 counter_u64_add(rate_limit_active, 1); 483 } else if (error != EOPNOTSUPP) 484 counter_u64_add(rate_limit_alloc_fail, 1); 485 #endif 486 return (error); 487 } 488 489 static void 490 populate_canned_table(struct tcp_rate_set *rs, const uint64_t *rate_table_act) 491 { 492 /* 493 * The internal table is "special", it 494 * is two seperate ordered tables that 495 * must be merged. We get here when the 496 * adapter specifies a number of rates that 497 * covers both ranges in the table in some 498 * form. 499 */ 500 int i, at_low, at_high; 501 uint8_t low_disabled = 0, high_disabled = 0; 502 503 for(i = 0, at_low = 0, at_high = RS_NEXT_ORDER_GROUP; i < rs->rs_rate_cnt; i++) { 504 rs->rs_rlt[i].flags = 0; 505 rs->rs_rlt[i].time_between = 0; 506 if ((low_disabled == 0) && 507 (high_disabled || 508 (rate_table_act[at_low] < rate_table_act[at_high]))) { 509 rs->rs_rlt[i].rate = rate_table_act[at_low]; 510 at_low++; 511 if (at_low == RS_NEXT_ORDER_GROUP) 512 low_disabled = 1; 513 } else if (high_disabled == 0) { 514 rs->rs_rlt[i].rate = rate_table_act[at_high]; 515 at_high++; 516 if (at_high == MAX_HDWR_RATES) 517 high_disabled = 1; 518 } 519 } 520 } 521 522 static struct tcp_rate_set * 523 rt_setup_new_rs(struct ifnet *ifp, int *error) 524 { 525 struct tcp_rate_set *rs; 526 const uint64_t *rate_table_act; 527 uint64_t lentim, res; 528 size_t sz; 529 uint32_t hash_type; 530 int i; 531 struct if_ratelimit_query_results rl; 532 struct sysctl_oid *rl_sysctl_root; 533 struct epoch_tracker et; 534 /* 535 * We expect to enter with the 536 * mutex locked. 537 */ 538 539 if (ifp->if_ratelimit_query == NULL) { 540 /* 541 * We can do nothing if we cannot 542 * get a query back from the driver. 543 */ 544 printf("Warning:No query functions for %s:%d-- failed\n", 545 ifp->if_dname, ifp->if_dunit); 546 return (NULL); 547 } 548 rs = malloc(sizeof(struct tcp_rate_set), M_TCPPACE, M_NOWAIT | M_ZERO); 549 if (rs == NULL) { 550 if (error) 551 *error = ENOMEM; 552 printf("Warning:No memory for malloc of tcp_rate_set\n"); 553 return (NULL); 554 } 555 memset(&rl, 0, sizeof(rl)); 556 rl.flags = RT_NOSUPPORT; 557 ifp->if_ratelimit_query(ifp, &rl); 558 if (rl.flags & RT_IS_UNUSABLE) { 559 /* 560 * The interface does not really support 561 * the rate-limiting. 562 */ 563 memset(rs, 0, sizeof(struct tcp_rate_set)); 564 rs->rs_ifp = ifp; 565 rs->rs_if_dunit = ifp->if_dunit; 566 rs->rs_flags = RS_INTF_NO_SUP; 567 rs->rs_disable = 1; 568 rs_number_alive++; 569 sysctl_ctx_init(&rs->sysctl_ctx); 570 rl_sysctl_root = SYSCTL_ADD_NODE(&rs->sysctl_ctx, 571 SYSCTL_STATIC_CHILDREN(_net_inet_tcp_rl), 572 OID_AUTO, 573 rs->rs_ifp->if_xname, 574 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 575 ""); 576 rl_add_syctl_entries(rl_sysctl_root, rs); 577 NET_EPOCH_ENTER(et); 578 mtx_lock(&rs_mtx); 579 CK_LIST_INSERT_HEAD(&int_rs, rs, next); 580 mtx_unlock(&rs_mtx); 581 NET_EPOCH_EXIT(et); 582 return (rs); 583 } else if ((rl.flags & RT_IS_INDIRECT) == RT_IS_INDIRECT) { 584 memset(rs, 0, sizeof(struct tcp_rate_set)); 585 rs->rs_ifp = ifp; 586 rs->rs_if_dunit = ifp->if_dunit; 587 rs->rs_flags = RS_IS_DEFF; 588 rs_number_alive++; 589 sysctl_ctx_init(&rs->sysctl_ctx); 590 rl_sysctl_root = SYSCTL_ADD_NODE(&rs->sysctl_ctx, 591 SYSCTL_STATIC_CHILDREN(_net_inet_tcp_rl), 592 OID_AUTO, 593 rs->rs_ifp->if_xname, 594 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 595 ""); 596 rl_add_syctl_entries(rl_sysctl_root, rs); 597 NET_EPOCH_ENTER(et); 598 mtx_lock(&rs_mtx); 599 CK_LIST_INSERT_HEAD(&int_rs, rs, next); 600 mtx_unlock(&rs_mtx); 601 NET_EPOCH_EXIT(et); 602 return (rs); 603 } else if ((rl.flags & RT_IS_FIXED_TABLE) == RT_IS_FIXED_TABLE) { 604 /* Mellanox C4 likely */ 605 rs->rs_ifp = ifp; 606 rs->rs_if_dunit = ifp->if_dunit; 607 rs->rs_rate_cnt = rl.number_of_rates; 608 rs->rs_min_seg = rl.min_segment_burst; 609 rs->rs_highest_valid = 0; 610 rs->rs_flow_limit = rl.max_flows; 611 rs->rs_flags = RS_IS_INTF | RS_NO_PRE; 612 rs->rs_disable = 0; 613 rate_table_act = rl.rate_table; 614 } else if ((rl.flags & RT_IS_SELECTABLE) == RT_IS_SELECTABLE) { 615 /* Chelsio, C5 and C6 of Mellanox? */ 616 rs->rs_ifp = ifp; 617 rs->rs_if_dunit = ifp->if_dunit; 618 rs->rs_rate_cnt = rl.number_of_rates; 619 rs->rs_min_seg = rl.min_segment_burst; 620 rs->rs_disable = 0; 621 rs->rs_flow_limit = rl.max_flows; 622 rate_table_act = desired_rates; 623 if ((rs->rs_rate_cnt > MAX_HDWR_RATES) && 624 (rs->rs_rate_cnt < ALL_HARDWARE_RATES)) { 625 /* 626 * Our desired table is not big 627 * enough, do what we can. 628 */ 629 rs->rs_rate_cnt = MAX_HDWR_RATES; 630 } 631 if (rs->rs_rate_cnt <= RS_ORDERED_COUNT) 632 rs->rs_flags = RS_IS_INTF; 633 else 634 rs->rs_flags = RS_IS_INTF | RS_INT_TBL; 635 if (rs->rs_rate_cnt >= ALL_HARDWARE_RATES) 636 rs->rs_rate_cnt = ALL_HARDWARE_RATES; 637 } else { 638 free(rs, M_TCPPACE); 639 return (NULL); 640 } 641 sz = sizeof(struct tcp_hwrate_limit_table) * rs->rs_rate_cnt; 642 rs->rs_rlt = malloc(sz, M_TCPPACE, M_NOWAIT); 643 if (rs->rs_rlt == NULL) { 644 if (error) 645 *error = ENOMEM; 646 bail: 647 free(rs, M_TCPPACE); 648 return (NULL); 649 } 650 if (rs->rs_rate_cnt >= ALL_HARDWARE_RATES) { 651 /* 652 * The interface supports all 653 * the rates we could possibly want. 654 */ 655 uint64_t rat; 656 657 rs->rs_rlt[0].rate = 12500; /* 100k */ 658 rs->rs_rlt[1].rate = 25000; /* 200k */ 659 rs->rs_rlt[2].rate = 62500; /* 500k */ 660 /* Note 125000 == 1Megabit 661 * populate 1Meg - 1000meg. 662 */ 663 for(i = 3, rat = 125000; i< (ALL_HARDWARE_RATES-1); i++) { 664 rs->rs_rlt[i].rate = rat; 665 rat += 125000; 666 } 667 rs->rs_rlt[(ALL_HARDWARE_RATES-1)].rate = 1250000000; 668 } else if (rs->rs_flags & RS_INT_TBL) { 669 /* We populate this in a special way */ 670 populate_canned_table(rs, rate_table_act); 671 } else { 672 /* 673 * Just copy in the rates from 674 * the table, it is in order. 675 */ 676 for (i=0; i<rs->rs_rate_cnt; i++) { 677 rs->rs_rlt[i].rate = rate_table_act[i]; 678 rs->rs_rlt[i].time_between = 0; 679 rs->rs_rlt[i].flags = 0; 680 } 681 } 682 for (i = (rs->rs_rate_cnt - 1); i >= 0; i--) { 683 /* 684 * We go backwards through the list so that if we can't get 685 * a rate and fail to init one, we have at least a chance of 686 * getting the highest one. 687 */ 688 rs->rs_rlt[i].ptbl = rs; 689 rs->rs_rlt[i].tag = NULL; 690 rs->rs_rlt[i].using = 0; 691 rs->rs_rlt[i].rs_num_enobufs = 0; 692 /* 693 * Calculate the time between. 694 */ 695 lentim = ETHERNET_SEGMENT_SIZE * USECS_IN_SECOND; 696 res = lentim / rs->rs_rlt[i].rate; 697 if (res > 0) 698 rs->rs_rlt[i].time_between = res; 699 else 700 rs->rs_rlt[i].time_between = 1; 701 if (rs->rs_flags & RS_NO_PRE) { 702 rs->rs_rlt[i].flags = HDWRPACE_INITED; 703 rs->rs_lowest_valid = i; 704 } else { 705 int err; 706 707 if ((rl.flags & RT_IS_SETUP_REQ) && 708 (ifp->if_ratelimit_query)) { 709 err = ifp->if_ratelimit_setup(ifp, 710 rs->rs_rlt[i].rate, i); 711 if (err) 712 goto handle_err; 713 } 714 #ifdef RSS 715 hash_type = M_HASHTYPE_RSS_TCP_IPV4; 716 #else 717 hash_type = M_HASHTYPE_OPAQUE_HASH; 718 #endif 719 err = rl_attach_txrtlmt(ifp, 720 hash_type, 721 (i + 1), 722 rs->rs_rlt[i].rate, 723 &rs->rs_rlt[i].tag); 724 if (err) { 725 handle_err: 726 if (i == (rs->rs_rate_cnt - 1)) { 727 /* 728 * Huh - first rate and we can't get 729 * it? 730 */ 731 free(rs->rs_rlt, M_TCPPACE); 732 if (error) 733 *error = err; 734 goto bail; 735 } else { 736 if (error) 737 *error = err; 738 } 739 break; 740 } else { 741 rs->rs_rlt[i].flags = HDWRPACE_INITED | HDWRPACE_TAGPRESENT; 742 rs->rs_lowest_valid = i; 743 } 744 } 745 } 746 /* Did we get at least 1 rate? */ 747 if (rs->rs_rlt[(rs->rs_rate_cnt - 1)].flags & HDWRPACE_INITED) 748 rs->rs_highest_valid = rs->rs_rate_cnt - 1; 749 else { 750 free(rs->rs_rlt, M_TCPPACE); 751 goto bail; 752 } 753 rs_number_alive++; 754 sysctl_ctx_init(&rs->sysctl_ctx); 755 rl_sysctl_root = SYSCTL_ADD_NODE(&rs->sysctl_ctx, 756 SYSCTL_STATIC_CHILDREN(_net_inet_tcp_rl), 757 OID_AUTO, 758 rs->rs_ifp->if_xname, 759 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 760 ""); 761 rl_add_syctl_entries(rl_sysctl_root, rs); 762 NET_EPOCH_ENTER(et); 763 mtx_lock(&rs_mtx); 764 CK_LIST_INSERT_HEAD(&int_rs, rs, next); 765 mtx_unlock(&rs_mtx); 766 NET_EPOCH_EXIT(et); 767 return (rs); 768 } 769 770 /* 771 * For an explanation of why the argument is volatile please 772 * look at the comments around rt_setup_rate(). 773 */ 774 static const struct tcp_hwrate_limit_table * 775 tcp_int_find_suitable_rate(const volatile struct tcp_rate_set *rs, 776 uint64_t bytes_per_sec, uint32_t flags, uint64_t *lower_rate) 777 { 778 struct tcp_hwrate_limit_table *arte = NULL, *rte = NULL; 779 uint64_t mbits_per_sec, ind_calc, previous_rate = 0; 780 int i; 781 782 mbits_per_sec = (bytes_per_sec * 8); 783 if (flags & RS_PACING_LT) { 784 if ((mbits_per_sec < RS_ONE_MEGABIT_PERSEC) && 785 (rs->rs_lowest_valid <= 2)){ 786 /* 787 * Smaller than 1Meg, only 788 * 3 entries can match it. 789 */ 790 previous_rate = 0; 791 for(i = rs->rs_lowest_valid; i < 3; i++) { 792 if (bytes_per_sec <= rs->rs_rlt[i].rate) { 793 rte = &rs->rs_rlt[i]; 794 break; 795 } else if (rs->rs_rlt[i].flags & HDWRPACE_INITED) { 796 arte = &rs->rs_rlt[i]; 797 } 798 previous_rate = rs->rs_rlt[i].rate; 799 } 800 goto done; 801 } else if ((mbits_per_sec > RS_ONE_GIGABIT_PERSEC) && 802 (rs->rs_rlt[(ALL_HARDWARE_RATES-1)].flags & HDWRPACE_INITED)){ 803 /* 804 * Larger than 1G (the majority of 805 * our table. 806 */ 807 if (mbits_per_sec < RS_TEN_GIGABIT_PERSEC) 808 rte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)]; 809 else 810 arte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)]; 811 previous_rate = rs->rs_rlt[(ALL_HARDWARE_RATES-2)].rate; 812 goto done; 813 } 814 /* 815 * If we reach here its in our table (between 1Meg - 1000Meg), 816 * just take the rounded down mbits per second, and add 817 * 1Megabit to it, from this we can calculate 818 * the index in the table. 819 */ 820 ind_calc = mbits_per_sec/RS_ONE_MEGABIT_PERSEC; 821 if ((ind_calc * RS_ONE_MEGABIT_PERSEC) != mbits_per_sec) 822 ind_calc++; 823 /* our table is offset by 3, we add 2 */ 824 ind_calc += 2; 825 if (ind_calc > (ALL_HARDWARE_RATES-1)) { 826 /* This should not happen */ 827 ind_calc = ALL_HARDWARE_RATES-1; 828 } 829 if ((ind_calc >= rs->rs_lowest_valid) && 830 (ind_calc <= rs->rs_highest_valid)) { 831 rte = &rs->rs_rlt[ind_calc]; 832 if (ind_calc >= 1) 833 previous_rate = rs->rs_rlt[(ind_calc-1)].rate; 834 } 835 } else if (flags & RS_PACING_EXACT_MATCH) { 836 if ((mbits_per_sec < RS_ONE_MEGABIT_PERSEC) && 837 (rs->rs_lowest_valid <= 2)){ 838 for(i = rs->rs_lowest_valid; i < 3; i++) { 839 if (bytes_per_sec == rs->rs_rlt[i].rate) { 840 rte = &rs->rs_rlt[i]; 841 break; 842 } 843 } 844 } else if ((mbits_per_sec > RS_ONE_GIGABIT_PERSEC) && 845 (rs->rs_rlt[(ALL_HARDWARE_RATES-1)].flags & HDWRPACE_INITED)) { 846 /* > 1Gbps only one rate */ 847 if (bytes_per_sec == rs->rs_rlt[(ALL_HARDWARE_RATES-1)].rate) { 848 /* Its 10G wow */ 849 rte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)]; 850 } 851 } else { 852 /* Ok it must be a exact meg (its between 1G and 1Meg) */ 853 ind_calc = mbits_per_sec/RS_ONE_MEGABIT_PERSEC; 854 if ((ind_calc * RS_ONE_MEGABIT_PERSEC) == mbits_per_sec) { 855 /* its an exact Mbps */ 856 ind_calc += 2; 857 if (ind_calc > (ALL_HARDWARE_RATES-1)) { 858 /* This should not happen */ 859 ind_calc = ALL_HARDWARE_RATES-1; 860 } 861 if (rs->rs_rlt[ind_calc].flags & HDWRPACE_INITED) 862 rte = &rs->rs_rlt[ind_calc]; 863 } 864 } 865 } else { 866 /* we want greater than the requested rate */ 867 if ((mbits_per_sec < RS_ONE_MEGABIT_PERSEC) && 868 (rs->rs_lowest_valid <= 2)){ 869 arte = &rs->rs_rlt[3]; /* set alternate to 1Meg */ 870 for (i=2; i>=rs->rs_lowest_valid; i--) { 871 if (bytes_per_sec < rs->rs_rlt[i].rate) { 872 rte = &rs->rs_rlt[i]; 873 if (i >= 1) { 874 previous_rate = rs->rs_rlt[(i-1)].rate; 875 } 876 break; 877 } else if ((flags & RS_PACING_GEQ) && 878 (bytes_per_sec == rs->rs_rlt[i].rate)) { 879 rte = &rs->rs_rlt[i]; 880 if (i >= 1) { 881 previous_rate = rs->rs_rlt[(i-1)].rate; 882 } 883 break; 884 } else { 885 arte = &rs->rs_rlt[i]; /* new alternate */ 886 } 887 } 888 } else if (mbits_per_sec > RS_ONE_GIGABIT_PERSEC) { 889 if ((bytes_per_sec < rs->rs_rlt[(ALL_HARDWARE_RATES-1)].rate) && 890 (rs->rs_rlt[(ALL_HARDWARE_RATES-1)].flags & HDWRPACE_INITED)){ 891 /* Our top rate is larger than the request */ 892 rte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)]; 893 } else if ((flags & RS_PACING_GEQ) && 894 (bytes_per_sec == rs->rs_rlt[(ALL_HARDWARE_RATES-1)].rate) && 895 (rs->rs_rlt[(ALL_HARDWARE_RATES-1)].flags & HDWRPACE_INITED)) { 896 /* It matches our top rate */ 897 rte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)]; 898 } else if (rs->rs_rlt[(ALL_HARDWARE_RATES-1)].flags & HDWRPACE_INITED) { 899 /* The top rate is an alternative */ 900 arte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)]; 901 } 902 previous_rate = rs->rs_rlt[(ALL_HARDWARE_RATES-2)].rate; 903 } else { 904 /* Its in our range 1Meg - 1Gig */ 905 if (flags & RS_PACING_GEQ) { 906 ind_calc = mbits_per_sec/RS_ONE_MEGABIT_PERSEC; 907 if ((ind_calc * RS_ONE_MEGABIT_PERSEC) == mbits_per_sec) { 908 if (ind_calc > (ALL_HARDWARE_RATES-1)) { 909 /* This should not happen */ 910 ind_calc = (ALL_HARDWARE_RATES-1); 911 } 912 rte = &rs->rs_rlt[ind_calc]; 913 if (ind_calc >= 1) 914 previous_rate = rs->rs_rlt[(ind_calc-1)].rate; 915 } 916 goto done; 917 } 918 ind_calc = (mbits_per_sec + (RS_ONE_MEGABIT_PERSEC-1))/RS_ONE_MEGABIT_PERSEC; 919 ind_calc += 2; 920 if (ind_calc > (ALL_HARDWARE_RATES-1)) { 921 /* This should not happen */ 922 ind_calc = ALL_HARDWARE_RATES-1; 923 } 924 if (rs->rs_rlt[ind_calc].flags & HDWRPACE_INITED) { 925 rte = &rs->rs_rlt[ind_calc]; 926 if (ind_calc >= 1) 927 previous_rate = rs->rs_rlt[(ind_calc-1)].rate; 928 } 929 } 930 } 931 done: 932 if ((rte == NULL) && 933 (arte != NULL) && 934 (flags & RS_PACING_SUB_OK)) { 935 /* We can use the substitute */ 936 rte = arte; 937 } 938 if (lower_rate) 939 *lower_rate = previous_rate; 940 return (rte); 941 } 942 943 /* 944 * For an explanation of why the argument is volatile please 945 * look at the comments around rt_setup_rate(). 946 */ 947 static const struct tcp_hwrate_limit_table * 948 tcp_find_suitable_rate(const volatile struct tcp_rate_set *rs, uint64_t bytes_per_sec, uint32_t flags, uint64_t *lower_rate) 949 { 950 /** 951 * Hunt the rate table with the restrictions in flags and find a 952 * suitable rate if possible. 953 * RS_PACING_EXACT_MATCH - look for an exact match to rate. 954 * RS_PACING_GT - must be greater than. 955 * RS_PACING_GEQ - must be greater than or equal. 956 * RS_PACING_LT - must be less than. 957 * RS_PACING_SUB_OK - If we don't meet criteria a 958 * substitute is ok. 959 */ 960 int i, matched; 961 struct tcp_hwrate_limit_table *rte = NULL; 962 uint64_t previous_rate = 0; 963 964 if ((rs->rs_flags & RS_INT_TBL) && 965 (rs->rs_rate_cnt >= ALL_HARDWARE_RATES)) { 966 /* 967 * Here we don't want to paw thru 968 * a big table, we have everything 969 * from 1Meg - 1000Meg in 1Meg increments. 970 * Use an alternate method to "lookup". 971 */ 972 return (tcp_int_find_suitable_rate(rs, bytes_per_sec, flags, lower_rate)); 973 } 974 if ((flags & RS_PACING_LT) || 975 (flags & RS_PACING_EXACT_MATCH)) { 976 /* 977 * For exact and less than we go forward through the table. 978 * This way when we find one larger we stop (exact was a 979 * toss up). 980 */ 981 for (i = rs->rs_lowest_valid, matched = 0; i <= rs->rs_highest_valid; i++) { 982 if ((flags & RS_PACING_EXACT_MATCH) && 983 (bytes_per_sec == rs->rs_rlt[i].rate)) { 984 rte = &rs->rs_rlt[i]; 985 matched = 1; 986 if (lower_rate != NULL) 987 *lower_rate = previous_rate; 988 break; 989 } else if ((flags & RS_PACING_LT) && 990 (bytes_per_sec <= rs->rs_rlt[i].rate)) { 991 rte = &rs->rs_rlt[i]; 992 matched = 1; 993 if (lower_rate != NULL) 994 *lower_rate = previous_rate; 995 break; 996 } 997 previous_rate = rs->rs_rlt[i].rate; 998 if (bytes_per_sec > rs->rs_rlt[i].rate) 999 break; 1000 } 1001 if ((matched == 0) && 1002 (flags & RS_PACING_LT) && 1003 (flags & RS_PACING_SUB_OK)) { 1004 /* Kick in a substitute (the lowest) */ 1005 rte = &rs->rs_rlt[rs->rs_lowest_valid]; 1006 } 1007 } else { 1008 /* 1009 * Here we go backward through the table so that we can find 1010 * the one greater in theory faster (but its probably a 1011 * wash). 1012 */ 1013 for (i = rs->rs_highest_valid, matched = 0; i >= rs->rs_lowest_valid; i--) { 1014 if (rs->rs_rlt[i].rate > bytes_per_sec) { 1015 /* A possible candidate */ 1016 rte = &rs->rs_rlt[i]; 1017 } 1018 if ((flags & RS_PACING_GEQ) && 1019 (bytes_per_sec == rs->rs_rlt[i].rate)) { 1020 /* An exact match and we want equal */ 1021 matched = 1; 1022 rte = &rs->rs_rlt[i]; 1023 break; 1024 } else if (rte) { 1025 /* 1026 * Found one that is larger than but don't 1027 * stop, there may be a more closer match. 1028 */ 1029 matched = 1; 1030 } 1031 if (rs->rs_rlt[i].rate < bytes_per_sec) { 1032 /* 1033 * We found a table entry that is smaller, 1034 * stop there will be none greater or equal. 1035 */ 1036 if (lower_rate != NULL) 1037 *lower_rate = rs->rs_rlt[i].rate; 1038 break; 1039 } 1040 } 1041 if ((matched == 0) && 1042 (flags & RS_PACING_SUB_OK)) { 1043 /* Kick in a substitute (the highest) */ 1044 rte = &rs->rs_rlt[rs->rs_highest_valid]; 1045 } 1046 } 1047 return (rte); 1048 } 1049 1050 static struct ifnet * 1051 rt_find_real_interface(struct ifnet *ifp, struct inpcb *inp, int *error) 1052 { 1053 struct ifnet *tifp; 1054 struct m_snd_tag *tag, *ntag; 1055 union if_snd_tag_alloc_params params = { 1056 .rate_limit.hdr.type = IF_SND_TAG_TYPE_RATE_LIMIT, 1057 .rate_limit.hdr.flowid = inp->inp_flowid, 1058 .rate_limit.hdr.numa_domain = inp->inp_numa_domain, 1059 .rate_limit.max_rate = COMMON_RATE, 1060 .rate_limit.flags = M_NOWAIT, 1061 }; 1062 int err; 1063 #ifdef RSS 1064 params.rate_limit.hdr.flowtype = ((inp->inp_vflag & INP_IPV6) ? 1065 M_HASHTYPE_RSS_TCP_IPV6 : M_HASHTYPE_RSS_TCP_IPV4); 1066 #else 1067 params.rate_limit.hdr.flowtype = M_HASHTYPE_OPAQUE_HASH; 1068 #endif 1069 err = m_snd_tag_alloc(ifp, ¶ms, &tag); 1070 if (err) { 1071 /* Failed to setup a tag? */ 1072 if (error) 1073 *error = err; 1074 return (NULL); 1075 } 1076 ntag = tag; 1077 while (ntag->sw->next_snd_tag != NULL) { 1078 ntag = ntag->sw->next_snd_tag(ntag); 1079 } 1080 tifp = ntag->ifp; 1081 m_snd_tag_rele(tag); 1082 return (tifp); 1083 } 1084 1085 static void 1086 rl_increment_using(const struct tcp_hwrate_limit_table *rte) 1087 { 1088 struct tcp_hwrate_limit_table *decon_rte; 1089 1090 decon_rte = __DECONST(struct tcp_hwrate_limit_table *, rte); 1091 atomic_add_long(&decon_rte->using, 1); 1092 } 1093 1094 static void 1095 rl_decrement_using(const struct tcp_hwrate_limit_table *rte) 1096 { 1097 struct tcp_hwrate_limit_table *decon_rte; 1098 1099 decon_rte = __DECONST(struct tcp_hwrate_limit_table *, rte); 1100 atomic_subtract_long(&decon_rte->using, 1); 1101 } 1102 1103 void 1104 tcp_rl_log_enobuf(const struct tcp_hwrate_limit_table *rte) 1105 { 1106 struct tcp_hwrate_limit_table *decon_rte; 1107 1108 decon_rte = __DECONST(struct tcp_hwrate_limit_table *, rte); 1109 atomic_add_long(&decon_rte->rs_num_enobufs, 1); 1110 } 1111 1112 /* 1113 * Do NOT take the __noinline out of the 1114 * find_rs_for_ifp() function. If you do the inline 1115 * of it for the rt_setup_rate() will show you a 1116 * compiler bug. For some reason the compiler thinks 1117 * the list can never be empty. The consequence of 1118 * this will be a crash when we dereference NULL 1119 * if an ifp is removed just has a hw rate limit 1120 * is attempted. If you are working on the compiler 1121 * and want to "test" this go ahead and take the noinline 1122 * out otherwise let sleeping dogs ly until such time 1123 * as we get a compiler fix 10/2/20 -- RRS 1124 */ 1125 static __noinline struct tcp_rate_set * 1126 find_rs_for_ifp(struct ifnet *ifp) 1127 { 1128 struct tcp_rate_set *rs; 1129 1130 CK_LIST_FOREACH(rs, &int_rs, next) { 1131 if ((rs->rs_ifp == ifp) && 1132 (rs->rs_if_dunit == ifp->if_dunit)) { 1133 /* Ok we found it */ 1134 return (rs); 1135 } 1136 } 1137 return (NULL); 1138 } 1139 1140 1141 static const struct tcp_hwrate_limit_table * 1142 rt_setup_rate(struct inpcb *inp, struct ifnet *ifp, uint64_t bytes_per_sec, 1143 uint32_t flags, int *error, uint64_t *lower_rate) 1144 { 1145 /* First lets find the interface if it exists */ 1146 const struct tcp_hwrate_limit_table *rte; 1147 /* 1148 * So why is rs volatile? This is to defeat a 1149 * compiler bug where in the compiler is convinced 1150 * that rs can never be NULL (which is not true). Because 1151 * of its conviction it nicely optimizes out the if ((rs == NULL 1152 * below which means if you get a NULL back you dereference it. 1153 */ 1154 volatile struct tcp_rate_set *rs; 1155 struct epoch_tracker et; 1156 struct ifnet *oifp = ifp; 1157 int err; 1158 1159 NET_EPOCH_ENTER(et); 1160 use_real_interface: 1161 rs = find_rs_for_ifp(ifp); 1162 if ((rs == NULL) || 1163 (rs->rs_flags & RS_INTF_NO_SUP) || 1164 (rs->rs_flags & RS_IS_DEAD)) { 1165 /* 1166 * This means we got a packet *before* 1167 * the IF-UP was processed below, <or> 1168 * while or after we already received an interface 1169 * departed event. In either case we really don't 1170 * want to do anything with pacing, in 1171 * the departing case the packet is not 1172 * going to go very far. The new case 1173 * might be arguable, but its impossible 1174 * to tell from the departing case. 1175 */ 1176 if (error) 1177 *error = ENODEV; 1178 NET_EPOCH_EXIT(et); 1179 return (NULL); 1180 } 1181 1182 if ((rs == NULL) || (rs->rs_disable != 0)) { 1183 if (error) 1184 *error = ENOSPC; 1185 NET_EPOCH_EXIT(et); 1186 return (NULL); 1187 } 1188 if (rs->rs_flags & RS_IS_DEFF) { 1189 /* We need to find the real interface */ 1190 struct ifnet *tifp; 1191 1192 tifp = rt_find_real_interface(ifp, inp, error); 1193 if (tifp == NULL) { 1194 if (rs->rs_disable && error) 1195 *error = ENOTSUP; 1196 NET_EPOCH_EXIT(et); 1197 return (NULL); 1198 } 1199 KASSERT((tifp != ifp), 1200 ("Lookup failure ifp:%p inp:%p rt_find_real_interface() returns the same interface tifp:%p?\n", 1201 ifp, inp, tifp)); 1202 ifp = tifp; 1203 goto use_real_interface; 1204 } 1205 if (rs->rs_flow_limit && 1206 ((rs->rs_flows_using + 1) > rs->rs_flow_limit)) { 1207 if (error) 1208 *error = ENOSPC; 1209 NET_EPOCH_EXIT(et); 1210 return (NULL); 1211 } 1212 rte = tcp_find_suitable_rate(rs, bytes_per_sec, flags, lower_rate); 1213 if (rte) { 1214 err = in_pcbattach_txrtlmt(inp, oifp, 1215 inp->inp_flowtype, 1216 inp->inp_flowid, 1217 rte->rate, 1218 &inp->inp_snd_tag); 1219 if (err) { 1220 /* Failed to attach */ 1221 if (error) 1222 *error = err; 1223 rte = NULL; 1224 } else { 1225 KASSERT((inp->inp_snd_tag != NULL) , 1226 ("Setup rate has no snd_tag inp:%p rte:%p rate:%llu rs:%p", 1227 inp, rte, (unsigned long long)rte->rate, rs)); 1228 #ifdef INET 1229 counter_u64_add(rate_limit_new, 1); 1230 #endif 1231 } 1232 } 1233 if (rte) { 1234 /* 1235 * We use an atomic here for accounting so we don't have to 1236 * use locks when freeing. 1237 */ 1238 atomic_add_64(&rs->rs_flows_using, 1); 1239 } 1240 NET_EPOCH_EXIT(et); 1241 return (rte); 1242 } 1243 1244 static void 1245 tcp_rl_ifnet_link(void *arg __unused, struct ifnet *ifp, int link_state) 1246 { 1247 int error; 1248 struct tcp_rate_set *rs; 1249 struct epoch_tracker et; 1250 1251 if (((ifp->if_capenable & IFCAP_TXRTLMT) == 0) || 1252 (link_state != LINK_STATE_UP)) { 1253 /* 1254 * We only care on an interface going up that is rate-limit 1255 * capable. 1256 */ 1257 return; 1258 } 1259 NET_EPOCH_ENTER(et); 1260 mtx_lock(&rs_mtx); 1261 rs = find_rs_for_ifp(ifp); 1262 if (rs) { 1263 /* We already have initialized this guy */ 1264 mtx_unlock(&rs_mtx); 1265 NET_EPOCH_EXIT(et); 1266 return; 1267 } 1268 mtx_unlock(&rs_mtx); 1269 NET_EPOCH_EXIT(et); 1270 rt_setup_new_rs(ifp, &error); 1271 } 1272 1273 static void 1274 tcp_rl_ifnet_departure(void *arg __unused, struct ifnet *ifp) 1275 { 1276 struct tcp_rate_set *rs; 1277 struct epoch_tracker et; 1278 int i; 1279 1280 NET_EPOCH_ENTER(et); 1281 mtx_lock(&rs_mtx); 1282 rs = find_rs_for_ifp(ifp); 1283 if (rs) { 1284 CK_LIST_REMOVE(rs, next); 1285 rs_number_alive--; 1286 rs->rs_flags |= RS_IS_DEAD; 1287 for (i = 0; i < rs->rs_rate_cnt; i++) { 1288 if (rs->rs_rlt[i].flags & HDWRPACE_TAGPRESENT) { 1289 in_pcbdetach_tag(rs->rs_rlt[i].tag); 1290 rs->rs_rlt[i].tag = NULL; 1291 } 1292 rs->rs_rlt[i].flags = HDWRPACE_IFPDEPARTED; 1293 } 1294 if (rs->rs_flows_using == 0) 1295 rs_defer_destroy(rs); 1296 } 1297 mtx_unlock(&rs_mtx); 1298 NET_EPOCH_EXIT(et); 1299 } 1300 1301 static void 1302 tcp_rl_shutdown(void *arg __unused, int howto __unused) 1303 { 1304 struct tcp_rate_set *rs, *nrs; 1305 struct epoch_tracker et; 1306 int i; 1307 1308 NET_EPOCH_ENTER(et); 1309 mtx_lock(&rs_mtx); 1310 CK_LIST_FOREACH_SAFE(rs, &int_rs, next, nrs) { 1311 CK_LIST_REMOVE(rs, next); 1312 rs_number_alive--; 1313 rs->rs_flags |= RS_IS_DEAD; 1314 for (i = 0; i < rs->rs_rate_cnt; i++) { 1315 if (rs->rs_rlt[i].flags & HDWRPACE_TAGPRESENT) { 1316 in_pcbdetach_tag(rs->rs_rlt[i].tag); 1317 rs->rs_rlt[i].tag = NULL; 1318 } 1319 rs->rs_rlt[i].flags = HDWRPACE_IFPDEPARTED; 1320 } 1321 if (rs->rs_flows_using == 0) 1322 rs_defer_destroy(rs); 1323 } 1324 mtx_unlock(&rs_mtx); 1325 NET_EPOCH_EXIT(et); 1326 } 1327 1328 const struct tcp_hwrate_limit_table * 1329 tcp_set_pacing_rate(struct tcpcb *tp, struct ifnet *ifp, 1330 uint64_t bytes_per_sec, int flags, int *error, uint64_t *lower_rate) 1331 { 1332 struct inpcb *inp = tptoinpcb(tp); 1333 const struct tcp_hwrate_limit_table *rte; 1334 #ifdef KERN_TLS 1335 struct ktls_session *tls; 1336 #endif 1337 1338 INP_WLOCK_ASSERT(inp); 1339 1340 if (inp->inp_snd_tag == NULL) { 1341 /* 1342 * We are setting up a rate for the first time. 1343 */ 1344 if ((ifp->if_capenable & IFCAP_TXRTLMT) == 0) { 1345 /* Not supported by the egress */ 1346 if (error) 1347 *error = ENODEV; 1348 return (NULL); 1349 } 1350 #ifdef KERN_TLS 1351 tls = NULL; 1352 if (tp->t_nic_ktls_xmit != 0) { 1353 tls = tptosocket(tp)->so_snd.sb_tls_info; 1354 1355 if ((ifp->if_capenable & IFCAP_TXTLS_RTLMT) == 0 || 1356 tls->mode != TCP_TLS_MODE_IFNET) { 1357 if (error) 1358 *error = ENODEV; 1359 return (NULL); 1360 } 1361 } 1362 #endif 1363 rte = rt_setup_rate(inp, ifp, bytes_per_sec, flags, error, lower_rate); 1364 if (rte) 1365 rl_increment_using(rte); 1366 #ifdef KERN_TLS 1367 if (rte != NULL && tls != NULL && tls->snd_tag != NULL) { 1368 /* 1369 * Fake a route change error to reset the TLS 1370 * send tag. This will convert the existing 1371 * tag to a TLS ratelimit tag. 1372 */ 1373 MPASS(tls->snd_tag->sw->type == IF_SND_TAG_TYPE_TLS); 1374 ktls_output_eagain(inp, tls); 1375 } 1376 #endif 1377 } else { 1378 /* 1379 * We are modifying a rate, wrong interface? 1380 */ 1381 if (error) 1382 *error = EINVAL; 1383 rte = NULL; 1384 } 1385 if (rte != NULL) { 1386 tp->t_pacing_rate = rte->rate; 1387 *error = 0; 1388 } 1389 return (rte); 1390 } 1391 1392 const struct tcp_hwrate_limit_table * 1393 tcp_chg_pacing_rate(const struct tcp_hwrate_limit_table *crte, 1394 struct tcpcb *tp, struct ifnet *ifp, 1395 uint64_t bytes_per_sec, int flags, int *error, uint64_t *lower_rate) 1396 { 1397 struct inpcb *inp = tptoinpcb(tp); 1398 const struct tcp_hwrate_limit_table *nrte; 1399 const struct tcp_rate_set *rs; 1400 #ifdef KERN_TLS 1401 struct ktls_session *tls = NULL; 1402 #endif 1403 int err; 1404 1405 INP_WLOCK_ASSERT(inp); 1406 1407 if (crte == NULL) { 1408 /* Wrong interface */ 1409 if (error) 1410 *error = EINVAL; 1411 return (NULL); 1412 } 1413 1414 #ifdef KERN_TLS 1415 if (tp->t_nic_ktls_xmit) { 1416 tls = tptosocket(tp)->so_snd.sb_tls_info; 1417 if (tls->mode != TCP_TLS_MODE_IFNET) 1418 tls = NULL; 1419 else if (tls->snd_tag != NULL && 1420 tls->snd_tag->sw->type != IF_SND_TAG_TYPE_TLS_RATE_LIMIT) { 1421 if (!tls->reset_pending) { 1422 /* 1423 * NIC probably doesn't support 1424 * ratelimit TLS tags if it didn't 1425 * allocate one when an existing rate 1426 * was present, so ignore. 1427 */ 1428 tcp_rel_pacing_rate(crte, tp); 1429 if (error) 1430 *error = EOPNOTSUPP; 1431 return (NULL); 1432 } 1433 1434 /* 1435 * The send tag is being converted, so set the 1436 * rate limit on the inpcb tag. There is a 1437 * race that the new NIC send tag might use 1438 * the current rate instead of this one. 1439 */ 1440 tls = NULL; 1441 } 1442 } 1443 #endif 1444 if (inp->inp_snd_tag == NULL) { 1445 /* Wrong interface */ 1446 tcp_rel_pacing_rate(crte, tp); 1447 if (error) 1448 *error = EINVAL; 1449 return (NULL); 1450 } 1451 rs = crte->ptbl; 1452 if ((rs->rs_flags & RS_IS_DEAD) || 1453 (crte->flags & HDWRPACE_IFPDEPARTED)) { 1454 /* Release the rate, and try anew */ 1455 1456 tcp_rel_pacing_rate(crte, tp); 1457 nrte = tcp_set_pacing_rate(tp, ifp, 1458 bytes_per_sec, flags, error, lower_rate); 1459 return (nrte); 1460 } 1461 nrte = tcp_find_suitable_rate(rs, bytes_per_sec, flags, lower_rate); 1462 if (nrte == crte) { 1463 /* No change */ 1464 if (error) 1465 *error = 0; 1466 return (crte); 1467 } 1468 if (nrte == NULL) { 1469 /* Release the old rate */ 1470 if (error) 1471 *error = ENOENT; 1472 tcp_rel_pacing_rate(crte, tp); 1473 return (NULL); 1474 } 1475 rl_decrement_using(crte); 1476 rl_increment_using(nrte); 1477 /* Change rates to our new entry */ 1478 #ifdef KERN_TLS 1479 if (tls != NULL) 1480 err = ktls_modify_txrtlmt(tls, nrte->rate); 1481 else 1482 #endif 1483 err = in_pcbmodify_txrtlmt(inp, nrte->rate); 1484 if (err) { 1485 struct tcp_rate_set *lrs; 1486 uint64_t pre; 1487 1488 rl_decrement_using(nrte); 1489 lrs = __DECONST(struct tcp_rate_set *, rs); 1490 pre = atomic_fetchadd_64(&lrs->rs_flows_using, -1); 1491 /* Do we still have a snd-tag attached? */ 1492 if (inp->inp_snd_tag) 1493 in_pcbdetach_txrtlmt(inp); 1494 1495 if (pre == 1) { 1496 struct epoch_tracker et; 1497 1498 NET_EPOCH_ENTER(et); 1499 mtx_lock(&rs_mtx); 1500 /* 1501 * Is it dead? 1502 */ 1503 if (lrs->rs_flags & RS_IS_DEAD) 1504 rs_defer_destroy(lrs); 1505 mtx_unlock(&rs_mtx); 1506 NET_EPOCH_EXIT(et); 1507 } 1508 if (error) 1509 *error = err; 1510 return (NULL); 1511 } else { 1512 #ifdef INET 1513 counter_u64_add(rate_limit_chg, 1); 1514 #endif 1515 } 1516 if (error) 1517 *error = 0; 1518 tp->t_pacing_rate = nrte->rate; 1519 return (nrte); 1520 } 1521 1522 void 1523 tcp_rel_pacing_rate(const struct tcp_hwrate_limit_table *crte, struct tcpcb *tp) 1524 { 1525 struct inpcb *inp = tptoinpcb(tp); 1526 const struct tcp_rate_set *crs; 1527 struct tcp_rate_set *rs; 1528 uint64_t pre; 1529 1530 INP_WLOCK_ASSERT(inp); 1531 1532 tp->t_pacing_rate = -1; 1533 crs = crte->ptbl; 1534 /* 1535 * Now we must break the const 1536 * in order to release our refcount. 1537 */ 1538 rs = __DECONST(struct tcp_rate_set *, crs); 1539 rl_decrement_using(crte); 1540 pre = atomic_fetchadd_64(&rs->rs_flows_using, -1); 1541 if (pre == 1) { 1542 struct epoch_tracker et; 1543 1544 NET_EPOCH_ENTER(et); 1545 mtx_lock(&rs_mtx); 1546 /* 1547 * Is it dead? 1548 */ 1549 if (rs->rs_flags & RS_IS_DEAD) 1550 rs_defer_destroy(rs); 1551 mtx_unlock(&rs_mtx); 1552 NET_EPOCH_EXIT(et); 1553 } 1554 1555 /* 1556 * XXX: If this connection is using ifnet TLS, should we 1557 * switch it to using an unlimited rate, or perhaps use 1558 * ktls_output_eagain() to reset the send tag to a plain 1559 * TLS tag? 1560 */ 1561 in_pcbdetach_txrtlmt(inp); 1562 } 1563 1564 #define ONE_POINT_TWO_MEG 150000 /* 1.2 megabits in bytes */ 1565 #define ONE_HUNDRED_MBPS 12500000 /* 100Mbps in bytes per second */ 1566 #define FIVE_HUNDRED_MBPS 62500000 /* 500Mbps in bytes per second */ 1567 #define MAX_MSS_SENT 43 /* 43 mss = 43 x 1500 = 64,500 bytes */ 1568 1569 static void 1570 tcp_log_pacing_size(struct tcpcb *tp, uint64_t bw, uint32_t segsiz, uint32_t new_tso, 1571 uint64_t hw_rate, uint32_t time_between, uint32_t calc_time_between, 1572 uint32_t segs, uint32_t res_div, uint16_t mult, uint8_t mod) 1573 { 1574 if (tcp_bblogging_on(tp)) { 1575 union tcp_log_stackspecific log; 1576 struct timeval tv; 1577 1578 memset(&log, 0, sizeof(log)); 1579 log.u_bbr.flex1 = segsiz; 1580 log.u_bbr.flex2 = new_tso; 1581 log.u_bbr.flex3 = time_between; 1582 log.u_bbr.flex4 = calc_time_between; 1583 log.u_bbr.flex5 = segs; 1584 log.u_bbr.flex6 = res_div; 1585 log.u_bbr.flex7 = mult; 1586 log.u_bbr.flex8 = mod; 1587 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 1588 log.u_bbr.cur_del_rate = bw; 1589 log.u_bbr.delRate = hw_rate; 1590 TCP_LOG_EVENTP(tp, NULL, 1591 &tptosocket(tp)->so_rcv, 1592 &tptosocket(tp)->so_snd, 1593 TCP_HDWR_PACE_SIZE, 0, 1594 0, &log, false, &tv); 1595 } 1596 } 1597 1598 uint32_t 1599 tcp_get_pacing_burst_size_w_divisor(struct tcpcb *tp, uint64_t bw, uint32_t segsiz, int can_use_1mss, 1600 const struct tcp_hwrate_limit_table *te, int *err, int divisor) 1601 { 1602 /* 1603 * We use the google formula to calculate the 1604 * TSO size. I.E. 1605 * bw < 24Meg 1606 * tso = 2mss 1607 * else 1608 * tso = min(bw/(div=1000), 64k) 1609 * 1610 * Note for these calculations we ignore the 1611 * packet overhead (enet hdr, ip hdr and tcp hdr). 1612 * We only get the google formula when we have 1613 * divisor = 1000, which is the default for now. 1614 */ 1615 uint64_t lentim, res, bytes; 1616 uint32_t new_tso, min_tso_segs; 1617 1618 /* It can't be zero */ 1619 if ((divisor == 0) || 1620 (divisor < RL_MIN_DIVISOR)) { 1621 if (mss_divisor) 1622 bytes = bw / mss_divisor; 1623 else 1624 bytes = bw / 1000; 1625 } else 1626 bytes = bw / divisor; 1627 /* We can't ever send more than 65k in a TSO */ 1628 if (bytes > 0xffff) { 1629 bytes = 0xffff; 1630 } 1631 /* Round up */ 1632 new_tso = (bytes + segsiz - 1) / segsiz; 1633 /* Are we enforcing even boundaries? */ 1634 if (even_num_segs && (new_tso & 1) && (new_tso > even_threshold)) 1635 new_tso++; 1636 if (can_use_1mss) 1637 min_tso_segs = 1; 1638 else 1639 min_tso_segs = 2; 1640 if (rs_floor_mss && (new_tso < rs_floor_mss)) 1641 new_tso = rs_floor_mss; 1642 else if (new_tso < min_tso_segs) 1643 new_tso = min_tso_segs; 1644 if (new_tso > MAX_MSS_SENT) 1645 new_tso = MAX_MSS_SENT; 1646 new_tso *= segsiz; 1647 tcp_log_pacing_size(tp, bw, segsiz, new_tso, 1648 0, 0, 0, 0, 0, 0, 1); 1649 /* 1650 * If we are not doing hardware pacing 1651 * then we are done. 1652 */ 1653 if (te == NULL) { 1654 if (err) 1655 *err = 0; 1656 return(new_tso); 1657 } 1658 /* 1659 * For hardware pacing we look at the 1660 * rate you are sending at and compare 1661 * that to the rate you have in hardware. 1662 * 1663 * If the hardware rate is slower than your 1664 * software rate then you are in error and 1665 * we will build a queue in our hardware whic 1666 * is probably not desired, in such a case 1667 * just return the non-hardware TSO size. 1668 * 1669 * If the rate in hardware is faster (which 1670 * it should be) then look at how long it 1671 * takes to send one ethernet segment size at 1672 * your b/w and compare that to the time it 1673 * takes to send at the rate you had selected. 1674 * 1675 * If your time is greater (which we hope it is) 1676 * we get the delta between the two, and then 1677 * divide that into your pacing time. This tells 1678 * us how many MSS you can send down at once (rounded up). 1679 * 1680 * Note we also double this value if the b/w is over 1681 * 100Mbps. If its over 500meg we just set you to the 1682 * max (43 segments). 1683 */ 1684 if (te->rate > FIVE_HUNDRED_MBPS) 1685 goto max; 1686 if (te->rate == bw) { 1687 /* We are pacing at exactly the hdwr rate */ 1688 max: 1689 tcp_log_pacing_size(tp, bw, segsiz, new_tso, 1690 te->rate, te->time_between, (uint32_t)0, 1691 (segsiz * MAX_MSS_SENT), 0, 0, 3); 1692 return (segsiz * MAX_MSS_SENT); 1693 } 1694 lentim = ETHERNET_SEGMENT_SIZE * USECS_IN_SECOND; 1695 res = lentim / bw; 1696 if (res > te->time_between) { 1697 uint32_t delta, segs, res_div; 1698 1699 res_div = ((res * num_of_waits_allowed) + wait_time_floor); 1700 delta = res - te->time_between; 1701 segs = (res_div + delta - 1)/delta; 1702 if (segs < min_tso_segs) 1703 segs = min_tso_segs; 1704 if (segs < rs_hw_floor_mss) 1705 segs = rs_hw_floor_mss; 1706 if (segs > MAX_MSS_SENT) 1707 segs = MAX_MSS_SENT; 1708 segs *= segsiz; 1709 tcp_log_pacing_size(tp, bw, segsiz, new_tso, 1710 te->rate, te->time_between, (uint32_t)res, 1711 segs, res_div, 1, 3); 1712 if (err) 1713 *err = 0; 1714 if (segs < new_tso) { 1715 /* unexpected ? */ 1716 return(new_tso); 1717 } else { 1718 return (segs); 1719 } 1720 } else { 1721 /* 1722 * Your time is smaller which means 1723 * we will grow a queue on our 1724 * hardware. Send back the non-hardware 1725 * rate. 1726 */ 1727 tcp_log_pacing_size(tp, bw, segsiz, new_tso, 1728 te->rate, te->time_between, (uint32_t)res, 1729 0, 0, 0, 4); 1730 if (err) 1731 *err = -1; 1732 return (new_tso); 1733 } 1734 } 1735 1736 uint64_t 1737 tcp_hw_highest_rate_ifp(struct ifnet *ifp, struct inpcb *inp) 1738 { 1739 struct epoch_tracker et; 1740 struct tcp_rate_set *rs; 1741 uint64_t rate_ret; 1742 1743 NET_EPOCH_ENTER(et); 1744 use_next_interface: 1745 rs = find_rs_for_ifp(ifp); 1746 if (rs == NULL) { 1747 /* This interface does not do ratelimiting */ 1748 rate_ret = 0; 1749 } else if (rs->rs_flags & RS_IS_DEFF) { 1750 /* We need to find the real interface */ 1751 struct ifnet *tifp; 1752 1753 tifp = rt_find_real_interface(ifp, inp, NULL); 1754 if (tifp == NULL) { 1755 NET_EPOCH_EXIT(et); 1756 return (0); 1757 } 1758 ifp = tifp; 1759 goto use_next_interface; 1760 } else { 1761 /* Lets return the highest rate this guy has */ 1762 rate_ret = rs->rs_rlt[rs->rs_highest_valid].rate; 1763 } 1764 NET_EPOCH_EXIT(et); 1765 return(rate_ret); 1766 } 1767 1768 static eventhandler_tag rl_ifnet_departs; 1769 static eventhandler_tag rl_ifnet_arrives; 1770 static eventhandler_tag rl_shutdown_start; 1771 1772 static void 1773 tcp_rs_init(void *st __unused) 1774 { 1775 CK_LIST_INIT(&int_rs); 1776 rs_number_alive = 0; 1777 rs_number_dead = 0; 1778 mtx_init(&rs_mtx, "tcp_rs_mtx", "rsmtx", MTX_DEF); 1779 rl_ifnet_departs = EVENTHANDLER_REGISTER(ifnet_departure_event, 1780 tcp_rl_ifnet_departure, 1781 NULL, EVENTHANDLER_PRI_ANY); 1782 rl_ifnet_arrives = EVENTHANDLER_REGISTER(ifnet_link_event, 1783 tcp_rl_ifnet_link, 1784 NULL, EVENTHANDLER_PRI_ANY); 1785 rl_shutdown_start = EVENTHANDLER_REGISTER(shutdown_pre_sync, 1786 tcp_rl_shutdown, NULL, 1787 SHUTDOWN_PRI_FIRST); 1788 printf("TCP_ratelimit: Is now initialized\n"); 1789 } 1790 1791 SYSINIT(tcp_rl_init, SI_SUB_SMP + 1, SI_ORDER_ANY, tcp_rs_init, NULL); 1792 #endif 1793