1 /*- 2 * 3 * SPDX-License-Identifier: BSD-3-Clause 4 * 5 * Copyright (c) 2018-2020 6 * Netflix Inc. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 * 29 */ 30 /** 31 * Author: Randall Stewart <rrs@netflix.com> 32 */ 33 34 #include <sys/cdefs.h> 35 __FBSDID("$FreeBSD$"); 36 #include "opt_inet.h" 37 #include "opt_inet6.h" 38 #include "opt_ipsec.h" 39 #include "opt_tcpdebug.h" 40 #include "opt_ratelimit.h" 41 #include <sys/param.h> 42 #include <sys/kernel.h> 43 #include <sys/malloc.h> 44 #include <sys/mbuf.h> 45 #include <sys/socket.h> 46 #include <sys/socketvar.h> 47 #include <sys/sysctl.h> 48 #include <sys/eventhandler.h> 49 #include <sys/mutex.h> 50 #include <sys/ck.h> 51 #include <net/if.h> 52 #include <net/if_var.h> 53 #include <netinet/in.h> 54 #include <netinet/in_pcb.h> 55 #define TCPSTATES /* for logging */ 56 #include <netinet/tcp_var.h> 57 #ifdef INET6 58 #include <netinet6/tcp6_var.h> 59 #endif 60 #include <netinet/tcp_ratelimit.h> 61 #ifndef USECS_IN_SECOND 62 #define USECS_IN_SECOND 1000000 63 #endif 64 /* 65 * For the purposes of each send, what is the size 66 * of an ethernet frame. 67 */ 68 MALLOC_DEFINE(M_TCPPACE, "tcp_hwpace", "TCP Hardware pacing memory"); 69 #ifdef RATELIMIT 70 71 /* 72 * The following preferred table will seem weird to 73 * the casual viewer. Why do we not have any rates below 74 * 1Mbps? Why do we have a rate at 1.44Mbps called common? 75 * Why do the rates cluster in the 1-100Mbps range more 76 * than others? Why does the table jump around at the beginnign 77 * and then be more consistently raising? 78 * 79 * Let me try to answer those questions. A lot of 80 * this is dependant on the hardware. We have three basic 81 * supporters of rate limiting 82 * 83 * Chelsio - Supporting 16 configurable rates. 84 * Mlx - c4 supporting 13 fixed rates. 85 * Mlx - c5 & c6 supporting 127 configurable rates. 86 * 87 * The c4 is why we have a common rate that is available 88 * in all rate tables. This is a selected rate from the 89 * c4 table and we assure its available in all ratelimit 90 * tables. This way the tcp_ratelimit code has an assured 91 * rate it should always be able to get. This answers a 92 * couple of the questions above. 93 * 94 * So what about the rest, well the table is built to 95 * try to get the most out of a joint hardware/software 96 * pacing system. The software pacer will always pick 97 * a rate higher than the b/w that it is estimating 98 * 99 * on the path. This is done for two reasons. 100 * a) So we can discover more b/w 101 * and 102 * b) So we can send a block of MSS's down and then 103 * have the software timer go off after the previous 104 * send is completely out of the hardware. 105 * 106 * But when we do <b> we don't want to have the delay 107 * between the last packet sent by the hardware be 108 * excessively long (to reach our desired rate). 109 * 110 * So let me give an example for clarity. 111 * 112 * Lets assume that the tcp stack sees that 29,110,000 bps is 113 * what the bw of the path is. The stack would select the 114 * rate 31Mbps. 31Mbps means that each send that is done 115 * by the hardware will cause a 390 micro-second gap between 116 * the packets sent at that rate. For 29,110,000 bps we 117 * would need 416 micro-seconds gap between each send. 118 * 119 * Note that are calculating a complete time for pacing 120 * which includes the ethernet, IP and TCP overhead. So 121 * a full 1514 bytes is used for the above calculations. 122 * My testing has shown that both cards are also using this 123 * as their basis i.e. full payload size of the ethernet frame. 124 * The TCP stack caller needs to be aware of this and make the 125 * appropriate overhead calculations be included in its choices. 126 * 127 * Now, continuing our example, we pick a MSS size based on the 128 * delta between the two rates (416 - 390) divided into the rate 129 * we really wish to send at rounded up. That results in a MSS 130 * send of 17 mss's at once. The hardware then will 131 * run out of data in a single 17MSS send in 6,630 micro-seconds. 132 * 133 * On the other hand the software pacer will send more data 134 * in 7,072 micro-seconds. This means that we will refill 135 * the hardware 52 microseconds after it would have sent 136 * next if it had not ran out of data. This is a win since we are 137 * only sending every 7ms or so and yet all the packets are spaced on 138 * the wire with 94% of what they should be and only 139 * the last packet is delayed extra to make up for the 140 * difference. 141 * 142 * Note that the above formula has two important caveat. 143 * If we are above (b/w wise) over 100Mbps we double the result 144 * of the MSS calculation. The second caveat is if we are 500Mbps 145 * or more we just send the maximum MSS at once i.e. 45MSS. At 146 * the higher b/w's even the cards have limits to what times (timer granularity) 147 * they can insert between packets and start to send more than one 148 * packet at a time on the wire. 149 * 150 */ 151 #define COMMON_RATE 180500 152 const uint64_t desired_rates[] = { 153 122500, /* 1Mbps - rate 1 */ 154 180500, /* 1.44Mpbs - rate 2 common rate */ 155 375000, /* 3Mbps - rate 3 */ 156 625000, /* 5Mbps - rate 4 */ 157 875000, /* 7Mbps - rate 5 */ 158 1125000, /* 9Mbps - rate 6 */ 159 1375000, /* 11Mbps - rate 7 */ 160 1625000, /* 13Mbps - rate 8 */ 161 2625000, /* 21Mbps - rate 9 */ 162 3875000, /* 31Mbps - rate 10 */ 163 5125000, /* 41Meg - rate 11 */ 164 12500000, /* 100Mbps - rate 12 */ 165 25000000, /* 200Mbps - rate 13 */ 166 50000000, /* 400Mbps - rate 14 */ 167 63750000, /* 51Mbps - rate 15 */ 168 100000000, /* 800Mbps - rate 16 */ 169 1875000, /* 15Mbps - rate 17 */ 170 2125000, /* 17Mbps - rate 18 */ 171 2375000, /* 19Mbps - rate 19 */ 172 2875000, /* 23Mbps - rate 20 */ 173 3125000, /* 25Mbps - rate 21 */ 174 3375000, /* 27Mbps - rate 22 */ 175 3625000, /* 29Mbps - rate 23 */ 176 4125000, /* 33Mbps - rate 24 */ 177 4375000, /* 35Mbps - rate 25 */ 178 4625000, /* 37Mbps - rate 26 */ 179 4875000, /* 39Mbps - rate 27 */ 180 5375000, /* 43Mbps - rate 28 */ 181 5625000, /* 45Mbps - rate 29 */ 182 5875000, /* 47Mbps - rate 30 */ 183 6125000, /* 49Mbps - rate 31 */ 184 6625000, /* 53Mbps - rate 32 */ 185 6875000, /* 55Mbps - rate 33 */ 186 7125000, /* 57Mbps - rate 34 */ 187 7375000, /* 59Mbps - rate 35 */ 188 7625000, /* 61Mbps - rate 36 */ 189 7875000, /* 63Mbps - rate 37 */ 190 8125000, /* 65Mbps - rate 38 */ 191 8375000, /* 67Mbps - rate 39 */ 192 8625000, /* 69Mbps - rate 40 */ 193 8875000, /* 71Mbps - rate 41 */ 194 9125000, /* 73Mbps - rate 42 */ 195 9375000, /* 75Mbps - rate 43 */ 196 9625000, /* 77Mbps - rate 44 */ 197 9875000, /* 79Mbps - rate 45 */ 198 10125000, /* 81Mbps - rate 46 */ 199 10375000, /* 83Mbps - rate 47 */ 200 10625000, /* 85Mbps - rate 48 */ 201 10875000, /* 87Mbps - rate 49 */ 202 11125000, /* 89Mbps - rate 50 */ 203 11375000, /* 91Mbps - rate 51 */ 204 11625000, /* 93Mbps - rate 52 */ 205 11875000, /* 95Mbps - rate 53 */ 206 13125000, /* 105Mbps - rate 54 */ 207 13750000, /* 110Mbps - rate 55 */ 208 14375000, /* 115Mbps - rate 56 */ 209 15000000, /* 120Mbps - rate 57 */ 210 15625000, /* 125Mbps - rate 58 */ 211 16250000, /* 130Mbps - rate 59 */ 212 16875000, /* 135Mbps - rate 60 */ 213 17500000, /* 140Mbps - rate 61 */ 214 18125000, /* 145Mbps - rate 62 */ 215 18750000, /* 150Mbps - rate 64 */ 216 20000000, /* 160Mbps - rate 65 */ 217 21250000, /* 170Mbps - rate 66 */ 218 22500000, /* 180Mbps - rate 67 */ 219 23750000, /* 190Mbps - rate 68 */ 220 26250000, /* 210Mbps - rate 69 */ 221 27500000, /* 220Mbps - rate 70 */ 222 28750000, /* 230Mbps - rate 71 */ 223 30000000, /* 240Mbps - rate 72 */ 224 31250000, /* 250Mbps - rate 73 */ 225 34375000, /* 275Mbps - rate 74 */ 226 37500000, /* 300Mbps - rate 75 */ 227 40625000, /* 325Mbps - rate 76 */ 228 43750000, /* 350Mbps - rate 77 */ 229 46875000, /* 375Mbps - rate 78 */ 230 53125000, /* 425Mbps - rate 79 */ 231 56250000, /* 450Mbps - rate 80 */ 232 59375000, /* 475Mbps - rate 81 */ 233 62500000, /* 500Mbps - rate 82 */ 234 68750000, /* 550Mbps - rate 83 */ 235 75000000, /* 600Mbps - rate 84 */ 236 81250000, /* 650Mbps - rate 85 */ 237 87500000, /* 700Mbps - rate 86 */ 238 93750000, /* 750Mbps - rate 87 */ 239 106250000, /* 850Mbps - rate 88 */ 240 112500000, /* 900Mbps - rate 89 */ 241 125000000, /* 1Gbps - rate 90 */ 242 156250000, /* 1.25Gps - rate 91 */ 243 187500000, /* 1.5Gps - rate 92 */ 244 218750000, /* 1.75Gps - rate 93 */ 245 250000000, /* 2Gbps - rate 94 */ 246 281250000, /* 2.25Gps - rate 95 */ 247 312500000, /* 2.5Gbps - rate 96 */ 248 343750000, /* 2.75Gbps - rate 97 */ 249 375000000, /* 3Gbps - rate 98 */ 250 500000000, /* 4Gbps - rate 99 */ 251 625000000, /* 5Gbps - rate 100 */ 252 750000000, /* 6Gbps - rate 101 */ 253 875000000, /* 7Gbps - rate 102 */ 254 1000000000, /* 8Gbps - rate 103 */ 255 1125000000, /* 9Gbps - rate 104 */ 256 1250000000, /* 10Gbps - rate 105 */ 257 1875000000, /* 15Gbps - rate 106 */ 258 2500000000 /* 20Gbps - rate 107 */ 259 }; 260 261 #define MAX_HDWR_RATES (sizeof(desired_rates)/sizeof(uint64_t)) 262 #define RS_ORDERED_COUNT 16 /* 263 * Number that are in order 264 * at the beginning of the table, 265 * over this a sort is required. 266 */ 267 #define RS_NEXT_ORDER_GROUP 16 /* 268 * The point in our table where 269 * we come fill in a second ordered 270 * group (index wise means -1). 271 */ 272 #define ALL_HARDWARE_RATES 1004 /* 273 * 1Meg - 1Gig in 1 Meg steps 274 * plus 100, 200k and 500k and 275 * 10Gig 276 */ 277 278 #define RS_ONE_MEGABIT_PERSEC 1000000 279 #define RS_ONE_GIGABIT_PERSEC 1000000000 280 #define RS_TEN_GIGABIT_PERSEC 10000000000 281 282 static struct head_tcp_rate_set int_rs; 283 static struct mtx rs_mtx; 284 uint32_t rs_number_alive; 285 uint32_t rs_number_dead; 286 287 SYSCTL_NODE(_net_inet_tcp, OID_AUTO, rl, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 288 "TCP Ratelimit stats"); 289 SYSCTL_UINT(_net_inet_tcp_rl, OID_AUTO, alive, CTLFLAG_RW, 290 &rs_number_alive, 0, 291 "Number of interfaces initialized for ratelimiting"); 292 SYSCTL_UINT(_net_inet_tcp_rl, OID_AUTO, dead, CTLFLAG_RW, 293 &rs_number_dead, 0, 294 "Number of interfaces departing from ratelimiting"); 295 296 static void 297 rl_add_syctl_entries(struct sysctl_oid *rl_sysctl_root, struct tcp_rate_set *rs) 298 { 299 /* 300 * Add sysctl entries for thus interface. 301 */ 302 if (rs->rs_flags & RS_INTF_NO_SUP) { 303 SYSCTL_ADD_S32(&rs->sysctl_ctx, 304 SYSCTL_CHILDREN(rl_sysctl_root), 305 OID_AUTO, "disable", CTLFLAG_RD, 306 &rs->rs_disable, 0, 307 "Disable this interface from new hdwr limiting?"); 308 } else { 309 SYSCTL_ADD_S32(&rs->sysctl_ctx, 310 SYSCTL_CHILDREN(rl_sysctl_root), 311 OID_AUTO, "disable", CTLFLAG_RW, 312 &rs->rs_disable, 0, 313 "Disable this interface from new hdwr limiting?"); 314 } 315 SYSCTL_ADD_S32(&rs->sysctl_ctx, 316 SYSCTL_CHILDREN(rl_sysctl_root), 317 OID_AUTO, "minseg", CTLFLAG_RW, 318 &rs->rs_min_seg, 0, 319 "What is the minimum we need to send on this interface?"); 320 SYSCTL_ADD_U64(&rs->sysctl_ctx, 321 SYSCTL_CHILDREN(rl_sysctl_root), 322 OID_AUTO, "flow_limit", CTLFLAG_RW, 323 &rs->rs_flow_limit, 0, 324 "What is the limit for number of flows (0=unlimited)?"); 325 SYSCTL_ADD_S32(&rs->sysctl_ctx, 326 SYSCTL_CHILDREN(rl_sysctl_root), 327 OID_AUTO, "highest", CTLFLAG_RD, 328 &rs->rs_highest_valid, 0, 329 "Highest valid rate"); 330 SYSCTL_ADD_S32(&rs->sysctl_ctx, 331 SYSCTL_CHILDREN(rl_sysctl_root), 332 OID_AUTO, "lowest", CTLFLAG_RD, 333 &rs->rs_lowest_valid, 0, 334 "Lowest valid rate"); 335 SYSCTL_ADD_S32(&rs->sysctl_ctx, 336 SYSCTL_CHILDREN(rl_sysctl_root), 337 OID_AUTO, "flags", CTLFLAG_RD, 338 &rs->rs_flags, 0, 339 "What lags are on the entry?"); 340 SYSCTL_ADD_S32(&rs->sysctl_ctx, 341 SYSCTL_CHILDREN(rl_sysctl_root), 342 OID_AUTO, "numrates", CTLFLAG_RD, 343 &rs->rs_rate_cnt, 0, 344 "How many rates re there?"); 345 SYSCTL_ADD_U64(&rs->sysctl_ctx, 346 SYSCTL_CHILDREN(rl_sysctl_root), 347 OID_AUTO, "flows_using", CTLFLAG_RD, 348 &rs->rs_flows_using, 0, 349 "How many flows are using this interface now?"); 350 #ifdef DETAILED_RATELIMIT_SYSCTL 351 if (rs->rs_rlt && rs->rs_rate_cnt > 0) { 352 /* Lets display the rates */ 353 int i; 354 struct sysctl_oid *rl_rates; 355 struct sysctl_oid *rl_rate_num; 356 char rate_num[16]; 357 rl_rates = SYSCTL_ADD_NODE(&rs->sysctl_ctx, 358 SYSCTL_CHILDREN(rl_sysctl_root), 359 OID_AUTO, 360 "rate", 361 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 362 "Ratelist"); 363 for( i = 0; i < rs->rs_rate_cnt; i++) { 364 sprintf(rate_num, "%d", i); 365 rl_rate_num = SYSCTL_ADD_NODE(&rs->sysctl_ctx, 366 SYSCTL_CHILDREN(rl_rates), 367 OID_AUTO, 368 rate_num, 369 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 370 "Individual Rate"); 371 SYSCTL_ADD_U32(&rs->sysctl_ctx, 372 SYSCTL_CHILDREN(rl_rate_num), 373 OID_AUTO, "flags", CTLFLAG_RD, 374 &rs->rs_rlt[i].flags, 0, 375 "Flags on this rate"); 376 SYSCTL_ADD_U32(&rs->sysctl_ctx, 377 SYSCTL_CHILDREN(rl_rate_num), 378 OID_AUTO, "pacetime", CTLFLAG_RD, 379 &rs->rs_rlt[i].time_between, 0, 380 "Time hardware inserts between 1500 byte sends"); 381 SYSCTL_ADD_U64(&rs->sysctl_ctx, 382 SYSCTL_CHILDREN(rl_rate_num), 383 OID_AUTO, "rate", CTLFLAG_RD, 384 &rs->rs_rlt[i].rate, 0, 385 "Rate in bytes per second"); 386 } 387 } 388 #endif 389 } 390 391 static void 392 rs_destroy(epoch_context_t ctx) 393 { 394 struct tcp_rate_set *rs; 395 bool do_free_rs; 396 397 rs = __containerof(ctx, struct tcp_rate_set, rs_epoch_ctx); 398 399 mtx_lock(&rs_mtx); 400 rs->rs_flags &= ~RS_FUNERAL_SCHD; 401 /* 402 * In theory its possible (but unlikely) 403 * that while the delete was occuring 404 * and we were applying the DEAD flag 405 * someone slipped in and found the 406 * interface in a lookup. While we 407 * decided rs_flows_using were 0 and 408 * scheduling the epoch_call, the other 409 * thread incremented rs_flow_using. This 410 * is because users have a pointer and 411 * we only use the rs_flows_using in an 412 * atomic fashion, i.e. the other entities 413 * are not protected. To assure this did 414 * not occur, we check rs_flows_using here 415 * before deleting. 416 */ 417 do_free_rs = (rs->rs_flows_using == 0); 418 rs_number_dead--; 419 mtx_unlock(&rs_mtx); 420 421 if (do_free_rs) { 422 sysctl_ctx_free(&rs->sysctl_ctx); 423 free(rs->rs_rlt, M_TCPPACE); 424 free(rs, M_TCPPACE); 425 } 426 } 427 428 static void 429 rs_defer_destroy(struct tcp_rate_set *rs) 430 { 431 432 mtx_assert(&rs_mtx, MA_OWNED); 433 434 /* Check if already pending. */ 435 if (rs->rs_flags & RS_FUNERAL_SCHD) 436 return; 437 438 rs_number_dead++; 439 440 /* Set flag to only defer once. */ 441 rs->rs_flags |= RS_FUNERAL_SCHD; 442 NET_EPOCH_CALL(rs_destroy, &rs->rs_epoch_ctx); 443 } 444 445 #ifdef INET 446 extern counter_u64_t rate_limit_set_ok; 447 extern counter_u64_t rate_limit_active; 448 extern counter_u64_t rate_limit_alloc_fail; 449 #endif 450 451 static int 452 rl_attach_txrtlmt(struct ifnet *ifp, 453 uint32_t flowtype, 454 int flowid, 455 uint64_t cfg_rate, 456 struct m_snd_tag **tag) 457 { 458 int error; 459 union if_snd_tag_alloc_params params = { 460 .rate_limit.hdr.type = IF_SND_TAG_TYPE_RATE_LIMIT, 461 .rate_limit.hdr.flowid = flowid, 462 .rate_limit.hdr.flowtype = flowtype, 463 .rate_limit.max_rate = cfg_rate, 464 .rate_limit.flags = M_NOWAIT, 465 }; 466 467 error = m_snd_tag_alloc(ifp, ¶ms, tag); 468 #ifdef INET 469 if (error == 0) { 470 counter_u64_add(rate_limit_set_ok, 1); 471 counter_u64_add(rate_limit_active, 1); 472 } else if (error != EOPNOTSUPP) 473 counter_u64_add(rate_limit_alloc_fail, 1); 474 #endif 475 return (error); 476 } 477 478 static void 479 populate_canned_table(struct tcp_rate_set *rs, const uint64_t *rate_table_act) 480 { 481 /* 482 * The internal table is "special", it 483 * is two seperate ordered tables that 484 * must be merged. We get here when the 485 * adapter specifies a number of rates that 486 * covers both ranges in the table in some 487 * form. 488 */ 489 int i, at_low, at_high; 490 uint8_t low_disabled = 0, high_disabled = 0; 491 492 for(i = 0, at_low = 0, at_high = RS_NEXT_ORDER_GROUP; i < rs->rs_rate_cnt; i++) { 493 rs->rs_rlt[i].flags = 0; 494 rs->rs_rlt[i].time_between = 0; 495 if ((low_disabled == 0) && 496 (high_disabled || 497 (rate_table_act[at_low] < rate_table_act[at_high]))) { 498 rs->rs_rlt[i].rate = rate_table_act[at_low]; 499 at_low++; 500 if (at_low == RS_NEXT_ORDER_GROUP) 501 low_disabled = 1; 502 } else if (high_disabled == 0) { 503 rs->rs_rlt[i].rate = rate_table_act[at_high]; 504 at_high++; 505 if (at_high == MAX_HDWR_RATES) 506 high_disabled = 1; 507 } 508 } 509 } 510 511 static struct tcp_rate_set * 512 rt_setup_new_rs(struct ifnet *ifp, int *error) 513 { 514 struct tcp_rate_set *rs; 515 const uint64_t *rate_table_act; 516 uint64_t lentim, res; 517 size_t sz; 518 uint32_t hash_type; 519 int i; 520 struct if_ratelimit_query_results rl; 521 struct sysctl_oid *rl_sysctl_root; 522 /* 523 * We expect to enter with the 524 * mutex locked. 525 */ 526 527 if (ifp->if_ratelimit_query == NULL) { 528 /* 529 * We can do nothing if we cannot 530 * get a query back from the driver. 531 */ 532 printf("Warning:No query functions for %s:%d-- failed\n", 533 ifp->if_dname, ifp->if_dunit); 534 return (NULL); 535 } 536 rs = malloc(sizeof(struct tcp_rate_set), M_TCPPACE, M_NOWAIT | M_ZERO); 537 if (rs == NULL) { 538 if (error) 539 *error = ENOMEM; 540 printf("Warning:No memory for malloc of tcp_rate_set\n"); 541 return (NULL); 542 } 543 memset(&rl, 0, sizeof(rl)); 544 rl.flags = RT_NOSUPPORT; 545 ifp->if_ratelimit_query(ifp, &rl); 546 if (rl.flags & RT_IS_UNUSABLE) { 547 /* 548 * The interface does not really support 549 * the rate-limiting. 550 */ 551 memset(rs, 0, sizeof(struct tcp_rate_set)); 552 rs->rs_ifp = ifp; 553 rs->rs_if_dunit = ifp->if_dunit; 554 rs->rs_flags = RS_INTF_NO_SUP; 555 rs->rs_disable = 1; 556 rs_number_alive++; 557 sysctl_ctx_init(&rs->sysctl_ctx); 558 rl_sysctl_root = SYSCTL_ADD_NODE(&rs->sysctl_ctx, 559 SYSCTL_STATIC_CHILDREN(_net_inet_tcp_rl), 560 OID_AUTO, 561 rs->rs_ifp->if_xname, 562 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 563 ""); 564 rl_add_syctl_entries(rl_sysctl_root, rs); 565 mtx_lock(&rs_mtx); 566 CK_LIST_INSERT_HEAD(&int_rs, rs, next); 567 mtx_unlock(&rs_mtx); 568 return (rs); 569 } else if ((rl.flags & RT_IS_INDIRECT) == RT_IS_INDIRECT) { 570 memset(rs, 0, sizeof(struct tcp_rate_set)); 571 rs->rs_ifp = ifp; 572 rs->rs_if_dunit = ifp->if_dunit; 573 rs->rs_flags = RS_IS_DEFF; 574 rs_number_alive++; 575 sysctl_ctx_init(&rs->sysctl_ctx); 576 rl_sysctl_root = SYSCTL_ADD_NODE(&rs->sysctl_ctx, 577 SYSCTL_STATIC_CHILDREN(_net_inet_tcp_rl), 578 OID_AUTO, 579 rs->rs_ifp->if_xname, 580 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 581 ""); 582 rl_add_syctl_entries(rl_sysctl_root, rs); 583 mtx_lock(&rs_mtx); 584 CK_LIST_INSERT_HEAD(&int_rs, rs, next); 585 mtx_unlock(&rs_mtx); 586 return (rs); 587 } else if ((rl.flags & RT_IS_FIXED_TABLE) == RT_IS_FIXED_TABLE) { 588 /* Mellanox C4 likely */ 589 rs->rs_ifp = ifp; 590 rs->rs_if_dunit = ifp->if_dunit; 591 rs->rs_rate_cnt = rl.number_of_rates; 592 rs->rs_min_seg = rl.min_segment_burst; 593 rs->rs_highest_valid = 0; 594 rs->rs_flow_limit = rl.max_flows; 595 rs->rs_flags = RS_IS_INTF | RS_NO_PRE; 596 rs->rs_disable = 0; 597 rate_table_act = rl.rate_table; 598 } else if ((rl.flags & RT_IS_SELECTABLE) == RT_IS_SELECTABLE) { 599 /* Chelsio, C5 and C6 of Mellanox? */ 600 rs->rs_ifp = ifp; 601 rs->rs_if_dunit = ifp->if_dunit; 602 rs->rs_rate_cnt = rl.number_of_rates; 603 rs->rs_min_seg = rl.min_segment_burst; 604 rs->rs_disable = 0; 605 rs->rs_flow_limit = rl.max_flows; 606 rate_table_act = desired_rates; 607 if ((rs->rs_rate_cnt > MAX_HDWR_RATES) && 608 (rs->rs_rate_cnt < ALL_HARDWARE_RATES)) { 609 /* 610 * Our desired table is not big 611 * enough, do what we can. 612 */ 613 rs->rs_rate_cnt = MAX_HDWR_RATES; 614 } 615 if (rs->rs_rate_cnt <= RS_ORDERED_COUNT) 616 rs->rs_flags = RS_IS_INTF; 617 else 618 rs->rs_flags = RS_IS_INTF | RS_INT_TBL; 619 if (rs->rs_rate_cnt >= ALL_HARDWARE_RATES) 620 rs->rs_rate_cnt = ALL_HARDWARE_RATES; 621 } else { 622 free(rs, M_TCPPACE); 623 return (NULL); 624 } 625 sz = sizeof(struct tcp_hwrate_limit_table) * rs->rs_rate_cnt; 626 rs->rs_rlt = malloc(sz, M_TCPPACE, M_NOWAIT); 627 if (rs->rs_rlt == NULL) { 628 if (error) 629 *error = ENOMEM; 630 bail: 631 free(rs, M_TCPPACE); 632 return (NULL); 633 } 634 if (rs->rs_rate_cnt >= ALL_HARDWARE_RATES) { 635 /* 636 * The interface supports all 637 * the rates we could possibly want. 638 */ 639 uint64_t rat; 640 641 rs->rs_rlt[0].rate = 12500; /* 100k */ 642 rs->rs_rlt[1].rate = 25000; /* 200k */ 643 rs->rs_rlt[2].rate = 62500; /* 500k */ 644 /* Note 125000 == 1Megabit 645 * populate 1Meg - 1000meg. 646 */ 647 for(i = 3, rat = 125000; i< (ALL_HARDWARE_RATES-1); i++) { 648 rs->rs_rlt[i].rate = rat; 649 rat += 125000; 650 } 651 rs->rs_rlt[(ALL_HARDWARE_RATES-1)].rate = 1250000000; 652 } else if (rs->rs_flags & RS_INT_TBL) { 653 /* We populate this in a special way */ 654 populate_canned_table(rs, rate_table_act); 655 } else { 656 /* 657 * Just copy in the rates from 658 * the table, it is in order. 659 */ 660 for (i=0; i<rs->rs_rate_cnt; i++) { 661 rs->rs_rlt[i].rate = rate_table_act[i]; 662 rs->rs_rlt[i].time_between = 0; 663 rs->rs_rlt[i].flags = 0; 664 } 665 } 666 for (i = (rs->rs_rate_cnt - 1); i >= 0; i--) { 667 /* 668 * We go backwards through the list so that if we can't get 669 * a rate and fail to init one, we have at least a chance of 670 * getting the highest one. 671 */ 672 rs->rs_rlt[i].ptbl = rs; 673 rs->rs_rlt[i].tag = NULL; 674 /* 675 * Calculate the time between. 676 */ 677 lentim = ETHERNET_SEGMENT_SIZE * USECS_IN_SECOND; 678 res = lentim / rs->rs_rlt[i].rate; 679 if (res > 0) 680 rs->rs_rlt[i].time_between = res; 681 else 682 rs->rs_rlt[i].time_between = 1; 683 if (rs->rs_flags & RS_NO_PRE) { 684 rs->rs_rlt[i].flags = HDWRPACE_INITED; 685 rs->rs_lowest_valid = i; 686 } else { 687 int err; 688 689 if ((rl.flags & RT_IS_SETUP_REQ) && 690 (ifp->if_ratelimit_query)) { 691 err = ifp->if_ratelimit_setup(ifp, 692 rs->rs_rlt[i].rate, i); 693 if (err) 694 goto handle_err; 695 } 696 #ifdef RSS 697 hash_type = M_HASHTYPE_RSS_TCP_IPV4; 698 #else 699 hash_type = M_HASHTYPE_OPAQUE_HASH; 700 #endif 701 err = rl_attach_txrtlmt(ifp, 702 hash_type, 703 (i + 1), 704 rs->rs_rlt[i].rate, 705 &rs->rs_rlt[i].tag); 706 if (err) { 707 handle_err: 708 if (i == (rs->rs_rate_cnt - 1)) { 709 /* 710 * Huh - first rate and we can't get 711 * it? 712 */ 713 free(rs->rs_rlt, M_TCPPACE); 714 if (error) 715 *error = err; 716 goto bail; 717 } else { 718 if (error) 719 *error = err; 720 } 721 break; 722 } else { 723 rs->rs_rlt[i].flags = HDWRPACE_INITED | HDWRPACE_TAGPRESENT; 724 rs->rs_lowest_valid = i; 725 } 726 } 727 } 728 /* Did we get at least 1 rate? */ 729 if (rs->rs_rlt[(rs->rs_rate_cnt - 1)].flags & HDWRPACE_INITED) 730 rs->rs_highest_valid = rs->rs_rate_cnt - 1; 731 else { 732 free(rs->rs_rlt, M_TCPPACE); 733 goto bail; 734 } 735 rs_number_alive++; 736 sysctl_ctx_init(&rs->sysctl_ctx); 737 rl_sysctl_root = SYSCTL_ADD_NODE(&rs->sysctl_ctx, 738 SYSCTL_STATIC_CHILDREN(_net_inet_tcp_rl), 739 OID_AUTO, 740 rs->rs_ifp->if_xname, 741 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 742 ""); 743 rl_add_syctl_entries(rl_sysctl_root, rs); 744 mtx_lock(&rs_mtx); 745 CK_LIST_INSERT_HEAD(&int_rs, rs, next); 746 mtx_unlock(&rs_mtx); 747 return (rs); 748 } 749 750 static const struct tcp_hwrate_limit_table * 751 tcp_int_find_suitable_rate(const struct tcp_rate_set *rs, 752 uint64_t bytes_per_sec, uint32_t flags) 753 { 754 struct tcp_hwrate_limit_table *arte = NULL, *rte = NULL; 755 uint64_t mbits_per_sec, ind_calc; 756 int i; 757 758 mbits_per_sec = (bytes_per_sec * 8); 759 if (flags & RS_PACING_LT) { 760 if ((mbits_per_sec < RS_ONE_MEGABIT_PERSEC) && 761 (rs->rs_lowest_valid <= 2)){ 762 /* 763 * Smaller than 1Meg, only 764 * 3 entries can match it. 765 */ 766 for(i = rs->rs_lowest_valid; i < 3; i++) { 767 if (bytes_per_sec <= rs->rs_rlt[i].rate) { 768 rte = &rs->rs_rlt[i]; 769 break; 770 } else if (rs->rs_rlt[i].flags & HDWRPACE_INITED) { 771 arte = &rs->rs_rlt[i]; 772 } 773 } 774 goto done; 775 } else if ((mbits_per_sec > RS_ONE_GIGABIT_PERSEC) && 776 (rs->rs_rlt[(ALL_HARDWARE_RATES-1)].flags & HDWRPACE_INITED)){ 777 /* 778 * Larger than 1G (the majority of 779 * our table. 780 */ 781 if (mbits_per_sec < RS_TEN_GIGABIT_PERSEC) 782 rte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)]; 783 else 784 arte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)]; 785 goto done; 786 } 787 /* 788 * If we reach here its in our table (between 1Meg - 1000Meg), 789 * just take the rounded down mbits per second, and add 790 * 1Megabit to it, from this we can calculate 791 * the index in the table. 792 */ 793 ind_calc = mbits_per_sec/RS_ONE_MEGABIT_PERSEC; 794 if ((ind_calc * RS_ONE_MEGABIT_PERSEC) != mbits_per_sec) 795 ind_calc++; 796 /* our table is offset by 3, we add 2 */ 797 ind_calc += 2; 798 if (ind_calc > (ALL_HARDWARE_RATES-1)) { 799 /* This should not happen */ 800 ind_calc = ALL_HARDWARE_RATES-1; 801 } 802 if ((ind_calc >= rs->rs_lowest_valid) && 803 (ind_calc <= rs->rs_highest_valid)) 804 rte = &rs->rs_rlt[ind_calc]; 805 } else if (flags & RS_PACING_EXACT_MATCH) { 806 if ((mbits_per_sec < RS_ONE_MEGABIT_PERSEC) && 807 (rs->rs_lowest_valid <= 2)){ 808 for(i = rs->rs_lowest_valid; i < 3; i++) { 809 if (bytes_per_sec == rs->rs_rlt[i].rate) { 810 rte = &rs->rs_rlt[i]; 811 break; 812 } 813 } 814 } else if ((mbits_per_sec > RS_ONE_GIGABIT_PERSEC) && 815 (rs->rs_rlt[(ALL_HARDWARE_RATES-1)].flags & HDWRPACE_INITED)) { 816 /* > 1Gbps only one rate */ 817 if (bytes_per_sec == rs->rs_rlt[(ALL_HARDWARE_RATES-1)].rate) { 818 /* Its 10G wow */ 819 rte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)]; 820 } 821 } else { 822 /* Ok it must be a exact meg (its between 1G and 1Meg) */ 823 ind_calc = mbits_per_sec/RS_ONE_MEGABIT_PERSEC; 824 if ((ind_calc * RS_ONE_MEGABIT_PERSEC) == mbits_per_sec) { 825 /* its an exact Mbps */ 826 ind_calc += 2; 827 if (ind_calc > (ALL_HARDWARE_RATES-1)) { 828 /* This should not happen */ 829 ind_calc = ALL_HARDWARE_RATES-1; 830 } 831 if (rs->rs_rlt[ind_calc].flags & HDWRPACE_INITED) 832 rte = &rs->rs_rlt[ind_calc]; 833 } 834 } 835 } else { 836 /* we want greater than the requested rate */ 837 if ((mbits_per_sec < RS_ONE_MEGABIT_PERSEC) && 838 (rs->rs_lowest_valid <= 2)){ 839 arte = &rs->rs_rlt[3]; /* set alternate to 1Meg */ 840 for (i=2; i>=rs->rs_lowest_valid; i--) { 841 if (bytes_per_sec < rs->rs_rlt[i].rate) { 842 rte = &rs->rs_rlt[i]; 843 break; 844 } else if ((flags & RS_PACING_GEQ) && 845 (bytes_per_sec == rs->rs_rlt[i].rate)) { 846 rte = &rs->rs_rlt[i]; 847 break; 848 } else { 849 arte = &rs->rs_rlt[i]; /* new alternate */ 850 } 851 } 852 } else if (mbits_per_sec > RS_ONE_GIGABIT_PERSEC) { 853 if ((bytes_per_sec < rs->rs_rlt[(ALL_HARDWARE_RATES-1)].rate) && 854 (rs->rs_rlt[(ALL_HARDWARE_RATES-1)].flags & HDWRPACE_INITED)){ 855 /* Our top rate is larger than the request */ 856 rte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)]; 857 } else if ((flags & RS_PACING_GEQ) && 858 (bytes_per_sec == rs->rs_rlt[(ALL_HARDWARE_RATES-1)].rate) && 859 (rs->rs_rlt[(ALL_HARDWARE_RATES-1)].flags & HDWRPACE_INITED)) { 860 /* It matches our top rate */ 861 rte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)]; 862 } else if (rs->rs_rlt[(ALL_HARDWARE_RATES-1)].flags & HDWRPACE_INITED) { 863 /* The top rate is an alternative */ 864 arte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)]; 865 } 866 } else { 867 /* Its in our range 1Meg - 1Gig */ 868 if (flags & RS_PACING_GEQ) { 869 ind_calc = mbits_per_sec/RS_ONE_MEGABIT_PERSEC; 870 if ((ind_calc * RS_ONE_MEGABIT_PERSEC) == mbits_per_sec) { 871 if (ind_calc > (ALL_HARDWARE_RATES-1)) { 872 /* This should not happen */ 873 ind_calc = (ALL_HARDWARE_RATES-1); 874 } 875 rte = &rs->rs_rlt[ind_calc]; 876 } 877 goto done; 878 } 879 ind_calc = (mbits_per_sec + (RS_ONE_MEGABIT_PERSEC-1))/RS_ONE_MEGABIT_PERSEC; 880 ind_calc += 2; 881 if (ind_calc > (ALL_HARDWARE_RATES-1)) { 882 /* This should not happen */ 883 ind_calc = ALL_HARDWARE_RATES-1; 884 } 885 if (rs->rs_rlt[ind_calc].flags & HDWRPACE_INITED) 886 rte = &rs->rs_rlt[ind_calc]; 887 } 888 } 889 done: 890 if ((rte == NULL) && 891 (arte != NULL) && 892 (flags & RS_PACING_SUB_OK)) { 893 /* We can use the substitute */ 894 rte = arte; 895 } 896 return (rte); 897 } 898 899 static const struct tcp_hwrate_limit_table * 900 tcp_find_suitable_rate(const struct tcp_rate_set *rs, uint64_t bytes_per_sec, uint32_t flags) 901 { 902 /** 903 * Hunt the rate table with the restrictions in flags and find a 904 * suitable rate if possible. 905 * RS_PACING_EXACT_MATCH - look for an exact match to rate. 906 * RS_PACING_GT - must be greater than. 907 * RS_PACING_GEQ - must be greater than or equal. 908 * RS_PACING_LT - must be less than. 909 * RS_PACING_SUB_OK - If we don't meet criteria a 910 * substitute is ok. 911 */ 912 int i, matched; 913 struct tcp_hwrate_limit_table *rte = NULL; 914 915 if ((rs->rs_flags & RS_INT_TBL) && 916 (rs->rs_rate_cnt >= ALL_HARDWARE_RATES)) { 917 /* 918 * Here we don't want to paw thru 919 * a big table, we have everything 920 * from 1Meg - 1000Meg in 1Meg increments. 921 * Use an alternate method to "lookup". 922 */ 923 return (tcp_int_find_suitable_rate(rs, bytes_per_sec, flags)); 924 } 925 if ((flags & RS_PACING_LT) || 926 (flags & RS_PACING_EXACT_MATCH)) { 927 /* 928 * For exact and less than we go forward through the table. 929 * This way when we find one larger we stop (exact was a 930 * toss up). 931 */ 932 for (i = rs->rs_lowest_valid, matched = 0; i <= rs->rs_highest_valid; i++) { 933 if ((flags & RS_PACING_EXACT_MATCH) && 934 (bytes_per_sec == rs->rs_rlt[i].rate)) { 935 rte = &rs->rs_rlt[i]; 936 matched = 1; 937 break; 938 } else if ((flags & RS_PACING_LT) && 939 (bytes_per_sec <= rs->rs_rlt[i].rate)) { 940 rte = &rs->rs_rlt[i]; 941 matched = 1; 942 break; 943 } 944 if (bytes_per_sec > rs->rs_rlt[i].rate) 945 break; 946 } 947 if ((matched == 0) && 948 (flags & RS_PACING_LT) && 949 (flags & RS_PACING_SUB_OK)) { 950 /* Kick in a substitute (the lowest) */ 951 rte = &rs->rs_rlt[rs->rs_lowest_valid]; 952 } 953 } else { 954 /* 955 * Here we go backward through the table so that we can find 956 * the one greater in theory faster (but its probably a 957 * wash). 958 */ 959 for (i = rs->rs_highest_valid, matched = 0; i >= rs->rs_lowest_valid; i--) { 960 if (rs->rs_rlt[i].rate > bytes_per_sec) { 961 /* A possible candidate */ 962 rte = &rs->rs_rlt[i]; 963 } 964 if ((flags & RS_PACING_GEQ) && 965 (bytes_per_sec == rs->rs_rlt[i].rate)) { 966 /* An exact match and we want equal */ 967 matched = 1; 968 rte = &rs->rs_rlt[i]; 969 break; 970 } else if (rte) { 971 /* 972 * Found one that is larger than but don't 973 * stop, there may be a more closer match. 974 */ 975 matched = 1; 976 } 977 if (rs->rs_rlt[i].rate < bytes_per_sec) { 978 /* 979 * We found a table entry that is smaller, 980 * stop there will be none greater or equal. 981 */ 982 break; 983 } 984 } 985 if ((matched == 0) && 986 (flags & RS_PACING_SUB_OK)) { 987 /* Kick in a substitute (the highest) */ 988 rte = &rs->rs_rlt[rs->rs_highest_valid]; 989 } 990 } 991 return (rte); 992 } 993 994 static struct ifnet * 995 rt_find_real_interface(struct ifnet *ifp, struct inpcb *inp, int *error) 996 { 997 struct ifnet *tifp; 998 struct m_snd_tag *tag; 999 union if_snd_tag_alloc_params params = { 1000 .rate_limit.hdr.type = IF_SND_TAG_TYPE_RATE_LIMIT, 1001 .rate_limit.hdr.flowid = 1, 1002 .rate_limit.hdr.numa_domain = inp->inp_numa_domain, 1003 .rate_limit.max_rate = COMMON_RATE, 1004 .rate_limit.flags = M_NOWAIT, 1005 }; 1006 int err; 1007 #ifdef RSS 1008 params.rate_limit.hdr.flowtype = ((inp->inp_vflag & INP_IPV6) ? 1009 M_HASHTYPE_RSS_TCP_IPV6 : M_HASHTYPE_RSS_TCP_IPV4); 1010 #else 1011 params.rate_limit.hdr.flowtype = M_HASHTYPE_OPAQUE_HASH; 1012 #endif 1013 err = m_snd_tag_alloc(ifp, ¶ms, &tag); 1014 if (err) { 1015 /* Failed to setup a tag? */ 1016 if (error) 1017 *error = err; 1018 return (NULL); 1019 } 1020 tifp = tag->ifp; 1021 m_snd_tag_rele(tag); 1022 return (tifp); 1023 } 1024 1025 static const struct tcp_hwrate_limit_table * 1026 rt_setup_rate(struct inpcb *inp, struct ifnet *ifp, uint64_t bytes_per_sec, 1027 uint32_t flags, int *error) 1028 { 1029 /* First lets find the interface if it exists */ 1030 const struct tcp_hwrate_limit_table *rte; 1031 struct tcp_rate_set *rs; 1032 struct epoch_tracker et; 1033 int err; 1034 1035 NET_EPOCH_ENTER(et); 1036 use_real_interface: 1037 CK_LIST_FOREACH(rs, &int_rs, next) { 1038 /* 1039 * Note we don't look with the lock since we either see a 1040 * new entry or will get one when we try to add it. 1041 */ 1042 if (rs->rs_flags & RS_IS_DEAD) { 1043 /* The dead are not looked at */ 1044 continue; 1045 } 1046 if ((rs->rs_ifp == ifp) && 1047 (rs->rs_if_dunit == ifp->if_dunit)) { 1048 /* Ok we found it */ 1049 break; 1050 } 1051 } 1052 if ((rs == NULL) || 1053 (rs->rs_flags & RS_INTF_NO_SUP) || 1054 (rs->rs_flags & RS_IS_DEAD)) { 1055 /* 1056 * This means we got a packet *before* 1057 * the IF-UP was processed below, <or> 1058 * while or after we already received an interface 1059 * departed event. In either case we really don't 1060 * want to do anything with pacing, in 1061 * the departing case the packet is not 1062 * going to go very far. The new case 1063 * might be arguable, but its impossible 1064 * to tell from the departing case. 1065 */ 1066 if (rs->rs_disable && error) 1067 *error = ENODEV; 1068 NET_EPOCH_EXIT(et); 1069 return (NULL); 1070 } 1071 1072 if ((rs == NULL) || (rs->rs_disable != 0)) { 1073 if (rs->rs_disable && error) 1074 *error = ENOSPC; 1075 NET_EPOCH_EXIT(et); 1076 return (NULL); 1077 } 1078 if (rs->rs_flags & RS_IS_DEFF) { 1079 /* We need to find the real interface */ 1080 struct ifnet *tifp; 1081 1082 tifp = rt_find_real_interface(ifp, inp, error); 1083 if (tifp == NULL) { 1084 if (rs->rs_disable && error) 1085 *error = ENOTSUP; 1086 NET_EPOCH_EXIT(et); 1087 return (NULL); 1088 } 1089 goto use_real_interface; 1090 } 1091 if (rs->rs_flow_limit && 1092 ((rs->rs_flows_using + 1) > rs->rs_flow_limit)) { 1093 if (error) 1094 *error = ENOSPC; 1095 NET_EPOCH_EXIT(et); 1096 return (NULL); 1097 } 1098 rte = tcp_find_suitable_rate(rs, bytes_per_sec, flags); 1099 if (rte) { 1100 err = in_pcbattach_txrtlmt(inp, rs->rs_ifp, 1101 inp->inp_flowtype, 1102 inp->inp_flowid, 1103 rte->rate, 1104 &inp->inp_snd_tag); 1105 if (err) { 1106 /* Failed to attach */ 1107 if (error) 1108 *error = err; 1109 rte = NULL; 1110 } 1111 } 1112 if (rte) { 1113 /* 1114 * We use an atomic here for accounting so we don't have to 1115 * use locks when freeing. 1116 */ 1117 atomic_add_64(&rs->rs_flows_using, 1); 1118 } 1119 NET_EPOCH_EXIT(et); 1120 return (rte); 1121 } 1122 1123 static void 1124 tcp_rl_ifnet_link(void *arg __unused, struct ifnet *ifp, int link_state) 1125 { 1126 int error; 1127 struct tcp_rate_set *rs; 1128 1129 if (((ifp->if_capenable & IFCAP_TXRTLMT) == 0) || 1130 (link_state != LINK_STATE_UP)) { 1131 /* 1132 * We only care on an interface going up that is rate-limit 1133 * capable. 1134 */ 1135 return; 1136 } 1137 mtx_lock(&rs_mtx); 1138 CK_LIST_FOREACH(rs, &int_rs, next) { 1139 if ((rs->rs_ifp == ifp) && 1140 (rs->rs_if_dunit == ifp->if_dunit)) { 1141 /* We already have initialized this guy */ 1142 mtx_unlock(&rs_mtx); 1143 return; 1144 } 1145 } 1146 mtx_unlock(&rs_mtx); 1147 rt_setup_new_rs(ifp, &error); 1148 } 1149 1150 static void 1151 tcp_rl_ifnet_departure(void *arg __unused, struct ifnet *ifp) 1152 { 1153 struct tcp_rate_set *rs, *nrs; 1154 int i; 1155 1156 mtx_lock(&rs_mtx); 1157 CK_LIST_FOREACH_SAFE(rs, &int_rs, next, nrs) { 1158 if ((rs->rs_ifp == ifp) && 1159 (rs->rs_if_dunit == ifp->if_dunit)) { 1160 CK_LIST_REMOVE(rs, next); 1161 rs_number_alive--; 1162 rs->rs_flags |= RS_IS_DEAD; 1163 for (i = 0; i < rs->rs_rate_cnt; i++) { 1164 if (rs->rs_rlt[i].flags & HDWRPACE_TAGPRESENT) { 1165 in_pcbdetach_tag(rs->rs_rlt[i].tag); 1166 rs->rs_rlt[i].tag = NULL; 1167 } 1168 rs->rs_rlt[i].flags = HDWRPACE_IFPDEPARTED; 1169 } 1170 if (rs->rs_flows_using == 0) 1171 rs_defer_destroy(rs); 1172 break; 1173 } 1174 } 1175 mtx_unlock(&rs_mtx); 1176 } 1177 1178 static void 1179 tcp_rl_shutdown(void *arg __unused, int howto __unused) 1180 { 1181 struct tcp_rate_set *rs, *nrs; 1182 int i; 1183 1184 mtx_lock(&rs_mtx); 1185 CK_LIST_FOREACH_SAFE(rs, &int_rs, next, nrs) { 1186 CK_LIST_REMOVE(rs, next); 1187 rs_number_alive--; 1188 rs->rs_flags |= RS_IS_DEAD; 1189 for (i = 0; i < rs->rs_rate_cnt; i++) { 1190 if (rs->rs_rlt[i].flags & HDWRPACE_TAGPRESENT) { 1191 in_pcbdetach_tag(rs->rs_rlt[i].tag); 1192 rs->rs_rlt[i].tag = NULL; 1193 } 1194 rs->rs_rlt[i].flags = HDWRPACE_IFPDEPARTED; 1195 } 1196 if (rs->rs_flows_using == 0) 1197 rs_defer_destroy(rs); 1198 } 1199 mtx_unlock(&rs_mtx); 1200 } 1201 1202 const struct tcp_hwrate_limit_table * 1203 tcp_set_pacing_rate(struct tcpcb *tp, struct ifnet *ifp, 1204 uint64_t bytes_per_sec, int flags, int *error) 1205 { 1206 const struct tcp_hwrate_limit_table *rte; 1207 #ifdef KERN_TLS 1208 struct ktls_session *tls; 1209 #endif 1210 1211 INP_WLOCK_ASSERT(tp->t_inpcb); 1212 1213 if (tp->t_inpcb->inp_snd_tag == NULL) { 1214 /* 1215 * We are setting up a rate for the first time. 1216 */ 1217 if ((ifp->if_capenable & IFCAP_TXRTLMT) == 0) { 1218 /* Not supported by the egress */ 1219 if (error) 1220 *error = ENODEV; 1221 return (NULL); 1222 } 1223 #ifdef KERN_TLS 1224 tls = NULL; 1225 if (tp->t_inpcb->inp_socket->so_snd.sb_flags & SB_TLS_IFNET) { 1226 tls = tp->t_inpcb->inp_socket->so_snd.sb_tls_info; 1227 1228 if ((ifp->if_capenable & IFCAP_TXTLS_RTLMT) == 0 || 1229 tls->mode != TCP_TLS_MODE_IFNET) { 1230 if (error) 1231 *error = ENODEV; 1232 return (NULL); 1233 } 1234 } 1235 #endif 1236 rte = rt_setup_rate(tp->t_inpcb, ifp, bytes_per_sec, flags, error); 1237 #ifdef KERN_TLS 1238 if (rte != NULL && tls != NULL && tls->snd_tag != NULL) { 1239 /* 1240 * Fake a route change error to reset the TLS 1241 * send tag. This will convert the existing 1242 * tag to a TLS ratelimit tag. 1243 */ 1244 MPASS(tls->snd_tag->type == IF_SND_TAG_TYPE_TLS); 1245 ktls_output_eagain(tp->t_inpcb, tls); 1246 } 1247 #endif 1248 } else { 1249 /* 1250 * We are modifying a rate, wrong interface? 1251 */ 1252 if (error) 1253 *error = EINVAL; 1254 rte = NULL; 1255 } 1256 tp->t_pacing_rate = rte->rate; 1257 *error = 0; 1258 return (rte); 1259 } 1260 1261 const struct tcp_hwrate_limit_table * 1262 tcp_chg_pacing_rate(const struct tcp_hwrate_limit_table *crte, 1263 struct tcpcb *tp, struct ifnet *ifp, 1264 uint64_t bytes_per_sec, int flags, int *error) 1265 { 1266 const struct tcp_hwrate_limit_table *nrte; 1267 const struct tcp_rate_set *rs; 1268 #ifdef KERN_TLS 1269 struct ktls_session *tls = NULL; 1270 #endif 1271 int is_indirect = 0; 1272 int err; 1273 1274 INP_WLOCK_ASSERT(tp->t_inpcb); 1275 1276 if (crte == NULL) { 1277 /* Wrong interface */ 1278 if (error) 1279 *error = EINVAL; 1280 return (NULL); 1281 } 1282 1283 #ifdef KERN_TLS 1284 if (tp->t_inpcb->inp_socket->so_snd.sb_flags & SB_TLS_IFNET) { 1285 tls = tp->t_inpcb->inp_socket->so_snd.sb_tls_info; 1286 MPASS(tls->mode == TCP_TLS_MODE_IFNET); 1287 if (tls->snd_tag != NULL && 1288 tls->snd_tag->type != IF_SND_TAG_TYPE_TLS_RATE_LIMIT) { 1289 /* 1290 * NIC probably doesn't support ratelimit TLS 1291 * tags if it didn't allocate one when an 1292 * existing rate was present, so ignore. 1293 */ 1294 if (error) 1295 *error = EOPNOTSUPP; 1296 return (NULL); 1297 } 1298 } 1299 #endif 1300 if (tp->t_inpcb->inp_snd_tag == NULL) { 1301 /* Wrong interface */ 1302 if (error) 1303 *error = EINVAL; 1304 return (NULL); 1305 } 1306 rs = crte->ptbl; 1307 if ((rs->rs_flags & RS_IS_DEAD) || 1308 (crte->flags & HDWRPACE_IFPDEPARTED)) { 1309 /* Release the rate, and try anew */ 1310 re_rate: 1311 tcp_rel_pacing_rate(crte, tp); 1312 nrte = tcp_set_pacing_rate(tp, ifp, 1313 bytes_per_sec, flags, error); 1314 return (nrte); 1315 } 1316 if ((rs->rs_flags & RT_IS_INDIRECT ) == RT_IS_INDIRECT) 1317 is_indirect = 1; 1318 else 1319 is_indirect = 0; 1320 if ((is_indirect == 0) && 1321 ((ifp != rs->rs_ifp) || 1322 (ifp->if_dunit != rs->rs_if_dunit))) { 1323 /* 1324 * Something changed, the user is not pointing to the same 1325 * ifp? Maybe a route updated on this guy? 1326 */ 1327 goto re_rate; 1328 } else if (is_indirect) { 1329 /* 1330 * For indirect we have to dig in and find the real interface. 1331 */ 1332 struct ifnet *rifp; 1333 1334 rifp = rt_find_real_interface(ifp, tp->t_inpcb, error); 1335 if (rifp == NULL) { 1336 /* Can't find it? */ 1337 goto re_rate; 1338 } 1339 if ((rifp != rs->rs_ifp) || 1340 (ifp->if_dunit != rs->rs_if_dunit)) { 1341 goto re_rate; 1342 } 1343 } 1344 nrte = tcp_find_suitable_rate(rs, bytes_per_sec, flags); 1345 if (nrte == crte) { 1346 /* No change */ 1347 if (error) 1348 *error = 0; 1349 return (crte); 1350 } 1351 if (nrte == NULL) { 1352 /* Release the old rate */ 1353 tcp_rel_pacing_rate(crte, tp); 1354 return (NULL); 1355 } 1356 /* Change rates to our new entry */ 1357 #ifdef KERN_TLS 1358 if (tls != NULL) 1359 err = ktls_modify_txrtlmt(tls, nrte->rate); 1360 else 1361 #endif 1362 err = in_pcbmodify_txrtlmt(tp->t_inpcb, nrte->rate); 1363 if (err) { 1364 if (error) 1365 *error = err; 1366 return (NULL); 1367 } 1368 if (error) 1369 *error = 0; 1370 tp->t_pacing_rate = nrte->rate; 1371 return (nrte); 1372 } 1373 1374 void 1375 tcp_rel_pacing_rate(const struct tcp_hwrate_limit_table *crte, struct tcpcb *tp) 1376 { 1377 const struct tcp_rate_set *crs; 1378 struct tcp_rate_set *rs; 1379 uint64_t pre; 1380 1381 INP_WLOCK_ASSERT(tp->t_inpcb); 1382 1383 tp->t_pacing_rate = -1; 1384 crs = crte->ptbl; 1385 /* 1386 * Now we must break the const 1387 * in order to release our refcount. 1388 */ 1389 rs = __DECONST(struct tcp_rate_set *, crs); 1390 pre = atomic_fetchadd_64(&rs->rs_flows_using, -1); 1391 if (pre == 1) { 1392 mtx_lock(&rs_mtx); 1393 /* 1394 * Is it dead? 1395 */ 1396 if (rs->rs_flags & RS_IS_DEAD) 1397 rs_defer_destroy(rs); 1398 mtx_unlock(&rs_mtx); 1399 } 1400 1401 /* 1402 * XXX: If this connection is using ifnet TLS, should we 1403 * switch it to using an unlimited rate, or perhaps use 1404 * ktls_output_eagain() to reset the send tag to a plain 1405 * TLS tag? 1406 */ 1407 in_pcbdetach_txrtlmt(tp->t_inpcb); 1408 } 1409 1410 #define ONE_POINT_TWO_MEG 150000 /* 1.2 megabits in bytes */ 1411 #define ONE_HUNDRED_MBPS 12500000 /* 100Mbps in bytes per second */ 1412 #define FIVE_HUNDRED_MBPS 62500000 /* 500Mbps in bytes per second */ 1413 #define MAX_MSS_SENT 43 /* 43 mss = 43 x 1500 = 64,500 bytes */ 1414 1415 uint32_t 1416 tcp_get_pacing_burst_size (uint64_t bw, uint32_t segsiz, int can_use_1mss, 1417 const struct tcp_hwrate_limit_table *te, int *err) 1418 { 1419 /* 1420 * We use the google formula to calculate the 1421 * TSO size. I.E. 1422 * bw < 24Meg 1423 * tso = 2mss 1424 * else 1425 * tso = min(bw/1000, 64k) 1426 * 1427 * Note for these calculations we ignore the 1428 * packet overhead (enet hdr, ip hdr and tcp hdr). 1429 */ 1430 uint64_t lentim, res, bytes; 1431 uint32_t new_tso, min_tso_segs; 1432 1433 bytes = bw / 1000; 1434 if (bytes > (64 * 1000)) 1435 bytes = 64 * 1000; 1436 /* Round up */ 1437 new_tso = (bytes + segsiz - 1) / segsiz; 1438 if (can_use_1mss && (bw < ONE_POINT_TWO_MEG)) 1439 min_tso_segs = 1; 1440 else 1441 min_tso_segs = 2; 1442 if (new_tso < min_tso_segs) 1443 new_tso = min_tso_segs; 1444 if (new_tso > MAX_MSS_SENT) 1445 new_tso = MAX_MSS_SENT; 1446 new_tso *= segsiz; 1447 /* 1448 * If we are not doing hardware pacing 1449 * then we are done. 1450 */ 1451 if (te == NULL) { 1452 if (err) 1453 *err = 0; 1454 return(new_tso); 1455 } 1456 /* 1457 * For hardware pacing we look at the 1458 * rate you are sending at and compare 1459 * that to the rate you have in hardware. 1460 * 1461 * If the hardware rate is slower than your 1462 * software rate then you are in error and 1463 * we will build a queue in our hardware whic 1464 * is probably not desired, in such a case 1465 * just return the non-hardware TSO size. 1466 * 1467 * If the rate in hardware is faster (which 1468 * it should be) then look at how long it 1469 * takes to send one ethernet segment size at 1470 * your b/w and compare that to the time it 1471 * takes to send at the rate you had selected. 1472 * 1473 * If your time is greater (which we hope it is) 1474 * we get the delta between the two, and then 1475 * divide that into your pacing time. This tells 1476 * us how many MSS you can send down at once (rounded up). 1477 * 1478 * Note we also double this value if the b/w is over 1479 * 100Mbps. If its over 500meg we just set you to the 1480 * max (43 segments). 1481 */ 1482 if (te->rate > FIVE_HUNDRED_MBPS) 1483 return (segsiz * MAX_MSS_SENT); 1484 if (te->rate == bw) { 1485 /* We are pacing at exactly the hdwr rate */ 1486 return (segsiz * MAX_MSS_SENT); 1487 } 1488 lentim = ETHERNET_SEGMENT_SIZE * USECS_IN_SECOND; 1489 res = lentim / bw; 1490 if (res > te->time_between) { 1491 uint32_t delta, segs; 1492 1493 delta = res - te->time_between; 1494 segs = (res + delta - 1)/delta; 1495 if (te->rate > ONE_HUNDRED_MBPS) 1496 segs *= 2; 1497 if (segs < min_tso_segs) 1498 segs = min_tso_segs; 1499 if (segs > MAX_MSS_SENT) 1500 segs = MAX_MSS_SENT; 1501 segs *= segsiz; 1502 if (err) 1503 *err = 0; 1504 if (segs < new_tso) { 1505 /* unexpected ? */ 1506 return(new_tso); 1507 } else { 1508 return (segs); 1509 } 1510 } else { 1511 /* 1512 * Your time is smaller which means 1513 * we will grow a queue on our 1514 * hardware. Send back the non-hardware 1515 * rate. 1516 */ 1517 if (err) 1518 *err = -1; 1519 return (new_tso); 1520 } 1521 } 1522 1523 static eventhandler_tag rl_ifnet_departs; 1524 static eventhandler_tag rl_ifnet_arrives; 1525 static eventhandler_tag rl_shutdown_start; 1526 1527 static void 1528 tcp_rs_init(void *st __unused) 1529 { 1530 CK_LIST_INIT(&int_rs); 1531 rs_number_alive = 0; 1532 rs_number_dead = 0; 1533 mtx_init(&rs_mtx, "tcp_rs_mtx", "rsmtx", MTX_DEF); 1534 rl_ifnet_departs = EVENTHANDLER_REGISTER(ifnet_departure_event, 1535 tcp_rl_ifnet_departure, 1536 NULL, EVENTHANDLER_PRI_ANY); 1537 rl_ifnet_arrives = EVENTHANDLER_REGISTER(ifnet_link_event, 1538 tcp_rl_ifnet_link, 1539 NULL, EVENTHANDLER_PRI_ANY); 1540 rl_shutdown_start = EVENTHANDLER_REGISTER(shutdown_pre_sync, 1541 tcp_rl_shutdown, NULL, 1542 SHUTDOWN_PRI_FIRST); 1543 printf("TCP_ratelimit: Is now initialized\n"); 1544 } 1545 1546 SYSINIT(tcp_rl_init, SI_SUB_SMP + 1, SI_ORDER_ANY, tcp_rs_init, NULL); 1547 #endif 1548