1 /*- 2 * 3 * SPDX-License-Identifier: BSD-3-Clause 4 * 5 * Copyright (c) 2018-2019 6 * Netflix Inc. 7 * All rights reserved. 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions 11 * are met: 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in the 16 * documentation and/or other materials provided with the distribution. 17 * 18 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 19 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 21 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 22 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 24 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 28 * SUCH DAMAGE. 29 * 30 */ 31 /** 32 * Author: Randall Stewart <rrs@netflix.com> 33 */ 34 35 #include <sys/cdefs.h> 36 __FBSDID("$FreeBSD$"); 37 #include "opt_inet.h" 38 #include "opt_inet6.h" 39 #include "opt_ipsec.h" 40 #include "opt_tcpdebug.h" 41 #include "opt_ratelimit.h" 42 #include <sys/param.h> 43 #include <sys/kernel.h> 44 #include <sys/malloc.h> 45 #include <sys/mbuf.h> 46 #include <sys/socket.h> 47 #include <sys/socketvar.h> 48 #include <sys/sysctl.h> 49 #include <sys/eventhandler.h> 50 #include <sys/mutex.h> 51 #include <sys/ck.h> 52 #include <net/if.h> 53 #include <net/if_var.h> 54 #include <netinet/in.h> 55 #include <netinet/in_pcb.h> 56 #define TCPSTATES /* for logging */ 57 #include <netinet/tcp_var.h> 58 #ifdef INET6 59 #include <netinet6/tcp6_var.h> 60 #endif 61 #include <netinet/tcp_ratelimit.h> 62 #ifndef USECS_IN_SECOND 63 #define USECS_IN_SECOND 1000000 64 #endif 65 /* 66 * For the purposes of each send, what is the size 67 * of an ethernet frame. 68 */ 69 MALLOC_DEFINE(M_TCPPACE, "tcp_hwpace", "TCP Hardware pacing memory"); 70 #ifdef RATELIMIT 71 72 /* 73 * The following preferred table will seem weird to 74 * the casual viewer. Why do we not have any rates below 75 * 1Mbps? Why do we have a rate at 1.44Mbps called common? 76 * Why do the rates cluster in the 1-100Mbps range more 77 * than others? Why does the table jump around at the beginnign 78 * and then be more consistently raising? 79 * 80 * Let me try to answer those questions. A lot of 81 * this is dependant on the hardware. We have three basic 82 * supporters of rate limiting 83 * 84 * Chelsio - Supporting 16 configurable rates. 85 * Mlx - c4 supporting 13 fixed rates. 86 * Mlx - c5 & c6 supporting 127 configurable rates. 87 * 88 * The c4 is why we have a common rate that is available 89 * in all rate tables. This is a selected rate from the 90 * c4 table and we assure its available in all ratelimit 91 * tables. This way the tcp_ratelimit code has an assured 92 * rate it should always be able to get. This answers a 93 * couple of the questions above. 94 * 95 * So what about the rest, well the table is built to 96 * try to get the most out of a joint hardware/software 97 * pacing system. The software pacer will always pick 98 * a rate higher than the b/w that it is estimating 99 * 100 * on the path. This is done for two reasons. 101 * a) So we can discover more b/w 102 * and 103 * b) So we can send a block of MSS's down and then 104 * have the software timer go off after the previous 105 * send is completely out of the hardware. 106 * 107 * But when we do <b> we don't want to have the delay 108 * between the last packet sent by the hardware be 109 * excessively long (to reach our desired rate). 110 * 111 * So let me give an example for clarity. 112 * 113 * Lets assume that the tcp stack sees that 29,110,000 bps is 114 * what the bw of the path is. The stack would select the 115 * rate 31Mbps. 31Mbps means that each send that is done 116 * by the hardware will cause a 390 micro-second gap between 117 * the packets sent at that rate. For 29,110,000 bps we 118 * would need 416 micro-seconds gap between each send. 119 * 120 * Note that are calculating a complete time for pacing 121 * which includes the ethernet, IP and TCP overhead. So 122 * a full 1514 bytes is used for the above calculations. 123 * My testing has shown that both cards are also using this 124 * as their basis i.e. full payload size of the ethernet frame. 125 * The TCP stack caller needs to be aware of this and make the 126 * appropriate overhead calculations be included in its choices. 127 * 128 * Now, continuing our example, we pick a MSS size based on the 129 * delta between the two rates (416 - 390) divided into the rate 130 * we really wish to send at rounded up. That results in a MSS 131 * send of 17 mss's at once. The hardware then will 132 * run out of data in a single 17MSS send in 6,630 micro-seconds. 133 * 134 * On the other hand the software pacer will send more data 135 * in 7,072 micro-seconds. This means that we will refill 136 * the hardware 52 microseconds after it would have sent 137 * next if it had not ran out of data. This is a win since we are 138 * only sending every 7ms or so and yet all the packets are spaced on 139 * the wire with 94% of what they should be and only 140 * the last packet is delayed extra to make up for the 141 * difference. 142 * 143 * Note that the above formula has two important caveat. 144 * If we are above (b/w wise) over 100Mbps we double the result 145 * of the MSS calculation. The second caveat is if we are 500Mbps 146 * or more we just send the maximum MSS at once i.e. 45MSS. At 147 * the higher b/w's even the cards have limits to what times (timer granularity) 148 * they can insert between packets and start to send more than one 149 * packet at a time on the wire. 150 * 151 */ 152 #define COMMON_RATE 180500 153 const uint64_t desired_rates[] = { 154 122500, /* 1Mbps - rate 1 */ 155 180500, /* 1.44Mpbs - rate 2 common rate */ 156 375000, /* 3Mbps - rate 3 */ 157 625000, /* 5Mbps - rate 4 */ 158 875000, /* 7Mbps - rate 5 */ 159 1125000, /* 9Mbps - rate 6 */ 160 1375000, /* 11Mbps - rate 7 */ 161 1625000, /* 13Mbps - rate 8 */ 162 2625000, /* 21Mbps - rate 9 */ 163 3875000, /* 31Mbps - rate 10 */ 164 5125000, /* 41Meg - rate 11 */ 165 12500000, /* 100Mbps - rate 12 */ 166 25000000, /* 200Mbps - rate 13 */ 167 50000000, /* 400Mbps - rate 14 */ 168 63750000, /* 51Mbps - rate 15 */ 169 100000000, /* 800Mbps - rate 16 */ 170 1875000, /* 15Mbps - rate 17 */ 171 2125000, /* 17Mbps - rate 18 */ 172 2375000, /* 19Mbps - rate 19 */ 173 2875000, /* 23Mbps - rate 20 */ 174 3125000, /* 25Mbps - rate 21 */ 175 3375000, /* 27Mbps - rate 22 */ 176 3625000, /* 29Mbps - rate 23 */ 177 4125000, /* 33Mbps - rate 24 */ 178 4375000, /* 35Mbps - rate 25 */ 179 4625000, /* 37Mbps - rate 26 */ 180 4875000, /* 39Mbps - rate 27 */ 181 5375000, /* 43Mbps - rate 28 */ 182 5625000, /* 45Mbps - rate 29 */ 183 5875000, /* 47Mbps - rate 30 */ 184 6125000, /* 49Mbps - rate 31 */ 185 6625000, /* 53Mbps - rate 32 */ 186 6875000, /* 55Mbps - rate 33 */ 187 7125000, /* 57Mbps - rate 34 */ 188 7375000, /* 59Mbps - rate 35 */ 189 7625000, /* 61Mbps - rate 36 */ 190 7875000, /* 63Mbps - rate 37 */ 191 8125000, /* 65Mbps - rate 38 */ 192 8375000, /* 67Mbps - rate 39 */ 193 8625000, /* 69Mbps - rate 40 */ 194 8875000, /* 71Mbps - rate 41 */ 195 9125000, /* 73Mbps - rate 42 */ 196 9375000, /* 75Mbps - rate 43 */ 197 9625000, /* 77Mbps - rate 44 */ 198 9875000, /* 79Mbps - rate 45 */ 199 10125000, /* 81Mbps - rate 46 */ 200 10375000, /* 83Mbps - rate 47 */ 201 10625000, /* 85Mbps - rate 48 */ 202 10875000, /* 87Mbps - rate 49 */ 203 11125000, /* 89Mbps - rate 50 */ 204 11375000, /* 91Mbps - rate 51 */ 205 11625000, /* 93Mbps - rate 52 */ 206 11875000, /* 95Mbps - rate 53 */ 207 13125000, /* 105Mbps - rate 54 */ 208 13750000, /* 110Mbps - rate 55 */ 209 14375000, /* 115Mbps - rate 56 */ 210 15000000, /* 120Mbps - rate 57 */ 211 15625000, /* 125Mbps - rate 58 */ 212 16250000, /* 130Mbps - rate 59 */ 213 16875000, /* 135Mbps - rate 60 */ 214 17500000, /* 140Mbps - rate 61 */ 215 18125000, /* 145Mbps - rate 62 */ 216 18750000, /* 150Mbps - rate 64 */ 217 20000000, /* 160Mbps - rate 65 */ 218 21250000, /* 170Mbps - rate 66 */ 219 22500000, /* 180Mbps - rate 67 */ 220 23750000, /* 190Mbps - rate 68 */ 221 26250000, /* 210Mbps - rate 69 */ 222 27500000, /* 220Mbps - rate 70 */ 223 28750000, /* 230Mbps - rate 71 */ 224 30000000, /* 240Mbps - rate 72 */ 225 31250000, /* 250Mbps - rate 73 */ 226 34375000, /* 275Mbps - rate 74 */ 227 37500000, /* 300Mbps - rate 75 */ 228 40625000, /* 325Mbps - rate 76 */ 229 43750000, /* 350Mbps - rate 77 */ 230 46875000, /* 375Mbps - rate 78 */ 231 53125000, /* 425Mbps - rate 79 */ 232 56250000, /* 450Mbps - rate 80 */ 233 59375000, /* 475Mbps - rate 81 */ 234 62500000, /* 500Mbps - rate 82 */ 235 68750000, /* 550Mbps - rate 83 */ 236 75000000, /* 600Mbps - rate 84 */ 237 81250000, /* 650Mbps - rate 85 */ 238 87500000, /* 700Mbps - rate 86 */ 239 93750000, /* 750Mbps - rate 87 */ 240 106250000, /* 850Mbps - rate 88 */ 241 112500000, /* 900Mbps - rate 89 */ 242 125000000, /* 1Gbps - rate 90 */ 243 156250000, /* 1.25Gps - rate 91 */ 244 187500000, /* 1.5Gps - rate 92 */ 245 218750000, /* 1.75Gps - rate 93 */ 246 250000000, /* 2Gbps - rate 94 */ 247 281250000, /* 2.25Gps - rate 95 */ 248 312500000, /* 2.5Gbps - rate 96 */ 249 343750000, /* 2.75Gbps - rate 97 */ 250 375000000, /* 3Gbps - rate 98 */ 251 500000000, /* 4Gbps - rate 99 */ 252 625000000, /* 5Gbps - rate 100 */ 253 750000000, /* 6Gbps - rate 101 */ 254 875000000, /* 7Gbps - rate 102 */ 255 1000000000, /* 8Gbps - rate 103 */ 256 1125000000, /* 9Gbps - rate 104 */ 257 1250000000, /* 10Gbps - rate 105 */ 258 1875000000, /* 15Gbps - rate 106 */ 259 2500000000 /* 20Gbps - rate 107 */ 260 }; 261 262 #define MAX_HDWR_RATES (sizeof(desired_rates)/sizeof(uint64_t)) 263 #define RS_ORDERED_COUNT 16 /* 264 * Number that are in order 265 * at the beginning of the table, 266 * over this a sort is required. 267 */ 268 #define RS_NEXT_ORDER_GROUP 16 /* 269 * The point in our table where 270 * we come fill in a second ordered 271 * group (index wise means -1). 272 */ 273 #define ALL_HARDWARE_RATES 1004 /* 274 * 1Meg - 1Gig in 1 Meg steps 275 * plus 100, 200k and 500k and 276 * 10Gig 277 */ 278 279 #define RS_ONE_MEGABIT_PERSEC 1000000 280 #define RS_ONE_GIGABIT_PERSEC 1000000000 281 #define RS_TEN_GIGABIT_PERSEC 10000000000 282 283 static struct head_tcp_rate_set int_rs; 284 static struct mtx rs_mtx; 285 uint32_t rs_number_alive; 286 uint32_t rs_number_dead; 287 288 SYSCTL_NODE(_net_inet_tcp, OID_AUTO, rl, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 289 "TCP Ratelimit stats"); 290 SYSCTL_UINT(_net_inet_tcp_rl, OID_AUTO, alive, CTLFLAG_RW, 291 &rs_number_alive, 0, 292 "Number of interfaces initialized for ratelimiting"); 293 SYSCTL_UINT(_net_inet_tcp_rl, OID_AUTO, dead, CTLFLAG_RW, 294 &rs_number_dead, 0, 295 "Number of interfaces departing from ratelimiting"); 296 297 static void 298 rl_add_syctl_entries(struct sysctl_oid *rl_sysctl_root, struct tcp_rate_set *rs) 299 { 300 /* 301 * Add sysctl entries for thus interface. 302 */ 303 if (rs->rs_flags & RS_INTF_NO_SUP) { 304 SYSCTL_ADD_S32(&rs->sysctl_ctx, 305 SYSCTL_CHILDREN(rl_sysctl_root), 306 OID_AUTO, "disable", CTLFLAG_RD, 307 &rs->rs_disable, 0, 308 "Disable this interface from new hdwr limiting?"); 309 } else { 310 SYSCTL_ADD_S32(&rs->sysctl_ctx, 311 SYSCTL_CHILDREN(rl_sysctl_root), 312 OID_AUTO, "disable", CTLFLAG_RW, 313 &rs->rs_disable, 0, 314 "Disable this interface from new hdwr limiting?"); 315 } 316 SYSCTL_ADD_S32(&rs->sysctl_ctx, 317 SYSCTL_CHILDREN(rl_sysctl_root), 318 OID_AUTO, "minseg", CTLFLAG_RW, 319 &rs->rs_min_seg, 0, 320 "What is the minimum we need to send on this interface?"); 321 SYSCTL_ADD_U64(&rs->sysctl_ctx, 322 SYSCTL_CHILDREN(rl_sysctl_root), 323 OID_AUTO, "flow_limit", CTLFLAG_RW, 324 &rs->rs_flow_limit, 0, 325 "What is the limit for number of flows (0=unlimited)?"); 326 SYSCTL_ADD_S32(&rs->sysctl_ctx, 327 SYSCTL_CHILDREN(rl_sysctl_root), 328 OID_AUTO, "highest", CTLFLAG_RD, 329 &rs->rs_highest_valid, 0, 330 "Highest valid rate"); 331 SYSCTL_ADD_S32(&rs->sysctl_ctx, 332 SYSCTL_CHILDREN(rl_sysctl_root), 333 OID_AUTO, "lowest", CTLFLAG_RD, 334 &rs->rs_lowest_valid, 0, 335 "Lowest valid rate"); 336 SYSCTL_ADD_S32(&rs->sysctl_ctx, 337 SYSCTL_CHILDREN(rl_sysctl_root), 338 OID_AUTO, "flags", CTLFLAG_RD, 339 &rs->rs_flags, 0, 340 "What lags are on the entry?"); 341 SYSCTL_ADD_S32(&rs->sysctl_ctx, 342 SYSCTL_CHILDREN(rl_sysctl_root), 343 OID_AUTO, "numrates", CTLFLAG_RD, 344 &rs->rs_rate_cnt, 0, 345 "How many rates re there?"); 346 SYSCTL_ADD_U64(&rs->sysctl_ctx, 347 SYSCTL_CHILDREN(rl_sysctl_root), 348 OID_AUTO, "flows_using", CTLFLAG_RD, 349 &rs->rs_flows_using, 0, 350 "How many flows are using this interface now?"); 351 #ifdef DETAILED_RATELIMIT_SYSCTL 352 if (rs->rs_rlt && rs->rs_rate_cnt > 0) { 353 /* Lets display the rates */ 354 int i; 355 struct sysctl_oid *rl_rates; 356 struct sysctl_oid *rl_rate_num; 357 char rate_num[16]; 358 rl_rates = SYSCTL_ADD_NODE(&rs->sysctl_ctx, 359 SYSCTL_CHILDREN(rl_sysctl_root), 360 OID_AUTO, 361 "rate", 362 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 363 "Ratelist"); 364 for( i = 0; i < rs->rs_rate_cnt; i++) { 365 sprintf(rate_num, "%d", i); 366 rl_rate_num = SYSCTL_ADD_NODE(&rs->sysctl_ctx, 367 SYSCTL_CHILDREN(rl_rates), 368 OID_AUTO, 369 rate_num, 370 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 371 "Individual Rate"); 372 SYSCTL_ADD_U32(&rs->sysctl_ctx, 373 SYSCTL_CHILDREN(rl_rate_num), 374 OID_AUTO, "flags", CTLFLAG_RD, 375 &rs->rs_rlt[i].flags, 0, 376 "Flags on this rate"); 377 SYSCTL_ADD_U32(&rs->sysctl_ctx, 378 SYSCTL_CHILDREN(rl_rate_num), 379 OID_AUTO, "pacetime", CTLFLAG_RD, 380 &rs->rs_rlt[i].time_between, 0, 381 "Time hardware inserts between 1500 byte sends"); 382 SYSCTL_ADD_U64(&rs->sysctl_ctx, 383 SYSCTL_CHILDREN(rl_rate_num), 384 OID_AUTO, "rate", CTLFLAG_RD, 385 &rs->rs_rlt[i].rate, 0, 386 "Rate in bytes per second"); 387 } 388 } 389 #endif 390 } 391 392 static void 393 rs_destroy(epoch_context_t ctx) 394 { 395 struct tcp_rate_set *rs; 396 bool do_free_rs; 397 398 rs = __containerof(ctx, struct tcp_rate_set, rs_epoch_ctx); 399 400 mtx_lock(&rs_mtx); 401 rs->rs_flags &= ~RS_FUNERAL_SCHD; 402 /* 403 * In theory its possible (but unlikely) 404 * that while the delete was occuring 405 * and we were applying the DEAD flag 406 * someone slipped in and found the 407 * interface in a lookup. While we 408 * decided rs_flows_using were 0 and 409 * scheduling the epoch_call, the other 410 * thread incremented rs_flow_using. This 411 * is because users have a pointer and 412 * we only use the rs_flows_using in an 413 * atomic fashion, i.e. the other entities 414 * are not protected. To assure this did 415 * not occur, we check rs_flows_using here 416 * before deleting. 417 */ 418 do_free_rs = (rs->rs_flows_using == 0); 419 rs_number_dead--; 420 mtx_unlock(&rs_mtx); 421 422 if (do_free_rs) { 423 sysctl_ctx_free(&rs->sysctl_ctx); 424 free(rs->rs_rlt, M_TCPPACE); 425 free(rs, M_TCPPACE); 426 } 427 } 428 429 static void 430 rs_defer_destroy(struct tcp_rate_set *rs) 431 { 432 433 mtx_assert(&rs_mtx, MA_OWNED); 434 435 /* Check if already pending. */ 436 if (rs->rs_flags & RS_FUNERAL_SCHD) 437 return; 438 439 rs_number_dead++; 440 441 /* Set flag to only defer once. */ 442 rs->rs_flags |= RS_FUNERAL_SCHD; 443 NET_EPOCH_CALL(rs_destroy, &rs->rs_epoch_ctx); 444 } 445 446 #ifdef INET 447 extern counter_u64_t rate_limit_set_ok; 448 extern counter_u64_t rate_limit_active; 449 extern counter_u64_t rate_limit_alloc_fail; 450 #endif 451 452 static int 453 rl_attach_txrtlmt(struct ifnet *ifp, 454 uint32_t flowtype, 455 int flowid, 456 uint64_t cfg_rate, 457 struct m_snd_tag **tag) 458 { 459 int error; 460 union if_snd_tag_alloc_params params = { 461 .rate_limit.hdr.type = IF_SND_TAG_TYPE_RATE_LIMIT, 462 .rate_limit.hdr.flowid = flowid, 463 .rate_limit.hdr.flowtype = flowtype, 464 .rate_limit.max_rate = cfg_rate, 465 .rate_limit.flags = M_NOWAIT, 466 }; 467 468 if (ifp->if_snd_tag_alloc == NULL) { 469 error = EOPNOTSUPP; 470 } else { 471 error = ifp->if_snd_tag_alloc(ifp, ¶ms, tag); 472 #ifdef INET 473 if (error == 0) { 474 if_ref((*tag)->ifp); 475 counter_u64_add(rate_limit_set_ok, 1); 476 counter_u64_add(rate_limit_active, 1); 477 } else 478 counter_u64_add(rate_limit_alloc_fail, 1); 479 #endif 480 } 481 return (error); 482 } 483 484 static void 485 populate_canned_table(struct tcp_rate_set *rs, const uint64_t *rate_table_act) 486 { 487 /* 488 * The internal table is "special", it 489 * is two seperate ordered tables that 490 * must be merged. We get here when the 491 * adapter specifies a number of rates that 492 * covers both ranges in the table in some 493 * form. 494 */ 495 int i, at_low, at_high; 496 uint8_t low_disabled = 0, high_disabled = 0; 497 498 for(i = 0, at_low = 0, at_high = RS_NEXT_ORDER_GROUP; i < rs->rs_rate_cnt; i++) { 499 rs->rs_rlt[i].flags = 0; 500 rs->rs_rlt[i].time_between = 0; 501 if ((low_disabled == 0) && 502 (high_disabled || 503 (rate_table_act[at_low] < rate_table_act[at_high]))) { 504 rs->rs_rlt[i].rate = rate_table_act[at_low]; 505 at_low++; 506 if (at_low == RS_NEXT_ORDER_GROUP) 507 low_disabled = 1; 508 } else if (high_disabled == 0) { 509 rs->rs_rlt[i].rate = rate_table_act[at_high]; 510 at_high++; 511 if (at_high == MAX_HDWR_RATES) 512 high_disabled = 1; 513 } 514 } 515 } 516 517 static struct tcp_rate_set * 518 rt_setup_new_rs(struct ifnet *ifp, int *error) 519 { 520 struct tcp_rate_set *rs; 521 const uint64_t *rate_table_act; 522 uint64_t lentim, res; 523 size_t sz; 524 uint32_t hash_type; 525 int i; 526 struct if_ratelimit_query_results rl; 527 struct sysctl_oid *rl_sysctl_root; 528 /* 529 * We expect to enter with the 530 * mutex locked. 531 */ 532 533 if (ifp->if_ratelimit_query == NULL) { 534 /* 535 * We can do nothing if we cannot 536 * get a query back from the driver. 537 */ 538 printf("Warning:No query functions for %s:%d-- failed\n", 539 ifp->if_dname, ifp->if_dunit); 540 return (NULL); 541 } 542 rs = malloc(sizeof(struct tcp_rate_set), M_TCPPACE, M_NOWAIT | M_ZERO); 543 if (rs == NULL) { 544 if (error) 545 *error = ENOMEM; 546 printf("Warning:No memory for malloc of tcp_rate_set\n"); 547 return (NULL); 548 } 549 memset(&rl, 0, sizeof(rl)); 550 rl.flags = RT_NOSUPPORT; 551 ifp->if_ratelimit_query(ifp, &rl); 552 if (rl.flags & RT_IS_UNUSABLE) { 553 /* 554 * The interface does not really support 555 * the rate-limiting. 556 */ 557 memset(rs, 0, sizeof(struct tcp_rate_set)); 558 rs->rs_ifp = ifp; 559 rs->rs_if_dunit = ifp->if_dunit; 560 rs->rs_flags = RS_INTF_NO_SUP; 561 rs->rs_disable = 1; 562 rs_number_alive++; 563 sysctl_ctx_init(&rs->sysctl_ctx); 564 rl_sysctl_root = SYSCTL_ADD_NODE(&rs->sysctl_ctx, 565 SYSCTL_STATIC_CHILDREN(_net_inet_tcp_rl), 566 OID_AUTO, 567 rs->rs_ifp->if_xname, 568 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 569 ""); 570 rl_add_syctl_entries(rl_sysctl_root, rs); 571 mtx_lock(&rs_mtx); 572 CK_LIST_INSERT_HEAD(&int_rs, rs, next); 573 mtx_unlock(&rs_mtx); 574 return (rs); 575 } else if ((rl.flags & RT_IS_INDIRECT) == RT_IS_INDIRECT) { 576 memset(rs, 0, sizeof(struct tcp_rate_set)); 577 rs->rs_ifp = ifp; 578 rs->rs_if_dunit = ifp->if_dunit; 579 rs->rs_flags = RS_IS_DEFF; 580 rs_number_alive++; 581 sysctl_ctx_init(&rs->sysctl_ctx); 582 rl_sysctl_root = SYSCTL_ADD_NODE(&rs->sysctl_ctx, 583 SYSCTL_STATIC_CHILDREN(_net_inet_tcp_rl), 584 OID_AUTO, 585 rs->rs_ifp->if_xname, 586 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 587 ""); 588 rl_add_syctl_entries(rl_sysctl_root, rs); 589 mtx_lock(&rs_mtx); 590 CK_LIST_INSERT_HEAD(&int_rs, rs, next); 591 mtx_unlock(&rs_mtx); 592 return (rs); 593 } else if ((rl.flags & RT_IS_FIXED_TABLE) == RT_IS_FIXED_TABLE) { 594 /* Mellanox C4 likely */ 595 rs->rs_ifp = ifp; 596 rs->rs_if_dunit = ifp->if_dunit; 597 rs->rs_rate_cnt = rl.number_of_rates; 598 rs->rs_min_seg = rl.min_segment_burst; 599 rs->rs_highest_valid = 0; 600 rs->rs_flow_limit = rl.max_flows; 601 rs->rs_flags = RS_IS_INTF | RS_NO_PRE; 602 rs->rs_disable = 0; 603 rate_table_act = rl.rate_table; 604 } else if ((rl.flags & RT_IS_SELECTABLE) == RT_IS_SELECTABLE) { 605 /* Chelsio, C5 and C6 of Mellanox? */ 606 rs->rs_ifp = ifp; 607 rs->rs_if_dunit = ifp->if_dunit; 608 rs->rs_rate_cnt = rl.number_of_rates; 609 rs->rs_min_seg = rl.min_segment_burst; 610 rs->rs_disable = 0; 611 rs->rs_flow_limit = rl.max_flows; 612 rate_table_act = desired_rates; 613 if ((rs->rs_rate_cnt > MAX_HDWR_RATES) && 614 (rs->rs_rate_cnt < ALL_HARDWARE_RATES)) { 615 /* 616 * Our desired table is not big 617 * enough, do what we can. 618 */ 619 rs->rs_rate_cnt = MAX_HDWR_RATES; 620 } 621 if (rs->rs_rate_cnt <= RS_ORDERED_COUNT) 622 rs->rs_flags = RS_IS_INTF; 623 else 624 rs->rs_flags = RS_IS_INTF | RS_INT_TBL; 625 if (rs->rs_rate_cnt >= ALL_HARDWARE_RATES) 626 rs->rs_rate_cnt = ALL_HARDWARE_RATES; 627 } else { 628 free(rs, M_TCPPACE); 629 return (NULL); 630 } 631 sz = sizeof(struct tcp_hwrate_limit_table) * rs->rs_rate_cnt; 632 rs->rs_rlt = malloc(sz, M_TCPPACE, M_NOWAIT); 633 if (rs->rs_rlt == NULL) { 634 if (error) 635 *error = ENOMEM; 636 bail: 637 free(rs, M_TCPPACE); 638 return (NULL); 639 } 640 if (rs->rs_rate_cnt >= ALL_HARDWARE_RATES) { 641 /* 642 * The interface supports all 643 * the rates we could possibly want. 644 */ 645 uint64_t rat; 646 647 rs->rs_rlt[0].rate = 12500; /* 100k */ 648 rs->rs_rlt[1].rate = 25000; /* 200k */ 649 rs->rs_rlt[2].rate = 62500; /* 500k */ 650 /* Note 125000 == 1Megabit 651 * populate 1Meg - 1000meg. 652 */ 653 for(i = 3, rat = 125000; i< (ALL_HARDWARE_RATES-1); i++) { 654 rs->rs_rlt[i].rate = rat; 655 rat += 125000; 656 } 657 rs->rs_rlt[(ALL_HARDWARE_RATES-1)].rate = 1250000000; 658 } else if (rs->rs_flags & RS_INT_TBL) { 659 /* We populate this in a special way */ 660 populate_canned_table(rs, rate_table_act); 661 } else { 662 /* 663 * Just copy in the rates from 664 * the table, it is in order. 665 */ 666 for (i=0; i<rs->rs_rate_cnt; i++) { 667 rs->rs_rlt[i].rate = rate_table_act[i]; 668 rs->rs_rlt[i].time_between = 0; 669 rs->rs_rlt[i].flags = 0; 670 } 671 } 672 for (i = (rs->rs_rate_cnt - 1); i >= 0; i--) { 673 /* 674 * We go backwards through the list so that if we can't get 675 * a rate and fail to init one, we have at least a chance of 676 * getting the highest one. 677 */ 678 rs->rs_rlt[i].ptbl = rs; 679 rs->rs_rlt[i].tag = NULL; 680 /* 681 * Calculate the time between. 682 */ 683 lentim = ETHERNET_SEGMENT_SIZE * USECS_IN_SECOND; 684 res = lentim / rs->rs_rlt[i].rate; 685 if (res > 0) 686 rs->rs_rlt[i].time_between = res; 687 else 688 rs->rs_rlt[i].time_between = 1; 689 if (rs->rs_flags & RS_NO_PRE) { 690 rs->rs_rlt[i].flags = HDWRPACE_INITED; 691 rs->rs_lowest_valid = i; 692 } else { 693 int err; 694 695 if ((rl.flags & RT_IS_SETUP_REQ) && 696 (ifp->if_ratelimit_query)) { 697 err = ifp->if_ratelimit_setup(ifp, 698 rs->rs_rlt[i].rate, i); 699 if (err) 700 goto handle_err; 701 } 702 #ifdef RSS 703 hash_type = M_HASHTYPE_RSS_TCP_IPV4; 704 #else 705 hash_type = M_HASHTYPE_OPAQUE_HASH; 706 #endif 707 err = rl_attach_txrtlmt(ifp, 708 hash_type, 709 (i + 1), 710 rs->rs_rlt[i].rate, 711 &rs->rs_rlt[i].tag); 712 if (err) { 713 handle_err: 714 if (i == (rs->rs_rate_cnt - 1)) { 715 /* 716 * Huh - first rate and we can't get 717 * it? 718 */ 719 free(rs->rs_rlt, M_TCPPACE); 720 if (error) 721 *error = err; 722 goto bail; 723 } else { 724 if (error) 725 *error = err; 726 } 727 break; 728 } else { 729 rs->rs_rlt[i].flags = HDWRPACE_INITED | HDWRPACE_TAGPRESENT; 730 rs->rs_lowest_valid = i; 731 } 732 } 733 } 734 /* Did we get at least 1 rate? */ 735 if (rs->rs_rlt[(rs->rs_rate_cnt - 1)].flags & HDWRPACE_INITED) 736 rs->rs_highest_valid = rs->rs_rate_cnt - 1; 737 else { 738 free(rs->rs_rlt, M_TCPPACE); 739 goto bail; 740 } 741 rs_number_alive++; 742 sysctl_ctx_init(&rs->sysctl_ctx); 743 rl_sysctl_root = SYSCTL_ADD_NODE(&rs->sysctl_ctx, 744 SYSCTL_STATIC_CHILDREN(_net_inet_tcp_rl), 745 OID_AUTO, 746 rs->rs_ifp->if_xname, 747 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 748 ""); 749 rl_add_syctl_entries(rl_sysctl_root, rs); 750 mtx_lock(&rs_mtx); 751 CK_LIST_INSERT_HEAD(&int_rs, rs, next); 752 mtx_unlock(&rs_mtx); 753 return (rs); 754 } 755 756 static const struct tcp_hwrate_limit_table * 757 tcp_int_find_suitable_rate(const struct tcp_rate_set *rs, 758 uint64_t bytes_per_sec, uint32_t flags) 759 { 760 struct tcp_hwrate_limit_table *arte = NULL, *rte = NULL; 761 uint64_t mbits_per_sec, ind_calc; 762 int i; 763 764 mbits_per_sec = (bytes_per_sec * 8); 765 if (flags & RS_PACING_LT) { 766 if ((mbits_per_sec < RS_ONE_MEGABIT_PERSEC) && 767 (rs->rs_lowest_valid <= 2)){ 768 /* 769 * Smaller than 1Meg, only 770 * 3 entries can match it. 771 */ 772 for(i = rs->rs_lowest_valid; i < 3; i++) { 773 if (bytes_per_sec <= rs->rs_rlt[i].rate) { 774 rte = &rs->rs_rlt[i]; 775 break; 776 } else if (rs->rs_rlt[i].flags & HDWRPACE_INITED) { 777 arte = &rs->rs_rlt[i]; 778 } 779 } 780 goto done; 781 } else if ((mbits_per_sec > RS_ONE_GIGABIT_PERSEC) && 782 (rs->rs_rlt[(ALL_HARDWARE_RATES-1)].flags & HDWRPACE_INITED)){ 783 /* 784 * Larger than 1G (the majority of 785 * our table. 786 */ 787 if (mbits_per_sec < RS_TEN_GIGABIT_PERSEC) 788 rte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)]; 789 else 790 arte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)]; 791 goto done; 792 } 793 /* 794 * If we reach here its in our table (between 1Meg - 1000Meg), 795 * just take the rounded down mbits per second, and add 796 * 1Megabit to it, from this we can calculate 797 * the index in the table. 798 */ 799 ind_calc = mbits_per_sec/RS_ONE_MEGABIT_PERSEC; 800 if ((ind_calc * RS_ONE_MEGABIT_PERSEC) != mbits_per_sec) 801 ind_calc++; 802 /* our table is offset by 3, we add 2 */ 803 ind_calc += 2; 804 if (ind_calc > (ALL_HARDWARE_RATES-1)) { 805 /* This should not happen */ 806 ind_calc = ALL_HARDWARE_RATES-1; 807 } 808 if ((ind_calc >= rs->rs_lowest_valid) && 809 (ind_calc <= rs->rs_highest_valid)) 810 rte = &rs->rs_rlt[ind_calc]; 811 } else if (flags & RS_PACING_EXACT_MATCH) { 812 if ((mbits_per_sec < RS_ONE_MEGABIT_PERSEC) && 813 (rs->rs_lowest_valid <= 2)){ 814 for(i = rs->rs_lowest_valid; i < 3; i++) { 815 if (bytes_per_sec == rs->rs_rlt[i].rate) { 816 rte = &rs->rs_rlt[i]; 817 break; 818 } 819 } 820 } else if ((mbits_per_sec > RS_ONE_GIGABIT_PERSEC) && 821 (rs->rs_rlt[(ALL_HARDWARE_RATES-1)].flags & HDWRPACE_INITED)) { 822 /* > 1Gbps only one rate */ 823 if (bytes_per_sec == rs->rs_rlt[(ALL_HARDWARE_RATES-1)].rate) { 824 /* Its 10G wow */ 825 rte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)]; 826 } 827 } else { 828 /* Ok it must be a exact meg (its between 1G and 1Meg) */ 829 ind_calc = mbits_per_sec/RS_ONE_MEGABIT_PERSEC; 830 if ((ind_calc * RS_ONE_MEGABIT_PERSEC) == mbits_per_sec) { 831 /* its an exact Mbps */ 832 ind_calc += 2; 833 if (ind_calc > (ALL_HARDWARE_RATES-1)) { 834 /* This should not happen */ 835 ind_calc = ALL_HARDWARE_RATES-1; 836 } 837 if (rs->rs_rlt[ind_calc].flags & HDWRPACE_INITED) 838 rte = &rs->rs_rlt[ind_calc]; 839 } 840 } 841 } else { 842 /* we want greater than the requested rate */ 843 if ((mbits_per_sec < RS_ONE_MEGABIT_PERSEC) && 844 (rs->rs_lowest_valid <= 2)){ 845 arte = &rs->rs_rlt[3]; /* set alternate to 1Meg */ 846 for (i=2; i>=rs->rs_lowest_valid; i--) { 847 if (bytes_per_sec < rs->rs_rlt[i].rate) { 848 rte = &rs->rs_rlt[i]; 849 break; 850 } else if ((flags & RS_PACING_GEQ) && 851 (bytes_per_sec == rs->rs_rlt[i].rate)) { 852 rte = &rs->rs_rlt[i]; 853 break; 854 } else { 855 arte = &rs->rs_rlt[i]; /* new alternate */ 856 } 857 } 858 } else if (mbits_per_sec > RS_ONE_GIGABIT_PERSEC) { 859 if ((bytes_per_sec < rs->rs_rlt[(ALL_HARDWARE_RATES-1)].rate) && 860 (rs->rs_rlt[(ALL_HARDWARE_RATES-1)].flags & HDWRPACE_INITED)){ 861 /* Our top rate is larger than the request */ 862 rte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)]; 863 } else if ((flags & RS_PACING_GEQ) && 864 (bytes_per_sec == rs->rs_rlt[(ALL_HARDWARE_RATES-1)].rate) && 865 (rs->rs_rlt[(ALL_HARDWARE_RATES-1)].flags & HDWRPACE_INITED)) { 866 /* It matches our top rate */ 867 rte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)]; 868 } else if (rs->rs_rlt[(ALL_HARDWARE_RATES-1)].flags & HDWRPACE_INITED) { 869 /* The top rate is an alternative */ 870 arte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)]; 871 } 872 } else { 873 /* Its in our range 1Meg - 1Gig */ 874 if (flags & RS_PACING_GEQ) { 875 ind_calc = mbits_per_sec/RS_ONE_MEGABIT_PERSEC; 876 if ((ind_calc * RS_ONE_MEGABIT_PERSEC) == mbits_per_sec) { 877 if (ind_calc > (ALL_HARDWARE_RATES-1)) { 878 /* This should not happen */ 879 ind_calc = (ALL_HARDWARE_RATES-1); 880 } 881 rte = &rs->rs_rlt[ind_calc]; 882 } 883 goto done; 884 } 885 ind_calc = (mbits_per_sec + (RS_ONE_MEGABIT_PERSEC-1))/RS_ONE_MEGABIT_PERSEC; 886 ind_calc += 2; 887 if (ind_calc > (ALL_HARDWARE_RATES-1)) { 888 /* This should not happen */ 889 ind_calc = ALL_HARDWARE_RATES-1; 890 } 891 if (rs->rs_rlt[ind_calc].flags & HDWRPACE_INITED) 892 rte = &rs->rs_rlt[ind_calc]; 893 } 894 } 895 done: 896 if ((rte == NULL) && 897 (arte != NULL) && 898 (flags & RS_PACING_SUB_OK)) { 899 /* We can use the substitute */ 900 rte = arte; 901 } 902 return (rte); 903 } 904 905 static const struct tcp_hwrate_limit_table * 906 tcp_find_suitable_rate(const struct tcp_rate_set *rs, uint64_t bytes_per_sec, uint32_t flags) 907 { 908 /** 909 * Hunt the rate table with the restrictions in flags and find a 910 * suitable rate if possible. 911 * RS_PACING_EXACT_MATCH - look for an exact match to rate. 912 * RS_PACING_GT - must be greater than. 913 * RS_PACING_GEQ - must be greater than or equal. 914 * RS_PACING_LT - must be less than. 915 * RS_PACING_SUB_OK - If we don't meet criteria a 916 * substitute is ok. 917 */ 918 int i, matched; 919 struct tcp_hwrate_limit_table *rte = NULL; 920 921 922 if ((rs->rs_flags & RS_INT_TBL) && 923 (rs->rs_rate_cnt >= ALL_HARDWARE_RATES)) { 924 /* 925 * Here we don't want to paw thru 926 * a big table, we have everything 927 * from 1Meg - 1000Meg in 1Meg increments. 928 * Use an alternate method to "lookup". 929 */ 930 return (tcp_int_find_suitable_rate(rs, bytes_per_sec, flags)); 931 } 932 if ((flags & RS_PACING_LT) || 933 (flags & RS_PACING_EXACT_MATCH)) { 934 /* 935 * For exact and less than we go forward through the table. 936 * This way when we find one larger we stop (exact was a 937 * toss up). 938 */ 939 for (i = rs->rs_lowest_valid, matched = 0; i <= rs->rs_highest_valid; i++) { 940 if ((flags & RS_PACING_EXACT_MATCH) && 941 (bytes_per_sec == rs->rs_rlt[i].rate)) { 942 rte = &rs->rs_rlt[i]; 943 matched = 1; 944 break; 945 } else if ((flags & RS_PACING_LT) && 946 (bytes_per_sec <= rs->rs_rlt[i].rate)) { 947 rte = &rs->rs_rlt[i]; 948 matched = 1; 949 break; 950 } 951 if (bytes_per_sec > rs->rs_rlt[i].rate) 952 break; 953 } 954 if ((matched == 0) && 955 (flags & RS_PACING_LT) && 956 (flags & RS_PACING_SUB_OK)) { 957 /* Kick in a substitute (the lowest) */ 958 rte = &rs->rs_rlt[rs->rs_lowest_valid]; 959 } 960 } else { 961 /* 962 * Here we go backward through the table so that we can find 963 * the one greater in theory faster (but its probably a 964 * wash). 965 */ 966 for (i = rs->rs_highest_valid, matched = 0; i >= rs->rs_lowest_valid; i--) { 967 if (rs->rs_rlt[i].rate > bytes_per_sec) { 968 /* A possible candidate */ 969 rte = &rs->rs_rlt[i]; 970 } 971 if ((flags & RS_PACING_GEQ) && 972 (bytes_per_sec == rs->rs_rlt[i].rate)) { 973 /* An exact match and we want equal */ 974 matched = 1; 975 rte = &rs->rs_rlt[i]; 976 break; 977 } else if (rte) { 978 /* 979 * Found one that is larger than but don't 980 * stop, there may be a more closer match. 981 */ 982 matched = 1; 983 } 984 if (rs->rs_rlt[i].rate < bytes_per_sec) { 985 /* 986 * We found a table entry that is smaller, 987 * stop there will be none greater or equal. 988 */ 989 break; 990 } 991 } 992 if ((matched == 0) && 993 (flags & RS_PACING_SUB_OK)) { 994 /* Kick in a substitute (the highest) */ 995 rte = &rs->rs_rlt[rs->rs_highest_valid]; 996 } 997 } 998 return (rte); 999 } 1000 1001 static struct ifnet * 1002 rt_find_real_interface(struct ifnet *ifp, struct inpcb *inp, int *error) 1003 { 1004 struct ifnet *tifp; 1005 struct m_snd_tag *tag; 1006 union if_snd_tag_alloc_params params = { 1007 .rate_limit.hdr.type = IF_SND_TAG_TYPE_RATE_LIMIT, 1008 .rate_limit.hdr.flowid = 1, 1009 .rate_limit.hdr.numa_domain = inp->inp_numa_domain, 1010 .rate_limit.max_rate = COMMON_RATE, 1011 .rate_limit.flags = M_NOWAIT, 1012 }; 1013 int err; 1014 #ifdef RSS 1015 params.rate_limit.hdr.flowtype = ((inp->inp_vflag & INP_IPV6) ? 1016 M_HASHTYPE_RSS_TCP_IPV6 : M_HASHTYPE_RSS_TCP_IPV4); 1017 #else 1018 params.rate_limit.hdr.flowtype = M_HASHTYPE_OPAQUE_HASH; 1019 #endif 1020 tag = NULL; 1021 if (ifp->if_snd_tag_alloc) { 1022 if (error) 1023 *error = ENODEV; 1024 return (NULL); 1025 } 1026 err = ifp->if_snd_tag_alloc(ifp, ¶ms, &tag); 1027 if (err) { 1028 /* Failed to setup a tag? */ 1029 if (error) 1030 *error = err; 1031 return (NULL); 1032 } 1033 tifp = tag->ifp; 1034 tifp->if_snd_tag_free(tag); 1035 return (tifp); 1036 } 1037 1038 static const struct tcp_hwrate_limit_table * 1039 rt_setup_rate(struct inpcb *inp, struct ifnet *ifp, uint64_t bytes_per_sec, 1040 uint32_t flags, int *error) 1041 { 1042 /* First lets find the interface if it exists */ 1043 const struct tcp_hwrate_limit_table *rte; 1044 struct tcp_rate_set *rs; 1045 struct epoch_tracker et; 1046 int err; 1047 1048 NET_EPOCH_ENTER(et); 1049 use_real_interface: 1050 CK_LIST_FOREACH(rs, &int_rs, next) { 1051 /* 1052 * Note we don't look with the lock since we either see a 1053 * new entry or will get one when we try to add it. 1054 */ 1055 if (rs->rs_flags & RS_IS_DEAD) { 1056 /* The dead are not looked at */ 1057 continue; 1058 } 1059 if ((rs->rs_ifp == ifp) && 1060 (rs->rs_if_dunit == ifp->if_dunit)) { 1061 /* Ok we found it */ 1062 break; 1063 } 1064 } 1065 if ((rs == NULL) || 1066 (rs->rs_flags & RS_INTF_NO_SUP) || 1067 (rs->rs_flags & RS_IS_DEAD)) { 1068 /* 1069 * This means we got a packet *before* 1070 * the IF-UP was processed below, <or> 1071 * while or after we already received an interface 1072 * departed event. In either case we really don't 1073 * want to do anything with pacing, in 1074 * the departing case the packet is not 1075 * going to go very far. The new case 1076 * might be arguable, but its impossible 1077 * to tell from the departing case. 1078 */ 1079 if (rs->rs_disable && error) 1080 *error = ENODEV; 1081 NET_EPOCH_EXIT(et); 1082 return (NULL); 1083 } 1084 1085 if ((rs == NULL) || (rs->rs_disable != 0)) { 1086 if (rs->rs_disable && error) 1087 *error = ENOSPC; 1088 NET_EPOCH_EXIT(et); 1089 return (NULL); 1090 } 1091 if (rs->rs_flags & RS_IS_DEFF) { 1092 /* We need to find the real interface */ 1093 struct ifnet *tifp; 1094 1095 tifp = rt_find_real_interface(ifp, inp, error); 1096 if (tifp == NULL) { 1097 if (rs->rs_disable && error) 1098 *error = ENOTSUP; 1099 NET_EPOCH_EXIT(et); 1100 return (NULL); 1101 } 1102 goto use_real_interface; 1103 } 1104 if (rs->rs_flow_limit && 1105 ((rs->rs_flows_using + 1) > rs->rs_flow_limit)) { 1106 if (error) 1107 *error = ENOSPC; 1108 NET_EPOCH_EXIT(et); 1109 return (NULL); 1110 } 1111 rte = tcp_find_suitable_rate(rs, bytes_per_sec, flags); 1112 if (rte) { 1113 err = in_pcbattach_txrtlmt(inp, rs->rs_ifp, 1114 inp->inp_flowtype, 1115 inp->inp_flowid, 1116 rte->rate, 1117 &inp->inp_snd_tag); 1118 if (err) { 1119 /* Failed to attach */ 1120 if (error) 1121 *error = err; 1122 rte = NULL; 1123 } 1124 } 1125 if (rte) { 1126 /* 1127 * We use an atomic here for accounting so we don't have to 1128 * use locks when freeing. 1129 */ 1130 atomic_add_64(&rs->rs_flows_using, 1); 1131 } 1132 NET_EPOCH_EXIT(et); 1133 return (rte); 1134 } 1135 1136 static void 1137 tcp_rl_ifnet_link(void *arg __unused, struct ifnet *ifp, int link_state) 1138 { 1139 int error; 1140 struct tcp_rate_set *rs; 1141 1142 if (((ifp->if_capabilities & IFCAP_TXRTLMT) == 0) || 1143 (link_state != LINK_STATE_UP)) { 1144 /* 1145 * We only care on an interface going up that is rate-limit 1146 * capable. 1147 */ 1148 return; 1149 } 1150 mtx_lock(&rs_mtx); 1151 CK_LIST_FOREACH(rs, &int_rs, next) { 1152 if ((rs->rs_ifp == ifp) && 1153 (rs->rs_if_dunit == ifp->if_dunit)) { 1154 /* We already have initialized this guy */ 1155 mtx_unlock(&rs_mtx); 1156 return; 1157 } 1158 } 1159 mtx_unlock(&rs_mtx); 1160 rt_setup_new_rs(ifp, &error); 1161 } 1162 1163 static void 1164 tcp_rl_ifnet_departure(void *arg __unused, struct ifnet *ifp) 1165 { 1166 struct tcp_rate_set *rs, *nrs; 1167 struct ifnet *tifp; 1168 int i; 1169 1170 mtx_lock(&rs_mtx); 1171 CK_LIST_FOREACH_SAFE(rs, &int_rs, next, nrs) { 1172 if ((rs->rs_ifp == ifp) && 1173 (rs->rs_if_dunit == ifp->if_dunit)) { 1174 CK_LIST_REMOVE(rs, next); 1175 rs_number_alive--; 1176 rs->rs_flags |= RS_IS_DEAD; 1177 for (i = 0; i < rs->rs_rate_cnt; i++) { 1178 if (rs->rs_rlt[i].flags & HDWRPACE_TAGPRESENT) { 1179 tifp = rs->rs_rlt[i].tag->ifp; 1180 in_pcbdetach_tag(tifp, rs->rs_rlt[i].tag); 1181 rs->rs_rlt[i].tag = NULL; 1182 } 1183 rs->rs_rlt[i].flags = HDWRPACE_IFPDEPARTED; 1184 } 1185 if (rs->rs_flows_using == 0) 1186 rs_defer_destroy(rs); 1187 break; 1188 } 1189 } 1190 mtx_unlock(&rs_mtx); 1191 } 1192 1193 static void 1194 tcp_rl_shutdown(void *arg __unused, int howto __unused) 1195 { 1196 struct tcp_rate_set *rs, *nrs; 1197 struct ifnet *tifp; 1198 int i; 1199 1200 mtx_lock(&rs_mtx); 1201 CK_LIST_FOREACH_SAFE(rs, &int_rs, next, nrs) { 1202 CK_LIST_REMOVE(rs, next); 1203 rs_number_alive--; 1204 rs->rs_flags |= RS_IS_DEAD; 1205 for (i = 0; i < rs->rs_rate_cnt; i++) { 1206 if (rs->rs_rlt[i].flags & HDWRPACE_TAGPRESENT) { 1207 tifp = rs->rs_rlt[i].tag->ifp; 1208 in_pcbdetach_tag(tifp, rs->rs_rlt[i].tag); 1209 rs->rs_rlt[i].tag = NULL; 1210 } 1211 rs->rs_rlt[i].flags = HDWRPACE_IFPDEPARTED; 1212 } 1213 if (rs->rs_flows_using == 0) 1214 rs_defer_destroy(rs); 1215 } 1216 mtx_unlock(&rs_mtx); 1217 } 1218 1219 const struct tcp_hwrate_limit_table * 1220 tcp_set_pacing_rate(struct tcpcb *tp, struct ifnet *ifp, 1221 uint64_t bytes_per_sec, int flags, int *error) 1222 { 1223 const struct tcp_hwrate_limit_table *rte; 1224 1225 if (tp->t_inpcb->inp_snd_tag == NULL) { 1226 /* 1227 * We are setting up a rate for the first time. 1228 */ 1229 if ((ifp->if_capabilities & IFCAP_TXRTLMT) == 0) { 1230 /* Not supported by the egress */ 1231 if (error) 1232 *error = ENODEV; 1233 return (NULL); 1234 } 1235 #ifdef KERN_TLS 1236 if (tp->t_inpcb->inp_socket->so_snd.sb_flags & SB_TLS_IFNET) { 1237 /* 1238 * We currently can't do both TLS and hardware 1239 * pacing 1240 */ 1241 if (error) 1242 *error = EINVAL; 1243 return (NULL); 1244 } 1245 #endif 1246 rte = rt_setup_rate(tp->t_inpcb, ifp, bytes_per_sec, flags, error); 1247 } else { 1248 /* 1249 * We are modifying a rate, wrong interface? 1250 */ 1251 if (error) 1252 *error = EINVAL; 1253 rte = NULL; 1254 } 1255 *error = 0; 1256 return (rte); 1257 } 1258 1259 const struct tcp_hwrate_limit_table * 1260 tcp_chg_pacing_rate(const struct tcp_hwrate_limit_table *crte, 1261 struct tcpcb *tp, struct ifnet *ifp, 1262 uint64_t bytes_per_sec, int flags, int *error) 1263 { 1264 const struct tcp_hwrate_limit_table *nrte; 1265 const struct tcp_rate_set *rs; 1266 int is_indirect = 0; 1267 int err; 1268 1269 1270 if ((tp->t_inpcb->inp_snd_tag == NULL) || 1271 (crte == NULL)) { 1272 /* Wrong interface */ 1273 if (error) 1274 *error = EINVAL; 1275 return (NULL); 1276 } 1277 rs = crte->ptbl; 1278 if ((rs->rs_flags & RS_IS_DEAD) || 1279 (crte->flags & HDWRPACE_IFPDEPARTED)) { 1280 /* Release the rate, and try anew */ 1281 re_rate: 1282 tcp_rel_pacing_rate(crte, tp); 1283 nrte = tcp_set_pacing_rate(tp, ifp, 1284 bytes_per_sec, flags, error); 1285 return (nrte); 1286 } 1287 if ((rs->rs_flags & RT_IS_INDIRECT ) == RT_IS_INDIRECT) 1288 is_indirect = 1; 1289 else 1290 is_indirect = 0; 1291 if ((is_indirect == 0) && 1292 ((ifp != rs->rs_ifp) || 1293 (ifp->if_dunit != rs->rs_if_dunit))) { 1294 /* 1295 * Something changed, the user is not pointing to the same 1296 * ifp? Maybe a route updated on this guy? 1297 */ 1298 goto re_rate; 1299 } else if (is_indirect) { 1300 /* 1301 * For indirect we have to dig in and find the real interface. 1302 */ 1303 struct ifnet *rifp; 1304 1305 rifp = rt_find_real_interface(ifp, tp->t_inpcb, error); 1306 if (rifp == NULL) { 1307 /* Can't find it? */ 1308 goto re_rate; 1309 } 1310 if ((rifp != rs->rs_ifp) || 1311 (ifp->if_dunit != rs->rs_if_dunit)) { 1312 goto re_rate; 1313 } 1314 } 1315 nrte = tcp_find_suitable_rate(rs, bytes_per_sec, flags); 1316 if (nrte == crte) { 1317 /* No change */ 1318 if (error) 1319 *error = 0; 1320 return (crte); 1321 } 1322 if (nrte == NULL) { 1323 /* Release the old rate */ 1324 tcp_rel_pacing_rate(crte, tp); 1325 return (NULL); 1326 } 1327 /* Change rates to our new entry */ 1328 err = in_pcbmodify_txrtlmt(tp->t_inpcb, nrte->rate); 1329 if (err) { 1330 if (error) 1331 *error = err; 1332 return (NULL); 1333 } 1334 if (error) 1335 *error = 0; 1336 return (nrte); 1337 } 1338 1339 void 1340 tcp_rel_pacing_rate(const struct tcp_hwrate_limit_table *crte, struct tcpcb *tp) 1341 { 1342 const struct tcp_rate_set *crs; 1343 struct tcp_rate_set *rs; 1344 uint64_t pre; 1345 1346 crs = crte->ptbl; 1347 /* 1348 * Now we must break the const 1349 * in order to release our refcount. 1350 */ 1351 rs = __DECONST(struct tcp_rate_set *, crs); 1352 pre = atomic_fetchadd_64(&rs->rs_flows_using, -1); 1353 if (pre == 1) { 1354 mtx_lock(&rs_mtx); 1355 /* 1356 * Is it dead? 1357 */ 1358 if (rs->rs_flags & RS_IS_DEAD) 1359 rs_defer_destroy(rs); 1360 mtx_unlock(&rs_mtx); 1361 } 1362 in_pcbdetach_txrtlmt(tp->t_inpcb); 1363 } 1364 1365 #define ONE_POINT_TWO_MEG 150000 /* 1.2 megabits in bytes */ 1366 #define ONE_HUNDRED_MBPS 12500000 /* 100Mbps in bytes per second */ 1367 #define FIVE_HUNDRED_MBPS 62500000 /* 500Mbps in bytes per second */ 1368 #define MAX_MSS_SENT 43 /* 43 mss = 43 x 1500 = 64,500 bytes */ 1369 1370 1371 uint32_t 1372 tcp_get_pacing_burst_size (uint64_t bw, uint32_t segsiz, int can_use_1mss, 1373 const struct tcp_hwrate_limit_table *te, int *err) 1374 { 1375 /* 1376 * We use the google formula to calculate the 1377 * TSO size. I.E. 1378 * bw < 24Meg 1379 * tso = 2mss 1380 * else 1381 * tso = min(bw/1000, 64k) 1382 * 1383 * Note for these calculations we ignore the 1384 * packet overhead (enet hdr, ip hdr and tcp hdr). 1385 */ 1386 uint64_t lentim, res, bytes; 1387 uint32_t new_tso, min_tso_segs; 1388 1389 bytes = bw / 1000; 1390 if (bytes > (64 * 1000)) 1391 bytes = 64 * 1000; 1392 /* Round up */ 1393 new_tso = (bytes + segsiz - 1) / segsiz; 1394 if (can_use_1mss && (bw < ONE_POINT_TWO_MEG)) 1395 min_tso_segs = 1; 1396 else 1397 min_tso_segs = 2; 1398 if (new_tso < min_tso_segs) 1399 new_tso = min_tso_segs; 1400 if (new_tso > MAX_MSS_SENT) 1401 new_tso = MAX_MSS_SENT; 1402 new_tso *= segsiz; 1403 /* 1404 * If we are not doing hardware pacing 1405 * then we are done. 1406 */ 1407 if (te == NULL) { 1408 if (err) 1409 *err = 0; 1410 return(new_tso); 1411 } 1412 /* 1413 * For hardware pacing we look at the 1414 * rate you are sending at and compare 1415 * that to the rate you have in hardware. 1416 * 1417 * If the hardware rate is slower than your 1418 * software rate then you are in error and 1419 * we will build a queue in our hardware whic 1420 * is probably not desired, in such a case 1421 * just return the non-hardware TSO size. 1422 * 1423 * If the rate in hardware is faster (which 1424 * it should be) then look at how long it 1425 * takes to send one ethernet segment size at 1426 * your b/w and compare that to the time it 1427 * takes to send at the rate you had selected. 1428 * 1429 * If your time is greater (which we hope it is) 1430 * we get the delta between the two, and then 1431 * divide that into your pacing time. This tells 1432 * us how many MSS you can send down at once (rounded up). 1433 * 1434 * Note we also double this value if the b/w is over 1435 * 100Mbps. If its over 500meg we just set you to the 1436 * max (43 segments). 1437 */ 1438 if (te->rate > FIVE_HUNDRED_MBPS) 1439 return (segsiz * MAX_MSS_SENT); 1440 if (te->rate == bw) { 1441 /* We are pacing at exactly the hdwr rate */ 1442 return (segsiz * MAX_MSS_SENT); 1443 } 1444 lentim = ETHERNET_SEGMENT_SIZE * USECS_IN_SECOND; 1445 res = lentim / bw; 1446 if (res > te->time_between) { 1447 uint32_t delta, segs; 1448 1449 delta = res - te->time_between; 1450 segs = (res + delta - 1)/delta; 1451 if (te->rate > ONE_HUNDRED_MBPS) 1452 segs *= 2; 1453 if (segs < min_tso_segs) 1454 segs = min_tso_segs; 1455 if (segs > MAX_MSS_SENT) 1456 segs = MAX_MSS_SENT; 1457 segs *= segsiz; 1458 if (err) 1459 *err = 0; 1460 if (segs < new_tso) { 1461 /* unexpected ? */ 1462 return(new_tso); 1463 } else { 1464 return (segs); 1465 } 1466 } else { 1467 /* 1468 * Your time is smaller which means 1469 * we will grow a queue on our 1470 * hardware. Send back the non-hardware 1471 * rate. 1472 */ 1473 if (err) 1474 *err = -1; 1475 return (new_tso); 1476 } 1477 } 1478 1479 static eventhandler_tag rl_ifnet_departs; 1480 static eventhandler_tag rl_ifnet_arrives; 1481 static eventhandler_tag rl_shutdown_start; 1482 1483 static void 1484 tcp_rs_init(void *st __unused) 1485 { 1486 CK_LIST_INIT(&int_rs); 1487 rs_number_alive = 0; 1488 rs_number_dead = 0; 1489 mtx_init(&rs_mtx, "tcp_rs_mtx", "rsmtx", MTX_DEF); 1490 rl_ifnet_departs = EVENTHANDLER_REGISTER(ifnet_departure_event, 1491 tcp_rl_ifnet_departure, 1492 NULL, EVENTHANDLER_PRI_ANY); 1493 rl_ifnet_arrives = EVENTHANDLER_REGISTER(ifnet_link_event, 1494 tcp_rl_ifnet_link, 1495 NULL, EVENTHANDLER_PRI_ANY); 1496 rl_shutdown_start = EVENTHANDLER_REGISTER(shutdown_pre_sync, 1497 tcp_rl_shutdown, NULL, 1498 SHUTDOWN_PRI_FIRST); 1499 printf("TCP_ratelimit: Is now initialized\n"); 1500 } 1501 1502 SYSINIT(tcp_rl_init, SI_SUB_SMP + 1, SI_ORDER_ANY, tcp_rs_init, NULL); 1503 #endif 1504