1 /*- 2 * 3 * SPDX-License-Identifier: BSD-3-Clause 4 * 5 * Copyright (c) 2018-2019 6 * Netflix Inc. 7 * All rights reserved. 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions 11 * are met: 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in the 16 * documentation and/or other materials provided with the distribution. 17 * 18 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 19 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 21 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 22 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 24 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 28 * SUCH DAMAGE. 29 * 30 */ 31 /** 32 * Author: Randall Stewart <rrs@netflix.com> 33 */ 34 35 #include <sys/cdefs.h> 36 __FBSDID("$FreeBSD$"); 37 #include "opt_inet.h" 38 #include "opt_inet6.h" 39 #include "opt_ipsec.h" 40 #include "opt_tcpdebug.h" 41 #include "opt_ratelimit.h" 42 #include <sys/param.h> 43 #include <sys/kernel.h> 44 #include <sys/malloc.h> 45 #include <sys/mbuf.h> 46 #include <sys/socket.h> 47 #include <sys/socketvar.h> 48 #include <sys/sysctl.h> 49 #include <sys/eventhandler.h> 50 #include <sys/mutex.h> 51 #include <sys/ck.h> 52 #include <net/if.h> 53 #include <net/if_var.h> 54 #include <netinet/in.h> 55 #include <netinet/in_pcb.h> 56 #define TCPSTATES /* for logging */ 57 #include <netinet/tcp_var.h> 58 #ifdef INET6 59 #include <netinet6/tcp6_var.h> 60 #endif 61 #include <netinet/tcp_ratelimit.h> 62 #ifndef USECS_IN_SECOND 63 #define USECS_IN_SECOND 1000000 64 #endif 65 /* 66 * For the purposes of each send, what is the size 67 * of an ethernet frame. 68 */ 69 MALLOC_DEFINE(M_TCPPACE, "tcp_hwpace", "TCP Hardware pacing memory"); 70 #ifdef RATELIMIT 71 72 /* 73 * The following preferred table will seem weird to 74 * the casual viewer. Why do we not have any rates below 75 * 1Mbps? Why do we have a rate at 1.44Mbps called common? 76 * Why do the rates cluster in the 1-100Mbps range more 77 * than others? Why does the table jump around at the beginnign 78 * and then be more consistently raising? 79 * 80 * Let me try to answer those questions. A lot of 81 * this is dependant on the hardware. We have three basic 82 * supporters of rate limiting 83 * 84 * Chelsio - Supporting 16 configurable rates. 85 * Mlx - c4 supporting 13 fixed rates. 86 * Mlx - c5 & c6 supporting 127 configurable rates. 87 * 88 * The c4 is why we have a common rate that is available 89 * in all rate tables. This is a selected rate from the 90 * c4 table and we assure its available in all ratelimit 91 * tables. This way the tcp_ratelimit code has an assured 92 * rate it should always be able to get. This answers a 93 * couple of the questions above. 94 * 95 * So what about the rest, well the table is built to 96 * try to get the most out of a joint hardware/software 97 * pacing system. The software pacer will always pick 98 * a rate higher than the b/w that it is estimating 99 * 100 * on the path. This is done for two reasons. 101 * a) So we can discover more b/w 102 * and 103 * b) So we can send a block of MSS's down and then 104 * have the software timer go off after the previous 105 * send is completely out of the hardware. 106 * 107 * But when we do <b> we don't want to have the delay 108 * between the last packet sent by the hardware be 109 * excessively long (to reach our desired rate). 110 * 111 * So let me give an example for clarity. 112 * 113 * Lets assume that the tcp stack sees that 29,110,000 bps is 114 * what the bw of the path is. The stack would select the 115 * rate 31Mbps. 31Mbps means that each send that is done 116 * by the hardware will cause a 387 micro-second gap between 117 * the pacets sent at that rate. For 29,110,000 bps we 118 * would need 412 micro-seconds gap between each send. 119 * 120 * Now we pick a MSS size based on the delta between the 121 * two rates (412 - 387) divided into the rate we really 122 * wish to send at rounded up. That results in a MSS 123 * send of 17 mss's at once. The hardware then will 124 * run out of data in a single 17MSS send in 6,579 micro-seconds. 125 * On the other hand the software pacer will send more data 126 * in 7,004 micro-seconds. This means that we will refill 127 * the hardware 25 microseconds after it would have sent 128 * next. This is a win since we no are only sending every 129 * 7ms or so and yet all the packets are spaced on 130 * the wire with 94% of what they should be and only 131 * the last packet is delayed extra to make up for the 132 * difference. Note that the above formula has two 133 * important caveat. If we are above (b/w wise) over 134 * 100Mbps we double the result of the MSS calculation. 135 * The second caveat is if we are 500Mbps or more 136 * we just send the maximum MSS at once i.e. 45MSS 137 * 138 */ 139 #define COMMON_RATE 180500 140 uint64_t desired_rates[] = { 141 122500, /* 1Mbps - rate 1 */ 142 180500, /* 1.44Mpbs - rate 2 common rate */ 143 375000, /* 3Mbps - rate 3 */ 144 625000, /* 5Mbps - rate 4 */ 145 875000, /* 7Mbps - rate 5 */ 146 1125000, /* 9Mbps - rate 6 */ 147 1375000, /* 11Mbps - rate 7 */ 148 1625000, /* 13Mbps - rate 8 */ 149 2625000, /* 21Mbps - rate 9 */ 150 3875000, /* 31Mbps - rate 10 */ 151 5125000, /* 41Meg - rate 11 */ 152 12500000, /* 100Mbps - rate 12 */ 153 25000000, /* 200Mbps - rate 13 */ 154 50000000, /* 400Mbps - rate 14 */ 155 63750000, /* 51Mbps - rate 15 */ 156 100000000, /* 800Mbps - rate 16 */ 157 1875000, /* 15Mbps - rate 17 */ 158 2125000, /* 17Mbps - rate 18 */ 159 2375000, /* 19Mbps - rate 19 */ 160 2875000, /* 23Mbps - rate 20 */ 161 3125000, /* 25Mbps - rate 21 */ 162 3375000, /* 27Mbps - rate 22 */ 163 3625000, /* 29Mbps - rate 23 */ 164 4125000, /* 33Mbps - rate 24 */ 165 4375000, /* 35Mbps - rate 25 */ 166 4625000, /* 37Mbps - rate 26 */ 167 4875000, /* 39Mbps - rate 27 */ 168 5375000, /* 43Mbps - rate 28 */ 169 5625000, /* 45Mbps - rate 29 */ 170 5875000, /* 47Mbps - rate 30 */ 171 6125000, /* 49Mbps - rate 31 */ 172 6625000, /* 53Mbps - rate 32 */ 173 6875000, /* 55Mbps - rate 33 */ 174 7125000, /* 57Mbps - rate 34 */ 175 7375000, /* 59Mbps - rate 35 */ 176 7625000, /* 61Mbps - rate 36 */ 177 7875000, /* 63Mbps - rate 37 */ 178 8125000, /* 65Mbps - rate 38 */ 179 8375000, /* 67Mbps - rate 39 */ 180 8625000, /* 69Mbps - rate 40 */ 181 8875000, /* 71Mbps - rate 41 */ 182 9125000, /* 73Mbps - rate 42 */ 183 9375000, /* 75Mbps - rate 43 */ 184 9625000, /* 77Mbps - rate 44 */ 185 9875000, /* 79Mbps - rate 45 */ 186 10125000, /* 81Mbps - rate 46 */ 187 10375000, /* 83Mbps - rate 47 */ 188 10625000, /* 85Mbps - rate 48 */ 189 10875000, /* 87Mbps - rate 49 */ 190 11125000, /* 89Mbps - rate 50 */ 191 11375000, /* 91Mbps - rate 51 */ 192 11625000, /* 93Mbps - rate 52 */ 193 11875000, /* 95Mbps - rate 53 */ 194 13125000, /* 105Mbps - rate 54 */ 195 13750000, /* 110Mbps - rate 55 */ 196 14375000, /* 115Mbps - rate 56 */ 197 15000000, /* 120Mbps - rate 57 */ 198 15625000, /* 125Mbps - rate 58 */ 199 16250000, /* 130Mbps - rate 59 */ 200 16875000, /* 135Mbps - rate 60 */ 201 17500000, /* 140Mbps - rate 61 */ 202 18125000, /* 145Mbps - rate 62 */ 203 18750000, /* 150Mbps - rate 64 */ 204 20000000, /* 160Mbps - rate 65 */ 205 21250000, /* 170Mbps - rate 66 */ 206 22500000, /* 180Mbps - rate 67 */ 207 23750000, /* 190Mbps - rate 68 */ 208 26250000, /* 210Mbps - rate 69 */ 209 27500000, /* 220Mbps - rate 70 */ 210 28750000, /* 230Mbps - rate 71 */ 211 30000000, /* 240Mbps - rate 72 */ 212 31250000, /* 250Mbps - rate 73 */ 213 34375000, /* 275Mbps - rate 74 */ 214 37500000, /* 300Mbps - rate 75 */ 215 40625000, /* 325Mbps - rate 76 */ 216 43750000, /* 350Mbps - rate 77 */ 217 46875000, /* 375Mbps - rate 78 */ 218 53125000, /* 425Mbps - rate 79 */ 219 56250000, /* 450Mbps - rate 80 */ 220 59375000, /* 475Mbps - rate 81 */ 221 62500000, /* 500Mbps - rate 82 */ 222 68750000, /* 550Mbps - rate 83 */ 223 75000000, /* 600Mbps - rate 84 */ 224 81250000, /* 650Mbps - rate 85 */ 225 87500000, /* 700Mbps - rate 86 */ 226 93750000, /* 750Mbps - rate 87 */ 227 106250000, /* 850Mbps - rate 88 */ 228 112500000, /* 900Mbps - rate 89 */ 229 125000000, /* 1Gbps - rate 90 */ 230 156250000, /* 1.25Gps - rate 91 */ 231 187500000, /* 1.5Gps - rate 92 */ 232 218750000, /* 1.75Gps - rate 93 */ 233 250000000, /* 2Gbps - rate 94 */ 234 281250000, /* 2.25Gps - rate 95 */ 235 312500000, /* 2.5Gbps - rate 96 */ 236 343750000, /* 2.75Gbps - rate 97 */ 237 375000000, /* 3Gbps - rate 98 */ 238 500000000, /* 4Gbps - rate 99 */ 239 625000000, /* 5Gbps - rate 100 */ 240 750000000, /* 6Gbps - rate 101 */ 241 875000000, /* 7Gbps - rate 102 */ 242 1000000000, /* 8Gbps - rate 103 */ 243 1125000000, /* 9Gbps - rate 104 */ 244 1250000000, /* 10Gbps - rate 105 */ 245 1875000000, /* 15Gbps - rate 106 */ 246 2500000000 /* 20Gbps - rate 107 */ 247 }; 248 249 #define MAX_HDWR_RATES (sizeof(desired_rates)/sizeof(uint64_t)) 250 #define RS_ORDERED_COUNT 16 /* 251 * Number that are in order 252 * at the beginning of the table, 253 * over this a sort is required. 254 */ 255 #define RS_NEXT_ORDER_GROUP 16 /* 256 * The point in our table where 257 * we come fill in a second ordered 258 * group (index wise means -1). 259 */ 260 #define ALL_HARDWARE_RATES 1004 /* 261 * 1Meg - 1Gig in 1 Meg steps 262 * plus 100, 200k and 500k and 263 * 10Gig 264 */ 265 266 #define RS_ONE_MEGABIT_PERSEC 1000000 267 #define RS_ONE_GIGABIT_PERSEC 1000000000 268 #define RS_TEN_GIGABIT_PERSEC 10000000000 269 270 static struct head_tcp_rate_set int_rs; 271 static struct mtx rs_mtx; 272 uint32_t rs_number_alive; 273 uint32_t rs_number_dead; 274 275 SYSCTL_NODE(_net_inet_tcp, OID_AUTO, rl, CTLFLAG_RW, 0, 276 "TCP Ratelimit stats"); 277 SYSCTL_UINT(_net_inet_tcp_rl, OID_AUTO, alive, CTLFLAG_RW, 278 &rs_number_alive, 0, 279 "Number of interfaces initialized for ratelimiting"); 280 SYSCTL_UINT(_net_inet_tcp_rl, OID_AUTO, dead, CTLFLAG_RW, 281 &rs_number_dead, 0, 282 "Number of interfaces departing from ratelimiting"); 283 284 static void 285 rl_add_syctl_entries(struct sysctl_oid *rl_sysctl_root, struct tcp_rate_set *rs) 286 { 287 /* 288 * Add sysctl entries for thus interface. 289 */ 290 if (rs->rs_flags & RS_INTF_NO_SUP) { 291 SYSCTL_ADD_S32(&rs->sysctl_ctx, 292 SYSCTL_CHILDREN(rl_sysctl_root), 293 OID_AUTO, "disable", CTLFLAG_RD, 294 &rs->rs_disable, 0, 295 "Disable this interface from new hdwr limiting?"); 296 } else { 297 SYSCTL_ADD_S32(&rs->sysctl_ctx, 298 SYSCTL_CHILDREN(rl_sysctl_root), 299 OID_AUTO, "disable", CTLFLAG_RW, 300 &rs->rs_disable, 0, 301 "Disable this interface from new hdwr limiting?"); 302 } 303 SYSCTL_ADD_S32(&rs->sysctl_ctx, 304 SYSCTL_CHILDREN(rl_sysctl_root), 305 OID_AUTO, "minseg", CTLFLAG_RW, 306 &rs->rs_min_seg, 0, 307 "What is the minimum we need to send on this interface?"); 308 SYSCTL_ADD_U64(&rs->sysctl_ctx, 309 SYSCTL_CHILDREN(rl_sysctl_root), 310 OID_AUTO, "flow_limit", CTLFLAG_RW, 311 &rs->rs_flow_limit, 0, 312 "What is the limit for number of flows (0=unlimited)?"); 313 SYSCTL_ADD_S32(&rs->sysctl_ctx, 314 SYSCTL_CHILDREN(rl_sysctl_root), 315 OID_AUTO, "highest", CTLFLAG_RD, 316 &rs->rs_highest_valid, 0, 317 "Highest valid rate"); 318 SYSCTL_ADD_S32(&rs->sysctl_ctx, 319 SYSCTL_CHILDREN(rl_sysctl_root), 320 OID_AUTO, "lowest", CTLFLAG_RD, 321 &rs->rs_lowest_valid, 0, 322 "Lowest valid rate"); 323 SYSCTL_ADD_S32(&rs->sysctl_ctx, 324 SYSCTL_CHILDREN(rl_sysctl_root), 325 OID_AUTO, "flags", CTLFLAG_RD, 326 &rs->rs_flags, 0, 327 "What lags are on the entry?"); 328 SYSCTL_ADD_S32(&rs->sysctl_ctx, 329 SYSCTL_CHILDREN(rl_sysctl_root), 330 OID_AUTO, "numrates", CTLFLAG_RD, 331 &rs->rs_rate_cnt, 0, 332 "How many rates re there?"); 333 SYSCTL_ADD_U64(&rs->sysctl_ctx, 334 SYSCTL_CHILDREN(rl_sysctl_root), 335 OID_AUTO, "flows_using", CTLFLAG_RD, 336 &rs->rs_flows_using, 0, 337 "How many flows are using this interface now?"); 338 #ifdef DETAILED_RATELIMIT_SYSCTL 339 if (rs->rs_rlt && rs->rs_rate_cnt > 0) { 340 /* Lets display the rates */ 341 int i; 342 struct sysctl_oid *rl_rates; 343 struct sysctl_oid *rl_rate_num; 344 char rate_num[16]; 345 rl_rates = SYSCTL_ADD_NODE(&rs->sysctl_ctx, 346 SYSCTL_CHILDREN(rl_sysctl_root), 347 OID_AUTO, 348 "rate", 349 CTLFLAG_RW, 0, 350 "Ratelist"); 351 for( i = 0; i < rs->rs_rate_cnt; i++) { 352 sprintf(rate_num, "%d", i); 353 rl_rate_num = SYSCTL_ADD_NODE(&rs->sysctl_ctx, 354 SYSCTL_CHILDREN(rl_rates), 355 OID_AUTO, 356 rate_num, 357 CTLFLAG_RW, 0, 358 "Individual Rate"); 359 SYSCTL_ADD_U32(&rs->sysctl_ctx, 360 SYSCTL_CHILDREN(rl_rate_num), 361 OID_AUTO, "flags", CTLFLAG_RD, 362 &rs->rs_rlt[i].flags, 0, 363 "Flags on this rate"); 364 SYSCTL_ADD_U32(&rs->sysctl_ctx, 365 SYSCTL_CHILDREN(rl_rate_num), 366 OID_AUTO, "pacetime", CTLFLAG_RD, 367 &rs->rs_rlt[i].time_between, 0, 368 "Time hardware inserts between 1500 byte sends"); 369 SYSCTL_ADD_U64(&rs->sysctl_ctx, 370 SYSCTL_CHILDREN(rl_rate_num), 371 OID_AUTO, "rate", CTLFLAG_RD, 372 &rs->rs_rlt[i].rate, 0, 373 "Rate in bytes per second"); 374 } 375 } 376 #endif 377 } 378 379 static void 380 rs_destroy(epoch_context_t ctx) 381 { 382 struct tcp_rate_set *rs; 383 bool do_free_rs; 384 385 rs = __containerof(ctx, struct tcp_rate_set, rs_epoch_ctx); 386 387 mtx_lock(&rs_mtx); 388 rs->rs_flags &= ~RS_FUNERAL_SCHD; 389 /* 390 * In theory its possible (but unlikely) 391 * that while the delete was occuring 392 * and we were applying the DEAD flag 393 * someone slipped in and found the 394 * interface in a lookup. While we 395 * decided rs_flows_using were 0 and 396 * scheduling the epoch_call, the other 397 * thread incremented rs_flow_using. This 398 * is because users have a pointer and 399 * we only use the rs_flows_using in an 400 * atomic fashion, i.e. the other entities 401 * are not protected. To assure this did 402 * not occur, we check rs_flows_using here 403 * before deleting. 404 */ 405 do_free_rs = (rs->rs_flows_using == 0); 406 rs_number_dead--; 407 mtx_unlock(&rs_mtx); 408 409 if (do_free_rs) { 410 sysctl_ctx_free(&rs->sysctl_ctx); 411 free(rs->rs_rlt, M_TCPPACE); 412 free(rs, M_TCPPACE); 413 } 414 } 415 416 static void 417 rs_defer_destroy(struct tcp_rate_set *rs) 418 { 419 420 mtx_assert(&rs_mtx, MA_OWNED); 421 422 /* Check if already pending. */ 423 if (rs->rs_flags & RS_FUNERAL_SCHD) 424 return; 425 426 rs_number_dead++; 427 428 /* Set flag to only defer once. */ 429 rs->rs_flags |= RS_FUNERAL_SCHD; 430 NET_EPOCH_CALL(rs_destroy, &rs->rs_epoch_ctx); 431 } 432 433 #ifdef INET 434 extern counter_u64_t rate_limit_set_ok; 435 extern counter_u64_t rate_limit_active; 436 extern counter_u64_t rate_limit_alloc_fail; 437 #endif 438 439 static int 440 rl_attach_txrtlmt(struct ifnet *ifp, 441 uint32_t flowtype, 442 int flowid, 443 uint64_t cfg_rate, 444 struct m_snd_tag **tag) 445 { 446 int error; 447 union if_snd_tag_alloc_params params = { 448 .rate_limit.hdr.type = IF_SND_TAG_TYPE_RATE_LIMIT, 449 .rate_limit.hdr.flowid = flowid, 450 .rate_limit.hdr.flowtype = flowtype, 451 .rate_limit.max_rate = cfg_rate, 452 .rate_limit.flags = M_NOWAIT, 453 }; 454 455 if (ifp->if_snd_tag_alloc == NULL) { 456 error = EOPNOTSUPP; 457 } else { 458 error = ifp->if_snd_tag_alloc(ifp, ¶ms, tag); 459 #ifdef INET 460 if (error == 0) { 461 if_ref((*tag)->ifp); 462 counter_u64_add(rate_limit_set_ok, 1); 463 counter_u64_add(rate_limit_active, 1); 464 } else 465 counter_u64_add(rate_limit_alloc_fail, 1); 466 #endif 467 } 468 return (error); 469 } 470 471 static void 472 populate_canned_table(struct tcp_rate_set *rs, const uint64_t *rate_table_act) 473 { 474 /* 475 * The internal table is "special", it 476 * is two seperate ordered tables that 477 * must be merged. We get here when the 478 * adapter specifies a number of rates that 479 * covers both ranges in the table in some 480 * form. 481 */ 482 int i, at_low, at_high; 483 uint8_t low_disabled = 0, high_disabled = 0; 484 485 for(i = 0, at_low = 0, at_high = RS_NEXT_ORDER_GROUP; i < rs->rs_rate_cnt; i++) { 486 rs->rs_rlt[i].flags = 0; 487 rs->rs_rlt[i].time_between = 0; 488 if ((low_disabled == 0) && 489 (high_disabled || 490 (rate_table_act[at_low] < rate_table_act[at_high]))) { 491 rs->rs_rlt[i].rate = rate_table_act[at_low]; 492 at_low++; 493 if (at_low == RS_NEXT_ORDER_GROUP) 494 low_disabled = 1; 495 } else if (high_disabled == 0) { 496 rs->rs_rlt[i].rate = rate_table_act[at_high]; 497 at_high++; 498 if (at_high == MAX_HDWR_RATES) 499 high_disabled = 1; 500 } 501 } 502 } 503 504 static struct tcp_rate_set * 505 rt_setup_new_rs(struct ifnet *ifp, int *error) 506 { 507 struct tcp_rate_set *rs; 508 const uint64_t *rate_table_act; 509 uint64_t lentim, res; 510 size_t sz; 511 uint32_t hash_type; 512 int i; 513 struct if_ratelimit_query_results rl; 514 struct sysctl_oid *rl_sysctl_root; 515 /* 516 * We expect to enter with the 517 * mutex locked. 518 */ 519 520 if (ifp->if_ratelimit_query == NULL) { 521 /* 522 * We can do nothing if we cannot 523 * get a query back from the driver. 524 */ 525 printf("No query functions for %s:%d-- failed\n", 526 ifp->if_dname, ifp->if_dunit); 527 return (NULL); 528 } 529 rs = malloc(sizeof(struct tcp_rate_set), M_TCPPACE, M_NOWAIT | M_ZERO); 530 if (rs == NULL) { 531 if (error) 532 *error = ENOMEM; 533 printf("No memory for malloc\n"); 534 return (NULL); 535 } 536 memset(&rl, 0, sizeof(rl)); 537 rl.flags = RT_NOSUPPORT; 538 ifp->if_ratelimit_query(ifp, &rl); 539 printf("if:%s:%d responds with flags:0x%x rate count:%d\n", 540 ifp->if_dname, 541 ifp->if_dunit, 542 rl.flags, rl.number_of_rates); 543 if (rl.flags & RT_IS_UNUSABLE) { 544 /* 545 * The interface does not really support 546 * the rate-limiting. 547 */ 548 memset(rs, 0, sizeof(struct tcp_rate_set)); 549 rs->rs_ifp = ifp; 550 rs->rs_if_dunit = ifp->if_dunit; 551 rs->rs_flags = RS_INTF_NO_SUP; 552 rs->rs_disable = 1; 553 rs_number_alive++; 554 sysctl_ctx_init(&rs->sysctl_ctx); 555 rl_sysctl_root = SYSCTL_ADD_NODE(&rs->sysctl_ctx, 556 SYSCTL_STATIC_CHILDREN(_net_inet_tcp_rl), 557 OID_AUTO, 558 rs->rs_ifp->if_xname, 559 CTLFLAG_RW, 0, 560 ""); 561 rl_add_syctl_entries(rl_sysctl_root, rs); 562 mtx_lock(&rs_mtx); 563 CK_LIST_INSERT_HEAD(&int_rs, rs, next); 564 mtx_unlock(&rs_mtx); 565 return (rs); 566 } else if ((rl.flags & RT_IS_INDIRECT) == RT_IS_INDIRECT) { 567 memset(rs, 0, sizeof(struct tcp_rate_set)); 568 rs->rs_ifp = ifp; 569 rs->rs_if_dunit = ifp->if_dunit; 570 rs->rs_flags = RS_IS_DEFF; 571 rs_number_alive++; 572 sysctl_ctx_init(&rs->sysctl_ctx); 573 rl_sysctl_root = SYSCTL_ADD_NODE(&rs->sysctl_ctx, 574 SYSCTL_STATIC_CHILDREN(_net_inet_tcp_rl), 575 OID_AUTO, 576 rs->rs_ifp->if_xname, 577 CTLFLAG_RW, 0, 578 ""); 579 rl_add_syctl_entries(rl_sysctl_root, rs); 580 mtx_lock(&rs_mtx); 581 CK_LIST_INSERT_HEAD(&int_rs, rs, next); 582 mtx_unlock(&rs_mtx); 583 return (rs); 584 } else if ((rl.flags & RT_IS_FIXED_TABLE) == RT_IS_FIXED_TABLE) { 585 /* Mellanox C4 likely */ 586 rs->rs_ifp = ifp; 587 rs->rs_if_dunit = ifp->if_dunit; 588 rs->rs_rate_cnt = rl.number_of_rates; 589 rs->rs_min_seg = rl.min_segment_burst; 590 rs->rs_highest_valid = 0; 591 rs->rs_flow_limit = rl.max_flows; 592 rs->rs_flags = RS_IS_INTF | RS_NO_PRE; 593 rs->rs_disable = 0; 594 rate_table_act = rl.rate_table; 595 } else if ((rl.flags & RT_IS_SELECTABLE) == RT_IS_SELECTABLE) { 596 /* Chelsio, C5 and C6 of Mellanox? */ 597 rs->rs_ifp = ifp; 598 rs->rs_if_dunit = ifp->if_dunit; 599 rs->rs_rate_cnt = rl.number_of_rates; 600 rs->rs_min_seg = rl.min_segment_burst; 601 rs->rs_disable = 0; 602 rs->rs_flow_limit = rl.max_flows; 603 rate_table_act = desired_rates; 604 if ((rs->rs_rate_cnt > MAX_HDWR_RATES) && 605 (rs->rs_rate_cnt < ALL_HARDWARE_RATES)) { 606 /* 607 * Our desired table is not big 608 * enough, do what we can. 609 */ 610 rs->rs_rate_cnt = MAX_HDWR_RATES; 611 } 612 if (rs->rs_rate_cnt <= RS_ORDERED_COUNT) 613 rs->rs_flags = RS_IS_INTF; 614 else 615 rs->rs_flags = RS_IS_INTF | RS_INT_TBL; 616 if (rs->rs_rate_cnt >= ALL_HARDWARE_RATES) 617 rs->rs_rate_cnt = ALL_HARDWARE_RATES; 618 } else { 619 printf("Interface:%s unit:%d not one known to have rate-limits\n", 620 ifp->if_dname, 621 ifp->if_dunit); 622 free(rs, M_TCPPACE); 623 return (NULL); 624 } 625 sz = sizeof(struct tcp_hwrate_limit_table) * rs->rs_rate_cnt; 626 rs->rs_rlt = malloc(sz, M_TCPPACE, M_NOWAIT); 627 if (rs->rs_rlt == NULL) { 628 if (error) 629 *error = ENOMEM; 630 bail: 631 free(rs, M_TCPPACE); 632 return (NULL); 633 } 634 if (rs->rs_rate_cnt >= ALL_HARDWARE_RATES) { 635 /* 636 * The interface supports all 637 * the rates we could possibly want. 638 */ 639 uint64_t rat; 640 641 rs->rs_rlt[0].rate = 12500; /* 100k */ 642 rs->rs_rlt[1].rate = 25000; /* 200k */ 643 rs->rs_rlt[2].rate = 62500; /* 500k */ 644 /* Note 125000 == 1Megabit 645 * populate 1Meg - 1000meg. 646 */ 647 for(i = 3, rat = 125000; i< (ALL_HARDWARE_RATES-1); i++) { 648 rs->rs_rlt[i].rate = rat; 649 rat += 125000; 650 } 651 rs->rs_rlt[(ALL_HARDWARE_RATES-1)].rate = 1250000000; 652 } else if (rs->rs_flags & RS_INT_TBL) { 653 /* We populate this in a special way */ 654 populate_canned_table(rs, rate_table_act); 655 } else { 656 /* 657 * Just copy in the rates from 658 * the table, it is in order. 659 */ 660 for (i=0; i<rs->rs_rate_cnt; i++) { 661 rs->rs_rlt[i].rate = rate_table_act[i]; 662 rs->rs_rlt[i].time_between = 0; 663 rs->rs_rlt[i].flags = 0; 664 } 665 } 666 for (i = (rs->rs_rate_cnt - 1); i >= 0; i--) { 667 /* 668 * We go backwards through the list so that if we can't get 669 * a rate and fail to init one, we have at least a chance of 670 * getting the highest one. 671 */ 672 rs->rs_rlt[i].ptbl = rs; 673 rs->rs_rlt[i].tag = NULL; 674 /* 675 * Calculate the time between. 676 */ 677 lentim = ETHERNET_SEGMENT_SIZE * USECS_IN_SECOND; 678 res = lentim / rs->rs_rlt[i].rate; 679 if (res > 0) 680 rs->rs_rlt[i].time_between = res; 681 else 682 rs->rs_rlt[i].time_between = 1; 683 if (rs->rs_flags & RS_NO_PRE) { 684 rs->rs_rlt[i].flags = HDWRPACE_INITED; 685 rs->rs_lowest_valid = i; 686 } else { 687 int err; 688 689 if ((rl.flags & RT_IS_SETUP_REQ) && 690 (ifp->if_ratelimit_query)) { 691 err = ifp->if_ratelimit_setup(ifp, 692 rs->rs_rlt[i].rate, i); 693 if (err) 694 goto handle_err; 695 } 696 #ifdef RSS 697 hash_type = M_HASHTYPE_RSS_TCP_IPV4; 698 #else 699 hash_type = M_HASHTYPE_OPAQUE_HASH; 700 #endif 701 err = rl_attach_txrtlmt(ifp, 702 hash_type, 703 (i + 1), 704 rs->rs_rlt[i].rate, 705 &rs->rs_rlt[i].tag); 706 if (err) { 707 handle_err: 708 if (i == (rs->rs_rate_cnt - 1)) { 709 /* 710 * Huh - first rate and we can't get 711 * it? 712 */ 713 free(rs->rs_rlt, M_TCPPACE); 714 if (error) 715 *error = err; 716 goto bail; 717 } else { 718 if (error) 719 *error = err; 720 } 721 break; 722 } else { 723 rs->rs_rlt[i].flags = HDWRPACE_INITED | HDWRPACE_TAGPRESENT; 724 rs->rs_lowest_valid = i; 725 } 726 } 727 } 728 /* Did we get at least 1 rate? */ 729 if (rs->rs_rlt[(rs->rs_rate_cnt - 1)].flags & HDWRPACE_INITED) 730 rs->rs_highest_valid = rs->rs_rate_cnt - 1; 731 else { 732 free(rs->rs_rlt, M_TCPPACE); 733 goto bail; 734 } 735 rs_number_alive++; 736 sysctl_ctx_init(&rs->sysctl_ctx); 737 rl_sysctl_root = SYSCTL_ADD_NODE(&rs->sysctl_ctx, 738 SYSCTL_STATIC_CHILDREN(_net_inet_tcp_rl), 739 OID_AUTO, 740 rs->rs_ifp->if_xname, 741 CTLFLAG_RW, 0, 742 ""); 743 rl_add_syctl_entries(rl_sysctl_root, rs); 744 mtx_lock(&rs_mtx); 745 CK_LIST_INSERT_HEAD(&int_rs, rs, next); 746 mtx_unlock(&rs_mtx); 747 return (rs); 748 } 749 750 static const struct tcp_hwrate_limit_table * 751 tcp_int_find_suitable_rate(const struct tcp_rate_set *rs, 752 uint64_t bytes_per_sec, uint32_t flags) 753 { 754 struct tcp_hwrate_limit_table *arte = NULL, *rte = NULL; 755 uint64_t mbits_per_sec, ind_calc; 756 int i; 757 758 mbits_per_sec = (bytes_per_sec * 8); 759 if (flags & RS_PACING_LT) { 760 if ((mbits_per_sec < RS_ONE_MEGABIT_PERSEC) && 761 (rs->rs_lowest_valid <= 2)){ 762 /* 763 * Smaller than 1Meg, only 764 * 3 entries can match it. 765 */ 766 for(i = rs->rs_lowest_valid; i < 3; i++) { 767 if (bytes_per_sec <= rs->rs_rlt[i].rate) { 768 rte = &rs->rs_rlt[i]; 769 break; 770 } else if (rs->rs_rlt[i].flags & HDWRPACE_INITED) { 771 arte = &rs->rs_rlt[i]; 772 } 773 } 774 goto done; 775 } else if ((mbits_per_sec > RS_ONE_GIGABIT_PERSEC) && 776 (rs->rs_rlt[(ALL_HARDWARE_RATES-1)].flags & HDWRPACE_INITED)){ 777 /* 778 * Larger than 1G (the majority of 779 * our table. 780 */ 781 if (mbits_per_sec < RS_TEN_GIGABIT_PERSEC) 782 rte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)]; 783 else 784 arte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)]; 785 goto done; 786 } 787 /* 788 * If we reach here its in our table (between 1Meg - 1000Meg), 789 * just take the rounded down mbits per second, and add 790 * 1Megabit to it, from this we can calculate 791 * the index in the table. 792 */ 793 ind_calc = mbits_per_sec/RS_ONE_MEGABIT_PERSEC; 794 if ((ind_calc * RS_ONE_MEGABIT_PERSEC) != mbits_per_sec) 795 ind_calc++; 796 /* our table is offset by 3, we add 2 */ 797 ind_calc += 2; 798 if (ind_calc > (ALL_HARDWARE_RATES-1)) { 799 /* This should not happen */ 800 ind_calc = ALL_HARDWARE_RATES-1; 801 } 802 if ((ind_calc >= rs->rs_lowest_valid) && 803 (ind_calc <= rs->rs_highest_valid)) 804 rte = &rs->rs_rlt[ind_calc]; 805 } else if (flags & RS_PACING_EXACT_MATCH) { 806 if ((mbits_per_sec < RS_ONE_MEGABIT_PERSEC) && 807 (rs->rs_lowest_valid <= 2)){ 808 for(i = rs->rs_lowest_valid; i < 3; i++) { 809 if (bytes_per_sec == rs->rs_rlt[i].rate) { 810 rte = &rs->rs_rlt[i]; 811 break; 812 } 813 } 814 } else if ((mbits_per_sec > RS_ONE_GIGABIT_PERSEC) && 815 (rs->rs_rlt[(ALL_HARDWARE_RATES-1)].flags & HDWRPACE_INITED)) { 816 /* > 1Gbps only one rate */ 817 if (bytes_per_sec == rs->rs_rlt[(ALL_HARDWARE_RATES-1)].rate) { 818 /* Its 10G wow */ 819 rte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)]; 820 } 821 } else { 822 /* Ok it must be a exact meg (its between 1G and 1Meg) */ 823 ind_calc = mbits_per_sec/RS_ONE_MEGABIT_PERSEC; 824 if ((ind_calc * RS_ONE_MEGABIT_PERSEC) == mbits_per_sec) { 825 /* its an exact Mbps */ 826 ind_calc += 2; 827 if (ind_calc > (ALL_HARDWARE_RATES-1)) { 828 /* This should not happen */ 829 ind_calc = ALL_HARDWARE_RATES-1; 830 } 831 if (rs->rs_rlt[ind_calc].flags & HDWRPACE_INITED) 832 rte = &rs->rs_rlt[ind_calc]; 833 } 834 } 835 } else { 836 /* we want greater than the requested rate */ 837 if ((mbits_per_sec < RS_ONE_MEGABIT_PERSEC) && 838 (rs->rs_lowest_valid <= 2)){ 839 arte = &rs->rs_rlt[3]; /* set alternate to 1Meg */ 840 for (i=2; i>=rs->rs_lowest_valid; i--) { 841 if (bytes_per_sec < rs->rs_rlt[i].rate) { 842 rte = &rs->rs_rlt[i]; 843 break; 844 } else if ((flags & RS_PACING_GEQ) && 845 (bytes_per_sec == rs->rs_rlt[i].rate)) { 846 rte = &rs->rs_rlt[i]; 847 break; 848 } else { 849 arte = &rs->rs_rlt[i]; /* new alternate */ 850 } 851 } 852 } else if (mbits_per_sec > RS_ONE_GIGABIT_PERSEC) { 853 if ((bytes_per_sec < rs->rs_rlt[(ALL_HARDWARE_RATES-1)].rate) && 854 (rs->rs_rlt[(ALL_HARDWARE_RATES-1)].flags & HDWRPACE_INITED)){ 855 /* Our top rate is larger than the request */ 856 rte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)]; 857 } else if ((flags & RS_PACING_GEQ) && 858 (bytes_per_sec == rs->rs_rlt[(ALL_HARDWARE_RATES-1)].rate) && 859 (rs->rs_rlt[(ALL_HARDWARE_RATES-1)].flags & HDWRPACE_INITED)) { 860 /* It matches our top rate */ 861 rte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)]; 862 } else if (rs->rs_rlt[(ALL_HARDWARE_RATES-1)].flags & HDWRPACE_INITED) { 863 /* The top rate is an alternative */ 864 arte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)]; 865 } 866 } else { 867 /* Its in our range 1Meg - 1Gig */ 868 if (flags & RS_PACING_GEQ) { 869 ind_calc = mbits_per_sec/RS_ONE_MEGABIT_PERSEC; 870 if ((ind_calc * RS_ONE_MEGABIT_PERSEC) == mbits_per_sec) { 871 if (ind_calc > (ALL_HARDWARE_RATES-1)) { 872 /* This should not happen */ 873 ind_calc = (ALL_HARDWARE_RATES-1); 874 } 875 rte = &rs->rs_rlt[ind_calc]; 876 } 877 goto done; 878 } 879 ind_calc = (mbits_per_sec + (RS_ONE_MEGABIT_PERSEC-1))/RS_ONE_MEGABIT_PERSEC; 880 ind_calc += 2; 881 if (ind_calc > (ALL_HARDWARE_RATES-1)) { 882 /* This should not happen */ 883 ind_calc = ALL_HARDWARE_RATES-1; 884 } 885 if (rs->rs_rlt[ind_calc].flags & HDWRPACE_INITED) 886 rte = &rs->rs_rlt[ind_calc]; 887 } 888 } 889 done: 890 if ((rte == NULL) && 891 (arte != NULL) && 892 (flags & RS_PACING_SUB_OK)) { 893 /* We can use the substitute */ 894 rte = arte; 895 } 896 return (rte); 897 } 898 899 static const struct tcp_hwrate_limit_table * 900 tcp_find_suitable_rate(const struct tcp_rate_set *rs, uint64_t bytes_per_sec, uint32_t flags) 901 { 902 /** 903 * Hunt the rate table with the restrictions in flags and find a 904 * suitable rate if possible. 905 * RS_PACING_EXACT_MATCH - look for an exact match to rate. 906 * RS_PACING_GT - must be greater than. 907 * RS_PACING_GEQ - must be greater than or equal. 908 * RS_PACING_LT - must be less than. 909 * RS_PACING_SUB_OK - If we don't meet criteria a 910 * substitute is ok. 911 */ 912 int i, matched; 913 struct tcp_hwrate_limit_table *rte = NULL; 914 915 916 if ((rs->rs_flags & RS_INT_TBL) && 917 (rs->rs_rate_cnt >= ALL_HARDWARE_RATES)) { 918 /* 919 * Here we don't want to paw thru 920 * a big table, we have everything 921 * from 1Meg - 1000Meg in 1Meg increments. 922 * Use an alternate method to "lookup". 923 */ 924 return (tcp_int_find_suitable_rate(rs, bytes_per_sec, flags)); 925 } 926 if ((flags & RS_PACING_LT) || 927 (flags & RS_PACING_EXACT_MATCH)) { 928 /* 929 * For exact and less than we go forward through the table. 930 * This way when we find one larger we stop (exact was a 931 * toss up). 932 */ 933 for (i = rs->rs_lowest_valid, matched = 0; i <= rs->rs_highest_valid; i++) { 934 if ((flags & RS_PACING_EXACT_MATCH) && 935 (bytes_per_sec == rs->rs_rlt[i].rate)) { 936 rte = &rs->rs_rlt[i]; 937 matched = 1; 938 break; 939 } else if ((flags & RS_PACING_LT) && 940 (bytes_per_sec <= rs->rs_rlt[i].rate)) { 941 rte = &rs->rs_rlt[i]; 942 matched = 1; 943 break; 944 } 945 if (bytes_per_sec > rs->rs_rlt[i].rate) 946 break; 947 } 948 if ((matched == 0) && 949 (flags & RS_PACING_LT) && 950 (flags & RS_PACING_SUB_OK)) { 951 /* Kick in a substitute (the lowest) */ 952 rte = &rs->rs_rlt[rs->rs_lowest_valid]; 953 } 954 } else { 955 /* 956 * Here we go backward through the table so that we can find 957 * the one greater in theory faster (but its probably a 958 * wash). 959 */ 960 for (i = rs->rs_highest_valid, matched = 0; i >= rs->rs_lowest_valid; i--) { 961 if (rs->rs_rlt[i].rate > bytes_per_sec) { 962 /* A possible candidate */ 963 rte = &rs->rs_rlt[i]; 964 } 965 if ((flags & RS_PACING_GEQ) && 966 (bytes_per_sec == rs->rs_rlt[i].rate)) { 967 /* An exact match and we want equal */ 968 matched = 1; 969 rte = &rs->rs_rlt[i]; 970 break; 971 } else if (rte) { 972 /* 973 * Found one that is larger than but don't 974 * stop, there may be a more closer match. 975 */ 976 matched = 1; 977 } 978 if (rs->rs_rlt[i].rate < bytes_per_sec) { 979 /* 980 * We found a table entry that is smaller, 981 * stop there will be none greater or equal. 982 */ 983 break; 984 } 985 } 986 if ((matched == 0) && 987 (flags & RS_PACING_SUB_OK)) { 988 /* Kick in a substitute (the highest) */ 989 rte = &rs->rs_rlt[rs->rs_highest_valid]; 990 } 991 } 992 return (rte); 993 } 994 995 static struct ifnet * 996 rt_find_real_interface(struct ifnet *ifp, struct inpcb *inp, int *error) 997 { 998 struct ifnet *tifp; 999 struct m_snd_tag *tag; 1000 union if_snd_tag_alloc_params params = { 1001 .rate_limit.hdr.type = IF_SND_TAG_TYPE_RATE_LIMIT, 1002 .rate_limit.hdr.flowid = 1, 1003 .rate_limit.max_rate = COMMON_RATE, 1004 .rate_limit.flags = M_NOWAIT, 1005 }; 1006 int err; 1007 #ifdef RSS 1008 params.rate_limit.hdr.flowtype = ((inp->inp_vflag & INP_IPV6) ? 1009 M_HASHTYPE_RSS_TCP_IPV6 : M_HASHTYPE_RSS_TCP_IPV4); 1010 #else 1011 params.rate_limit.hdr.flowtype = M_HASHTYPE_OPAQUE_HASH; 1012 #endif 1013 tag = NULL; 1014 if (ifp->if_snd_tag_alloc) { 1015 if (error) 1016 *error = ENODEV; 1017 return (NULL); 1018 } 1019 err = ifp->if_snd_tag_alloc(ifp, ¶ms, &tag); 1020 if (err) { 1021 /* Failed to setup a tag? */ 1022 if (error) 1023 *error = err; 1024 return (NULL); 1025 } 1026 tifp = tag->ifp; 1027 tifp->if_snd_tag_free(tag); 1028 return (tifp); 1029 } 1030 1031 static const struct tcp_hwrate_limit_table * 1032 rt_setup_rate(struct inpcb *inp, struct ifnet *ifp, uint64_t bytes_per_sec, 1033 uint32_t flags, int *error) 1034 { 1035 /* First lets find the interface if it exists */ 1036 const struct tcp_hwrate_limit_table *rte; 1037 struct tcp_rate_set *rs; 1038 struct epoch_tracker et; 1039 int err; 1040 1041 NET_EPOCH_ENTER(et); 1042 use_real_interface: 1043 CK_LIST_FOREACH(rs, &int_rs, next) { 1044 /* 1045 * Note we don't look with the lock since we either see a 1046 * new entry or will get one when we try to add it. 1047 */ 1048 if (rs->rs_flags & RS_IS_DEAD) { 1049 /* The dead are not looked at */ 1050 continue; 1051 } 1052 if ((rs->rs_ifp == ifp) && 1053 (rs->rs_if_dunit == ifp->if_dunit)) { 1054 /* Ok we found it */ 1055 break; 1056 } 1057 } 1058 if ((rs == NULL) || 1059 (rs->rs_flags & RS_INTF_NO_SUP) || 1060 (rs->rs_flags & RS_IS_DEAD)) { 1061 /* 1062 * This means we got a packet *before* 1063 * the IF-UP was processed below, <or> 1064 * while or after we already received an interface 1065 * departed event. In either case we really don't 1066 * want to do anything with pacing, in 1067 * the departing case the packet is not 1068 * going to go very far. The new case 1069 * might be arguable, but its impossible 1070 * to tell from the departing case. 1071 */ 1072 if (rs->rs_disable && error) 1073 *error = ENODEV; 1074 NET_EPOCH_EXIT(et); 1075 return (NULL); 1076 } 1077 1078 if ((rs == NULL) || (rs->rs_disable != 0)) { 1079 if (rs->rs_disable && error) 1080 *error = ENOSPC; 1081 NET_EPOCH_EXIT(et); 1082 return (NULL); 1083 } 1084 if (rs->rs_flags & RS_IS_DEFF) { 1085 /* We need to find the real interface */ 1086 struct ifnet *tifp; 1087 1088 tifp = rt_find_real_interface(ifp, inp, error); 1089 if (tifp == NULL) { 1090 if (rs->rs_disable && error) 1091 *error = ENOTSUP; 1092 NET_EPOCH_EXIT(et); 1093 return (NULL); 1094 } 1095 goto use_real_interface; 1096 } 1097 if (rs->rs_flow_limit && 1098 ((rs->rs_flows_using + 1) > rs->rs_flow_limit)) { 1099 if (error) 1100 *error = ENOSPC; 1101 NET_EPOCH_EXIT(et); 1102 return (NULL); 1103 } 1104 rte = tcp_find_suitable_rate(rs, bytes_per_sec, flags); 1105 if (rte) { 1106 err = in_pcbattach_txrtlmt(inp, rs->rs_ifp, 1107 inp->inp_flowtype, 1108 inp->inp_flowid, 1109 rte->rate, 1110 &inp->inp_snd_tag); 1111 if (err) { 1112 /* Failed to attach */ 1113 if (error) 1114 *error = err; 1115 rte = NULL; 1116 } 1117 } 1118 if (rte) { 1119 /* 1120 * We use an atomic here for accounting so we don't have to 1121 * use locks when freeing. 1122 */ 1123 atomic_add_64(&rs->rs_flows_using, 1); 1124 } 1125 NET_EPOCH_EXIT(et); 1126 return (rte); 1127 } 1128 1129 static void 1130 tcp_rl_ifnet_link(void *arg __unused, struct ifnet *ifp, int link_state) 1131 { 1132 int error; 1133 struct tcp_rate_set *rs; 1134 1135 if (((ifp->if_capabilities & IFCAP_TXRTLMT) == 0) || 1136 (link_state != LINK_STATE_UP)) { 1137 /* 1138 * We only care on an interface going up that is rate-limit 1139 * capable. 1140 */ 1141 printf("ifp:%s.%d does not support rate-limit(0x%x) or link_state is not UP(state:%d)\n", 1142 ifp->if_dname, 1143 ifp->if_dunit, 1144 ifp->if_capabilities, 1145 link_state); 1146 return; 1147 } 1148 mtx_lock(&rs_mtx); 1149 printf("Link UP on interface %s.%d\n", 1150 ifp->if_dname, 1151 ifp->if_dunit); 1152 CK_LIST_FOREACH(rs, &int_rs, next) { 1153 if ((rs->rs_ifp == ifp) && 1154 (rs->rs_if_dunit == ifp->if_dunit)) { 1155 /* We already have initialized this guy */ 1156 printf("Interface already initialized\n"); 1157 mtx_unlock(&rs_mtx); 1158 return; 1159 } 1160 } 1161 mtx_unlock(&rs_mtx); 1162 rt_setup_new_rs(ifp, &error); 1163 } 1164 1165 static void 1166 tcp_rl_ifnet_departure(void *arg __unused, struct ifnet *ifp) 1167 { 1168 struct tcp_rate_set *rs, *nrs; 1169 struct ifnet *tifp; 1170 int i; 1171 1172 mtx_lock(&rs_mtx); 1173 CK_LIST_FOREACH_SAFE(rs, &int_rs, next, nrs) { 1174 if ((rs->rs_ifp == ifp) && 1175 (rs->rs_if_dunit == ifp->if_dunit)) { 1176 CK_LIST_REMOVE(rs, next); 1177 rs_number_alive--; 1178 rs->rs_flags |= RS_IS_DEAD; 1179 for (i = 0; i < rs->rs_rate_cnt; i++) { 1180 if (rs->rs_rlt[i].flags & HDWRPACE_TAGPRESENT) { 1181 tifp = rs->rs_rlt[i].tag->ifp; 1182 in_pcbdetach_tag(tifp, rs->rs_rlt[i].tag); 1183 rs->rs_rlt[i].tag = NULL; 1184 } 1185 rs->rs_rlt[i].flags = HDWRPACE_IFPDEPARTED; 1186 } 1187 if (rs->rs_flows_using == 0) 1188 rs_defer_destroy(rs); 1189 break; 1190 } 1191 } 1192 mtx_unlock(&rs_mtx); 1193 } 1194 1195 static void 1196 tcp_rl_shutdown(void *arg __unused, int howto __unused) 1197 { 1198 struct tcp_rate_set *rs, *nrs; 1199 struct ifnet *tifp; 1200 int i; 1201 1202 mtx_lock(&rs_mtx); 1203 CK_LIST_FOREACH_SAFE(rs, &int_rs, next, nrs) { 1204 CK_LIST_REMOVE(rs, next); 1205 rs_number_alive--; 1206 rs->rs_flags |= RS_IS_DEAD; 1207 for (i = 0; i < rs->rs_rate_cnt; i++) { 1208 if (rs->rs_rlt[i].flags & HDWRPACE_TAGPRESENT) { 1209 tifp = rs->rs_rlt[i].tag->ifp; 1210 in_pcbdetach_tag(tifp, rs->rs_rlt[i].tag); 1211 rs->rs_rlt[i].tag = NULL; 1212 } 1213 rs->rs_rlt[i].flags = HDWRPACE_IFPDEPARTED; 1214 } 1215 if (rs->rs_flows_using == 0) 1216 rs_defer_destroy(rs); 1217 } 1218 mtx_unlock(&rs_mtx); 1219 } 1220 1221 const struct tcp_hwrate_limit_table * 1222 tcp_set_pacing_rate(struct tcpcb *tp, struct ifnet *ifp, 1223 uint64_t bytes_per_sec, int flags, int *error) 1224 { 1225 const struct tcp_hwrate_limit_table *rte; 1226 1227 if (tp->t_inpcb->inp_snd_tag == NULL) { 1228 /* 1229 * We are setting up a rate for the first time. 1230 */ 1231 if ((ifp->if_capabilities & IFCAP_TXRTLMT) == 0) { 1232 /* Not supported by the egress */ 1233 if (error) 1234 *error = ENODEV; 1235 return (NULL); 1236 } 1237 #ifdef KERN_TLS 1238 if (tp->t_inpcb->inp_socket->so_snd.sb_flags & SB_TLS_IFNET) { 1239 /* 1240 * We currently can't do both TLS and hardware 1241 * pacing 1242 */ 1243 if (error) 1244 *error = EINVAL; 1245 return (NULL); 1246 } 1247 #endif 1248 rte = rt_setup_rate(tp->t_inpcb, ifp, bytes_per_sec, flags, error); 1249 } else { 1250 /* 1251 * We are modifying a rate, wrong interface? 1252 */ 1253 if (error) 1254 *error = EINVAL; 1255 rte = NULL; 1256 } 1257 *error = 0; 1258 return (rte); 1259 } 1260 1261 const struct tcp_hwrate_limit_table * 1262 tcp_chg_pacing_rate(const struct tcp_hwrate_limit_table *crte, 1263 struct tcpcb *tp, struct ifnet *ifp, 1264 uint64_t bytes_per_sec, int flags, int *error) 1265 { 1266 const struct tcp_hwrate_limit_table *nrte; 1267 const struct tcp_rate_set *rs; 1268 int is_indirect = 0; 1269 int err; 1270 1271 1272 if ((tp->t_inpcb->inp_snd_tag == NULL) || 1273 (crte == NULL)) { 1274 /* Wrong interface */ 1275 if (error) 1276 *error = EINVAL; 1277 return (NULL); 1278 } 1279 rs = crte->ptbl; 1280 if ((rs->rs_flags & RS_IS_DEAD) || 1281 (crte->flags & HDWRPACE_IFPDEPARTED)) { 1282 /* Release the rate, and try anew */ 1283 re_rate: 1284 tcp_rel_pacing_rate(crte, tp); 1285 nrte = tcp_set_pacing_rate(tp, ifp, 1286 bytes_per_sec, flags, error); 1287 return (nrte); 1288 } 1289 if ((rs->rs_flags & RT_IS_INDIRECT ) == RT_IS_INDIRECT) 1290 is_indirect = 1; 1291 else 1292 is_indirect = 0; 1293 if ((is_indirect == 0) && 1294 ((ifp != rs->rs_ifp) || 1295 (ifp->if_dunit != rs->rs_if_dunit))) { 1296 /* 1297 * Something changed, the user is not pointing to the same 1298 * ifp? Maybe a route updated on this guy? 1299 */ 1300 goto re_rate; 1301 } else if (is_indirect) { 1302 /* 1303 * For indirect we have to dig in and find the real interface. 1304 */ 1305 struct ifnet *rifp; 1306 1307 rifp = rt_find_real_interface(ifp, tp->t_inpcb, error); 1308 if (rifp == NULL) { 1309 /* Can't find it? */ 1310 goto re_rate; 1311 } 1312 if ((rifp != rs->rs_ifp) || 1313 (ifp->if_dunit != rs->rs_if_dunit)) { 1314 goto re_rate; 1315 } 1316 } 1317 nrte = tcp_find_suitable_rate(rs, bytes_per_sec, flags); 1318 if (nrte == crte) { 1319 /* No change */ 1320 if (error) 1321 *error = 0; 1322 return (crte); 1323 } 1324 if (nrte == NULL) { 1325 /* Release the old rate */ 1326 tcp_rel_pacing_rate(crte, tp); 1327 return (NULL); 1328 } 1329 /* Change rates to our new entry */ 1330 err = in_pcbmodify_txrtlmt(tp->t_inpcb, nrte->rate); 1331 if (err) { 1332 if (error) 1333 *error = err; 1334 return (NULL); 1335 } 1336 if (error) 1337 *error = 0; 1338 return (nrte); 1339 } 1340 1341 void 1342 tcp_rel_pacing_rate(const struct tcp_hwrate_limit_table *crte, struct tcpcb *tp) 1343 { 1344 const struct tcp_rate_set *crs; 1345 struct tcp_rate_set *rs; 1346 uint64_t pre; 1347 1348 crs = crte->ptbl; 1349 /* 1350 * Now we must break the const 1351 * in order to release our refcount. 1352 */ 1353 rs = __DECONST(struct tcp_rate_set *, crs); 1354 pre = atomic_fetchadd_64(&rs->rs_flows_using, -1); 1355 if (pre == 1) { 1356 mtx_lock(&rs_mtx); 1357 /* 1358 * Is it dead? 1359 */ 1360 if (rs->rs_flags & RS_IS_DEAD) 1361 rs_defer_destroy(rs); 1362 mtx_unlock(&rs_mtx); 1363 } 1364 in_pcbdetach_txrtlmt(tp->t_inpcb); 1365 } 1366 1367 #define ONE_POINT_TWO_MEG 150000 /* 1.2 megabits in bytes */ 1368 #define ONE_HUNDRED_MBPS 12500000 /* 100Mbps in bytes per second */ 1369 #define FIVE_HUNDRED_MBPS 62500000 /* 500Mbps in bytes per second */ 1370 #define MAX_MSS_SENT 43 /* 43 mss = 43 x 1500 = 64,500 bytes */ 1371 1372 uint32_t 1373 tcp_get_pacing_mss(uint64_t bw, uint32_t segsiz, int can_use_1mss, 1374 const struct tcp_hwrate_limit_table *te) 1375 { 1376 /* 1377 * We use the google formula to calculate the 1378 * TSO size. I.E. 1379 * bw < 24Meg 1380 * tso = 2mss 1381 * else 1382 * tso = min(bw/1000, 64k) 1383 * 1384 * Note for these calculations we ignore the 1385 * packet overhead (enet hdr, ip hdr and tcp hdr). 1386 */ 1387 uint64_t lentim, res, bytes; 1388 uint32_t new_tso, min_tso_segs; 1389 1390 bytes = bw / 1000; 1391 if (bytes > (64 * 1000)) 1392 bytes = 64 * 1000; 1393 /* Round up */ 1394 new_tso = (bytes + segsiz - 1) / segsiz; 1395 if (can_use_1mss && (bw < ONE_POINT_TWO_MEG)) 1396 min_tso_segs = 1; 1397 else 1398 min_tso_segs = 2; 1399 if (new_tso < min_tso_segs) 1400 new_tso = min_tso_segs; 1401 if (new_tso > MAX_MSS_SENT) 1402 new_tso = MAX_MSS_SENT; 1403 new_tso *= segsiz; 1404 /* 1405 * If we are not doing hardware pacing 1406 * then we are done. 1407 */ 1408 if (te == NULL) 1409 return(new_tso); 1410 /* 1411 * For hardware pacing we look at the 1412 * rate you are sending at and compare 1413 * that to the rate you have in hardware. 1414 * 1415 * If the hardware rate is slower than your 1416 * software rate then you are in error and 1417 * we will build a queue in our hardware whic 1418 * is probably not desired, in such a case 1419 * just return the non-hardware TSO size. 1420 * 1421 * If the rate in hardware is faster (which 1422 * it should be) then look at how long it 1423 * takes to send one ethernet segment size at 1424 * your b/w and compare that to the time it 1425 * takes to send at the rate you had selected. 1426 * 1427 * If your time is greater (which we hope it is) 1428 * we get the delta between the two, and then 1429 * divide that into your pacing time. This tells 1430 * us how many MSS you can send down at once (rounded up). 1431 * 1432 * Note we also double this value if the b/w is over 1433 * 100Mbps. If its over 500meg we just set you to the 1434 * max (43 segments). 1435 */ 1436 if (te->rate > FIVE_HUNDRED_MBPS) 1437 return (segsiz * MAX_MSS_SENT); 1438 if (te->rate == bw) { 1439 /* We are pacing at exactly the hdwr rate */ 1440 return (segsiz * MAX_MSS_SENT); 1441 } 1442 lentim = ETHERNET_SEGMENT_SIZE * USECS_IN_SECOND; 1443 res = lentim / bw; 1444 if (res > te->time_between) { 1445 uint32_t delta, segs; 1446 1447 delta = res - te->time_between; 1448 segs = (res + delta - 1)/delta; 1449 if (te->rate > ONE_HUNDRED_MBPS) 1450 segs *= 2; 1451 if (segs < min_tso_segs) 1452 segs = min_tso_segs; 1453 if (segs > MAX_MSS_SENT) 1454 segs = MAX_MSS_SENT; 1455 segs *= segsiz; 1456 if (segs < new_tso) { 1457 /* unexpected ? */ 1458 return(new_tso); 1459 } else { 1460 return (segs); 1461 } 1462 } else { 1463 /* 1464 * Your time is smaller which means 1465 * we will grow a queue on our 1466 * hardware. Send back the non-hardware 1467 * rate. 1468 */ 1469 return (new_tso); 1470 } 1471 } 1472 1473 static eventhandler_tag rl_ifnet_departs; 1474 static eventhandler_tag rl_ifnet_arrives; 1475 static eventhandler_tag rl_shutdown_start; 1476 1477 static void 1478 tcp_rs_init(void *st __unused) 1479 { 1480 CK_LIST_INIT(&int_rs); 1481 rs_number_alive = 0; 1482 rs_number_dead = 0;; 1483 mtx_init(&rs_mtx, "tcp_rs_mtx", "rsmtx", MTX_DEF); 1484 rl_ifnet_departs = EVENTHANDLER_REGISTER(ifnet_departure_event, 1485 tcp_rl_ifnet_departure, 1486 NULL, EVENTHANDLER_PRI_ANY); 1487 rl_ifnet_arrives = EVENTHANDLER_REGISTER(ifnet_link_event, 1488 tcp_rl_ifnet_link, 1489 NULL, EVENTHANDLER_PRI_ANY); 1490 rl_shutdown_start = EVENTHANDLER_REGISTER(shutdown_pre_sync, 1491 tcp_rl_shutdown, NULL, 1492 SHUTDOWN_PRI_FIRST); 1493 printf("TCP_ratelimit: Is now initialized\n"); 1494 } 1495 1496 SYSINIT(tcp_rl_init, SI_SUB_SMP + 1, SI_ORDER_ANY, tcp_rs_init, NULL); 1497 #endif 1498