1 /*-
2 *
3 * SPDX-License-Identifier: BSD-3-Clause
4 *
5 * Copyright (c) 2018-2020
6 * Netflix Inc.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 *
29 */
30 /**
31 * Author: Randall Stewart <rrs@netflix.com>
32 */
33
34 #include "opt_inet.h"
35 #include "opt_inet6.h"
36 #include "opt_ipsec.h"
37 #include "opt_ratelimit.h"
38 #include <sys/param.h>
39 #include <sys/kernel.h>
40 #include <sys/malloc.h>
41 #include <sys/mbuf.h>
42 #include <sys/socket.h>
43 #include <sys/socketvar.h>
44 #include <sys/sysctl.h>
45 #include <sys/eventhandler.h>
46 #include <sys/mutex.h>
47 #include <sys/ck.h>
48 #include <net/if.h>
49 #include <net/if_var.h>
50 #include <net/if_private.h>
51 #include <netinet/in.h>
52 #include <netinet/in_pcb.h>
53 #define TCPSTATES /* for logging */
54 #include <netinet/tcp_var.h>
55 #include <netinet/tcp_hpts.h>
56 #include <netinet/tcp_log_buf.h>
57 #include <netinet/tcp_ratelimit.h>
58 #ifndef USECS_IN_SECOND
59 #define USECS_IN_SECOND 1000000
60 #endif
61 /*
62 * For the purposes of each send, what is the size
63 * of an ethernet frame.
64 */
65 MALLOC_DEFINE(M_TCPPACE, "tcp_hwpace", "TCP Hardware pacing memory");
66 #ifdef RATELIMIT
67
68 /*
69 * The following preferred table will seem weird to
70 * the casual viewer. Why do we not have any rates below
71 * 1Mbps? Why do we have a rate at 1.44Mbps called common?
72 * Why do the rates cluster in the 1-100Mbps range more
73 * than others? Why does the table jump around at the beginnign
74 * and then be more consistently raising?
75 *
76 * Let me try to answer those questions. A lot of
77 * this is dependant on the hardware. We have three basic
78 * supporters of rate limiting
79 *
80 * Chelsio - Supporting 16 configurable rates.
81 * Mlx - c4 supporting 13 fixed rates.
82 * Mlx - c5 & c6 supporting 127 configurable rates.
83 *
84 * The c4 is why we have a common rate that is available
85 * in all rate tables. This is a selected rate from the
86 * c4 table and we assure its available in all ratelimit
87 * tables. This way the tcp_ratelimit code has an assured
88 * rate it should always be able to get. This answers a
89 * couple of the questions above.
90 *
91 * So what about the rest, well the table is built to
92 * try to get the most out of a joint hardware/software
93 * pacing system. The software pacer will always pick
94 * a rate higher than the b/w that it is estimating
95 *
96 * on the path. This is done for two reasons.
97 * a) So we can discover more b/w
98 * and
99 * b) So we can send a block of MSS's down and then
100 * have the software timer go off after the previous
101 * send is completely out of the hardware.
102 *
103 * But when we do <b> we don't want to have the delay
104 * between the last packet sent by the hardware be
105 * excessively long (to reach our desired rate).
106 *
107 * So let me give an example for clarity.
108 *
109 * Lets assume that the tcp stack sees that 29,110,000 bps is
110 * what the bw of the path is. The stack would select the
111 * rate 31Mbps. 31Mbps means that each send that is done
112 * by the hardware will cause a 390 micro-second gap between
113 * the packets sent at that rate. For 29,110,000 bps we
114 * would need 416 micro-seconds gap between each send.
115 *
116 * Note that are calculating a complete time for pacing
117 * which includes the ethernet, IP and TCP overhead. So
118 * a full 1514 bytes is used for the above calculations.
119 * My testing has shown that both cards are also using this
120 * as their basis i.e. full payload size of the ethernet frame.
121 * The TCP stack caller needs to be aware of this and make the
122 * appropriate overhead calculations be included in its choices.
123 *
124 * Now, continuing our example, we pick a MSS size based on the
125 * delta between the two rates (416 - 390) divided into the rate
126 * we really wish to send at rounded up. That results in a MSS
127 * send of 17 mss's at once. The hardware then will
128 * run out of data in a single 17MSS send in 6,630 micro-seconds.
129 *
130 * On the other hand the software pacer will send more data
131 * in 7,072 micro-seconds. This means that we will refill
132 * the hardware 52 microseconds after it would have sent
133 * next if it had not ran out of data. This is a win since we are
134 * only sending every 7ms or so and yet all the packets are spaced on
135 * the wire with 94% of what they should be and only
136 * the last packet is delayed extra to make up for the
137 * difference.
138 *
139 * Note that the above formula has two important caveat.
140 * If we are above (b/w wise) over 100Mbps we double the result
141 * of the MSS calculation. The second caveat is if we are 500Mbps
142 * or more we just send the maximum MSS at once i.e. 45MSS. At
143 * the higher b/w's even the cards have limits to what times (timer granularity)
144 * they can insert between packets and start to send more than one
145 * packet at a time on the wire.
146 *
147 */
148 #define COMMON_RATE 180500
149 const uint64_t desired_rates[] = {
150 122500, /* 1Mbps - rate 1 */
151 180500, /* 1.44Mpbs - rate 2 common rate */
152 375000, /* 3Mbps - rate 3 */
153 625000, /* 5Mbps - rate 4 */
154 1250000, /* 10Mbps - rate 5 */
155 1875000, /* 15Mbps - rate 6 */
156 2500000, /* 20Mbps - rate 7 */
157 3125000, /* 25Mbps - rate 8 */
158 3750000, /* 30Mbps - rate 9 */
159 4375000, /* 35Mbps - rate 10 */
160 5000000, /* 40Meg - rate 11 */
161 6250000, /* 50Mbps - rate 12 */
162 12500000, /* 100Mbps - rate 13 */
163 25000000, /* 200Mbps - rate 14 */
164 50000000, /* 400Mbps - rate 15 */
165 100000000, /* 800Mbps - rate 16 */
166 5625000, /* 45Mbps - rate 17 */
167 6875000, /* 55Mbps - rate 19 */
168 7500000, /* 60Mbps - rate 20 */
169 8125000, /* 65Mbps - rate 21 */
170 8750000, /* 70Mbps - rate 22 */
171 9375000, /* 75Mbps - rate 23 */
172 10000000, /* 80Mbps - rate 24 */
173 10625000, /* 85Mbps - rate 25 */
174 11250000, /* 90Mbps - rate 26 */
175 11875000, /* 95Mbps - rate 27 */
176 12500000, /* 100Mbps - rate 28 */
177 13750000, /* 110Mbps - rate 29 */
178 15000000, /* 120Mbps - rate 30 */
179 16250000, /* 130Mbps - rate 31 */
180 17500000, /* 140Mbps - rate 32 */
181 18750000, /* 150Mbps - rate 33 */
182 20000000, /* 160Mbps - rate 34 */
183 21250000, /* 170Mbps - rate 35 */
184 22500000, /* 180Mbps - rate 36 */
185 23750000, /* 190Mbps - rate 37 */
186 26250000, /* 210Mbps - rate 38 */
187 27500000, /* 220Mbps - rate 39 */
188 28750000, /* 230Mbps - rate 40 */
189 30000000, /* 240Mbps - rate 41 */
190 31250000, /* 250Mbps - rate 42 */
191 34375000, /* 275Mbps - rate 43 */
192 37500000, /* 300Mbps - rate 44 */
193 40625000, /* 325Mbps - rate 45 */
194 43750000, /* 350Mbps - rate 46 */
195 46875000, /* 375Mbps - rate 47 */
196 53125000, /* 425Mbps - rate 48 */
197 56250000, /* 450Mbps - rate 49 */
198 59375000, /* 475Mbps - rate 50 */
199 62500000, /* 500Mbps - rate 51 */
200 68750000, /* 550Mbps - rate 52 */
201 75000000, /* 600Mbps - rate 53 */
202 81250000, /* 650Mbps - rate 54 */
203 87500000, /* 700Mbps - rate 55 */
204 93750000, /* 750Mbps - rate 56 */
205 106250000, /* 850Mbps - rate 57 */
206 112500000, /* 900Mbps - rate 58 */
207 125000000, /* 1Gbps - rate 59 */
208 156250000, /* 1.25Gps - rate 60 */
209 187500000, /* 1.5Gps - rate 61 */
210 218750000, /* 1.75Gps - rate 62 */
211 250000000, /* 2Gbps - rate 63 */
212 281250000, /* 2.25Gps - rate 64 */
213 312500000, /* 2.5Gbps - rate 65 */
214 343750000, /* 2.75Gbps - rate 66 */
215 375000000, /* 3Gbps - rate 67 */
216 500000000, /* 4Gbps - rate 68 */
217 625000000, /* 5Gbps - rate 69 */
218 750000000, /* 6Gbps - rate 70 */
219 875000000, /* 7Gbps - rate 71 */
220 1000000000, /* 8Gbps - rate 72 */
221 1125000000, /* 9Gbps - rate 73 */
222 1250000000, /* 10Gbps - rate 74 */
223 1875000000, /* 15Gbps - rate 75 */
224 2500000000 /* 20Gbps - rate 76 */
225 };
226
227 #define MAX_HDWR_RATES (sizeof(desired_rates)/sizeof(uint64_t))
228 #define RS_ORDERED_COUNT 16 /*
229 * Number that are in order
230 * at the beginning of the table,
231 * over this a sort is required.
232 */
233 #define RS_NEXT_ORDER_GROUP 16 /*
234 * The point in our table where
235 * we come fill in a second ordered
236 * group (index wise means -1).
237 */
238 #define ALL_HARDWARE_RATES 1004 /*
239 * 1Meg - 1Gig in 1 Meg steps
240 * plus 100, 200k and 500k and
241 * 10Gig
242 */
243
244 #define RS_ONE_MEGABIT_PERSEC 1000000
245 #define RS_ONE_GIGABIT_PERSEC 1000000000
246 #define RS_TEN_GIGABIT_PERSEC 10000000000
247
248 static struct head_tcp_rate_set int_rs = CK_LIST_HEAD_INITIALIZER();
249 static struct mtx rs_mtx;
250 uint32_t rs_number_alive = 0;
251 uint32_t rs_number_dead = 0;
252 static uint32_t rs_floor_mss = 0;
253 static uint32_t wait_time_floor = 8000; /* 8 ms */
254 static uint32_t rs_hw_floor_mss = 16;
255 static uint32_t num_of_waits_allowed = 1; /* How many time blocks are we willing to wait */
256
257 static uint32_t mss_divisor = RL_DEFAULT_DIVISOR;
258 static uint32_t even_num_segs = 1;
259 static uint32_t even_threshold = 4;
260
261 SYSCTL_NODE(_net_inet_tcp, OID_AUTO, rl, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
262 "TCP Ratelimit stats");
263 SYSCTL_UINT(_net_inet_tcp_rl, OID_AUTO, alive, CTLFLAG_RW,
264 &rs_number_alive, 0,
265 "Number of interfaces initialized for ratelimiting");
266 SYSCTL_UINT(_net_inet_tcp_rl, OID_AUTO, dead, CTLFLAG_RW,
267 &rs_number_dead, 0,
268 "Number of interfaces departing from ratelimiting");
269 SYSCTL_UINT(_net_inet_tcp_rl, OID_AUTO, floor_mss, CTLFLAG_RW,
270 &rs_floor_mss, 0,
271 "Number of MSS that will override the normal minimums (0 means don't enforce)");
272 SYSCTL_UINT(_net_inet_tcp_rl, OID_AUTO, wait_floor, CTLFLAG_RW,
273 &wait_time_floor, 2000,
274 "Has b/w increases what is the wait floor we are willing to wait at the end?");
275 SYSCTL_UINT(_net_inet_tcp_rl, OID_AUTO, time_blocks, CTLFLAG_RW,
276 &num_of_waits_allowed, 1,
277 "How many time blocks on the end should software pacing be willing to wait?");
278
279 SYSCTL_UINT(_net_inet_tcp_rl, OID_AUTO, hw_floor_mss, CTLFLAG_RW,
280 &rs_hw_floor_mss, 16,
281 "Number of mss that are a minum for hardware pacing?");
282
283 SYSCTL_INT(_net_inet_tcp_rl, OID_AUTO, divisor, CTLFLAG_RW,
284 &mss_divisor, RL_DEFAULT_DIVISOR,
285 "The value divided into bytes per second to help establish mss size");
286 SYSCTL_INT(_net_inet_tcp_rl, OID_AUTO, even, CTLFLAG_RW,
287 &even_num_segs, 1,
288 "Do we round mss size up to an even number of segments for delayed ack");
289 SYSCTL_INT(_net_inet_tcp_rl, OID_AUTO, eventhresh, CTLFLAG_RW,
290 &even_threshold, 4,
291 "At what number of mss do we start rounding up to an even number of mss?");
292
293 static void
rl_add_syctl_entries(struct sysctl_oid * rl_sysctl_root,struct tcp_rate_set * rs)294 rl_add_syctl_entries(struct sysctl_oid *rl_sysctl_root, struct tcp_rate_set *rs)
295 {
296 /*
297 * Add sysctl entries for thus interface.
298 */
299 if (rs->rs_flags & RS_INTF_NO_SUP) {
300 SYSCTL_ADD_S32(&rs->sysctl_ctx,
301 SYSCTL_CHILDREN(rl_sysctl_root),
302 OID_AUTO, "disable", CTLFLAG_RD,
303 &rs->rs_disable, 0,
304 "Disable this interface from new hdwr limiting?");
305 } else {
306 SYSCTL_ADD_S32(&rs->sysctl_ctx,
307 SYSCTL_CHILDREN(rl_sysctl_root),
308 OID_AUTO, "disable", CTLFLAG_RW,
309 &rs->rs_disable, 0,
310 "Disable this interface from new hdwr limiting?");
311 }
312 SYSCTL_ADD_S32(&rs->sysctl_ctx,
313 SYSCTL_CHILDREN(rl_sysctl_root),
314 OID_AUTO, "minseg", CTLFLAG_RW,
315 &rs->rs_min_seg, 0,
316 "What is the minimum we need to send on this interface?");
317 SYSCTL_ADD_U64(&rs->sysctl_ctx,
318 SYSCTL_CHILDREN(rl_sysctl_root),
319 OID_AUTO, "flow_limit", CTLFLAG_RW,
320 &rs->rs_flow_limit, 0,
321 "What is the limit for number of flows (0=unlimited)?");
322 SYSCTL_ADD_S32(&rs->sysctl_ctx,
323 SYSCTL_CHILDREN(rl_sysctl_root),
324 OID_AUTO, "highest", CTLFLAG_RD,
325 &rs->rs_highest_valid, 0,
326 "Highest valid rate");
327 SYSCTL_ADD_S32(&rs->sysctl_ctx,
328 SYSCTL_CHILDREN(rl_sysctl_root),
329 OID_AUTO, "lowest", CTLFLAG_RD,
330 &rs->rs_lowest_valid, 0,
331 "Lowest valid rate");
332 SYSCTL_ADD_S32(&rs->sysctl_ctx,
333 SYSCTL_CHILDREN(rl_sysctl_root),
334 OID_AUTO, "flags", CTLFLAG_RD,
335 &rs->rs_flags, 0,
336 "What lags are on the entry?");
337 SYSCTL_ADD_S32(&rs->sysctl_ctx,
338 SYSCTL_CHILDREN(rl_sysctl_root),
339 OID_AUTO, "numrates", CTLFLAG_RD,
340 &rs->rs_rate_cnt, 0,
341 "How many rates re there?");
342 SYSCTL_ADD_U64(&rs->sysctl_ctx,
343 SYSCTL_CHILDREN(rl_sysctl_root),
344 OID_AUTO, "flows_using", CTLFLAG_RD,
345 &rs->rs_flows_using, 0,
346 "How many flows are using this interface now?");
347 #ifdef DETAILED_RATELIMIT_SYSCTL
348 if (rs->rs_rlt && rs->rs_rate_cnt > 0) {
349 /* Lets display the rates */
350 int i;
351 struct sysctl_oid *rl_rates;
352 struct sysctl_oid *rl_rate_num;
353 char rate_num[16];
354 rl_rates = SYSCTL_ADD_NODE(&rs->sysctl_ctx,
355 SYSCTL_CHILDREN(rl_sysctl_root),
356 OID_AUTO,
357 "rate",
358 CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
359 "Ratelist");
360 for( i = 0; i < rs->rs_rate_cnt; i++) {
361 sprintf(rate_num, "%d", i);
362 rl_rate_num = SYSCTL_ADD_NODE(&rs->sysctl_ctx,
363 SYSCTL_CHILDREN(rl_rates),
364 OID_AUTO,
365 rate_num,
366 CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
367 "Individual Rate");
368 SYSCTL_ADD_U32(&rs->sysctl_ctx,
369 SYSCTL_CHILDREN(rl_rate_num),
370 OID_AUTO, "flags", CTLFLAG_RD,
371 &rs->rs_rlt[i].flags, 0,
372 "Flags on this rate");
373 SYSCTL_ADD_U32(&rs->sysctl_ctx,
374 SYSCTL_CHILDREN(rl_rate_num),
375 OID_AUTO, "pacetime", CTLFLAG_RD,
376 &rs->rs_rlt[i].time_between, 0,
377 "Time hardware inserts between 1500 byte sends");
378 SYSCTL_ADD_LONG(&rs->sysctl_ctx,
379 SYSCTL_CHILDREN(rl_rate_num),
380 OID_AUTO, "rate", CTLFLAG_RD,
381 &rs->rs_rlt[i].rate,
382 "Rate in bytes per second");
383 SYSCTL_ADD_LONG(&rs->sysctl_ctx,
384 SYSCTL_CHILDREN(rl_rate_num),
385 OID_AUTO, "using", CTLFLAG_RD,
386 &rs->rs_rlt[i].using,
387 "Number of flows using");
388 SYSCTL_ADD_LONG(&rs->sysctl_ctx,
389 SYSCTL_CHILDREN(rl_rate_num),
390 OID_AUTO, "enobufs", CTLFLAG_RD,
391 &rs->rs_rlt[i].rs_num_enobufs,
392 "Number of enobufs logged on this rate");
393
394 }
395 }
396 #endif
397 }
398
399 static void
rs_destroy(epoch_context_t ctx)400 rs_destroy(epoch_context_t ctx)
401 {
402 struct tcp_rate_set *rs;
403 bool do_free_rs;
404
405 rs = __containerof(ctx, struct tcp_rate_set, rs_epoch_ctx);
406
407 mtx_lock(&rs_mtx);
408 rs->rs_flags &= ~RS_FUNERAL_SCHD;
409 /*
410 * In theory its possible (but unlikely)
411 * that while the delete was occuring
412 * and we were applying the DEAD flag
413 * someone slipped in and found the
414 * interface in a lookup. While we
415 * decided rs_flows_using were 0 and
416 * scheduling the epoch_call, the other
417 * thread incremented rs_flow_using. This
418 * is because users have a pointer and
419 * we only use the rs_flows_using in an
420 * atomic fashion, i.e. the other entities
421 * are not protected. To assure this did
422 * not occur, we check rs_flows_using here
423 * before deleting.
424 */
425 do_free_rs = (rs->rs_flows_using == 0);
426 rs_number_dead--;
427 mtx_unlock(&rs_mtx);
428
429 if (do_free_rs) {
430 sysctl_ctx_free(&rs->sysctl_ctx);
431 free(rs->rs_rlt, M_TCPPACE);
432 free(rs, M_TCPPACE);
433 }
434 }
435
436 static void
rs_defer_destroy(struct tcp_rate_set * rs)437 rs_defer_destroy(struct tcp_rate_set *rs)
438 {
439
440 mtx_assert(&rs_mtx, MA_OWNED);
441
442 /* Check if already pending. */
443 if (rs->rs_flags & RS_FUNERAL_SCHD)
444 return;
445
446 rs_number_dead++;
447
448 /* Set flag to only defer once. */
449 rs->rs_flags |= RS_FUNERAL_SCHD;
450 NET_EPOCH_CALL(rs_destroy, &rs->rs_epoch_ctx);
451 }
452
453 #ifdef INET
454 extern counter_u64_t rate_limit_new;
455 extern counter_u64_t rate_limit_chg;
456 extern counter_u64_t rate_limit_set_ok;
457 extern counter_u64_t rate_limit_active;
458 extern counter_u64_t rate_limit_alloc_fail;
459 #endif
460
461 static int
rl_attach_txrtlmt(struct ifnet * ifp,uint32_t flowtype,int flowid,uint64_t cfg_rate,struct m_snd_tag ** tag)462 rl_attach_txrtlmt(struct ifnet *ifp,
463 uint32_t flowtype,
464 int flowid,
465 uint64_t cfg_rate,
466 struct m_snd_tag **tag)
467 {
468 int error;
469 union if_snd_tag_alloc_params params = {
470 .rate_limit.hdr.type = IF_SND_TAG_TYPE_RATE_LIMIT,
471 .rate_limit.hdr.flowid = flowid,
472 .rate_limit.hdr.flowtype = flowtype,
473 .rate_limit.max_rate = cfg_rate,
474 .rate_limit.flags = M_NOWAIT,
475 };
476
477 error = m_snd_tag_alloc(ifp, ¶ms, tag);
478 #ifdef INET
479 if (error == 0) {
480 counter_u64_add(rate_limit_set_ok, 1);
481 counter_u64_add(rate_limit_active, 1);
482 } else if (error != EOPNOTSUPP)
483 counter_u64_add(rate_limit_alloc_fail, 1);
484 #endif
485 return (error);
486 }
487
488 static void
populate_canned_table(struct tcp_rate_set * rs,const uint64_t * rate_table_act)489 populate_canned_table(struct tcp_rate_set *rs, const uint64_t *rate_table_act)
490 {
491 /*
492 * The internal table is "special", it
493 * is two seperate ordered tables that
494 * must be merged. We get here when the
495 * adapter specifies a number of rates that
496 * covers both ranges in the table in some
497 * form.
498 */
499 int i, at_low, at_high;
500 uint8_t low_disabled = 0, high_disabled = 0;
501
502 for(i = 0, at_low = 0, at_high = RS_NEXT_ORDER_GROUP; i < rs->rs_rate_cnt; i++) {
503 rs->rs_rlt[i].flags = 0;
504 rs->rs_rlt[i].time_between = 0;
505 if ((low_disabled == 0) &&
506 (high_disabled ||
507 (rate_table_act[at_low] < rate_table_act[at_high]))) {
508 rs->rs_rlt[i].rate = rate_table_act[at_low];
509 at_low++;
510 if (at_low == RS_NEXT_ORDER_GROUP)
511 low_disabled = 1;
512 } else if (high_disabled == 0) {
513 rs->rs_rlt[i].rate = rate_table_act[at_high];
514 at_high++;
515 if (at_high == MAX_HDWR_RATES)
516 high_disabled = 1;
517 }
518 }
519 }
520
521 static struct tcp_rate_set *
rt_setup_new_rs(struct ifnet * ifp,int * error)522 rt_setup_new_rs(struct ifnet *ifp, int *error)
523 {
524 struct tcp_rate_set *rs;
525 const uint64_t *rate_table_act;
526 uint64_t lentim, res;
527 size_t sz;
528 uint32_t hash_type;
529 int i;
530 struct if_ratelimit_query_results rl;
531 struct sysctl_oid *rl_sysctl_root;
532 struct epoch_tracker et;
533 /*
534 * We expect to enter with the
535 * mutex locked.
536 */
537
538 if (ifp->if_ratelimit_query == NULL) {
539 /*
540 * We can do nothing if we cannot
541 * get a query back from the driver.
542 */
543 printf("Warning:No query functions for %s:%d-- failed\n",
544 ifp->if_dname, ifp->if_dunit);
545 return (NULL);
546 }
547 rs = malloc(sizeof(struct tcp_rate_set), M_TCPPACE, M_NOWAIT | M_ZERO);
548 if (rs == NULL) {
549 if (error)
550 *error = ENOMEM;
551 printf("Warning:No memory for malloc of tcp_rate_set\n");
552 return (NULL);
553 }
554 memset(&rl, 0, sizeof(rl));
555 rl.flags = RT_NOSUPPORT;
556 ifp->if_ratelimit_query(ifp, &rl);
557 if (rl.flags & RT_IS_UNUSABLE) {
558 /*
559 * The interface does not really support
560 * the rate-limiting.
561 */
562 memset(rs, 0, sizeof(struct tcp_rate_set));
563 rs->rs_ifp = ifp;
564 rs->rs_if_dunit = ifp->if_dunit;
565 rs->rs_flags = RS_INTF_NO_SUP;
566 rs->rs_disable = 1;
567 rs_number_alive++;
568 sysctl_ctx_init(&rs->sysctl_ctx);
569 rl_sysctl_root = SYSCTL_ADD_NODE(&rs->sysctl_ctx,
570 SYSCTL_STATIC_CHILDREN(_net_inet_tcp_rl),
571 OID_AUTO,
572 rs->rs_ifp->if_xname,
573 CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
574 "");
575 rl_add_syctl_entries(rl_sysctl_root, rs);
576 NET_EPOCH_ENTER(et);
577 mtx_lock(&rs_mtx);
578 CK_LIST_INSERT_HEAD(&int_rs, rs, next);
579 mtx_unlock(&rs_mtx);
580 NET_EPOCH_EXIT(et);
581 return (rs);
582 } else if ((rl.flags & RT_IS_INDIRECT) == RT_IS_INDIRECT) {
583 memset(rs, 0, sizeof(struct tcp_rate_set));
584 rs->rs_ifp = ifp;
585 rs->rs_if_dunit = ifp->if_dunit;
586 rs->rs_flags = RS_IS_DEFF;
587 rs_number_alive++;
588 sysctl_ctx_init(&rs->sysctl_ctx);
589 rl_sysctl_root = SYSCTL_ADD_NODE(&rs->sysctl_ctx,
590 SYSCTL_STATIC_CHILDREN(_net_inet_tcp_rl),
591 OID_AUTO,
592 rs->rs_ifp->if_xname,
593 CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
594 "");
595 rl_add_syctl_entries(rl_sysctl_root, rs);
596 NET_EPOCH_ENTER(et);
597 mtx_lock(&rs_mtx);
598 CK_LIST_INSERT_HEAD(&int_rs, rs, next);
599 mtx_unlock(&rs_mtx);
600 NET_EPOCH_EXIT(et);
601 return (rs);
602 } else if ((rl.flags & RT_IS_FIXED_TABLE) == RT_IS_FIXED_TABLE) {
603 /* Mellanox C4 likely */
604 rs->rs_ifp = ifp;
605 rs->rs_if_dunit = ifp->if_dunit;
606 rs->rs_rate_cnt = rl.number_of_rates;
607 rs->rs_min_seg = rl.min_segment_burst;
608 rs->rs_highest_valid = 0;
609 rs->rs_flow_limit = rl.max_flows;
610 rs->rs_flags = RS_IS_INTF | RS_NO_PRE;
611 rs->rs_disable = 0;
612 rate_table_act = rl.rate_table;
613 } else if ((rl.flags & RT_IS_SELECTABLE) == RT_IS_SELECTABLE) {
614 /* Chelsio, C5 and C6 of Mellanox? */
615 rs->rs_ifp = ifp;
616 rs->rs_if_dunit = ifp->if_dunit;
617 rs->rs_rate_cnt = rl.number_of_rates;
618 rs->rs_min_seg = rl.min_segment_burst;
619 rs->rs_disable = 0;
620 rs->rs_flow_limit = rl.max_flows;
621 rate_table_act = desired_rates;
622 if ((rs->rs_rate_cnt > MAX_HDWR_RATES) &&
623 (rs->rs_rate_cnt < ALL_HARDWARE_RATES)) {
624 /*
625 * Our desired table is not big
626 * enough, do what we can.
627 */
628 rs->rs_rate_cnt = MAX_HDWR_RATES;
629 }
630 if (rs->rs_rate_cnt <= RS_ORDERED_COUNT)
631 rs->rs_flags = RS_IS_INTF;
632 else
633 rs->rs_flags = RS_IS_INTF | RS_INT_TBL;
634 if (rs->rs_rate_cnt >= ALL_HARDWARE_RATES)
635 rs->rs_rate_cnt = ALL_HARDWARE_RATES;
636 } else {
637 free(rs, M_TCPPACE);
638 return (NULL);
639 }
640 sz = sizeof(struct tcp_hwrate_limit_table) * rs->rs_rate_cnt;
641 rs->rs_rlt = malloc(sz, M_TCPPACE, M_NOWAIT);
642 if (rs->rs_rlt == NULL) {
643 if (error)
644 *error = ENOMEM;
645 bail:
646 free(rs, M_TCPPACE);
647 return (NULL);
648 }
649 if (rs->rs_rate_cnt >= ALL_HARDWARE_RATES) {
650 /*
651 * The interface supports all
652 * the rates we could possibly want.
653 */
654 uint64_t rat;
655
656 rs->rs_rlt[0].rate = 12500; /* 100k */
657 rs->rs_rlt[1].rate = 25000; /* 200k */
658 rs->rs_rlt[2].rate = 62500; /* 500k */
659 /* Note 125000 == 1Megabit
660 * populate 1Meg - 1000meg.
661 */
662 for(i = 3, rat = 125000; i< (ALL_HARDWARE_RATES-1); i++) {
663 rs->rs_rlt[i].rate = rat;
664 rat += 125000;
665 }
666 rs->rs_rlt[(ALL_HARDWARE_RATES-1)].rate = 1250000000;
667 } else if (rs->rs_flags & RS_INT_TBL) {
668 /* We populate this in a special way */
669 populate_canned_table(rs, rate_table_act);
670 } else {
671 /*
672 * Just copy in the rates from
673 * the table, it is in order.
674 */
675 for (i=0; i<rs->rs_rate_cnt; i++) {
676 rs->rs_rlt[i].rate = rate_table_act[i];
677 rs->rs_rlt[i].time_between = 0;
678 rs->rs_rlt[i].flags = 0;
679 }
680 }
681 for (i = (rs->rs_rate_cnt - 1); i >= 0; i--) {
682 /*
683 * We go backwards through the list so that if we can't get
684 * a rate and fail to init one, we have at least a chance of
685 * getting the highest one.
686 */
687 rs->rs_rlt[i].ptbl = rs;
688 rs->rs_rlt[i].tag = NULL;
689 rs->rs_rlt[i].using = 0;
690 rs->rs_rlt[i].rs_num_enobufs = 0;
691 /*
692 * Calculate the time between.
693 */
694 lentim = ETHERNET_SEGMENT_SIZE * USECS_IN_SECOND;
695 res = lentim / rs->rs_rlt[i].rate;
696 if (res > 0)
697 rs->rs_rlt[i].time_between = res;
698 else
699 rs->rs_rlt[i].time_between = 1;
700 if (rs->rs_flags & RS_NO_PRE) {
701 rs->rs_rlt[i].flags = HDWRPACE_INITED;
702 rs->rs_lowest_valid = i;
703 } else {
704 int err;
705
706 if ((rl.flags & RT_IS_SETUP_REQ) &&
707 (ifp->if_ratelimit_query)) {
708 err = ifp->if_ratelimit_setup(ifp,
709 rs->rs_rlt[i].rate, i);
710 if (err)
711 goto handle_err;
712 }
713 #ifdef RSS
714 hash_type = M_HASHTYPE_RSS_TCP_IPV4;
715 #else
716 hash_type = M_HASHTYPE_OPAQUE_HASH;
717 #endif
718 err = rl_attach_txrtlmt(ifp,
719 hash_type,
720 (i + 1),
721 rs->rs_rlt[i].rate,
722 &rs->rs_rlt[i].tag);
723 if (err) {
724 handle_err:
725 if (i == (rs->rs_rate_cnt - 1)) {
726 /*
727 * Huh - first rate and we can't get
728 * it?
729 */
730 free(rs->rs_rlt, M_TCPPACE);
731 if (error)
732 *error = err;
733 goto bail;
734 } else {
735 if (error)
736 *error = err;
737 }
738 break;
739 } else {
740 rs->rs_rlt[i].flags = HDWRPACE_INITED | HDWRPACE_TAGPRESENT;
741 rs->rs_lowest_valid = i;
742 }
743 }
744 }
745 /* Did we get at least 1 rate? */
746 if (rs->rs_rlt[(rs->rs_rate_cnt - 1)].flags & HDWRPACE_INITED)
747 rs->rs_highest_valid = rs->rs_rate_cnt - 1;
748 else {
749 free(rs->rs_rlt, M_TCPPACE);
750 goto bail;
751 }
752 rs_number_alive++;
753 sysctl_ctx_init(&rs->sysctl_ctx);
754 rl_sysctl_root = SYSCTL_ADD_NODE(&rs->sysctl_ctx,
755 SYSCTL_STATIC_CHILDREN(_net_inet_tcp_rl),
756 OID_AUTO,
757 rs->rs_ifp->if_xname,
758 CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
759 "");
760 rl_add_syctl_entries(rl_sysctl_root, rs);
761 NET_EPOCH_ENTER(et);
762 mtx_lock(&rs_mtx);
763 CK_LIST_INSERT_HEAD(&int_rs, rs, next);
764 mtx_unlock(&rs_mtx);
765 NET_EPOCH_EXIT(et);
766 return (rs);
767 }
768
769 /*
770 * For an explanation of why the argument is volatile please
771 * look at the comments around rt_setup_rate().
772 */
773 static const struct tcp_hwrate_limit_table *
tcp_int_find_suitable_rate(const volatile struct tcp_rate_set * rs,uint64_t bytes_per_sec,uint32_t flags,uint64_t * lower_rate)774 tcp_int_find_suitable_rate(const volatile struct tcp_rate_set *rs,
775 uint64_t bytes_per_sec, uint32_t flags, uint64_t *lower_rate)
776 {
777 struct tcp_hwrate_limit_table *arte = NULL, *rte = NULL;
778 uint64_t mbits_per_sec, ind_calc, previous_rate = 0;
779 int i;
780
781 mbits_per_sec = (bytes_per_sec * 8);
782 if (flags & RS_PACING_LT) {
783 if ((mbits_per_sec < RS_ONE_MEGABIT_PERSEC) &&
784 (rs->rs_lowest_valid <= 2)){
785 /*
786 * Smaller than 1Meg, only
787 * 3 entries can match it.
788 */
789 previous_rate = 0;
790 for(i = rs->rs_lowest_valid; i < 3; i++) {
791 if (bytes_per_sec <= rs->rs_rlt[i].rate) {
792 rte = &rs->rs_rlt[i];
793 break;
794 } else if (rs->rs_rlt[i].flags & HDWRPACE_INITED) {
795 arte = &rs->rs_rlt[i];
796 }
797 previous_rate = rs->rs_rlt[i].rate;
798 }
799 goto done;
800 } else if ((mbits_per_sec > RS_ONE_GIGABIT_PERSEC) &&
801 (rs->rs_rlt[(ALL_HARDWARE_RATES-1)].flags & HDWRPACE_INITED)){
802 /*
803 * Larger than 1G (the majority of
804 * our table.
805 */
806 if (mbits_per_sec < RS_TEN_GIGABIT_PERSEC)
807 rte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)];
808 else
809 arte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)];
810 previous_rate = rs->rs_rlt[(ALL_HARDWARE_RATES-2)].rate;
811 goto done;
812 }
813 /*
814 * If we reach here its in our table (between 1Meg - 1000Meg),
815 * just take the rounded down mbits per second, and add
816 * 1Megabit to it, from this we can calculate
817 * the index in the table.
818 */
819 ind_calc = mbits_per_sec/RS_ONE_MEGABIT_PERSEC;
820 if ((ind_calc * RS_ONE_MEGABIT_PERSEC) != mbits_per_sec)
821 ind_calc++;
822 /* our table is offset by 3, we add 2 */
823 ind_calc += 2;
824 if (ind_calc > (ALL_HARDWARE_RATES-1)) {
825 /* This should not happen */
826 ind_calc = ALL_HARDWARE_RATES-1;
827 }
828 if ((ind_calc >= rs->rs_lowest_valid) &&
829 (ind_calc <= rs->rs_highest_valid)) {
830 rte = &rs->rs_rlt[ind_calc];
831 if (ind_calc >= 1)
832 previous_rate = rs->rs_rlt[(ind_calc-1)].rate;
833 }
834 } else if (flags & RS_PACING_EXACT_MATCH) {
835 if ((mbits_per_sec < RS_ONE_MEGABIT_PERSEC) &&
836 (rs->rs_lowest_valid <= 2)){
837 for(i = rs->rs_lowest_valid; i < 3; i++) {
838 if (bytes_per_sec == rs->rs_rlt[i].rate) {
839 rte = &rs->rs_rlt[i];
840 break;
841 }
842 }
843 } else if ((mbits_per_sec > RS_ONE_GIGABIT_PERSEC) &&
844 (rs->rs_rlt[(ALL_HARDWARE_RATES-1)].flags & HDWRPACE_INITED)) {
845 /* > 1Gbps only one rate */
846 if (bytes_per_sec == rs->rs_rlt[(ALL_HARDWARE_RATES-1)].rate) {
847 /* Its 10G wow */
848 rte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)];
849 }
850 } else {
851 /* Ok it must be a exact meg (its between 1G and 1Meg) */
852 ind_calc = mbits_per_sec/RS_ONE_MEGABIT_PERSEC;
853 if ((ind_calc * RS_ONE_MEGABIT_PERSEC) == mbits_per_sec) {
854 /* its an exact Mbps */
855 ind_calc += 2;
856 if (ind_calc > (ALL_HARDWARE_RATES-1)) {
857 /* This should not happen */
858 ind_calc = ALL_HARDWARE_RATES-1;
859 }
860 if (rs->rs_rlt[ind_calc].flags & HDWRPACE_INITED)
861 rte = &rs->rs_rlt[ind_calc];
862 }
863 }
864 } else {
865 /* we want greater than the requested rate */
866 if ((mbits_per_sec < RS_ONE_MEGABIT_PERSEC) &&
867 (rs->rs_lowest_valid <= 2)){
868 arte = &rs->rs_rlt[3]; /* set alternate to 1Meg */
869 for (i=2; i>=rs->rs_lowest_valid; i--) {
870 if (bytes_per_sec < rs->rs_rlt[i].rate) {
871 rte = &rs->rs_rlt[i];
872 if (i >= 1) {
873 previous_rate = rs->rs_rlt[(i-1)].rate;
874 }
875 break;
876 } else if ((flags & RS_PACING_GEQ) &&
877 (bytes_per_sec == rs->rs_rlt[i].rate)) {
878 rte = &rs->rs_rlt[i];
879 if (i >= 1) {
880 previous_rate = rs->rs_rlt[(i-1)].rate;
881 }
882 break;
883 } else {
884 arte = &rs->rs_rlt[i]; /* new alternate */
885 }
886 }
887 } else if (mbits_per_sec > RS_ONE_GIGABIT_PERSEC) {
888 if ((bytes_per_sec < rs->rs_rlt[(ALL_HARDWARE_RATES-1)].rate) &&
889 (rs->rs_rlt[(ALL_HARDWARE_RATES-1)].flags & HDWRPACE_INITED)){
890 /* Our top rate is larger than the request */
891 rte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)];
892 } else if ((flags & RS_PACING_GEQ) &&
893 (bytes_per_sec == rs->rs_rlt[(ALL_HARDWARE_RATES-1)].rate) &&
894 (rs->rs_rlt[(ALL_HARDWARE_RATES-1)].flags & HDWRPACE_INITED)) {
895 /* It matches our top rate */
896 rte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)];
897 } else if (rs->rs_rlt[(ALL_HARDWARE_RATES-1)].flags & HDWRPACE_INITED) {
898 /* The top rate is an alternative */
899 arte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)];
900 }
901 previous_rate = rs->rs_rlt[(ALL_HARDWARE_RATES-2)].rate;
902 } else {
903 /* Its in our range 1Meg - 1Gig */
904 if (flags & RS_PACING_GEQ) {
905 ind_calc = mbits_per_sec/RS_ONE_MEGABIT_PERSEC;
906 if ((ind_calc * RS_ONE_MEGABIT_PERSEC) == mbits_per_sec) {
907 if (ind_calc > (ALL_HARDWARE_RATES-1)) {
908 /* This should not happen */
909 ind_calc = (ALL_HARDWARE_RATES-1);
910 }
911 rte = &rs->rs_rlt[ind_calc];
912 if (ind_calc >= 1)
913 previous_rate = rs->rs_rlt[(ind_calc-1)].rate;
914 }
915 goto done;
916 }
917 ind_calc = (mbits_per_sec + (RS_ONE_MEGABIT_PERSEC-1))/RS_ONE_MEGABIT_PERSEC;
918 ind_calc += 2;
919 if (ind_calc > (ALL_HARDWARE_RATES-1)) {
920 /* This should not happen */
921 ind_calc = ALL_HARDWARE_RATES-1;
922 }
923 if (rs->rs_rlt[ind_calc].flags & HDWRPACE_INITED) {
924 rte = &rs->rs_rlt[ind_calc];
925 if (ind_calc >= 1)
926 previous_rate = rs->rs_rlt[(ind_calc-1)].rate;
927 }
928 }
929 }
930 done:
931 if ((rte == NULL) &&
932 (arte != NULL) &&
933 (flags & RS_PACING_SUB_OK)) {
934 /* We can use the substitute */
935 rte = arte;
936 }
937 if (lower_rate)
938 *lower_rate = previous_rate;
939 return (rte);
940 }
941
942 /*
943 * For an explanation of why the argument is volatile please
944 * look at the comments around rt_setup_rate().
945 */
946 static const struct tcp_hwrate_limit_table *
tcp_find_suitable_rate(const volatile struct tcp_rate_set * rs,uint64_t bytes_per_sec,uint32_t flags,uint64_t * lower_rate)947 tcp_find_suitable_rate(const volatile struct tcp_rate_set *rs, uint64_t bytes_per_sec, uint32_t flags, uint64_t *lower_rate)
948 {
949 /**
950 * Hunt the rate table with the restrictions in flags and find a
951 * suitable rate if possible.
952 * RS_PACING_EXACT_MATCH - look for an exact match to rate.
953 * RS_PACING_GT - must be greater than.
954 * RS_PACING_GEQ - must be greater than or equal.
955 * RS_PACING_LT - must be less than.
956 * RS_PACING_SUB_OK - If we don't meet criteria a
957 * substitute is ok.
958 */
959 int i, matched;
960 struct tcp_hwrate_limit_table *rte = NULL;
961 uint64_t previous_rate = 0;
962
963 if ((rs->rs_flags & RS_INT_TBL) &&
964 (rs->rs_rate_cnt >= ALL_HARDWARE_RATES)) {
965 /*
966 * Here we don't want to paw thru
967 * a big table, we have everything
968 * from 1Meg - 1000Meg in 1Meg increments.
969 * Use an alternate method to "lookup".
970 */
971 return (tcp_int_find_suitable_rate(rs, bytes_per_sec, flags, lower_rate));
972 }
973 if ((flags & RS_PACING_LT) ||
974 (flags & RS_PACING_EXACT_MATCH)) {
975 /*
976 * For exact and less than we go forward through the table.
977 * This way when we find one larger we stop (exact was a
978 * toss up).
979 */
980 for (i = rs->rs_lowest_valid, matched = 0; i <= rs->rs_highest_valid; i++) {
981 if ((flags & RS_PACING_EXACT_MATCH) &&
982 (bytes_per_sec == rs->rs_rlt[i].rate)) {
983 rte = &rs->rs_rlt[i];
984 matched = 1;
985 if (lower_rate != NULL)
986 *lower_rate = previous_rate;
987 break;
988 } else if ((flags & RS_PACING_LT) &&
989 (bytes_per_sec <= rs->rs_rlt[i].rate)) {
990 rte = &rs->rs_rlt[i];
991 matched = 1;
992 if (lower_rate != NULL)
993 *lower_rate = previous_rate;
994 break;
995 }
996 previous_rate = rs->rs_rlt[i].rate;
997 if (bytes_per_sec > rs->rs_rlt[i].rate)
998 break;
999 }
1000 if ((matched == 0) &&
1001 (flags & RS_PACING_LT) &&
1002 (flags & RS_PACING_SUB_OK)) {
1003 /* Kick in a substitute (the lowest) */
1004 rte = &rs->rs_rlt[rs->rs_lowest_valid];
1005 }
1006 } else {
1007 /*
1008 * Here we go backward through the table so that we can find
1009 * the one greater in theory faster (but its probably a
1010 * wash).
1011 */
1012 for (i = rs->rs_highest_valid, matched = 0; i >= rs->rs_lowest_valid; i--) {
1013 if (rs->rs_rlt[i].rate > bytes_per_sec) {
1014 /* A possible candidate */
1015 rte = &rs->rs_rlt[i];
1016 }
1017 if ((flags & RS_PACING_GEQ) &&
1018 (bytes_per_sec == rs->rs_rlt[i].rate)) {
1019 /* An exact match and we want equal */
1020 matched = 1;
1021 rte = &rs->rs_rlt[i];
1022 break;
1023 } else if (rte) {
1024 /*
1025 * Found one that is larger than but don't
1026 * stop, there may be a more closer match.
1027 */
1028 matched = 1;
1029 }
1030 if (rs->rs_rlt[i].rate < bytes_per_sec) {
1031 /*
1032 * We found a table entry that is smaller,
1033 * stop there will be none greater or equal.
1034 */
1035 if (lower_rate != NULL)
1036 *lower_rate = rs->rs_rlt[i].rate;
1037 break;
1038 }
1039 }
1040 if ((matched == 0) &&
1041 (flags & RS_PACING_SUB_OK)) {
1042 /* Kick in a substitute (the highest) */
1043 rte = &rs->rs_rlt[rs->rs_highest_valid];
1044 }
1045 }
1046 return (rte);
1047 }
1048
1049 static struct ifnet *
rt_find_real_interface(struct ifnet * ifp,struct inpcb * inp,int * error)1050 rt_find_real_interface(struct ifnet *ifp, struct inpcb *inp, int *error)
1051 {
1052 struct ifnet *tifp;
1053 struct m_snd_tag *tag, *ntag;
1054 union if_snd_tag_alloc_params params = {
1055 .rate_limit.hdr.type = IF_SND_TAG_TYPE_RATE_LIMIT,
1056 .rate_limit.hdr.flowid = inp->inp_flowid,
1057 .rate_limit.hdr.numa_domain = inp->inp_numa_domain,
1058 .rate_limit.max_rate = COMMON_RATE,
1059 .rate_limit.flags = M_NOWAIT,
1060 };
1061 int err;
1062 #ifdef RSS
1063 params.rate_limit.hdr.flowtype = ((inp->inp_vflag & INP_IPV6) ?
1064 M_HASHTYPE_RSS_TCP_IPV6 : M_HASHTYPE_RSS_TCP_IPV4);
1065 #else
1066 params.rate_limit.hdr.flowtype = M_HASHTYPE_OPAQUE_HASH;
1067 #endif
1068 err = m_snd_tag_alloc(ifp, ¶ms, &tag);
1069 if (err) {
1070 /* Failed to setup a tag? */
1071 if (error)
1072 *error = err;
1073 return (NULL);
1074 }
1075 ntag = tag;
1076 while (ntag->sw->next_snd_tag != NULL) {
1077 ntag = ntag->sw->next_snd_tag(ntag);
1078 }
1079 tifp = ntag->ifp;
1080 m_snd_tag_rele(tag);
1081 return (tifp);
1082 }
1083
1084 static void
rl_increment_using(const struct tcp_hwrate_limit_table * rte)1085 rl_increment_using(const struct tcp_hwrate_limit_table *rte)
1086 {
1087 struct tcp_hwrate_limit_table *decon_rte;
1088
1089 decon_rte = __DECONST(struct tcp_hwrate_limit_table *, rte);
1090 atomic_add_long(&decon_rte->using, 1);
1091 }
1092
1093 static void
rl_decrement_using(const struct tcp_hwrate_limit_table * rte)1094 rl_decrement_using(const struct tcp_hwrate_limit_table *rte)
1095 {
1096 struct tcp_hwrate_limit_table *decon_rte;
1097
1098 decon_rte = __DECONST(struct tcp_hwrate_limit_table *, rte);
1099 atomic_subtract_long(&decon_rte->using, 1);
1100 }
1101
1102 void
tcp_rl_log_enobuf(const struct tcp_hwrate_limit_table * rte)1103 tcp_rl_log_enobuf(const struct tcp_hwrate_limit_table *rte)
1104 {
1105 struct tcp_hwrate_limit_table *decon_rte;
1106
1107 decon_rte = __DECONST(struct tcp_hwrate_limit_table *, rte);
1108 atomic_add_long(&decon_rte->rs_num_enobufs, 1);
1109 }
1110
1111 /*
1112 * Do NOT take the __noinline out of the
1113 * find_rs_for_ifp() function. If you do the inline
1114 * of it for the rt_setup_rate() will show you a
1115 * compiler bug. For some reason the compiler thinks
1116 * the list can never be empty. The consequence of
1117 * this will be a crash when we dereference NULL
1118 * if an ifp is removed just has a hw rate limit
1119 * is attempted. If you are working on the compiler
1120 * and want to "test" this go ahead and take the noinline
1121 * out otherwise let sleeping dogs ly until such time
1122 * as we get a compiler fix 10/2/20 -- RRS
1123 */
1124 static __noinline struct tcp_rate_set *
find_rs_for_ifp(struct ifnet * ifp)1125 find_rs_for_ifp(struct ifnet *ifp)
1126 {
1127 struct tcp_rate_set *rs;
1128
1129 CK_LIST_FOREACH(rs, &int_rs, next) {
1130 if ((rs->rs_ifp == ifp) &&
1131 (rs->rs_if_dunit == ifp->if_dunit)) {
1132 /* Ok we found it */
1133 return (rs);
1134 }
1135 }
1136 return (NULL);
1137 }
1138
1139
1140 static const struct tcp_hwrate_limit_table *
rt_setup_rate(struct inpcb * inp,struct ifnet * ifp,uint64_t bytes_per_sec,uint32_t flags,int * error,uint64_t * lower_rate)1141 rt_setup_rate(struct inpcb *inp, struct ifnet *ifp, uint64_t bytes_per_sec,
1142 uint32_t flags, int *error, uint64_t *lower_rate)
1143 {
1144 /* First lets find the interface if it exists */
1145 const struct tcp_hwrate_limit_table *rte;
1146 /*
1147 * So why is rs volatile? This is to defeat a
1148 * compiler bug where in the compiler is convinced
1149 * that rs can never be NULL (which is not true). Because
1150 * of its conviction it nicely optimizes out the if ((rs == NULL
1151 * below which means if you get a NULL back you dereference it.
1152 */
1153 volatile struct tcp_rate_set *rs;
1154 struct epoch_tracker et;
1155 struct ifnet *oifp = ifp;
1156 int err;
1157
1158 NET_EPOCH_ENTER(et);
1159 use_real_interface:
1160 rs = find_rs_for_ifp(ifp);
1161 if ((rs == NULL) ||
1162 (rs->rs_flags & RS_INTF_NO_SUP) ||
1163 (rs->rs_flags & RS_IS_DEAD)) {
1164 /*
1165 * This means we got a packet *before*
1166 * the IF-UP was processed below, <or>
1167 * while or after we already received an interface
1168 * departed event. In either case we really don't
1169 * want to do anything with pacing, in
1170 * the departing case the packet is not
1171 * going to go very far. The new case
1172 * might be arguable, but its impossible
1173 * to tell from the departing case.
1174 */
1175 if (error)
1176 *error = ENODEV;
1177 NET_EPOCH_EXIT(et);
1178 return (NULL);
1179 }
1180
1181 if ((rs == NULL) || (rs->rs_disable != 0)) {
1182 if (error)
1183 *error = ENOSPC;
1184 NET_EPOCH_EXIT(et);
1185 return (NULL);
1186 }
1187 if (rs->rs_flags & RS_IS_DEFF) {
1188 /* We need to find the real interface */
1189 struct ifnet *tifp;
1190
1191 tifp = rt_find_real_interface(ifp, inp, error);
1192 if (tifp == NULL) {
1193 if (rs->rs_disable && error)
1194 *error = ENOTSUP;
1195 NET_EPOCH_EXIT(et);
1196 return (NULL);
1197 }
1198 KASSERT((tifp != ifp),
1199 ("Lookup failure ifp:%p inp:%p rt_find_real_interface() returns the same interface tifp:%p?\n",
1200 ifp, inp, tifp));
1201 ifp = tifp;
1202 goto use_real_interface;
1203 }
1204 if (rs->rs_flow_limit &&
1205 ((rs->rs_flows_using + 1) > rs->rs_flow_limit)) {
1206 if (error)
1207 *error = ENOSPC;
1208 NET_EPOCH_EXIT(et);
1209 return (NULL);
1210 }
1211 rte = tcp_find_suitable_rate(rs, bytes_per_sec, flags, lower_rate);
1212 if (rte) {
1213 err = in_pcbattach_txrtlmt(inp, oifp,
1214 inp->inp_flowtype,
1215 inp->inp_flowid,
1216 rte->rate,
1217 &inp->inp_snd_tag);
1218 if (err) {
1219 /* Failed to attach */
1220 if (error)
1221 *error = err;
1222 rte = NULL;
1223 } else {
1224 KASSERT((inp->inp_snd_tag != NULL) ,
1225 ("Setup rate has no snd_tag inp:%p rte:%p rate:%llu rs:%p",
1226 inp, rte, (unsigned long long)rte->rate, rs));
1227 #ifdef INET
1228 counter_u64_add(rate_limit_new, 1);
1229 #endif
1230 }
1231 }
1232 if (rte) {
1233 /*
1234 * We use an atomic here for accounting so we don't have to
1235 * use locks when freeing.
1236 */
1237 atomic_add_64(&rs->rs_flows_using, 1);
1238 }
1239 NET_EPOCH_EXIT(et);
1240 return (rte);
1241 }
1242
1243 static void
tcp_rl_ifnet_link(void * arg __unused,struct ifnet * ifp,int link_state)1244 tcp_rl_ifnet_link(void *arg __unused, struct ifnet *ifp, int link_state)
1245 {
1246 int error;
1247 struct tcp_rate_set *rs;
1248 struct epoch_tracker et;
1249
1250 if (((ifp->if_capenable & IFCAP_TXRTLMT) == 0) ||
1251 (link_state != LINK_STATE_UP)) {
1252 /*
1253 * We only care on an interface going up that is rate-limit
1254 * capable.
1255 */
1256 return;
1257 }
1258 NET_EPOCH_ENTER(et);
1259 mtx_lock(&rs_mtx);
1260 rs = find_rs_for_ifp(ifp);
1261 if (rs) {
1262 /* We already have initialized this guy */
1263 mtx_unlock(&rs_mtx);
1264 NET_EPOCH_EXIT(et);
1265 return;
1266 }
1267 mtx_unlock(&rs_mtx);
1268 NET_EPOCH_EXIT(et);
1269 rt_setup_new_rs(ifp, &error);
1270 }
1271
1272 static void
tcp_rl_ifnet_departure(void * arg __unused,struct ifnet * ifp)1273 tcp_rl_ifnet_departure(void *arg __unused, struct ifnet *ifp)
1274 {
1275 struct tcp_rate_set *rs;
1276 struct epoch_tracker et;
1277 int i;
1278
1279 NET_EPOCH_ENTER(et);
1280 mtx_lock(&rs_mtx);
1281 rs = find_rs_for_ifp(ifp);
1282 if (rs) {
1283 CK_LIST_REMOVE(rs, next);
1284 rs_number_alive--;
1285 rs->rs_flags |= RS_IS_DEAD;
1286 for (i = 0; i < rs->rs_rate_cnt; i++) {
1287 if (rs->rs_rlt[i].flags & HDWRPACE_TAGPRESENT) {
1288 in_pcbdetach_tag(rs->rs_rlt[i].tag);
1289 rs->rs_rlt[i].tag = NULL;
1290 }
1291 rs->rs_rlt[i].flags = HDWRPACE_IFPDEPARTED;
1292 }
1293 if (rs->rs_flows_using == 0)
1294 rs_defer_destroy(rs);
1295 }
1296 mtx_unlock(&rs_mtx);
1297 NET_EPOCH_EXIT(et);
1298 }
1299
1300 void
tcp_rl_release_ifnet(struct ifnet * ifp)1301 tcp_rl_release_ifnet(struct ifnet *ifp)
1302 {
1303 tcp_rl_ifnet_departure(NULL, ifp);
1304 }
1305
1306 static void
tcp_rl_shutdown(void * arg __unused,int howto __unused)1307 tcp_rl_shutdown(void *arg __unused, int howto __unused)
1308 {
1309 struct tcp_rate_set *rs, *nrs;
1310 struct epoch_tracker et;
1311 int i;
1312
1313 NET_EPOCH_ENTER(et);
1314 mtx_lock(&rs_mtx);
1315 CK_LIST_FOREACH_SAFE(rs, &int_rs, next, nrs) {
1316 CK_LIST_REMOVE(rs, next);
1317 rs_number_alive--;
1318 rs->rs_flags |= RS_IS_DEAD;
1319 for (i = 0; i < rs->rs_rate_cnt; i++) {
1320 if (rs->rs_rlt[i].flags & HDWRPACE_TAGPRESENT) {
1321 in_pcbdetach_tag(rs->rs_rlt[i].tag);
1322 rs->rs_rlt[i].tag = NULL;
1323 }
1324 rs->rs_rlt[i].flags = HDWRPACE_IFPDEPARTED;
1325 }
1326 if (rs->rs_flows_using == 0)
1327 rs_defer_destroy(rs);
1328 }
1329 mtx_unlock(&rs_mtx);
1330 NET_EPOCH_EXIT(et);
1331 }
1332
1333 const struct tcp_hwrate_limit_table *
tcp_set_pacing_rate(struct tcpcb * tp,struct ifnet * ifp,uint64_t bytes_per_sec,int flags,int * error,uint64_t * lower_rate)1334 tcp_set_pacing_rate(struct tcpcb *tp, struct ifnet *ifp,
1335 uint64_t bytes_per_sec, int flags, int *error, uint64_t *lower_rate)
1336 {
1337 struct inpcb *inp = tptoinpcb(tp);
1338 const struct tcp_hwrate_limit_table *rte;
1339 #ifdef KERN_TLS
1340 struct ktls_session *tls;
1341 #endif
1342
1343 INP_WLOCK_ASSERT(inp);
1344
1345 if (inp->inp_snd_tag == NULL) {
1346 /*
1347 * We are setting up a rate for the first time.
1348 */
1349 if ((ifp->if_capenable & IFCAP_TXRTLMT) == 0) {
1350 /* Not supported by the egress */
1351 if (error)
1352 *error = ENODEV;
1353 return (NULL);
1354 }
1355 #ifdef KERN_TLS
1356 tls = NULL;
1357 if (tp->t_nic_ktls_xmit != 0) {
1358 tls = tptosocket(tp)->so_snd.sb_tls_info;
1359
1360 if ((ifp->if_capenable & IFCAP_TXTLS_RTLMT) == 0 ||
1361 tls->mode != TCP_TLS_MODE_IFNET) {
1362 if (error)
1363 *error = ENODEV;
1364 return (NULL);
1365 }
1366 }
1367 #endif
1368 rte = rt_setup_rate(inp, ifp, bytes_per_sec, flags, error, lower_rate);
1369 if (rte)
1370 rl_increment_using(rte);
1371 #ifdef KERN_TLS
1372 if (rte != NULL && tls != NULL && tls->snd_tag != NULL) {
1373 /*
1374 * Fake a route change error to reset the TLS
1375 * send tag. This will convert the existing
1376 * tag to a TLS ratelimit tag.
1377 */
1378 MPASS(tls->snd_tag->sw->type == IF_SND_TAG_TYPE_TLS);
1379 ktls_output_eagain(inp, tls);
1380 }
1381 #endif
1382 } else {
1383 /*
1384 * We are modifying a rate, wrong interface?
1385 */
1386 if (error)
1387 *error = EINVAL;
1388 rte = NULL;
1389 }
1390 if (rte != NULL) {
1391 tp->t_pacing_rate = rte->rate;
1392 *error = 0;
1393 }
1394 return (rte);
1395 }
1396
1397 const struct tcp_hwrate_limit_table *
tcp_chg_pacing_rate(const struct tcp_hwrate_limit_table * crte,struct tcpcb * tp,struct ifnet * ifp,uint64_t bytes_per_sec,int flags,int * error,uint64_t * lower_rate)1398 tcp_chg_pacing_rate(const struct tcp_hwrate_limit_table *crte,
1399 struct tcpcb *tp, struct ifnet *ifp,
1400 uint64_t bytes_per_sec, int flags, int *error, uint64_t *lower_rate)
1401 {
1402 struct inpcb *inp = tptoinpcb(tp);
1403 const struct tcp_hwrate_limit_table *nrte;
1404 const struct tcp_rate_set *rs;
1405 #ifdef KERN_TLS
1406 struct ktls_session *tls = NULL;
1407 #endif
1408 int err;
1409
1410 INP_WLOCK_ASSERT(inp);
1411
1412 if (crte == NULL) {
1413 /* Wrong interface */
1414 if (error)
1415 *error = EINVAL;
1416 return (NULL);
1417 }
1418
1419 #ifdef KERN_TLS
1420 if (tp->t_nic_ktls_xmit) {
1421 tls = tptosocket(tp)->so_snd.sb_tls_info;
1422 if (tls->mode != TCP_TLS_MODE_IFNET)
1423 tls = NULL;
1424 else if (tls->snd_tag != NULL &&
1425 tls->snd_tag->sw->type != IF_SND_TAG_TYPE_TLS_RATE_LIMIT) {
1426 if (!tls->reset_pending) {
1427 /*
1428 * NIC probably doesn't support
1429 * ratelimit TLS tags if it didn't
1430 * allocate one when an existing rate
1431 * was present, so ignore.
1432 */
1433 tcp_rel_pacing_rate(crte, tp);
1434 if (error)
1435 *error = EOPNOTSUPP;
1436 return (NULL);
1437 }
1438
1439 /*
1440 * The send tag is being converted, so set the
1441 * rate limit on the inpcb tag. There is a
1442 * race that the new NIC send tag might use
1443 * the current rate instead of this one.
1444 */
1445 tls = NULL;
1446 }
1447 }
1448 #endif
1449 if (inp->inp_snd_tag == NULL) {
1450 /* Wrong interface */
1451 tcp_rel_pacing_rate(crte, tp);
1452 if (error)
1453 *error = EINVAL;
1454 return (NULL);
1455 }
1456 rs = crte->ptbl;
1457 if ((rs->rs_flags & RS_IS_DEAD) ||
1458 (crte->flags & HDWRPACE_IFPDEPARTED)) {
1459 /* Release the rate, and try anew */
1460
1461 tcp_rel_pacing_rate(crte, tp);
1462 nrte = tcp_set_pacing_rate(tp, ifp,
1463 bytes_per_sec, flags, error, lower_rate);
1464 return (nrte);
1465 }
1466 nrte = tcp_find_suitable_rate(rs, bytes_per_sec, flags, lower_rate);
1467 if (nrte == crte) {
1468 /* No change */
1469 if (error)
1470 *error = 0;
1471 return (crte);
1472 }
1473 if (nrte == NULL) {
1474 /* Release the old rate */
1475 if (error)
1476 *error = ENOENT;
1477 tcp_rel_pacing_rate(crte, tp);
1478 return (NULL);
1479 }
1480 rl_decrement_using(crte);
1481 rl_increment_using(nrte);
1482 /* Change rates to our new entry */
1483 #ifdef KERN_TLS
1484 if (tls != NULL)
1485 err = ktls_modify_txrtlmt(tls, nrte->rate);
1486 else
1487 #endif
1488 err = in_pcbmodify_txrtlmt(inp, nrte->rate);
1489 if (err) {
1490 struct tcp_rate_set *lrs;
1491 uint64_t pre;
1492
1493 rl_decrement_using(nrte);
1494 lrs = __DECONST(struct tcp_rate_set *, rs);
1495 pre = atomic_fetchadd_64(&lrs->rs_flows_using, -1);
1496 /* Do we still have a snd-tag attached? */
1497 if (inp->inp_snd_tag)
1498 in_pcbdetach_txrtlmt(inp);
1499
1500 if (pre == 1) {
1501 struct epoch_tracker et;
1502
1503 NET_EPOCH_ENTER(et);
1504 mtx_lock(&rs_mtx);
1505 /*
1506 * Is it dead?
1507 */
1508 if (lrs->rs_flags & RS_IS_DEAD)
1509 rs_defer_destroy(lrs);
1510 mtx_unlock(&rs_mtx);
1511 NET_EPOCH_EXIT(et);
1512 }
1513 if (error)
1514 *error = err;
1515 return (NULL);
1516 } else {
1517 #ifdef INET
1518 counter_u64_add(rate_limit_chg, 1);
1519 #endif
1520 }
1521 if (error)
1522 *error = 0;
1523 tp->t_pacing_rate = nrte->rate;
1524 return (nrte);
1525 }
1526
1527 void
tcp_rel_pacing_rate(const struct tcp_hwrate_limit_table * crte,struct tcpcb * tp)1528 tcp_rel_pacing_rate(const struct tcp_hwrate_limit_table *crte, struct tcpcb *tp)
1529 {
1530 struct inpcb *inp = tptoinpcb(tp);
1531 const struct tcp_rate_set *crs;
1532 struct tcp_rate_set *rs;
1533 uint64_t pre;
1534
1535 INP_WLOCK_ASSERT(inp);
1536
1537 tp->t_pacing_rate = -1;
1538 crs = crte->ptbl;
1539 /*
1540 * Now we must break the const
1541 * in order to release our refcount.
1542 */
1543 rs = __DECONST(struct tcp_rate_set *, crs);
1544 rl_decrement_using(crte);
1545 pre = atomic_fetchadd_64(&rs->rs_flows_using, -1);
1546 if (pre == 1) {
1547 struct epoch_tracker et;
1548
1549 NET_EPOCH_ENTER(et);
1550 mtx_lock(&rs_mtx);
1551 /*
1552 * Is it dead?
1553 */
1554 if (rs->rs_flags & RS_IS_DEAD)
1555 rs_defer_destroy(rs);
1556 mtx_unlock(&rs_mtx);
1557 NET_EPOCH_EXIT(et);
1558 }
1559
1560 /*
1561 * XXX: If this connection is using ifnet TLS, should we
1562 * switch it to using an unlimited rate, or perhaps use
1563 * ktls_output_eagain() to reset the send tag to a plain
1564 * TLS tag?
1565 */
1566 in_pcbdetach_txrtlmt(inp);
1567 }
1568
1569 #define ONE_POINT_TWO_MEG 150000 /* 1.2 megabits in bytes */
1570 #define ONE_HUNDRED_MBPS 12500000 /* 100Mbps in bytes per second */
1571 #define FIVE_HUNDRED_MBPS 62500000 /* 500Mbps in bytes per second */
1572 #define MAX_MSS_SENT 43 /* 43 mss = 43 x 1500 = 64,500 bytes */
1573
1574 static void
tcp_log_pacing_size(struct tcpcb * tp,uint64_t bw,uint32_t segsiz,uint32_t new_tso,uint64_t hw_rate,uint32_t time_between,uint32_t calc_time_between,uint32_t segs,uint32_t res_div,uint16_t mult,uint8_t mod)1575 tcp_log_pacing_size(struct tcpcb *tp, uint64_t bw, uint32_t segsiz, uint32_t new_tso,
1576 uint64_t hw_rate, uint32_t time_between, uint32_t calc_time_between,
1577 uint32_t segs, uint32_t res_div, uint16_t mult, uint8_t mod)
1578 {
1579 if (tcp_bblogging_on(tp)) {
1580 union tcp_log_stackspecific log;
1581 struct timeval tv;
1582
1583 memset(&log, 0, sizeof(log));
1584 log.u_bbr.flex1 = segsiz;
1585 log.u_bbr.flex2 = new_tso;
1586 log.u_bbr.flex3 = time_between;
1587 log.u_bbr.flex4 = calc_time_between;
1588 log.u_bbr.flex5 = segs;
1589 log.u_bbr.flex6 = res_div;
1590 log.u_bbr.flex7 = mult;
1591 log.u_bbr.flex8 = mod;
1592 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
1593 log.u_bbr.cur_del_rate = bw;
1594 log.u_bbr.delRate = hw_rate;
1595 TCP_LOG_EVENTP(tp, NULL,
1596 &tptosocket(tp)->so_rcv,
1597 &tptosocket(tp)->so_snd,
1598 TCP_HDWR_PACE_SIZE, 0,
1599 0, &log, false, &tv);
1600 }
1601 }
1602
1603 uint32_t
tcp_get_pacing_burst_size_w_divisor(struct tcpcb * tp,uint64_t bw,uint32_t segsiz,int can_use_1mss,const struct tcp_hwrate_limit_table * te,int * err,int divisor)1604 tcp_get_pacing_burst_size_w_divisor(struct tcpcb *tp, uint64_t bw, uint32_t segsiz, int can_use_1mss,
1605 const struct tcp_hwrate_limit_table *te, int *err, int divisor)
1606 {
1607 /*
1608 * We use the google formula to calculate the
1609 * TSO size. I.E.
1610 * bw < 24Meg
1611 * tso = 2mss
1612 * else
1613 * tso = min(bw/(div=1000), 64k)
1614 *
1615 * Note for these calculations we ignore the
1616 * packet overhead (enet hdr, ip hdr and tcp hdr).
1617 * We only get the google formula when we have
1618 * divisor = 1000, which is the default for now.
1619 */
1620 uint64_t lentim, res, bytes;
1621 uint32_t new_tso, min_tso_segs;
1622
1623 /* It can't be zero */
1624 if ((divisor == 0) ||
1625 (divisor < RL_MIN_DIVISOR)) {
1626 if (mss_divisor)
1627 bytes = bw / mss_divisor;
1628 else
1629 bytes = bw / 1000;
1630 } else
1631 bytes = bw / divisor;
1632 /* We can't ever send more than 65k in a TSO */
1633 if (bytes > 0xffff) {
1634 bytes = 0xffff;
1635 }
1636 /* Round up */
1637 new_tso = (bytes + segsiz - 1) / segsiz;
1638 /* Are we enforcing even boundaries? */
1639 if (even_num_segs && (new_tso & 1) && (new_tso > even_threshold))
1640 new_tso++;
1641 if (can_use_1mss)
1642 min_tso_segs = 1;
1643 else
1644 min_tso_segs = 2;
1645 if (rs_floor_mss && (new_tso < rs_floor_mss))
1646 new_tso = rs_floor_mss;
1647 else if (new_tso < min_tso_segs)
1648 new_tso = min_tso_segs;
1649 if (new_tso > MAX_MSS_SENT)
1650 new_tso = MAX_MSS_SENT;
1651 new_tso *= segsiz;
1652 tcp_log_pacing_size(tp, bw, segsiz, new_tso,
1653 0, 0, 0, 0, 0, 0, 1);
1654 /*
1655 * If we are not doing hardware pacing
1656 * then we are done.
1657 */
1658 if (te == NULL) {
1659 if (err)
1660 *err = 0;
1661 return(new_tso);
1662 }
1663 /*
1664 * For hardware pacing we look at the
1665 * rate you are sending at and compare
1666 * that to the rate you have in hardware.
1667 *
1668 * If the hardware rate is slower than your
1669 * software rate then you are in error and
1670 * we will build a queue in our hardware whic
1671 * is probably not desired, in such a case
1672 * just return the non-hardware TSO size.
1673 *
1674 * If the rate in hardware is faster (which
1675 * it should be) then look at how long it
1676 * takes to send one ethernet segment size at
1677 * your b/w and compare that to the time it
1678 * takes to send at the rate you had selected.
1679 *
1680 * If your time is greater (which we hope it is)
1681 * we get the delta between the two, and then
1682 * divide that into your pacing time. This tells
1683 * us how many MSS you can send down at once (rounded up).
1684 *
1685 * Note we also double this value if the b/w is over
1686 * 100Mbps. If its over 500meg we just set you to the
1687 * max (43 segments).
1688 */
1689 if (te->rate > FIVE_HUNDRED_MBPS)
1690 goto max;
1691 if (te->rate == bw) {
1692 /* We are pacing at exactly the hdwr rate */
1693 max:
1694 tcp_log_pacing_size(tp, bw, segsiz, new_tso,
1695 te->rate, te->time_between, (uint32_t)0,
1696 (segsiz * MAX_MSS_SENT), 0, 0, 3);
1697 return (segsiz * MAX_MSS_SENT);
1698 }
1699 lentim = ETHERNET_SEGMENT_SIZE * USECS_IN_SECOND;
1700 res = lentim / bw;
1701 if (res > te->time_between) {
1702 uint32_t delta, segs, res_div;
1703
1704 res_div = ((res * num_of_waits_allowed) + wait_time_floor);
1705 delta = res - te->time_between;
1706 segs = (res_div + delta - 1)/delta;
1707 if (segs < min_tso_segs)
1708 segs = min_tso_segs;
1709 if (segs < rs_hw_floor_mss)
1710 segs = rs_hw_floor_mss;
1711 if (segs > MAX_MSS_SENT)
1712 segs = MAX_MSS_SENT;
1713 segs *= segsiz;
1714 tcp_log_pacing_size(tp, bw, segsiz, new_tso,
1715 te->rate, te->time_between, (uint32_t)res,
1716 segs, res_div, 1, 3);
1717 if (err)
1718 *err = 0;
1719 if (segs < new_tso) {
1720 /* unexpected ? */
1721 return(new_tso);
1722 } else {
1723 return (segs);
1724 }
1725 } else {
1726 /*
1727 * Your time is smaller which means
1728 * we will grow a queue on our
1729 * hardware. Send back the non-hardware
1730 * rate.
1731 */
1732 tcp_log_pacing_size(tp, bw, segsiz, new_tso,
1733 te->rate, te->time_between, (uint32_t)res,
1734 0, 0, 0, 4);
1735 if (err)
1736 *err = -1;
1737 return (new_tso);
1738 }
1739 }
1740
1741 uint64_t
tcp_hw_highest_rate_ifp(struct ifnet * ifp,struct inpcb * inp)1742 tcp_hw_highest_rate_ifp(struct ifnet *ifp, struct inpcb *inp)
1743 {
1744 struct epoch_tracker et;
1745 struct tcp_rate_set *rs;
1746 uint64_t rate_ret;
1747
1748 NET_EPOCH_ENTER(et);
1749 use_next_interface:
1750 rs = find_rs_for_ifp(ifp);
1751 if (rs == NULL) {
1752 /* This interface does not do ratelimiting */
1753 rate_ret = 0;
1754 } else if (rs->rs_flags & RS_IS_DEFF) {
1755 /* We need to find the real interface */
1756 struct ifnet *tifp;
1757
1758 tifp = rt_find_real_interface(ifp, inp, NULL);
1759 if (tifp == NULL) {
1760 NET_EPOCH_EXIT(et);
1761 return (0);
1762 }
1763 ifp = tifp;
1764 goto use_next_interface;
1765 } else {
1766 /* Lets return the highest rate this guy has */
1767 rate_ret = rs->rs_rlt[rs->rs_highest_valid].rate;
1768 }
1769 NET_EPOCH_EXIT(et);
1770 return(rate_ret);
1771 }
1772
1773 static eventhandler_tag rl_ifnet_departs;
1774 static eventhandler_tag rl_ifnet_arrives;
1775 static eventhandler_tag rl_shutdown_start;
1776
1777 static void
tcp_rs_init(void * st __unused)1778 tcp_rs_init(void *st __unused)
1779 {
1780 mtx_init(&rs_mtx, "tcp_rs_mtx", "rsmtx", MTX_DEF);
1781 rl_ifnet_departs = EVENTHANDLER_REGISTER(ifnet_departure_event,
1782 tcp_rl_ifnet_departure,
1783 NULL, EVENTHANDLER_PRI_ANY);
1784 rl_ifnet_arrives = EVENTHANDLER_REGISTER(ifnet_link_event,
1785 tcp_rl_ifnet_link,
1786 NULL, EVENTHANDLER_PRI_ANY);
1787 rl_shutdown_start = EVENTHANDLER_REGISTER(shutdown_pre_sync,
1788 tcp_rl_shutdown, NULL,
1789 SHUTDOWN_PRI_FIRST);
1790 printf("TCP_ratelimit: Is now initialized\n");
1791 }
1792
1793 SYSINIT(tcp_rl_init, SI_SUB_SMP + 1, SI_ORDER_ANY, tcp_rs_init, NULL);
1794 #endif
1795