120abea66SRandall Stewart /*-
220abea66SRandall Stewart *
320abea66SRandall Stewart * SPDX-License-Identifier: BSD-3-Clause
420abea66SRandall Stewart *
528540ab1SWarner Losh * Copyright (c) 2018-2020
620abea66SRandall Stewart * Netflix Inc.
720abea66SRandall Stewart *
820abea66SRandall Stewart * Redistribution and use in source and binary forms, with or without
920abea66SRandall Stewart * modification, are permitted provided that the following conditions
1020abea66SRandall Stewart * are met:
1120abea66SRandall Stewart * 1. Redistributions of source code must retain the above copyright
1220abea66SRandall Stewart * notice, this list of conditions and the following disclaimer.
1320abea66SRandall Stewart * 2. Redistributions in binary form must reproduce the above copyright
1420abea66SRandall Stewart * notice, this list of conditions and the following disclaimer in the
1520abea66SRandall Stewart * documentation and/or other materials provided with the distribution.
1620abea66SRandall Stewart *
1720abea66SRandall Stewart * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
1820abea66SRandall Stewart * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
1920abea66SRandall Stewart * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
2020abea66SRandall Stewart * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
2120abea66SRandall Stewart * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
2220abea66SRandall Stewart * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
2320abea66SRandall Stewart * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
2420abea66SRandall Stewart * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
2520abea66SRandall Stewart * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
2620abea66SRandall Stewart * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
2720abea66SRandall Stewart * SUCH DAMAGE.
2820abea66SRandall Stewart *
2920abea66SRandall Stewart */
3020abea66SRandall Stewart /**
3120abea66SRandall Stewart * Author: Randall Stewart <rrs@netflix.com>
3220abea66SRandall Stewart */
3320abea66SRandall Stewart
3420abea66SRandall Stewart #include <sys/cdefs.h>
3520abea66SRandall Stewart #include "opt_inet.h"
3620abea66SRandall Stewart #include "opt_inet6.h"
3720abea66SRandall Stewart #include "opt_ipsec.h"
3820abea66SRandall Stewart #include "opt_ratelimit.h"
3920abea66SRandall Stewart #include <sys/param.h>
4020abea66SRandall Stewart #include <sys/kernel.h>
4120abea66SRandall Stewart #include <sys/malloc.h>
4220abea66SRandall Stewart #include <sys/mbuf.h>
4320abea66SRandall Stewart #include <sys/socket.h>
4420abea66SRandall Stewart #include <sys/socketvar.h>
4520abea66SRandall Stewart #include <sys/sysctl.h>
4620abea66SRandall Stewart #include <sys/eventhandler.h>
4720abea66SRandall Stewart #include <sys/mutex.h>
4820abea66SRandall Stewart #include <sys/ck.h>
49348404bcSRandall Stewart #include <net/if.h>
50348404bcSRandall Stewart #include <net/if_var.h>
513d0d5b21SJustin Hibbits #include <net/if_private.h>
5220abea66SRandall Stewart #include <netinet/in.h>
5320abea66SRandall Stewart #include <netinet/in_pcb.h>
54348404bcSRandall Stewart #define TCPSTATES /* for logging */
5520abea66SRandall Stewart #include <netinet/tcp_var.h>
561a714ff2SRandall Stewart #include <netinet/tcp_hpts.h>
571a714ff2SRandall Stewart #include <netinet/tcp_log_buf.h>
5820abea66SRandall Stewart #include <netinet/tcp_ratelimit.h>
5920abea66SRandall Stewart #ifndef USECS_IN_SECOND
6020abea66SRandall Stewart #define USECS_IN_SECOND 1000000
6120abea66SRandall Stewart #endif
6220abea66SRandall Stewart /*
6320abea66SRandall Stewart * For the purposes of each send, what is the size
6420abea66SRandall Stewart * of an ethernet frame.
6520abea66SRandall Stewart */
6620abea66SRandall Stewart MALLOC_DEFINE(M_TCPPACE, "tcp_hwpace", "TCP Hardware pacing memory");
6720abea66SRandall Stewart #ifdef RATELIMIT
6820abea66SRandall Stewart
69d7313dc6SRandall Stewart /*
70d7313dc6SRandall Stewart * The following preferred table will seem weird to
71d7313dc6SRandall Stewart * the casual viewer. Why do we not have any rates below
72d7313dc6SRandall Stewart * 1Mbps? Why do we have a rate at 1.44Mbps called common?
73d7313dc6SRandall Stewart * Why do the rates cluster in the 1-100Mbps range more
74d7313dc6SRandall Stewart * than others? Why does the table jump around at the beginnign
75d7313dc6SRandall Stewart * and then be more consistently raising?
76d7313dc6SRandall Stewart *
77d7313dc6SRandall Stewart * Let me try to answer those questions. A lot of
78d7313dc6SRandall Stewart * this is dependant on the hardware. We have three basic
79d7313dc6SRandall Stewart * supporters of rate limiting
80d7313dc6SRandall Stewart *
81d7313dc6SRandall Stewart * Chelsio - Supporting 16 configurable rates.
82d7313dc6SRandall Stewart * Mlx - c4 supporting 13 fixed rates.
83d7313dc6SRandall Stewart * Mlx - c5 & c6 supporting 127 configurable rates.
84d7313dc6SRandall Stewart *
85d7313dc6SRandall Stewart * The c4 is why we have a common rate that is available
86d7313dc6SRandall Stewart * in all rate tables. This is a selected rate from the
87d7313dc6SRandall Stewart * c4 table and we assure its available in all ratelimit
88d7313dc6SRandall Stewart * tables. This way the tcp_ratelimit code has an assured
89d7313dc6SRandall Stewart * rate it should always be able to get. This answers a
90d7313dc6SRandall Stewart * couple of the questions above.
91d7313dc6SRandall Stewart *
92d7313dc6SRandall Stewart * So what about the rest, well the table is built to
93d7313dc6SRandall Stewart * try to get the most out of a joint hardware/software
94d7313dc6SRandall Stewart * pacing system. The software pacer will always pick
95d7313dc6SRandall Stewart * a rate higher than the b/w that it is estimating
96d7313dc6SRandall Stewart *
97d7313dc6SRandall Stewart * on the path. This is done for two reasons.
98d7313dc6SRandall Stewart * a) So we can discover more b/w
99d7313dc6SRandall Stewart * and
100d7313dc6SRandall Stewart * b) So we can send a block of MSS's down and then
101d7313dc6SRandall Stewart * have the software timer go off after the previous
102d7313dc6SRandall Stewart * send is completely out of the hardware.
103d7313dc6SRandall Stewart *
104d7313dc6SRandall Stewart * But when we do <b> we don't want to have the delay
105d7313dc6SRandall Stewart * between the last packet sent by the hardware be
106d7313dc6SRandall Stewart * excessively long (to reach our desired rate).
107d7313dc6SRandall Stewart *
108d7313dc6SRandall Stewart * So let me give an example for clarity.
109d7313dc6SRandall Stewart *
110d7313dc6SRandall Stewart * Lets assume that the tcp stack sees that 29,110,000 bps is
111d7313dc6SRandall Stewart * what the bw of the path is. The stack would select the
112d7313dc6SRandall Stewart * rate 31Mbps. 31Mbps means that each send that is done
113d7313dc6SRandall Stewart * by the hardware will cause a 390 micro-second gap between
114d7313dc6SRandall Stewart * the packets sent at that rate. For 29,110,000 bps we
115d7313dc6SRandall Stewart * would need 416 micro-seconds gap between each send.
116d7313dc6SRandall Stewart *
117d7313dc6SRandall Stewart * Note that are calculating a complete time for pacing
118d7313dc6SRandall Stewart * which includes the ethernet, IP and TCP overhead. So
119d7313dc6SRandall Stewart * a full 1514 bytes is used for the above calculations.
120d7313dc6SRandall Stewart * My testing has shown that both cards are also using this
121d7313dc6SRandall Stewart * as their basis i.e. full payload size of the ethernet frame.
122d7313dc6SRandall Stewart * The TCP stack caller needs to be aware of this and make the
123d7313dc6SRandall Stewart * appropriate overhead calculations be included in its choices.
124d7313dc6SRandall Stewart *
125d7313dc6SRandall Stewart * Now, continuing our example, we pick a MSS size based on the
126d7313dc6SRandall Stewart * delta between the two rates (416 - 390) divided into the rate
127d7313dc6SRandall Stewart * we really wish to send at rounded up. That results in a MSS
128d7313dc6SRandall Stewart * send of 17 mss's at once. The hardware then will
129d7313dc6SRandall Stewart * run out of data in a single 17MSS send in 6,630 micro-seconds.
130d7313dc6SRandall Stewart *
131d7313dc6SRandall Stewart * On the other hand the software pacer will send more data
132d7313dc6SRandall Stewart * in 7,072 micro-seconds. This means that we will refill
133d7313dc6SRandall Stewart * the hardware 52 microseconds after it would have sent
134d7313dc6SRandall Stewart * next if it had not ran out of data. This is a win since we are
135d7313dc6SRandall Stewart * only sending every 7ms or so and yet all the packets are spaced on
136d7313dc6SRandall Stewart * the wire with 94% of what they should be and only
137d7313dc6SRandall Stewart * the last packet is delayed extra to make up for the
138d7313dc6SRandall Stewart * difference.
139d7313dc6SRandall Stewart *
140d7313dc6SRandall Stewart * Note that the above formula has two important caveat.
141d7313dc6SRandall Stewart * If we are above (b/w wise) over 100Mbps we double the result
142d7313dc6SRandall Stewart * of the MSS calculation. The second caveat is if we are 500Mbps
143d7313dc6SRandall Stewart * or more we just send the maximum MSS at once i.e. 45MSS. At
144d7313dc6SRandall Stewart * the higher b/w's even the cards have limits to what times (timer granularity)
145d7313dc6SRandall Stewart * they can insert between packets and start to send more than one
146d7313dc6SRandall Stewart * packet at a time on the wire.
147d7313dc6SRandall Stewart *
148d7313dc6SRandall Stewart */
14920abea66SRandall Stewart #define COMMON_RATE 180500
150d7313dc6SRandall Stewart const uint64_t desired_rates[] = {
151d7313dc6SRandall Stewart 122500, /* 1Mbps - rate 1 */
152d7313dc6SRandall Stewart 180500, /* 1.44Mpbs - rate 2 common rate */
153d7313dc6SRandall Stewart 375000, /* 3Mbps - rate 3 */
154d7313dc6SRandall Stewart 625000, /* 5Mbps - rate 4 */
1551a714ff2SRandall Stewart 1250000, /* 10Mbps - rate 5 */
1561a714ff2SRandall Stewart 1875000, /* 15Mbps - rate 6 */
1571a714ff2SRandall Stewart 2500000, /* 20Mbps - rate 7 */
1581a714ff2SRandall Stewart 3125000, /* 25Mbps - rate 8 */
1591a714ff2SRandall Stewart 3750000, /* 30Mbps - rate 9 */
1601a714ff2SRandall Stewart 4375000, /* 35Mbps - rate 10 */
1611a714ff2SRandall Stewart 5000000, /* 40Meg - rate 11 */
1621a714ff2SRandall Stewart 6250000, /* 50Mbps - rate 12 */
1631a714ff2SRandall Stewart 12500000, /* 100Mbps - rate 13 */
1641a714ff2SRandall Stewart 25000000, /* 200Mbps - rate 14 */
1651a714ff2SRandall Stewart 50000000, /* 400Mbps - rate 15 */
166d7313dc6SRandall Stewart 100000000, /* 800Mbps - rate 16 */
1671a714ff2SRandall Stewart 5625000, /* 45Mbps - rate 17 */
1681a714ff2SRandall Stewart 6875000, /* 55Mbps - rate 19 */
1691a714ff2SRandall Stewart 7500000, /* 60Mbps - rate 20 */
1701a714ff2SRandall Stewart 8125000, /* 65Mbps - rate 21 */
1711a714ff2SRandall Stewart 8750000, /* 70Mbps - rate 22 */
1721a714ff2SRandall Stewart 9375000, /* 75Mbps - rate 23 */
1731a714ff2SRandall Stewart 10000000, /* 80Mbps - rate 24 */
1741a714ff2SRandall Stewart 10625000, /* 85Mbps - rate 25 */
1751a714ff2SRandall Stewart 11250000, /* 90Mbps - rate 26 */
1761a714ff2SRandall Stewart 11875000, /* 95Mbps - rate 27 */
1771a714ff2SRandall Stewart 12500000, /* 100Mbps - rate 28 */
1781a714ff2SRandall Stewart 13750000, /* 110Mbps - rate 29 */
1791a714ff2SRandall Stewart 15000000, /* 120Mbps - rate 30 */
1801a714ff2SRandall Stewart 16250000, /* 130Mbps - rate 31 */
1811a714ff2SRandall Stewart 17500000, /* 140Mbps - rate 32 */
1821a714ff2SRandall Stewart 18750000, /* 150Mbps - rate 33 */
1831a714ff2SRandall Stewart 20000000, /* 160Mbps - rate 34 */
1841a714ff2SRandall Stewart 21250000, /* 170Mbps - rate 35 */
1851a714ff2SRandall Stewart 22500000, /* 180Mbps - rate 36 */
1861a714ff2SRandall Stewart 23750000, /* 190Mbps - rate 37 */
1871a714ff2SRandall Stewart 26250000, /* 210Mbps - rate 38 */
1881a714ff2SRandall Stewart 27500000, /* 220Mbps - rate 39 */
1891a714ff2SRandall Stewart 28750000, /* 230Mbps - rate 40 */
1901a714ff2SRandall Stewart 30000000, /* 240Mbps - rate 41 */
1911a714ff2SRandall Stewart 31250000, /* 250Mbps - rate 42 */
1921a714ff2SRandall Stewart 34375000, /* 275Mbps - rate 43 */
1931a714ff2SRandall Stewart 37500000, /* 300Mbps - rate 44 */
1941a714ff2SRandall Stewart 40625000, /* 325Mbps - rate 45 */
1951a714ff2SRandall Stewart 43750000, /* 350Mbps - rate 46 */
1961a714ff2SRandall Stewart 46875000, /* 375Mbps - rate 47 */
1971a714ff2SRandall Stewart 53125000, /* 425Mbps - rate 48 */
1981a714ff2SRandall Stewart 56250000, /* 450Mbps - rate 49 */
1991a714ff2SRandall Stewart 59375000, /* 475Mbps - rate 50 */
2001a714ff2SRandall Stewart 62500000, /* 500Mbps - rate 51 */
2011a714ff2SRandall Stewart 68750000, /* 550Mbps - rate 52 */
2021a714ff2SRandall Stewart 75000000, /* 600Mbps - rate 53 */
2031a714ff2SRandall Stewart 81250000, /* 650Mbps - rate 54 */
2041a714ff2SRandall Stewart 87500000, /* 700Mbps - rate 55 */
2051a714ff2SRandall Stewart 93750000, /* 750Mbps - rate 56 */
2061a714ff2SRandall Stewart 106250000, /* 850Mbps - rate 57 */
2071a714ff2SRandall Stewart 112500000, /* 900Mbps - rate 58 */
2081a714ff2SRandall Stewart 125000000, /* 1Gbps - rate 59 */
2091a714ff2SRandall Stewart 156250000, /* 1.25Gps - rate 60 */
2101a714ff2SRandall Stewart 187500000, /* 1.5Gps - rate 61 */
2111a714ff2SRandall Stewart 218750000, /* 1.75Gps - rate 62 */
2121a714ff2SRandall Stewart 250000000, /* 2Gbps - rate 63 */
2131a714ff2SRandall Stewart 281250000, /* 2.25Gps - rate 64 */
2141a714ff2SRandall Stewart 312500000, /* 2.5Gbps - rate 65 */
2151a714ff2SRandall Stewart 343750000, /* 2.75Gbps - rate 66 */
2161a714ff2SRandall Stewart 375000000, /* 3Gbps - rate 67 */
2171a714ff2SRandall Stewart 500000000, /* 4Gbps - rate 68 */
2181a714ff2SRandall Stewart 625000000, /* 5Gbps - rate 69 */
2191a714ff2SRandall Stewart 750000000, /* 6Gbps - rate 70 */
2201a714ff2SRandall Stewart 875000000, /* 7Gbps - rate 71 */
2211a714ff2SRandall Stewart 1000000000, /* 8Gbps - rate 72 */
2221a714ff2SRandall Stewart 1125000000, /* 9Gbps - rate 73 */
2231a714ff2SRandall Stewart 1250000000, /* 10Gbps - rate 74 */
2241a714ff2SRandall Stewart 1875000000, /* 15Gbps - rate 75 */
2251a714ff2SRandall Stewart 2500000000 /* 20Gbps - rate 76 */
22620abea66SRandall Stewart };
227d7313dc6SRandall Stewart
22820abea66SRandall Stewart #define MAX_HDWR_RATES (sizeof(desired_rates)/sizeof(uint64_t))
22920abea66SRandall Stewart #define RS_ORDERED_COUNT 16 /*
23020abea66SRandall Stewart * Number that are in order
23120abea66SRandall Stewart * at the beginning of the table,
23220abea66SRandall Stewart * over this a sort is required.
23320abea66SRandall Stewart */
23420abea66SRandall Stewart #define RS_NEXT_ORDER_GROUP 16 /*
23520abea66SRandall Stewart * The point in our table where
23620abea66SRandall Stewart * we come fill in a second ordered
23720abea66SRandall Stewart * group (index wise means -1).
23820abea66SRandall Stewart */
23920abea66SRandall Stewart #define ALL_HARDWARE_RATES 1004 /*
24020abea66SRandall Stewart * 1Meg - 1Gig in 1 Meg steps
24120abea66SRandall Stewart * plus 100, 200k and 500k and
24220abea66SRandall Stewart * 10Gig
24320abea66SRandall Stewart */
24420abea66SRandall Stewart
24520abea66SRandall Stewart #define RS_ONE_MEGABIT_PERSEC 1000000
24620abea66SRandall Stewart #define RS_ONE_GIGABIT_PERSEC 1000000000
24720abea66SRandall Stewart #define RS_TEN_GIGABIT_PERSEC 10000000000
24820abea66SRandall Stewart
24920abea66SRandall Stewart static struct head_tcp_rate_set int_rs;
25020abea66SRandall Stewart static struct mtx rs_mtx;
25120abea66SRandall Stewart uint32_t rs_number_alive;
25220abea66SRandall Stewart uint32_t rs_number_dead;
2531a714ff2SRandall Stewart static uint32_t rs_floor_mss = 0;
2541a714ff2SRandall Stewart static uint32_t wait_time_floor = 8000; /* 8 ms */
2551a714ff2SRandall Stewart static uint32_t rs_hw_floor_mss = 16;
2561a714ff2SRandall Stewart static uint32_t num_of_waits_allowed = 1; /* How many time blocks are we willing to wait */
25720abea66SRandall Stewart
25826bdd35cSRandall Stewart static uint32_t mss_divisor = RL_DEFAULT_DIVISOR;
25926bdd35cSRandall Stewart static uint32_t even_num_segs = 1;
26026bdd35cSRandall Stewart static uint32_t even_threshold = 4;
26126bdd35cSRandall Stewart
2627029da5cSPawel Biernacki SYSCTL_NODE(_net_inet_tcp, OID_AUTO, rl, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
26320abea66SRandall Stewart "TCP Ratelimit stats");
26420abea66SRandall Stewart SYSCTL_UINT(_net_inet_tcp_rl, OID_AUTO, alive, CTLFLAG_RW,
26520abea66SRandall Stewart &rs_number_alive, 0,
26620abea66SRandall Stewart "Number of interfaces initialized for ratelimiting");
26720abea66SRandall Stewart SYSCTL_UINT(_net_inet_tcp_rl, OID_AUTO, dead, CTLFLAG_RW,
26820abea66SRandall Stewart &rs_number_dead, 0,
26920abea66SRandall Stewart "Number of interfaces departing from ratelimiting");
2701a714ff2SRandall Stewart SYSCTL_UINT(_net_inet_tcp_rl, OID_AUTO, floor_mss, CTLFLAG_RW,
2711a714ff2SRandall Stewart &rs_floor_mss, 0,
2721a714ff2SRandall Stewart "Number of MSS that will override the normal minimums (0 means don't enforce)");
2731a714ff2SRandall Stewart SYSCTL_UINT(_net_inet_tcp_rl, OID_AUTO, wait_floor, CTLFLAG_RW,
2741a714ff2SRandall Stewart &wait_time_floor, 2000,
2751a714ff2SRandall Stewart "Has b/w increases what is the wait floor we are willing to wait at the end?");
2761a714ff2SRandall Stewart SYSCTL_UINT(_net_inet_tcp_rl, OID_AUTO, time_blocks, CTLFLAG_RW,
2771a714ff2SRandall Stewart &num_of_waits_allowed, 1,
2781a714ff2SRandall Stewart "How many time blocks on the end should software pacing be willing to wait?");
2791a714ff2SRandall Stewart
2801a714ff2SRandall Stewart SYSCTL_UINT(_net_inet_tcp_rl, OID_AUTO, hw_floor_mss, CTLFLAG_RW,
2811a714ff2SRandall Stewart &rs_hw_floor_mss, 16,
2821a714ff2SRandall Stewart "Number of mss that are a minum for hardware pacing?");
2831a714ff2SRandall Stewart
28426bdd35cSRandall Stewart SYSCTL_INT(_net_inet_tcp_rl, OID_AUTO, divisor, CTLFLAG_RW,
28526bdd35cSRandall Stewart &mss_divisor, RL_DEFAULT_DIVISOR,
28626bdd35cSRandall Stewart "The value divided into bytes per second to help establish mss size");
28726bdd35cSRandall Stewart SYSCTL_INT(_net_inet_tcp_rl, OID_AUTO, even, CTLFLAG_RW,
28826bdd35cSRandall Stewart &even_num_segs, 1,
28926bdd35cSRandall Stewart "Do we round mss size up to an even number of segments for delayed ack");
29026bdd35cSRandall Stewart SYSCTL_INT(_net_inet_tcp_rl, OID_AUTO, eventhresh, CTLFLAG_RW,
29126bdd35cSRandall Stewart &even_threshold, 4,
29226bdd35cSRandall Stewart "At what number of mss do we start rounding up to an even number of mss?");
29320abea66SRandall Stewart
29420abea66SRandall Stewart static void
rl_add_syctl_entries(struct sysctl_oid * rl_sysctl_root,struct tcp_rate_set * rs)29520abea66SRandall Stewart rl_add_syctl_entries(struct sysctl_oid *rl_sysctl_root, struct tcp_rate_set *rs)
29620abea66SRandall Stewart {
29720abea66SRandall Stewart /*
29820abea66SRandall Stewart * Add sysctl entries for thus interface.
29920abea66SRandall Stewart */
30020abea66SRandall Stewart if (rs->rs_flags & RS_INTF_NO_SUP) {
30120abea66SRandall Stewart SYSCTL_ADD_S32(&rs->sysctl_ctx,
30220abea66SRandall Stewart SYSCTL_CHILDREN(rl_sysctl_root),
30320abea66SRandall Stewart OID_AUTO, "disable", CTLFLAG_RD,
30420abea66SRandall Stewart &rs->rs_disable, 0,
30520abea66SRandall Stewart "Disable this interface from new hdwr limiting?");
30620abea66SRandall Stewart } else {
30720abea66SRandall Stewart SYSCTL_ADD_S32(&rs->sysctl_ctx,
30820abea66SRandall Stewart SYSCTL_CHILDREN(rl_sysctl_root),
30920abea66SRandall Stewart OID_AUTO, "disable", CTLFLAG_RW,
31020abea66SRandall Stewart &rs->rs_disable, 0,
31120abea66SRandall Stewart "Disable this interface from new hdwr limiting?");
31220abea66SRandall Stewart }
31320abea66SRandall Stewart SYSCTL_ADD_S32(&rs->sysctl_ctx,
31420abea66SRandall Stewart SYSCTL_CHILDREN(rl_sysctl_root),
31520abea66SRandall Stewart OID_AUTO, "minseg", CTLFLAG_RW,
31620abea66SRandall Stewart &rs->rs_min_seg, 0,
31720abea66SRandall Stewart "What is the minimum we need to send on this interface?");
31820abea66SRandall Stewart SYSCTL_ADD_U64(&rs->sysctl_ctx,
31920abea66SRandall Stewart SYSCTL_CHILDREN(rl_sysctl_root),
32020abea66SRandall Stewart OID_AUTO, "flow_limit", CTLFLAG_RW,
32120abea66SRandall Stewart &rs->rs_flow_limit, 0,
32220abea66SRandall Stewart "What is the limit for number of flows (0=unlimited)?");
32320abea66SRandall Stewart SYSCTL_ADD_S32(&rs->sysctl_ctx,
32420abea66SRandall Stewart SYSCTL_CHILDREN(rl_sysctl_root),
32520abea66SRandall Stewart OID_AUTO, "highest", CTLFLAG_RD,
32620abea66SRandall Stewart &rs->rs_highest_valid, 0,
32720abea66SRandall Stewart "Highest valid rate");
32820abea66SRandall Stewart SYSCTL_ADD_S32(&rs->sysctl_ctx,
32920abea66SRandall Stewart SYSCTL_CHILDREN(rl_sysctl_root),
33020abea66SRandall Stewart OID_AUTO, "lowest", CTLFLAG_RD,
33120abea66SRandall Stewart &rs->rs_lowest_valid, 0,
33220abea66SRandall Stewart "Lowest valid rate");
33320abea66SRandall Stewart SYSCTL_ADD_S32(&rs->sysctl_ctx,
33420abea66SRandall Stewart SYSCTL_CHILDREN(rl_sysctl_root),
33520abea66SRandall Stewart OID_AUTO, "flags", CTLFLAG_RD,
33620abea66SRandall Stewart &rs->rs_flags, 0,
33720abea66SRandall Stewart "What lags are on the entry?");
33820abea66SRandall Stewart SYSCTL_ADD_S32(&rs->sysctl_ctx,
33920abea66SRandall Stewart SYSCTL_CHILDREN(rl_sysctl_root),
34020abea66SRandall Stewart OID_AUTO, "numrates", CTLFLAG_RD,
34120abea66SRandall Stewart &rs->rs_rate_cnt, 0,
34220abea66SRandall Stewart "How many rates re there?");
34320abea66SRandall Stewart SYSCTL_ADD_U64(&rs->sysctl_ctx,
34420abea66SRandall Stewart SYSCTL_CHILDREN(rl_sysctl_root),
34520abea66SRandall Stewart OID_AUTO, "flows_using", CTLFLAG_RD,
34620abea66SRandall Stewart &rs->rs_flows_using, 0,
34720abea66SRandall Stewart "How many flows are using this interface now?");
34820abea66SRandall Stewart #ifdef DETAILED_RATELIMIT_SYSCTL
34920abea66SRandall Stewart if (rs->rs_rlt && rs->rs_rate_cnt > 0) {
35020abea66SRandall Stewart /* Lets display the rates */
35120abea66SRandall Stewart int i;
35220abea66SRandall Stewart struct sysctl_oid *rl_rates;
35320abea66SRandall Stewart struct sysctl_oid *rl_rate_num;
35420abea66SRandall Stewart char rate_num[16];
35520abea66SRandall Stewart rl_rates = SYSCTL_ADD_NODE(&rs->sysctl_ctx,
35620abea66SRandall Stewart SYSCTL_CHILDREN(rl_sysctl_root),
35720abea66SRandall Stewart OID_AUTO,
35820abea66SRandall Stewart "rate",
3597029da5cSPawel Biernacki CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
36020abea66SRandall Stewart "Ratelist");
36120abea66SRandall Stewart for( i = 0; i < rs->rs_rate_cnt; i++) {
36220abea66SRandall Stewart sprintf(rate_num, "%d", i);
36320abea66SRandall Stewart rl_rate_num = SYSCTL_ADD_NODE(&rs->sysctl_ctx,
36420abea66SRandall Stewart SYSCTL_CHILDREN(rl_rates),
36520abea66SRandall Stewart OID_AUTO,
36620abea66SRandall Stewart rate_num,
3677029da5cSPawel Biernacki CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
36820abea66SRandall Stewart "Individual Rate");
36920abea66SRandall Stewart SYSCTL_ADD_U32(&rs->sysctl_ctx,
37020abea66SRandall Stewart SYSCTL_CHILDREN(rl_rate_num),
37120abea66SRandall Stewart OID_AUTO, "flags", CTLFLAG_RD,
37220abea66SRandall Stewart &rs->rs_rlt[i].flags, 0,
37320abea66SRandall Stewart "Flags on this rate");
37420abea66SRandall Stewart SYSCTL_ADD_U32(&rs->sysctl_ctx,
37520abea66SRandall Stewart SYSCTL_CHILDREN(rl_rate_num),
37620abea66SRandall Stewart OID_AUTO, "pacetime", CTLFLAG_RD,
37720abea66SRandall Stewart &rs->rs_rlt[i].time_between, 0,
37820abea66SRandall Stewart "Time hardware inserts between 1500 byte sends");
3795d8fd932SRandall Stewart SYSCTL_ADD_LONG(&rs->sysctl_ctx,
38020abea66SRandall Stewart SYSCTL_CHILDREN(rl_rate_num),
38120abea66SRandall Stewart OID_AUTO, "rate", CTLFLAG_RD,
3825d8fd932SRandall Stewart &rs->rs_rlt[i].rate,
38320abea66SRandall Stewart "Rate in bytes per second");
3845d8fd932SRandall Stewart SYSCTL_ADD_LONG(&rs->sysctl_ctx,
3855d8fd932SRandall Stewart SYSCTL_CHILDREN(rl_rate_num),
3865d8fd932SRandall Stewart OID_AUTO, "using", CTLFLAG_RD,
3875d8fd932SRandall Stewart &rs->rs_rlt[i].using,
3885d8fd932SRandall Stewart "Number of flows using");
3895d8fd932SRandall Stewart SYSCTL_ADD_LONG(&rs->sysctl_ctx,
3905d8fd932SRandall Stewart SYSCTL_CHILDREN(rl_rate_num),
3915d8fd932SRandall Stewart OID_AUTO, "enobufs", CTLFLAG_RD,
3925d8fd932SRandall Stewart &rs->rs_rlt[i].rs_num_enobufs,
3935d8fd932SRandall Stewart "Number of enobufs logged on this rate");
3945d8fd932SRandall Stewart
39520abea66SRandall Stewart }
39620abea66SRandall Stewart }
39720abea66SRandall Stewart #endif
39820abea66SRandall Stewart }
39920abea66SRandall Stewart
40020abea66SRandall Stewart static void
rs_destroy(epoch_context_t ctx)40120abea66SRandall Stewart rs_destroy(epoch_context_t ctx)
40220abea66SRandall Stewart {
40320abea66SRandall Stewart struct tcp_rate_set *rs;
40424be1353SHans Petter Selasky bool do_free_rs;
40520abea66SRandall Stewart
40620abea66SRandall Stewart rs = __containerof(ctx, struct tcp_rate_set, rs_epoch_ctx);
40724be1353SHans Petter Selasky
40820abea66SRandall Stewart mtx_lock(&rs_mtx);
40920abea66SRandall Stewart rs->rs_flags &= ~RS_FUNERAL_SCHD;
41020abea66SRandall Stewart /*
41120abea66SRandall Stewart * In theory its possible (but unlikely)
41220abea66SRandall Stewart * that while the delete was occuring
41320abea66SRandall Stewart * and we were applying the DEAD flag
41420abea66SRandall Stewart * someone slipped in and found the
41520abea66SRandall Stewart * interface in a lookup. While we
41620abea66SRandall Stewart * decided rs_flows_using were 0 and
41720abea66SRandall Stewart * scheduling the epoch_call, the other
41820abea66SRandall Stewart * thread incremented rs_flow_using. This
41920abea66SRandall Stewart * is because users have a pointer and
42020abea66SRandall Stewart * we only use the rs_flows_using in an
42120abea66SRandall Stewart * atomic fashion, i.e. the other entities
42220abea66SRandall Stewart * are not protected. To assure this did
42320abea66SRandall Stewart * not occur, we check rs_flows_using here
42424be1353SHans Petter Selasky * before deleting.
42520abea66SRandall Stewart */
42624be1353SHans Petter Selasky do_free_rs = (rs->rs_flows_using == 0);
42724be1353SHans Petter Selasky rs_number_dead--;
42824be1353SHans Petter Selasky mtx_unlock(&rs_mtx);
42924be1353SHans Petter Selasky
43024be1353SHans Petter Selasky if (do_free_rs) {
43120abea66SRandall Stewart sysctl_ctx_free(&rs->sysctl_ctx);
43220abea66SRandall Stewart free(rs->rs_rlt, M_TCPPACE);
43320abea66SRandall Stewart free(rs, M_TCPPACE);
43420abea66SRandall Stewart }
43520abea66SRandall Stewart }
43620abea66SRandall Stewart
437eabddb25SHans Petter Selasky static void
rs_defer_destroy(struct tcp_rate_set * rs)438eabddb25SHans Petter Selasky rs_defer_destroy(struct tcp_rate_set *rs)
439eabddb25SHans Petter Selasky {
440eabddb25SHans Petter Selasky
441eabddb25SHans Petter Selasky mtx_assert(&rs_mtx, MA_OWNED);
442eabddb25SHans Petter Selasky
443eabddb25SHans Petter Selasky /* Check if already pending. */
444eabddb25SHans Petter Selasky if (rs->rs_flags & RS_FUNERAL_SCHD)
445eabddb25SHans Petter Selasky return;
446eabddb25SHans Petter Selasky
447eabddb25SHans Petter Selasky rs_number_dead++;
448eabddb25SHans Petter Selasky
449eabddb25SHans Petter Selasky /* Set flag to only defer once. */
450eabddb25SHans Petter Selasky rs->rs_flags |= RS_FUNERAL_SCHD;
451348404bcSRandall Stewart NET_EPOCH_CALL(rs_destroy, &rs->rs_epoch_ctx);
452eabddb25SHans Petter Selasky }
453eabddb25SHans Petter Selasky
454903c4ee6SXin LI #ifdef INET
4551a714ff2SRandall Stewart extern counter_u64_t rate_limit_new;
4561a714ff2SRandall Stewart extern counter_u64_t rate_limit_chg;
45720abea66SRandall Stewart extern counter_u64_t rate_limit_set_ok;
45820abea66SRandall Stewart extern counter_u64_t rate_limit_active;
45920abea66SRandall Stewart extern counter_u64_t rate_limit_alloc_fail;
460903c4ee6SXin LI #endif
46120abea66SRandall Stewart
46220abea66SRandall Stewart static int
rl_attach_txrtlmt(struct ifnet * ifp,uint32_t flowtype,int flowid,uint64_t cfg_rate,struct m_snd_tag ** tag)46320abea66SRandall Stewart rl_attach_txrtlmt(struct ifnet *ifp,
46420abea66SRandall Stewart uint32_t flowtype,
46520abea66SRandall Stewart int flowid,
46620abea66SRandall Stewart uint64_t cfg_rate,
46720abea66SRandall Stewart struct m_snd_tag **tag)
46820abea66SRandall Stewart {
46920abea66SRandall Stewart int error;
47020abea66SRandall Stewart union if_snd_tag_alloc_params params = {
47120abea66SRandall Stewart .rate_limit.hdr.type = IF_SND_TAG_TYPE_RATE_LIMIT,
47220abea66SRandall Stewart .rate_limit.hdr.flowid = flowid,
47320abea66SRandall Stewart .rate_limit.hdr.flowtype = flowtype,
47420abea66SRandall Stewart .rate_limit.max_rate = cfg_rate,
47520abea66SRandall Stewart .rate_limit.flags = M_NOWAIT,
47620abea66SRandall Stewart };
47720abea66SRandall Stewart
47836e0a362SJohn Baldwin error = m_snd_tag_alloc(ifp, ¶ms, tag);
479903c4ee6SXin LI #ifdef INET
48020abea66SRandall Stewart if (error == 0) {
48120abea66SRandall Stewart counter_u64_add(rate_limit_set_ok, 1);
48220abea66SRandall Stewart counter_u64_add(rate_limit_active, 1);
48336e0a362SJohn Baldwin } else if (error != EOPNOTSUPP)
48420abea66SRandall Stewart counter_u64_add(rate_limit_alloc_fail, 1);
485903c4ee6SXin LI #endif
48620abea66SRandall Stewart return (error);
48720abea66SRandall Stewart }
48820abea66SRandall Stewart
48920abea66SRandall Stewart static void
populate_canned_table(struct tcp_rate_set * rs,const uint64_t * rate_table_act)49020abea66SRandall Stewart populate_canned_table(struct tcp_rate_set *rs, const uint64_t *rate_table_act)
49120abea66SRandall Stewart {
49220abea66SRandall Stewart /*
49320abea66SRandall Stewart * The internal table is "special", it
49420abea66SRandall Stewart * is two seperate ordered tables that
49520abea66SRandall Stewart * must be merged. We get here when the
49620abea66SRandall Stewart * adapter specifies a number of rates that
49720abea66SRandall Stewart * covers both ranges in the table in some
49820abea66SRandall Stewart * form.
49920abea66SRandall Stewart */
50020abea66SRandall Stewart int i, at_low, at_high;
50120abea66SRandall Stewart uint8_t low_disabled = 0, high_disabled = 0;
50220abea66SRandall Stewart
50320abea66SRandall Stewart for(i = 0, at_low = 0, at_high = RS_NEXT_ORDER_GROUP; i < rs->rs_rate_cnt; i++) {
50420abea66SRandall Stewart rs->rs_rlt[i].flags = 0;
50520abea66SRandall Stewart rs->rs_rlt[i].time_between = 0;
50620abea66SRandall Stewart if ((low_disabled == 0) &&
50720abea66SRandall Stewart (high_disabled ||
50820abea66SRandall Stewart (rate_table_act[at_low] < rate_table_act[at_high]))) {
50920abea66SRandall Stewart rs->rs_rlt[i].rate = rate_table_act[at_low];
51020abea66SRandall Stewart at_low++;
51120abea66SRandall Stewart if (at_low == RS_NEXT_ORDER_GROUP)
51220abea66SRandall Stewart low_disabled = 1;
51320abea66SRandall Stewart } else if (high_disabled == 0) {
51420abea66SRandall Stewart rs->rs_rlt[i].rate = rate_table_act[at_high];
51520abea66SRandall Stewart at_high++;
51620abea66SRandall Stewart if (at_high == MAX_HDWR_RATES)
51720abea66SRandall Stewart high_disabled = 1;
51820abea66SRandall Stewart }
51920abea66SRandall Stewart }
52020abea66SRandall Stewart }
52120abea66SRandall Stewart
52220abea66SRandall Stewart static struct tcp_rate_set *
rt_setup_new_rs(struct ifnet * ifp,int * error)52320abea66SRandall Stewart rt_setup_new_rs(struct ifnet *ifp, int *error)
52420abea66SRandall Stewart {
52520abea66SRandall Stewart struct tcp_rate_set *rs;
52620abea66SRandall Stewart const uint64_t *rate_table_act;
52720abea66SRandall Stewart uint64_t lentim, res;
52820abea66SRandall Stewart size_t sz;
52920abea66SRandall Stewart uint32_t hash_type;
53020abea66SRandall Stewart int i;
53120abea66SRandall Stewart struct if_ratelimit_query_results rl;
53220abea66SRandall Stewart struct sysctl_oid *rl_sysctl_root;
5331a714ff2SRandall Stewart struct epoch_tracker et;
53420abea66SRandall Stewart /*
53520abea66SRandall Stewart * We expect to enter with the
53620abea66SRandall Stewart * mutex locked.
53720abea66SRandall Stewart */
53820abea66SRandall Stewart
53920abea66SRandall Stewart if (ifp->if_ratelimit_query == NULL) {
54020abea66SRandall Stewart /*
54120abea66SRandall Stewart * We can do nothing if we cannot
54220abea66SRandall Stewart * get a query back from the driver.
54320abea66SRandall Stewart */
544d7313dc6SRandall Stewart printf("Warning:No query functions for %s:%d-- failed\n",
545d7313dc6SRandall Stewart ifp->if_dname, ifp->if_dunit);
54620abea66SRandall Stewart return (NULL);
54720abea66SRandall Stewart }
54820abea66SRandall Stewart rs = malloc(sizeof(struct tcp_rate_set), M_TCPPACE, M_NOWAIT | M_ZERO);
54920abea66SRandall Stewart if (rs == NULL) {
55020abea66SRandall Stewart if (error)
55120abea66SRandall Stewart *error = ENOMEM;
552d7313dc6SRandall Stewart printf("Warning:No memory for malloc of tcp_rate_set\n");
55320abea66SRandall Stewart return (NULL);
55420abea66SRandall Stewart }
555d7313dc6SRandall Stewart memset(&rl, 0, sizeof(rl));
55620abea66SRandall Stewart rl.flags = RT_NOSUPPORT;
55720abea66SRandall Stewart ifp->if_ratelimit_query(ifp, &rl);
55820abea66SRandall Stewart if (rl.flags & RT_IS_UNUSABLE) {
55920abea66SRandall Stewart /*
56020abea66SRandall Stewart * The interface does not really support
56120abea66SRandall Stewart * the rate-limiting.
56220abea66SRandall Stewart */
56320abea66SRandall Stewart memset(rs, 0, sizeof(struct tcp_rate_set));
56420abea66SRandall Stewart rs->rs_ifp = ifp;
56520abea66SRandall Stewart rs->rs_if_dunit = ifp->if_dunit;
56620abea66SRandall Stewart rs->rs_flags = RS_INTF_NO_SUP;
56720abea66SRandall Stewart rs->rs_disable = 1;
56820abea66SRandall Stewart rs_number_alive++;
56920abea66SRandall Stewart sysctl_ctx_init(&rs->sysctl_ctx);
57020abea66SRandall Stewart rl_sysctl_root = SYSCTL_ADD_NODE(&rs->sysctl_ctx,
57120abea66SRandall Stewart SYSCTL_STATIC_CHILDREN(_net_inet_tcp_rl),
57220abea66SRandall Stewart OID_AUTO,
57320abea66SRandall Stewart rs->rs_ifp->if_xname,
5747029da5cSPawel Biernacki CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
57520abea66SRandall Stewart "");
57620abea66SRandall Stewart rl_add_syctl_entries(rl_sysctl_root, rs);
5771a714ff2SRandall Stewart NET_EPOCH_ENTER(et);
57820abea66SRandall Stewart mtx_lock(&rs_mtx);
57915ddc5e4SMichael Tuexen CK_LIST_INSERT_HEAD(&int_rs, rs, next);
58015ddc5e4SMichael Tuexen mtx_unlock(&rs_mtx);
5811a714ff2SRandall Stewart NET_EPOCH_EXIT(et);
58220abea66SRandall Stewart return (rs);
58320abea66SRandall Stewart } else if ((rl.flags & RT_IS_INDIRECT) == RT_IS_INDIRECT) {
58420abea66SRandall Stewart memset(rs, 0, sizeof(struct tcp_rate_set));
58520abea66SRandall Stewart rs->rs_ifp = ifp;
58620abea66SRandall Stewart rs->rs_if_dunit = ifp->if_dunit;
58720abea66SRandall Stewart rs->rs_flags = RS_IS_DEFF;
58820abea66SRandall Stewart rs_number_alive++;
58920abea66SRandall Stewart sysctl_ctx_init(&rs->sysctl_ctx);
59020abea66SRandall Stewart rl_sysctl_root = SYSCTL_ADD_NODE(&rs->sysctl_ctx,
59120abea66SRandall Stewart SYSCTL_STATIC_CHILDREN(_net_inet_tcp_rl),
59220abea66SRandall Stewart OID_AUTO,
59320abea66SRandall Stewart rs->rs_ifp->if_xname,
5947029da5cSPawel Biernacki CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
59520abea66SRandall Stewart "");
59620abea66SRandall Stewart rl_add_syctl_entries(rl_sysctl_root, rs);
5971a714ff2SRandall Stewart NET_EPOCH_ENTER(et);
59820abea66SRandall Stewart mtx_lock(&rs_mtx);
59915ddc5e4SMichael Tuexen CK_LIST_INSERT_HEAD(&int_rs, rs, next);
60015ddc5e4SMichael Tuexen mtx_unlock(&rs_mtx);
6011a714ff2SRandall Stewart NET_EPOCH_EXIT(et);
60220abea66SRandall Stewart return (rs);
60320abea66SRandall Stewart } else if ((rl.flags & RT_IS_FIXED_TABLE) == RT_IS_FIXED_TABLE) {
604d7313dc6SRandall Stewart /* Mellanox C4 likely */
60520abea66SRandall Stewart rs->rs_ifp = ifp;
60620abea66SRandall Stewart rs->rs_if_dunit = ifp->if_dunit;
60720abea66SRandall Stewart rs->rs_rate_cnt = rl.number_of_rates;
60820abea66SRandall Stewart rs->rs_min_seg = rl.min_segment_burst;
60920abea66SRandall Stewart rs->rs_highest_valid = 0;
61020abea66SRandall Stewart rs->rs_flow_limit = rl.max_flows;
61120abea66SRandall Stewart rs->rs_flags = RS_IS_INTF | RS_NO_PRE;
61220abea66SRandall Stewart rs->rs_disable = 0;
61320abea66SRandall Stewart rate_table_act = rl.rate_table;
61420abea66SRandall Stewart } else if ((rl.flags & RT_IS_SELECTABLE) == RT_IS_SELECTABLE) {
615d7313dc6SRandall Stewart /* Chelsio, C5 and C6 of Mellanox? */
61620abea66SRandall Stewart rs->rs_ifp = ifp;
61720abea66SRandall Stewart rs->rs_if_dunit = ifp->if_dunit;
61820abea66SRandall Stewart rs->rs_rate_cnt = rl.number_of_rates;
61920abea66SRandall Stewart rs->rs_min_seg = rl.min_segment_burst;
62020abea66SRandall Stewart rs->rs_disable = 0;
62120abea66SRandall Stewart rs->rs_flow_limit = rl.max_flows;
62220abea66SRandall Stewart rate_table_act = desired_rates;
62320abea66SRandall Stewart if ((rs->rs_rate_cnt > MAX_HDWR_RATES) &&
62420abea66SRandall Stewart (rs->rs_rate_cnt < ALL_HARDWARE_RATES)) {
62520abea66SRandall Stewart /*
62620abea66SRandall Stewart * Our desired table is not big
62720abea66SRandall Stewart * enough, do what we can.
62820abea66SRandall Stewart */
62920abea66SRandall Stewart rs->rs_rate_cnt = MAX_HDWR_RATES;
63020abea66SRandall Stewart }
63120abea66SRandall Stewart if (rs->rs_rate_cnt <= RS_ORDERED_COUNT)
63220abea66SRandall Stewart rs->rs_flags = RS_IS_INTF;
63320abea66SRandall Stewart else
63420abea66SRandall Stewart rs->rs_flags = RS_IS_INTF | RS_INT_TBL;
63520abea66SRandall Stewart if (rs->rs_rate_cnt >= ALL_HARDWARE_RATES)
63620abea66SRandall Stewart rs->rs_rate_cnt = ALL_HARDWARE_RATES;
63720abea66SRandall Stewart } else {
63820abea66SRandall Stewart free(rs, M_TCPPACE);
63920abea66SRandall Stewart return (NULL);
64020abea66SRandall Stewart }
64120abea66SRandall Stewart sz = sizeof(struct tcp_hwrate_limit_table) * rs->rs_rate_cnt;
64220abea66SRandall Stewart rs->rs_rlt = malloc(sz, M_TCPPACE, M_NOWAIT);
64320abea66SRandall Stewart if (rs->rs_rlt == NULL) {
64420abea66SRandall Stewart if (error)
64520abea66SRandall Stewart *error = ENOMEM;
64620abea66SRandall Stewart bail:
64720abea66SRandall Stewart free(rs, M_TCPPACE);
64820abea66SRandall Stewart return (NULL);
64920abea66SRandall Stewart }
65020abea66SRandall Stewart if (rs->rs_rate_cnt >= ALL_HARDWARE_RATES) {
65120abea66SRandall Stewart /*
65220abea66SRandall Stewart * The interface supports all
65320abea66SRandall Stewart * the rates we could possibly want.
65420abea66SRandall Stewart */
65520abea66SRandall Stewart uint64_t rat;
65620abea66SRandall Stewart
65720abea66SRandall Stewart rs->rs_rlt[0].rate = 12500; /* 100k */
65820abea66SRandall Stewart rs->rs_rlt[1].rate = 25000; /* 200k */
65920abea66SRandall Stewart rs->rs_rlt[2].rate = 62500; /* 500k */
66020abea66SRandall Stewart /* Note 125000 == 1Megabit
66120abea66SRandall Stewart * populate 1Meg - 1000meg.
66220abea66SRandall Stewart */
66320abea66SRandall Stewart for(i = 3, rat = 125000; i< (ALL_HARDWARE_RATES-1); i++) {
66420abea66SRandall Stewart rs->rs_rlt[i].rate = rat;
66520abea66SRandall Stewart rat += 125000;
66620abea66SRandall Stewart }
66720abea66SRandall Stewart rs->rs_rlt[(ALL_HARDWARE_RATES-1)].rate = 1250000000;
66820abea66SRandall Stewart } else if (rs->rs_flags & RS_INT_TBL) {
66920abea66SRandall Stewart /* We populate this in a special way */
67020abea66SRandall Stewart populate_canned_table(rs, rate_table_act);
67120abea66SRandall Stewart } else {
67220abea66SRandall Stewart /*
67320abea66SRandall Stewart * Just copy in the rates from
67420abea66SRandall Stewart * the table, it is in order.
67520abea66SRandall Stewart */
67620abea66SRandall Stewart for (i=0; i<rs->rs_rate_cnt; i++) {
67720abea66SRandall Stewart rs->rs_rlt[i].rate = rate_table_act[i];
67820abea66SRandall Stewart rs->rs_rlt[i].time_between = 0;
67920abea66SRandall Stewart rs->rs_rlt[i].flags = 0;
68020abea66SRandall Stewart }
68120abea66SRandall Stewart }
68220abea66SRandall Stewart for (i = (rs->rs_rate_cnt - 1); i >= 0; i--) {
68320abea66SRandall Stewart /*
68420abea66SRandall Stewart * We go backwards through the list so that if we can't get
68520abea66SRandall Stewart * a rate and fail to init one, we have at least a chance of
68620abea66SRandall Stewart * getting the highest one.
68720abea66SRandall Stewart */
68820abea66SRandall Stewart rs->rs_rlt[i].ptbl = rs;
68920abea66SRandall Stewart rs->rs_rlt[i].tag = NULL;
6905d8fd932SRandall Stewart rs->rs_rlt[i].using = 0;
6915d8fd932SRandall Stewart rs->rs_rlt[i].rs_num_enobufs = 0;
69220abea66SRandall Stewart /*
69320abea66SRandall Stewart * Calculate the time between.
69420abea66SRandall Stewart */
69520abea66SRandall Stewart lentim = ETHERNET_SEGMENT_SIZE * USECS_IN_SECOND;
69620abea66SRandall Stewart res = lentim / rs->rs_rlt[i].rate;
69720abea66SRandall Stewart if (res > 0)
69820abea66SRandall Stewart rs->rs_rlt[i].time_between = res;
69920abea66SRandall Stewart else
70020abea66SRandall Stewart rs->rs_rlt[i].time_between = 1;
70120abea66SRandall Stewart if (rs->rs_flags & RS_NO_PRE) {
70220abea66SRandall Stewart rs->rs_rlt[i].flags = HDWRPACE_INITED;
70320abea66SRandall Stewart rs->rs_lowest_valid = i;
70420abea66SRandall Stewart } else {
70520abea66SRandall Stewart int err;
706d7313dc6SRandall Stewart
707d7313dc6SRandall Stewart if ((rl.flags & RT_IS_SETUP_REQ) &&
708d7313dc6SRandall Stewart (ifp->if_ratelimit_query)) {
709d7313dc6SRandall Stewart err = ifp->if_ratelimit_setup(ifp,
710d7313dc6SRandall Stewart rs->rs_rlt[i].rate, i);
711d7313dc6SRandall Stewart if (err)
712d7313dc6SRandall Stewart goto handle_err;
713d7313dc6SRandall Stewart }
71420abea66SRandall Stewart #ifdef RSS
71520abea66SRandall Stewart hash_type = M_HASHTYPE_RSS_TCP_IPV4;
71620abea66SRandall Stewart #else
71720abea66SRandall Stewart hash_type = M_HASHTYPE_OPAQUE_HASH;
71820abea66SRandall Stewart #endif
71920abea66SRandall Stewart err = rl_attach_txrtlmt(ifp,
72020abea66SRandall Stewart hash_type,
72120abea66SRandall Stewart (i + 1),
72220abea66SRandall Stewart rs->rs_rlt[i].rate,
72320abea66SRandall Stewart &rs->rs_rlt[i].tag);
72420abea66SRandall Stewart if (err) {
725d7313dc6SRandall Stewart handle_err:
72620abea66SRandall Stewart if (i == (rs->rs_rate_cnt - 1)) {
72720abea66SRandall Stewart /*
72820abea66SRandall Stewart * Huh - first rate and we can't get
72920abea66SRandall Stewart * it?
73020abea66SRandall Stewart */
73120abea66SRandall Stewart free(rs->rs_rlt, M_TCPPACE);
73220abea66SRandall Stewart if (error)
73320abea66SRandall Stewart *error = err;
73420abea66SRandall Stewart goto bail;
73520abea66SRandall Stewart } else {
73620abea66SRandall Stewart if (error)
73720abea66SRandall Stewart *error = err;
73820abea66SRandall Stewart }
73920abea66SRandall Stewart break;
74020abea66SRandall Stewart } else {
74120abea66SRandall Stewart rs->rs_rlt[i].flags = HDWRPACE_INITED | HDWRPACE_TAGPRESENT;
74220abea66SRandall Stewart rs->rs_lowest_valid = i;
74320abea66SRandall Stewart }
74420abea66SRandall Stewart }
74520abea66SRandall Stewart }
74620abea66SRandall Stewart /* Did we get at least 1 rate? */
74720abea66SRandall Stewart if (rs->rs_rlt[(rs->rs_rate_cnt - 1)].flags & HDWRPACE_INITED)
74820abea66SRandall Stewart rs->rs_highest_valid = rs->rs_rate_cnt - 1;
74920abea66SRandall Stewart else {
75020abea66SRandall Stewart free(rs->rs_rlt, M_TCPPACE);
75120abea66SRandall Stewart goto bail;
75220abea66SRandall Stewart }
75320abea66SRandall Stewart rs_number_alive++;
75420abea66SRandall Stewart sysctl_ctx_init(&rs->sysctl_ctx);
75520abea66SRandall Stewart rl_sysctl_root = SYSCTL_ADD_NODE(&rs->sysctl_ctx,
75620abea66SRandall Stewart SYSCTL_STATIC_CHILDREN(_net_inet_tcp_rl),
75720abea66SRandall Stewart OID_AUTO,
75820abea66SRandall Stewart rs->rs_ifp->if_xname,
7597029da5cSPawel Biernacki CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
76020abea66SRandall Stewart "");
76120abea66SRandall Stewart rl_add_syctl_entries(rl_sysctl_root, rs);
7621a714ff2SRandall Stewart NET_EPOCH_ENTER(et);
76320abea66SRandall Stewart mtx_lock(&rs_mtx);
76415ddc5e4SMichael Tuexen CK_LIST_INSERT_HEAD(&int_rs, rs, next);
76515ddc5e4SMichael Tuexen mtx_unlock(&rs_mtx);
7661a714ff2SRandall Stewart NET_EPOCH_EXIT(et);
76720abea66SRandall Stewart return (rs);
76820abea66SRandall Stewart }
76920abea66SRandall Stewart
7701a714ff2SRandall Stewart /*
7711a714ff2SRandall Stewart * For an explanation of why the argument is volatile please
7721a714ff2SRandall Stewart * look at the comments around rt_setup_rate().
7731a714ff2SRandall Stewart */
77420abea66SRandall Stewart static const struct tcp_hwrate_limit_table *
tcp_int_find_suitable_rate(const volatile struct tcp_rate_set * rs,uint64_t bytes_per_sec,uint32_t flags,uint64_t * lower_rate)7751a714ff2SRandall Stewart tcp_int_find_suitable_rate(const volatile struct tcp_rate_set *rs,
7761a714ff2SRandall Stewart uint64_t bytes_per_sec, uint32_t flags, uint64_t *lower_rate)
77720abea66SRandall Stewart {
77820abea66SRandall Stewart struct tcp_hwrate_limit_table *arte = NULL, *rte = NULL;
7791a714ff2SRandall Stewart uint64_t mbits_per_sec, ind_calc, previous_rate = 0;
78020abea66SRandall Stewart int i;
78120abea66SRandall Stewart
78220abea66SRandall Stewart mbits_per_sec = (bytes_per_sec * 8);
78320abea66SRandall Stewart if (flags & RS_PACING_LT) {
78420abea66SRandall Stewart if ((mbits_per_sec < RS_ONE_MEGABIT_PERSEC) &&
78520abea66SRandall Stewart (rs->rs_lowest_valid <= 2)){
78620abea66SRandall Stewart /*
78720abea66SRandall Stewart * Smaller than 1Meg, only
78820abea66SRandall Stewart * 3 entries can match it.
78920abea66SRandall Stewart */
7901a714ff2SRandall Stewart previous_rate = 0;
79120abea66SRandall Stewart for(i = rs->rs_lowest_valid; i < 3; i++) {
79220abea66SRandall Stewart if (bytes_per_sec <= rs->rs_rlt[i].rate) {
79320abea66SRandall Stewart rte = &rs->rs_rlt[i];
79420abea66SRandall Stewart break;
79520abea66SRandall Stewart } else if (rs->rs_rlt[i].flags & HDWRPACE_INITED) {
79620abea66SRandall Stewart arte = &rs->rs_rlt[i];
79720abea66SRandall Stewart }
7981a714ff2SRandall Stewart previous_rate = rs->rs_rlt[i].rate;
79920abea66SRandall Stewart }
80020abea66SRandall Stewart goto done;
80120abea66SRandall Stewart } else if ((mbits_per_sec > RS_ONE_GIGABIT_PERSEC) &&
80220abea66SRandall Stewart (rs->rs_rlt[(ALL_HARDWARE_RATES-1)].flags & HDWRPACE_INITED)){
80320abea66SRandall Stewart /*
80420abea66SRandall Stewart * Larger than 1G (the majority of
80520abea66SRandall Stewart * our table.
80620abea66SRandall Stewart */
80720abea66SRandall Stewart if (mbits_per_sec < RS_TEN_GIGABIT_PERSEC)
80820abea66SRandall Stewart rte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)];
80920abea66SRandall Stewart else
81020abea66SRandall Stewart arte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)];
8111a714ff2SRandall Stewart previous_rate = rs->rs_rlt[(ALL_HARDWARE_RATES-2)].rate;
81220abea66SRandall Stewart goto done;
81320abea66SRandall Stewart }
81420abea66SRandall Stewart /*
81520abea66SRandall Stewart * If we reach here its in our table (between 1Meg - 1000Meg),
81620abea66SRandall Stewart * just take the rounded down mbits per second, and add
81720abea66SRandall Stewart * 1Megabit to it, from this we can calculate
81820abea66SRandall Stewart * the index in the table.
81920abea66SRandall Stewart */
82020abea66SRandall Stewart ind_calc = mbits_per_sec/RS_ONE_MEGABIT_PERSEC;
82120abea66SRandall Stewart if ((ind_calc * RS_ONE_MEGABIT_PERSEC) != mbits_per_sec)
82220abea66SRandall Stewart ind_calc++;
82320abea66SRandall Stewart /* our table is offset by 3, we add 2 */
82420abea66SRandall Stewart ind_calc += 2;
82520abea66SRandall Stewart if (ind_calc > (ALL_HARDWARE_RATES-1)) {
82620abea66SRandall Stewart /* This should not happen */
82720abea66SRandall Stewart ind_calc = ALL_HARDWARE_RATES-1;
82820abea66SRandall Stewart }
82920abea66SRandall Stewart if ((ind_calc >= rs->rs_lowest_valid) &&
8301a714ff2SRandall Stewart (ind_calc <= rs->rs_highest_valid)) {
83120abea66SRandall Stewart rte = &rs->rs_rlt[ind_calc];
8321a714ff2SRandall Stewart if (ind_calc >= 1)
8331a714ff2SRandall Stewart previous_rate = rs->rs_rlt[(ind_calc-1)].rate;
8341a714ff2SRandall Stewart }
83520abea66SRandall Stewart } else if (flags & RS_PACING_EXACT_MATCH) {
83620abea66SRandall Stewart if ((mbits_per_sec < RS_ONE_MEGABIT_PERSEC) &&
83720abea66SRandall Stewart (rs->rs_lowest_valid <= 2)){
83820abea66SRandall Stewart for(i = rs->rs_lowest_valid; i < 3; i++) {
83920abea66SRandall Stewart if (bytes_per_sec == rs->rs_rlt[i].rate) {
84020abea66SRandall Stewart rte = &rs->rs_rlt[i];
84120abea66SRandall Stewart break;
84220abea66SRandall Stewart }
84320abea66SRandall Stewart }
84420abea66SRandall Stewart } else if ((mbits_per_sec > RS_ONE_GIGABIT_PERSEC) &&
84520abea66SRandall Stewart (rs->rs_rlt[(ALL_HARDWARE_RATES-1)].flags & HDWRPACE_INITED)) {
84620abea66SRandall Stewart /* > 1Gbps only one rate */
84720abea66SRandall Stewart if (bytes_per_sec == rs->rs_rlt[(ALL_HARDWARE_RATES-1)].rate) {
84820abea66SRandall Stewart /* Its 10G wow */
84920abea66SRandall Stewart rte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)];
85020abea66SRandall Stewart }
85120abea66SRandall Stewart } else {
85220abea66SRandall Stewart /* Ok it must be a exact meg (its between 1G and 1Meg) */
85320abea66SRandall Stewart ind_calc = mbits_per_sec/RS_ONE_MEGABIT_PERSEC;
85420abea66SRandall Stewart if ((ind_calc * RS_ONE_MEGABIT_PERSEC) == mbits_per_sec) {
85520abea66SRandall Stewart /* its an exact Mbps */
85620abea66SRandall Stewart ind_calc += 2;
85720abea66SRandall Stewart if (ind_calc > (ALL_HARDWARE_RATES-1)) {
85820abea66SRandall Stewart /* This should not happen */
85920abea66SRandall Stewart ind_calc = ALL_HARDWARE_RATES-1;
86020abea66SRandall Stewart }
86120abea66SRandall Stewart if (rs->rs_rlt[ind_calc].flags & HDWRPACE_INITED)
86220abea66SRandall Stewart rte = &rs->rs_rlt[ind_calc];
86320abea66SRandall Stewart }
86420abea66SRandall Stewart }
86520abea66SRandall Stewart } else {
86620abea66SRandall Stewart /* we want greater than the requested rate */
86720abea66SRandall Stewart if ((mbits_per_sec < RS_ONE_MEGABIT_PERSEC) &&
86820abea66SRandall Stewart (rs->rs_lowest_valid <= 2)){
86920abea66SRandall Stewart arte = &rs->rs_rlt[3]; /* set alternate to 1Meg */
87020abea66SRandall Stewart for (i=2; i>=rs->rs_lowest_valid; i--) {
87120abea66SRandall Stewart if (bytes_per_sec < rs->rs_rlt[i].rate) {
87220abea66SRandall Stewart rte = &rs->rs_rlt[i];
8731a714ff2SRandall Stewart if (i >= 1) {
8741a714ff2SRandall Stewart previous_rate = rs->rs_rlt[(i-1)].rate;
8751a714ff2SRandall Stewart }
87620abea66SRandall Stewart break;
87720abea66SRandall Stewart } else if ((flags & RS_PACING_GEQ) &&
87820abea66SRandall Stewart (bytes_per_sec == rs->rs_rlt[i].rate)) {
87920abea66SRandall Stewart rte = &rs->rs_rlt[i];
8801a714ff2SRandall Stewart if (i >= 1) {
8811a714ff2SRandall Stewart previous_rate = rs->rs_rlt[(i-1)].rate;
8821a714ff2SRandall Stewart }
88320abea66SRandall Stewart break;
88420abea66SRandall Stewart } else {
88520abea66SRandall Stewart arte = &rs->rs_rlt[i]; /* new alternate */
88620abea66SRandall Stewart }
88720abea66SRandall Stewart }
88820abea66SRandall Stewart } else if (mbits_per_sec > RS_ONE_GIGABIT_PERSEC) {
88920abea66SRandall Stewart if ((bytes_per_sec < rs->rs_rlt[(ALL_HARDWARE_RATES-1)].rate) &&
89020abea66SRandall Stewart (rs->rs_rlt[(ALL_HARDWARE_RATES-1)].flags & HDWRPACE_INITED)){
89120abea66SRandall Stewart /* Our top rate is larger than the request */
89220abea66SRandall Stewart rte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)];
89320abea66SRandall Stewart } else if ((flags & RS_PACING_GEQ) &&
89420abea66SRandall Stewart (bytes_per_sec == rs->rs_rlt[(ALL_HARDWARE_RATES-1)].rate) &&
89520abea66SRandall Stewart (rs->rs_rlt[(ALL_HARDWARE_RATES-1)].flags & HDWRPACE_INITED)) {
89620abea66SRandall Stewart /* It matches our top rate */
89720abea66SRandall Stewart rte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)];
89820abea66SRandall Stewart } else if (rs->rs_rlt[(ALL_HARDWARE_RATES-1)].flags & HDWRPACE_INITED) {
89920abea66SRandall Stewart /* The top rate is an alternative */
90020abea66SRandall Stewart arte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)];
90120abea66SRandall Stewart }
9021a714ff2SRandall Stewart previous_rate = rs->rs_rlt[(ALL_HARDWARE_RATES-2)].rate;
90320abea66SRandall Stewart } else {
90420abea66SRandall Stewart /* Its in our range 1Meg - 1Gig */
90520abea66SRandall Stewart if (flags & RS_PACING_GEQ) {
90620abea66SRandall Stewart ind_calc = mbits_per_sec/RS_ONE_MEGABIT_PERSEC;
90720abea66SRandall Stewart if ((ind_calc * RS_ONE_MEGABIT_PERSEC) == mbits_per_sec) {
90820abea66SRandall Stewart if (ind_calc > (ALL_HARDWARE_RATES-1)) {
90920abea66SRandall Stewart /* This should not happen */
91020abea66SRandall Stewart ind_calc = (ALL_HARDWARE_RATES-1);
91120abea66SRandall Stewart }
91220abea66SRandall Stewart rte = &rs->rs_rlt[ind_calc];
9131a714ff2SRandall Stewart if (ind_calc >= 1)
9141a714ff2SRandall Stewart previous_rate = rs->rs_rlt[(ind_calc-1)].rate;
91520abea66SRandall Stewart }
91620abea66SRandall Stewart goto done;
91720abea66SRandall Stewart }
91820abea66SRandall Stewart ind_calc = (mbits_per_sec + (RS_ONE_MEGABIT_PERSEC-1))/RS_ONE_MEGABIT_PERSEC;
91920abea66SRandall Stewart ind_calc += 2;
92020abea66SRandall Stewart if (ind_calc > (ALL_HARDWARE_RATES-1)) {
92120abea66SRandall Stewart /* This should not happen */
92220abea66SRandall Stewart ind_calc = ALL_HARDWARE_RATES-1;
92320abea66SRandall Stewart }
9241a714ff2SRandall Stewart if (rs->rs_rlt[ind_calc].flags & HDWRPACE_INITED) {
92520abea66SRandall Stewart rte = &rs->rs_rlt[ind_calc];
9261a714ff2SRandall Stewart if (ind_calc >= 1)
9271a714ff2SRandall Stewart previous_rate = rs->rs_rlt[(ind_calc-1)].rate;
9281a714ff2SRandall Stewart }
92920abea66SRandall Stewart }
93020abea66SRandall Stewart }
93120abea66SRandall Stewart done:
93220abea66SRandall Stewart if ((rte == NULL) &&
93320abea66SRandall Stewart (arte != NULL) &&
93420abea66SRandall Stewart (flags & RS_PACING_SUB_OK)) {
93520abea66SRandall Stewart /* We can use the substitute */
93620abea66SRandall Stewart rte = arte;
93720abea66SRandall Stewart }
9381a714ff2SRandall Stewart if (lower_rate)
9391a714ff2SRandall Stewart *lower_rate = previous_rate;
94020abea66SRandall Stewart return (rte);
94120abea66SRandall Stewart }
94220abea66SRandall Stewart
9431a714ff2SRandall Stewart /*
9441a714ff2SRandall Stewart * For an explanation of why the argument is volatile please
9451a714ff2SRandall Stewart * look at the comments around rt_setup_rate().
9461a714ff2SRandall Stewart */
94720abea66SRandall Stewart static const struct tcp_hwrate_limit_table *
tcp_find_suitable_rate(const volatile struct tcp_rate_set * rs,uint64_t bytes_per_sec,uint32_t flags,uint64_t * lower_rate)9481a714ff2SRandall Stewart tcp_find_suitable_rate(const volatile struct tcp_rate_set *rs, uint64_t bytes_per_sec, uint32_t flags, uint64_t *lower_rate)
94920abea66SRandall Stewart {
95020abea66SRandall Stewart /**
95120abea66SRandall Stewart * Hunt the rate table with the restrictions in flags and find a
95220abea66SRandall Stewart * suitable rate if possible.
95320abea66SRandall Stewart * RS_PACING_EXACT_MATCH - look for an exact match to rate.
95420abea66SRandall Stewart * RS_PACING_GT - must be greater than.
95520abea66SRandall Stewart * RS_PACING_GEQ - must be greater than or equal.
95620abea66SRandall Stewart * RS_PACING_LT - must be less than.
95720abea66SRandall Stewart * RS_PACING_SUB_OK - If we don't meet criteria a
95820abea66SRandall Stewart * substitute is ok.
95920abea66SRandall Stewart */
96020abea66SRandall Stewart int i, matched;
96120abea66SRandall Stewart struct tcp_hwrate_limit_table *rte = NULL;
9621a714ff2SRandall Stewart uint64_t previous_rate = 0;
96320abea66SRandall Stewart
96420abea66SRandall Stewart if ((rs->rs_flags & RS_INT_TBL) &&
96520abea66SRandall Stewart (rs->rs_rate_cnt >= ALL_HARDWARE_RATES)) {
96620abea66SRandall Stewart /*
96720abea66SRandall Stewart * Here we don't want to paw thru
96820abea66SRandall Stewart * a big table, we have everything
96920abea66SRandall Stewart * from 1Meg - 1000Meg in 1Meg increments.
97020abea66SRandall Stewart * Use an alternate method to "lookup".
97120abea66SRandall Stewart */
9721a714ff2SRandall Stewart return (tcp_int_find_suitable_rate(rs, bytes_per_sec, flags, lower_rate));
97320abea66SRandall Stewart }
97420abea66SRandall Stewart if ((flags & RS_PACING_LT) ||
97520abea66SRandall Stewart (flags & RS_PACING_EXACT_MATCH)) {
97620abea66SRandall Stewart /*
97720abea66SRandall Stewart * For exact and less than we go forward through the table.
97820abea66SRandall Stewart * This way when we find one larger we stop (exact was a
97920abea66SRandall Stewart * toss up).
98020abea66SRandall Stewart */
98120abea66SRandall Stewart for (i = rs->rs_lowest_valid, matched = 0; i <= rs->rs_highest_valid; i++) {
98220abea66SRandall Stewart if ((flags & RS_PACING_EXACT_MATCH) &&
98320abea66SRandall Stewart (bytes_per_sec == rs->rs_rlt[i].rate)) {
98420abea66SRandall Stewart rte = &rs->rs_rlt[i];
98520abea66SRandall Stewart matched = 1;
9861a714ff2SRandall Stewart if (lower_rate != NULL)
9871a714ff2SRandall Stewart *lower_rate = previous_rate;
98820abea66SRandall Stewart break;
98920abea66SRandall Stewart } else if ((flags & RS_PACING_LT) &&
99020abea66SRandall Stewart (bytes_per_sec <= rs->rs_rlt[i].rate)) {
99120abea66SRandall Stewart rte = &rs->rs_rlt[i];
99220abea66SRandall Stewart matched = 1;
9931a714ff2SRandall Stewart if (lower_rate != NULL)
9941a714ff2SRandall Stewart *lower_rate = previous_rate;
99520abea66SRandall Stewart break;
99620abea66SRandall Stewart }
9971a714ff2SRandall Stewart previous_rate = rs->rs_rlt[i].rate;
99820abea66SRandall Stewart if (bytes_per_sec > rs->rs_rlt[i].rate)
99920abea66SRandall Stewart break;
100020abea66SRandall Stewart }
100120abea66SRandall Stewart if ((matched == 0) &&
100220abea66SRandall Stewart (flags & RS_PACING_LT) &&
100320abea66SRandall Stewart (flags & RS_PACING_SUB_OK)) {
100420abea66SRandall Stewart /* Kick in a substitute (the lowest) */
100520abea66SRandall Stewart rte = &rs->rs_rlt[rs->rs_lowest_valid];
100620abea66SRandall Stewart }
100720abea66SRandall Stewart } else {
100820abea66SRandall Stewart /*
100920abea66SRandall Stewart * Here we go backward through the table so that we can find
101020abea66SRandall Stewart * the one greater in theory faster (but its probably a
101120abea66SRandall Stewart * wash).
101220abea66SRandall Stewart */
101320abea66SRandall Stewart for (i = rs->rs_highest_valid, matched = 0; i >= rs->rs_lowest_valid; i--) {
101420abea66SRandall Stewart if (rs->rs_rlt[i].rate > bytes_per_sec) {
101520abea66SRandall Stewart /* A possible candidate */
101620abea66SRandall Stewart rte = &rs->rs_rlt[i];
101720abea66SRandall Stewart }
101820abea66SRandall Stewart if ((flags & RS_PACING_GEQ) &&
101920abea66SRandall Stewart (bytes_per_sec == rs->rs_rlt[i].rate)) {
102020abea66SRandall Stewart /* An exact match and we want equal */
102120abea66SRandall Stewart matched = 1;
102220abea66SRandall Stewart rte = &rs->rs_rlt[i];
102320abea66SRandall Stewart break;
102420abea66SRandall Stewart } else if (rte) {
102520abea66SRandall Stewart /*
102620abea66SRandall Stewart * Found one that is larger than but don't
102720abea66SRandall Stewart * stop, there may be a more closer match.
102820abea66SRandall Stewart */
102920abea66SRandall Stewart matched = 1;
103020abea66SRandall Stewart }
103120abea66SRandall Stewart if (rs->rs_rlt[i].rate < bytes_per_sec) {
103220abea66SRandall Stewart /*
103320abea66SRandall Stewart * We found a table entry that is smaller,
103420abea66SRandall Stewart * stop there will be none greater or equal.
103520abea66SRandall Stewart */
10361a714ff2SRandall Stewart if (lower_rate != NULL)
10371a714ff2SRandall Stewart *lower_rate = rs->rs_rlt[i].rate;
103820abea66SRandall Stewart break;
103920abea66SRandall Stewart }
104020abea66SRandall Stewart }
104120abea66SRandall Stewart if ((matched == 0) &&
104220abea66SRandall Stewart (flags & RS_PACING_SUB_OK)) {
104320abea66SRandall Stewart /* Kick in a substitute (the highest) */
104420abea66SRandall Stewart rte = &rs->rs_rlt[rs->rs_highest_valid];
104520abea66SRandall Stewart }
104620abea66SRandall Stewart }
104720abea66SRandall Stewart return (rte);
104820abea66SRandall Stewart }
104920abea66SRandall Stewart
105020abea66SRandall Stewart static struct ifnet *
rt_find_real_interface(struct ifnet * ifp,struct inpcb * inp,int * error)105120abea66SRandall Stewart rt_find_real_interface(struct ifnet *ifp, struct inpcb *inp, int *error)
105220abea66SRandall Stewart {
105320abea66SRandall Stewart struct ifnet *tifp;
10541a714ff2SRandall Stewart struct m_snd_tag *tag, *ntag;
105520abea66SRandall Stewart union if_snd_tag_alloc_params params = {
105620abea66SRandall Stewart .rate_limit.hdr.type = IF_SND_TAG_TYPE_RATE_LIMIT,
10571a714ff2SRandall Stewart .rate_limit.hdr.flowid = inp->inp_flowid,
105898085baeSAndrew Gallatin .rate_limit.hdr.numa_domain = inp->inp_numa_domain,
105920abea66SRandall Stewart .rate_limit.max_rate = COMMON_RATE,
106020abea66SRandall Stewart .rate_limit.flags = M_NOWAIT,
106120abea66SRandall Stewart };
106220abea66SRandall Stewart int err;
106320abea66SRandall Stewart #ifdef RSS
106420abea66SRandall Stewart params.rate_limit.hdr.flowtype = ((inp->inp_vflag & INP_IPV6) ?
106520abea66SRandall Stewart M_HASHTYPE_RSS_TCP_IPV6 : M_HASHTYPE_RSS_TCP_IPV4);
106620abea66SRandall Stewart #else
106720abea66SRandall Stewart params.rate_limit.hdr.flowtype = M_HASHTYPE_OPAQUE_HASH;
106820abea66SRandall Stewart #endif
106936e0a362SJohn Baldwin err = m_snd_tag_alloc(ifp, ¶ms, &tag);
107020abea66SRandall Stewart if (err) {
107120abea66SRandall Stewart /* Failed to setup a tag? */
107220abea66SRandall Stewart if (error)
107320abea66SRandall Stewart *error = err;
107420abea66SRandall Stewart return (NULL);
107520abea66SRandall Stewart }
10761a714ff2SRandall Stewart ntag = tag;
1077c782ea8bSJohn Baldwin while (ntag->sw->next_snd_tag != NULL) {
1078c782ea8bSJohn Baldwin ntag = ntag->sw->next_snd_tag(ntag);
10791a714ff2SRandall Stewart }
10801a714ff2SRandall Stewart tifp = ntag->ifp;
108198d7a8d9SJohn Baldwin m_snd_tag_rele(tag);
108220abea66SRandall Stewart return (tifp);
108320abea66SRandall Stewart }
108420abea66SRandall Stewart
10851a714ff2SRandall Stewart static void
rl_increment_using(const struct tcp_hwrate_limit_table * rte)10861a714ff2SRandall Stewart rl_increment_using(const struct tcp_hwrate_limit_table *rte)
10871a714ff2SRandall Stewart {
10885d8fd932SRandall Stewart struct tcp_hwrate_limit_table *decon_rte;
10895d8fd932SRandall Stewart
10905d8fd932SRandall Stewart decon_rte = __DECONST(struct tcp_hwrate_limit_table *, rte);
10915d8fd932SRandall Stewart atomic_add_long(&decon_rte->using, 1);
10921a714ff2SRandall Stewart }
10931a714ff2SRandall Stewart
10941a714ff2SRandall Stewart static void
rl_decrement_using(const struct tcp_hwrate_limit_table * rte)10951a714ff2SRandall Stewart rl_decrement_using(const struct tcp_hwrate_limit_table *rte)
10961a714ff2SRandall Stewart {
10975d8fd932SRandall Stewart struct tcp_hwrate_limit_table *decon_rte;
10985d8fd932SRandall Stewart
10995d8fd932SRandall Stewart decon_rte = __DECONST(struct tcp_hwrate_limit_table *, rte);
11005d8fd932SRandall Stewart atomic_subtract_long(&decon_rte->using, 1);
11011a714ff2SRandall Stewart }
11021a714ff2SRandall Stewart
11031a714ff2SRandall Stewart void
tcp_rl_log_enobuf(const struct tcp_hwrate_limit_table * rte)11041a714ff2SRandall Stewart tcp_rl_log_enobuf(const struct tcp_hwrate_limit_table *rte)
11051a714ff2SRandall Stewart {
11065d8fd932SRandall Stewart struct tcp_hwrate_limit_table *decon_rte;
11075d8fd932SRandall Stewart
11085d8fd932SRandall Stewart decon_rte = __DECONST(struct tcp_hwrate_limit_table *, rte);
11095d8fd932SRandall Stewart atomic_add_long(&decon_rte->rs_num_enobufs, 1);
11101a714ff2SRandall Stewart }
11111a714ff2SRandall Stewart
11121a714ff2SRandall Stewart /*
11131a714ff2SRandall Stewart * Do NOT take the __noinline out of the
11141a714ff2SRandall Stewart * find_rs_for_ifp() function. If you do the inline
11151a714ff2SRandall Stewart * of it for the rt_setup_rate() will show you a
11161a714ff2SRandall Stewart * compiler bug. For some reason the compiler thinks
11171a714ff2SRandall Stewart * the list can never be empty. The consequence of
11181a714ff2SRandall Stewart * this will be a crash when we dereference NULL
11191a714ff2SRandall Stewart * if an ifp is removed just has a hw rate limit
11201a714ff2SRandall Stewart * is attempted. If you are working on the compiler
11211a714ff2SRandall Stewart * and want to "test" this go ahead and take the noinline
11221a714ff2SRandall Stewart * out otherwise let sleeping dogs ly until such time
11231a714ff2SRandall Stewart * as we get a compiler fix 10/2/20 -- RRS
11241a714ff2SRandall Stewart */
11251a714ff2SRandall Stewart static __noinline struct tcp_rate_set *
find_rs_for_ifp(struct ifnet * ifp)11261a714ff2SRandall Stewart find_rs_for_ifp(struct ifnet *ifp)
11271a714ff2SRandall Stewart {
11281a714ff2SRandall Stewart struct tcp_rate_set *rs;
11291a714ff2SRandall Stewart
11301a714ff2SRandall Stewart CK_LIST_FOREACH(rs, &int_rs, next) {
11311a714ff2SRandall Stewart if ((rs->rs_ifp == ifp) &&
11321a714ff2SRandall Stewart (rs->rs_if_dunit == ifp->if_dunit)) {
11331a714ff2SRandall Stewart /* Ok we found it */
11341a714ff2SRandall Stewart return (rs);
11351a714ff2SRandall Stewart }
11361a714ff2SRandall Stewart }
11371a714ff2SRandall Stewart return (NULL);
11381a714ff2SRandall Stewart }
11391a714ff2SRandall Stewart
11401a714ff2SRandall Stewart
114120abea66SRandall Stewart static const struct tcp_hwrate_limit_table *
rt_setup_rate(struct inpcb * inp,struct ifnet * ifp,uint64_t bytes_per_sec,uint32_t flags,int * error,uint64_t * lower_rate)114220abea66SRandall Stewart rt_setup_rate(struct inpcb *inp, struct ifnet *ifp, uint64_t bytes_per_sec,
11431a714ff2SRandall Stewart uint32_t flags, int *error, uint64_t *lower_rate)
114420abea66SRandall Stewart {
114520abea66SRandall Stewart /* First lets find the interface if it exists */
114620abea66SRandall Stewart const struct tcp_hwrate_limit_table *rte;
11471a714ff2SRandall Stewart /*
11481a714ff2SRandall Stewart * So why is rs volatile? This is to defeat a
11491a714ff2SRandall Stewart * compiler bug where in the compiler is convinced
11501a714ff2SRandall Stewart * that rs can never be NULL (which is not true). Because
11511a714ff2SRandall Stewart * of its conviction it nicely optimizes out the if ((rs == NULL
11521a714ff2SRandall Stewart * below which means if you get a NULL back you dereference it.
11531a714ff2SRandall Stewart */
11541a714ff2SRandall Stewart volatile struct tcp_rate_set *rs;
115520abea66SRandall Stewart struct epoch_tracker et;
11561a714ff2SRandall Stewart struct ifnet *oifp = ifp;
115720abea66SRandall Stewart int err;
115820abea66SRandall Stewart
1159348404bcSRandall Stewart NET_EPOCH_ENTER(et);
116020abea66SRandall Stewart use_real_interface:
11611a714ff2SRandall Stewart rs = find_rs_for_ifp(ifp);
116220abea66SRandall Stewart if ((rs == NULL) ||
116320abea66SRandall Stewart (rs->rs_flags & RS_INTF_NO_SUP) ||
116420abea66SRandall Stewart (rs->rs_flags & RS_IS_DEAD)) {
116520abea66SRandall Stewart /*
116620abea66SRandall Stewart * This means we got a packet *before*
116720abea66SRandall Stewart * the IF-UP was processed below, <or>
116820abea66SRandall Stewart * while or after we already received an interface
116920abea66SRandall Stewart * departed event. In either case we really don't
117020abea66SRandall Stewart * want to do anything with pacing, in
117120abea66SRandall Stewart * the departing case the packet is not
117220abea66SRandall Stewart * going to go very far. The new case
117320abea66SRandall Stewart * might be arguable, but its impossible
117420abea66SRandall Stewart * to tell from the departing case.
117520abea66SRandall Stewart */
11761a714ff2SRandall Stewart if (error)
117720abea66SRandall Stewart *error = ENODEV;
1178348404bcSRandall Stewart NET_EPOCH_EXIT(et);
117920abea66SRandall Stewart return (NULL);
118020abea66SRandall Stewart }
118120abea66SRandall Stewart
118220abea66SRandall Stewart if ((rs == NULL) || (rs->rs_disable != 0)) {
11831a714ff2SRandall Stewart if (error)
118420abea66SRandall Stewart *error = ENOSPC;
1185348404bcSRandall Stewart NET_EPOCH_EXIT(et);
118620abea66SRandall Stewart return (NULL);
118720abea66SRandall Stewart }
118820abea66SRandall Stewart if (rs->rs_flags & RS_IS_DEFF) {
118920abea66SRandall Stewart /* We need to find the real interface */
119020abea66SRandall Stewart struct ifnet *tifp;
119120abea66SRandall Stewart
119220abea66SRandall Stewart tifp = rt_find_real_interface(ifp, inp, error);
119320abea66SRandall Stewart if (tifp == NULL) {
119420abea66SRandall Stewart if (rs->rs_disable && error)
119520abea66SRandall Stewart *error = ENOTSUP;
1196348404bcSRandall Stewart NET_EPOCH_EXIT(et);
119720abea66SRandall Stewart return (NULL);
119820abea66SRandall Stewart }
11991a714ff2SRandall Stewart KASSERT((tifp != ifp),
12001a714ff2SRandall Stewart ("Lookup failure ifp:%p inp:%p rt_find_real_interface() returns the same interface tifp:%p?\n",
12011a714ff2SRandall Stewart ifp, inp, tifp));
12021a714ff2SRandall Stewart ifp = tifp;
120320abea66SRandall Stewart goto use_real_interface;
120420abea66SRandall Stewart }
120520abea66SRandall Stewart if (rs->rs_flow_limit &&
120620abea66SRandall Stewart ((rs->rs_flows_using + 1) > rs->rs_flow_limit)) {
120720abea66SRandall Stewart if (error)
120820abea66SRandall Stewart *error = ENOSPC;
1209348404bcSRandall Stewart NET_EPOCH_EXIT(et);
121020abea66SRandall Stewart return (NULL);
121120abea66SRandall Stewart }
12121a714ff2SRandall Stewart rte = tcp_find_suitable_rate(rs, bytes_per_sec, flags, lower_rate);
121320abea66SRandall Stewart if (rte) {
12141a714ff2SRandall Stewart err = in_pcbattach_txrtlmt(inp, oifp,
121520abea66SRandall Stewart inp->inp_flowtype,
121620abea66SRandall Stewart inp->inp_flowid,
121720abea66SRandall Stewart rte->rate,
121820abea66SRandall Stewart &inp->inp_snd_tag);
121920abea66SRandall Stewart if (err) {
122020abea66SRandall Stewart /* Failed to attach */
122120abea66SRandall Stewart if (error)
122220abea66SRandall Stewart *error = err;
122320abea66SRandall Stewart rte = NULL;
12241a714ff2SRandall Stewart } else {
12251a714ff2SRandall Stewart KASSERT((inp->inp_snd_tag != NULL) ,
1226db46c0d0SHans Petter Selasky ("Setup rate has no snd_tag inp:%p rte:%p rate:%llu rs:%p",
1227db46c0d0SHans Petter Selasky inp, rte, (unsigned long long)rte->rate, rs));
1228db46c0d0SHans Petter Selasky #ifdef INET
12291a714ff2SRandall Stewart counter_u64_add(rate_limit_new, 1);
1230db46c0d0SHans Petter Selasky #endif
123120abea66SRandall Stewart }
123220abea66SRandall Stewart }
123320abea66SRandall Stewart if (rte) {
123420abea66SRandall Stewart /*
123520abea66SRandall Stewart * We use an atomic here for accounting so we don't have to
123620abea66SRandall Stewart * use locks when freeing.
123720abea66SRandall Stewart */
123899c311c4SRandall Stewart atomic_add_64(&rs->rs_flows_using, 1);
123920abea66SRandall Stewart }
1240348404bcSRandall Stewart NET_EPOCH_EXIT(et);
124120abea66SRandall Stewart return (rte);
124220abea66SRandall Stewart }
124320abea66SRandall Stewart
124420abea66SRandall Stewart static void
tcp_rl_ifnet_link(void * arg __unused,struct ifnet * ifp,int link_state)124520abea66SRandall Stewart tcp_rl_ifnet_link(void *arg __unused, struct ifnet *ifp, int link_state)
124620abea66SRandall Stewart {
124720abea66SRandall Stewart int error;
124820abea66SRandall Stewart struct tcp_rate_set *rs;
12491a714ff2SRandall Stewart struct epoch_tracker et;
125020abea66SRandall Stewart
12519aed26b9SJohn Baldwin if (((ifp->if_capenable & IFCAP_TXRTLMT) == 0) ||
125220abea66SRandall Stewart (link_state != LINK_STATE_UP)) {
125320abea66SRandall Stewart /*
125420abea66SRandall Stewart * We only care on an interface going up that is rate-limit
125520abea66SRandall Stewart * capable.
125620abea66SRandall Stewart */
125720abea66SRandall Stewart return;
125820abea66SRandall Stewart }
12591a714ff2SRandall Stewart NET_EPOCH_ENTER(et);
126020abea66SRandall Stewart mtx_lock(&rs_mtx);
12611a714ff2SRandall Stewart rs = find_rs_for_ifp(ifp);
12621a714ff2SRandall Stewart if (rs) {
126320abea66SRandall Stewart /* We already have initialized this guy */
126420abea66SRandall Stewart mtx_unlock(&rs_mtx);
12651a714ff2SRandall Stewart NET_EPOCH_EXIT(et);
126620abea66SRandall Stewart return;
126720abea66SRandall Stewart }
126820abea66SRandall Stewart mtx_unlock(&rs_mtx);
12691a714ff2SRandall Stewart NET_EPOCH_EXIT(et);
127015ddc5e4SMichael Tuexen rt_setup_new_rs(ifp, &error);
127120abea66SRandall Stewart }
127220abea66SRandall Stewart
127320abea66SRandall Stewart static void
tcp_rl_ifnet_departure(void * arg __unused,struct ifnet * ifp)127420abea66SRandall Stewart tcp_rl_ifnet_departure(void *arg __unused, struct ifnet *ifp)
127520abea66SRandall Stewart {
12761a714ff2SRandall Stewart struct tcp_rate_set *rs;
12771a714ff2SRandall Stewart struct epoch_tracker et;
127820abea66SRandall Stewart int i;
127920abea66SRandall Stewart
12801a714ff2SRandall Stewart NET_EPOCH_ENTER(et);
128120abea66SRandall Stewart mtx_lock(&rs_mtx);
12821a714ff2SRandall Stewart rs = find_rs_for_ifp(ifp);
12831a714ff2SRandall Stewart if (rs) {
128420abea66SRandall Stewart CK_LIST_REMOVE(rs, next);
128520abea66SRandall Stewart rs_number_alive--;
128620abea66SRandall Stewart rs->rs_flags |= RS_IS_DEAD;
128720abea66SRandall Stewart for (i = 0; i < rs->rs_rate_cnt; i++) {
128820abea66SRandall Stewart if (rs->rs_rlt[i].flags & HDWRPACE_TAGPRESENT) {
128998d7a8d9SJohn Baldwin in_pcbdetach_tag(rs->rs_rlt[i].tag);
129020abea66SRandall Stewart rs->rs_rlt[i].tag = NULL;
129120abea66SRandall Stewart }
129220abea66SRandall Stewart rs->rs_rlt[i].flags = HDWRPACE_IFPDEPARTED;
129320abea66SRandall Stewart }
1294eabddb25SHans Petter Selasky if (rs->rs_flows_using == 0)
1295eabddb25SHans Petter Selasky rs_defer_destroy(rs);
129620abea66SRandall Stewart }
129720abea66SRandall Stewart mtx_unlock(&rs_mtx);
12981a714ff2SRandall Stewart NET_EPOCH_EXIT(et);
129920abea66SRandall Stewart }
130020abea66SRandall Stewart
1301*1f628be8SAndrew Gallatin void
tcp_rl_release_ifnet(struct ifnet * ifp)1302*1f628be8SAndrew Gallatin tcp_rl_release_ifnet(struct ifnet *ifp)
1303*1f628be8SAndrew Gallatin {
1304*1f628be8SAndrew Gallatin tcp_rl_ifnet_departure(NULL, ifp);
1305*1f628be8SAndrew Gallatin }
1306*1f628be8SAndrew Gallatin
130720abea66SRandall Stewart static void
tcp_rl_shutdown(void * arg __unused,int howto __unused)130820abea66SRandall Stewart tcp_rl_shutdown(void *arg __unused, int howto __unused)
130920abea66SRandall Stewart {
131020abea66SRandall Stewart struct tcp_rate_set *rs, *nrs;
13111a714ff2SRandall Stewart struct epoch_tracker et;
131220abea66SRandall Stewart int i;
131320abea66SRandall Stewart
13141a714ff2SRandall Stewart NET_EPOCH_ENTER(et);
131520abea66SRandall Stewart mtx_lock(&rs_mtx);
131620abea66SRandall Stewart CK_LIST_FOREACH_SAFE(rs, &int_rs, next, nrs) {
131720abea66SRandall Stewart CK_LIST_REMOVE(rs, next);
131820abea66SRandall Stewart rs_number_alive--;
131920abea66SRandall Stewart rs->rs_flags |= RS_IS_DEAD;
132020abea66SRandall Stewart for (i = 0; i < rs->rs_rate_cnt; i++) {
132120abea66SRandall Stewart if (rs->rs_rlt[i].flags & HDWRPACE_TAGPRESENT) {
132298d7a8d9SJohn Baldwin in_pcbdetach_tag(rs->rs_rlt[i].tag);
132320abea66SRandall Stewart rs->rs_rlt[i].tag = NULL;
132420abea66SRandall Stewart }
132520abea66SRandall Stewart rs->rs_rlt[i].flags = HDWRPACE_IFPDEPARTED;
132620abea66SRandall Stewart }
1327eabddb25SHans Petter Selasky if (rs->rs_flows_using == 0)
1328eabddb25SHans Petter Selasky rs_defer_destroy(rs);
132920abea66SRandall Stewart }
133020abea66SRandall Stewart mtx_unlock(&rs_mtx);
13311a714ff2SRandall Stewart NET_EPOCH_EXIT(et);
133220abea66SRandall Stewart }
133320abea66SRandall Stewart
133420abea66SRandall Stewart const struct tcp_hwrate_limit_table *
tcp_set_pacing_rate(struct tcpcb * tp,struct ifnet * ifp,uint64_t bytes_per_sec,int flags,int * error,uint64_t * lower_rate)133520abea66SRandall Stewart tcp_set_pacing_rate(struct tcpcb *tp, struct ifnet *ifp,
13361a714ff2SRandall Stewart uint64_t bytes_per_sec, int flags, int *error, uint64_t *lower_rate)
133720abea66SRandall Stewart {
13389eb0e832SGleb Smirnoff struct inpcb *inp = tptoinpcb(tp);
133920abea66SRandall Stewart const struct tcp_hwrate_limit_table *rte;
1340521eac97SJohn Baldwin #ifdef KERN_TLS
1341521eac97SJohn Baldwin struct ktls_session *tls;
1342521eac97SJohn Baldwin #endif
134320abea66SRandall Stewart
13449eb0e832SGleb Smirnoff INP_WLOCK_ASSERT(inp);
1345ce398115SJohn Baldwin
13469eb0e832SGleb Smirnoff if (inp->inp_snd_tag == NULL) {
134720abea66SRandall Stewart /*
134820abea66SRandall Stewart * We are setting up a rate for the first time.
134920abea66SRandall Stewart */
13509aed26b9SJohn Baldwin if ((ifp->if_capenable & IFCAP_TXRTLMT) == 0) {
135120abea66SRandall Stewart /* Not supported by the egress */
135220abea66SRandall Stewart if (error)
135320abea66SRandall Stewart *error = ENODEV;
135420abea66SRandall Stewart return (NULL);
135520abea66SRandall Stewart }
135620abea66SRandall Stewart #ifdef KERN_TLS
1357521eac97SJohn Baldwin tls = NULL;
1358c0e4090eSAndrew Gallatin if (tp->t_nic_ktls_xmit != 0) {
13599eb0e832SGleb Smirnoff tls = tptosocket(tp)->so_snd.sb_tls_info;
1360521eac97SJohn Baldwin
1361521eac97SJohn Baldwin if ((ifp->if_capenable & IFCAP_TXTLS_RTLMT) == 0 ||
1362521eac97SJohn Baldwin tls->mode != TCP_TLS_MODE_IFNET) {
136320abea66SRandall Stewart if (error)
1364521eac97SJohn Baldwin *error = ENODEV;
136520abea66SRandall Stewart return (NULL);
136620abea66SRandall Stewart }
1367521eac97SJohn Baldwin }
136820abea66SRandall Stewart #endif
13699eb0e832SGleb Smirnoff rte = rt_setup_rate(inp, ifp, bytes_per_sec, flags, error, lower_rate);
13701a714ff2SRandall Stewart if (rte)
13711a714ff2SRandall Stewart rl_increment_using(rte);
1372521eac97SJohn Baldwin #ifdef KERN_TLS
1373521eac97SJohn Baldwin if (rte != NULL && tls != NULL && tls->snd_tag != NULL) {
1374521eac97SJohn Baldwin /*
1375521eac97SJohn Baldwin * Fake a route change error to reset the TLS
1376521eac97SJohn Baldwin * send tag. This will convert the existing
1377521eac97SJohn Baldwin * tag to a TLS ratelimit tag.
1378521eac97SJohn Baldwin */
1379c782ea8bSJohn Baldwin MPASS(tls->snd_tag->sw->type == IF_SND_TAG_TYPE_TLS);
13809eb0e832SGleb Smirnoff ktls_output_eagain(inp, tls);
1381521eac97SJohn Baldwin }
1382521eac97SJohn Baldwin #endif
138320abea66SRandall Stewart } else {
138420abea66SRandall Stewart /*
138520abea66SRandall Stewart * We are modifying a rate, wrong interface?
138620abea66SRandall Stewart */
138720abea66SRandall Stewart if (error)
138820abea66SRandall Stewart *error = EINVAL;
138920abea66SRandall Stewart rte = NULL;
139020abea66SRandall Stewart }
13911a714ff2SRandall Stewart if (rte != NULL) {
1392ce398115SJohn Baldwin tp->t_pacing_rate = rte->rate;
1393d7313dc6SRandall Stewart *error = 0;
13941a714ff2SRandall Stewart }
139520abea66SRandall Stewart return (rte);
139620abea66SRandall Stewart }
139720abea66SRandall Stewart
139820abea66SRandall Stewart const struct tcp_hwrate_limit_table *
tcp_chg_pacing_rate(const struct tcp_hwrate_limit_table * crte,struct tcpcb * tp,struct ifnet * ifp,uint64_t bytes_per_sec,int flags,int * error,uint64_t * lower_rate)139920abea66SRandall Stewart tcp_chg_pacing_rate(const struct tcp_hwrate_limit_table *crte,
140020abea66SRandall Stewart struct tcpcb *tp, struct ifnet *ifp,
14011a714ff2SRandall Stewart uint64_t bytes_per_sec, int flags, int *error, uint64_t *lower_rate)
140220abea66SRandall Stewart {
14039eb0e832SGleb Smirnoff struct inpcb *inp = tptoinpcb(tp);
140420abea66SRandall Stewart const struct tcp_hwrate_limit_table *nrte;
140520abea66SRandall Stewart const struct tcp_rate_set *rs;
1406521eac97SJohn Baldwin #ifdef KERN_TLS
1407521eac97SJohn Baldwin struct ktls_session *tls = NULL;
1408521eac97SJohn Baldwin #endif
140920abea66SRandall Stewart int err;
141020abea66SRandall Stewart
14119eb0e832SGleb Smirnoff INP_WLOCK_ASSERT(inp);
1412ce398115SJohn Baldwin
1413521eac97SJohn Baldwin if (crte == NULL) {
1414521eac97SJohn Baldwin /* Wrong interface */
1415521eac97SJohn Baldwin if (error)
1416521eac97SJohn Baldwin *error = EINVAL;
1417521eac97SJohn Baldwin return (NULL);
1418521eac97SJohn Baldwin }
1419521eac97SJohn Baldwin
1420521eac97SJohn Baldwin #ifdef KERN_TLS
1421c0e4090eSAndrew Gallatin if (tp->t_nic_ktls_xmit) {
14229eb0e832SGleb Smirnoff tls = tptosocket(tp)->so_snd.sb_tls_info;
1423d782385eSJohn Baldwin if (tls->mode != TCP_TLS_MODE_IFNET)
1424d782385eSJohn Baldwin tls = NULL;
1425d782385eSJohn Baldwin else if (tls->snd_tag != NULL &&
1426c782ea8bSJohn Baldwin tls->snd_tag->sw->type != IF_SND_TAG_TYPE_TLS_RATE_LIMIT) {
1427d782385eSJohn Baldwin if (!tls->reset_pending) {
1428521eac97SJohn Baldwin /*
1429d782385eSJohn Baldwin * NIC probably doesn't support
1430d782385eSJohn Baldwin * ratelimit TLS tags if it didn't
1431d782385eSJohn Baldwin * allocate one when an existing rate
1432d782385eSJohn Baldwin * was present, so ignore.
1433521eac97SJohn Baldwin */
14348a7404b2SAndrew Gallatin tcp_rel_pacing_rate(crte, tp);
1435521eac97SJohn Baldwin if (error)
1436521eac97SJohn Baldwin *error = EOPNOTSUPP;
1437521eac97SJohn Baldwin return (NULL);
1438521eac97SJohn Baldwin }
1439d782385eSJohn Baldwin
1440d782385eSJohn Baldwin /*
1441d782385eSJohn Baldwin * The send tag is being converted, so set the
1442d782385eSJohn Baldwin * rate limit on the inpcb tag. There is a
1443d782385eSJohn Baldwin * race that the new NIC send tag might use
1444d782385eSJohn Baldwin * the current rate instead of this one.
1445d782385eSJohn Baldwin */
1446d782385eSJohn Baldwin tls = NULL;
1447d782385eSJohn Baldwin }
1448521eac97SJohn Baldwin }
1449521eac97SJohn Baldwin #endif
14509eb0e832SGleb Smirnoff if (inp->inp_snd_tag == NULL) {
145120abea66SRandall Stewart /* Wrong interface */
14528a7404b2SAndrew Gallatin tcp_rel_pacing_rate(crte, tp);
145320abea66SRandall Stewart if (error)
145420abea66SRandall Stewart *error = EINVAL;
145520abea66SRandall Stewart return (NULL);
145620abea66SRandall Stewart }
145720abea66SRandall Stewart rs = crte->ptbl;
145820abea66SRandall Stewart if ((rs->rs_flags & RS_IS_DEAD) ||
145920abea66SRandall Stewart (crte->flags & HDWRPACE_IFPDEPARTED)) {
146020abea66SRandall Stewart /* Release the rate, and try anew */
14611a714ff2SRandall Stewart
146220abea66SRandall Stewart tcp_rel_pacing_rate(crte, tp);
146320abea66SRandall Stewart nrte = tcp_set_pacing_rate(tp, ifp,
14641a714ff2SRandall Stewart bytes_per_sec, flags, error, lower_rate);
146520abea66SRandall Stewart return (nrte);
146620abea66SRandall Stewart }
14671a714ff2SRandall Stewart nrte = tcp_find_suitable_rate(rs, bytes_per_sec, flags, lower_rate);
146820abea66SRandall Stewart if (nrte == crte) {
146920abea66SRandall Stewart /* No change */
147020abea66SRandall Stewart if (error)
147120abea66SRandall Stewart *error = 0;
147220abea66SRandall Stewart return (crte);
147320abea66SRandall Stewart }
147420abea66SRandall Stewart if (nrte == NULL) {
147520abea66SRandall Stewart /* Release the old rate */
14761a714ff2SRandall Stewart if (error)
14771a714ff2SRandall Stewart *error = ENOENT;
147820abea66SRandall Stewart tcp_rel_pacing_rate(crte, tp);
147920abea66SRandall Stewart return (NULL);
148020abea66SRandall Stewart }
14811a714ff2SRandall Stewart rl_decrement_using(crte);
14821a714ff2SRandall Stewart rl_increment_using(nrte);
148320abea66SRandall Stewart /* Change rates to our new entry */
1484521eac97SJohn Baldwin #ifdef KERN_TLS
1485521eac97SJohn Baldwin if (tls != NULL)
1486521eac97SJohn Baldwin err = ktls_modify_txrtlmt(tls, nrte->rate);
1487521eac97SJohn Baldwin else
1488521eac97SJohn Baldwin #endif
14899eb0e832SGleb Smirnoff err = in_pcbmodify_txrtlmt(inp, nrte->rate);
149020abea66SRandall Stewart if (err) {
14918a7404b2SAndrew Gallatin struct tcp_rate_set *lrs;
14928a7404b2SAndrew Gallatin uint64_t pre;
14938a7404b2SAndrew Gallatin
14941a714ff2SRandall Stewart rl_decrement_using(nrte);
14958a7404b2SAndrew Gallatin lrs = __DECONST(struct tcp_rate_set *, rs);
14968a7404b2SAndrew Gallatin pre = atomic_fetchadd_64(&lrs->rs_flows_using, -1);
14971a714ff2SRandall Stewart /* Do we still have a snd-tag attached? */
14989eb0e832SGleb Smirnoff if (inp->inp_snd_tag)
14999eb0e832SGleb Smirnoff in_pcbdetach_txrtlmt(inp);
15008a7404b2SAndrew Gallatin
15018a7404b2SAndrew Gallatin if (pre == 1) {
15028a7404b2SAndrew Gallatin struct epoch_tracker et;
15038a7404b2SAndrew Gallatin
15048a7404b2SAndrew Gallatin NET_EPOCH_ENTER(et);
15058a7404b2SAndrew Gallatin mtx_lock(&rs_mtx);
15068a7404b2SAndrew Gallatin /*
15078a7404b2SAndrew Gallatin * Is it dead?
15088a7404b2SAndrew Gallatin */
15098a7404b2SAndrew Gallatin if (lrs->rs_flags & RS_IS_DEAD)
15108a7404b2SAndrew Gallatin rs_defer_destroy(lrs);
15118a7404b2SAndrew Gallatin mtx_unlock(&rs_mtx);
15128a7404b2SAndrew Gallatin NET_EPOCH_EXIT(et);
15138a7404b2SAndrew Gallatin }
151420abea66SRandall Stewart if (error)
151520abea66SRandall Stewart *error = err;
151620abea66SRandall Stewart return (NULL);
1517db46c0d0SHans Petter Selasky } else {
1518db46c0d0SHans Petter Selasky #ifdef INET
15191a714ff2SRandall Stewart counter_u64_add(rate_limit_chg, 1);
1520db46c0d0SHans Petter Selasky #endif
1521db46c0d0SHans Petter Selasky }
152220abea66SRandall Stewart if (error)
152320abea66SRandall Stewart *error = 0;
1524ce398115SJohn Baldwin tp->t_pacing_rate = nrte->rate;
152520abea66SRandall Stewart return (nrte);
152620abea66SRandall Stewart }
152720abea66SRandall Stewart
152820abea66SRandall Stewart void
tcp_rel_pacing_rate(const struct tcp_hwrate_limit_table * crte,struct tcpcb * tp)152920abea66SRandall Stewart tcp_rel_pacing_rate(const struct tcp_hwrate_limit_table *crte, struct tcpcb *tp)
153020abea66SRandall Stewart {
15319eb0e832SGleb Smirnoff struct inpcb *inp = tptoinpcb(tp);
153220abea66SRandall Stewart const struct tcp_rate_set *crs;
153320abea66SRandall Stewart struct tcp_rate_set *rs;
153420abea66SRandall Stewart uint64_t pre;
153520abea66SRandall Stewart
15369eb0e832SGleb Smirnoff INP_WLOCK_ASSERT(inp);
1537ce398115SJohn Baldwin
1538ce398115SJohn Baldwin tp->t_pacing_rate = -1;
153920abea66SRandall Stewart crs = crte->ptbl;
154020abea66SRandall Stewart /*
154120abea66SRandall Stewart * Now we must break the const
154220abea66SRandall Stewart * in order to release our refcount.
154320abea66SRandall Stewart */
154420abea66SRandall Stewart rs = __DECONST(struct tcp_rate_set *, crs);
15451a714ff2SRandall Stewart rl_decrement_using(crte);
1546a1589eb8SRandall Stewart pre = atomic_fetchadd_64(&rs->rs_flows_using, -1);
154720abea66SRandall Stewart if (pre == 1) {
15481a714ff2SRandall Stewart struct epoch_tracker et;
15491a714ff2SRandall Stewart
15501a714ff2SRandall Stewart NET_EPOCH_ENTER(et);
155120abea66SRandall Stewart mtx_lock(&rs_mtx);
155220abea66SRandall Stewart /*
155320abea66SRandall Stewart * Is it dead?
155420abea66SRandall Stewart */
1555eabddb25SHans Petter Selasky if (rs->rs_flags & RS_IS_DEAD)
1556eabddb25SHans Petter Selasky rs_defer_destroy(rs);
155720abea66SRandall Stewart mtx_unlock(&rs_mtx);
15581a714ff2SRandall Stewart NET_EPOCH_EXIT(et);
155920abea66SRandall Stewart }
1560521eac97SJohn Baldwin
1561521eac97SJohn Baldwin /*
1562521eac97SJohn Baldwin * XXX: If this connection is using ifnet TLS, should we
1563521eac97SJohn Baldwin * switch it to using an unlimited rate, or perhaps use
1564521eac97SJohn Baldwin * ktls_output_eagain() to reset the send tag to a plain
1565521eac97SJohn Baldwin * TLS tag?
1566521eac97SJohn Baldwin */
15679eb0e832SGleb Smirnoff in_pcbdetach_txrtlmt(inp);
156820abea66SRandall Stewart }
156920abea66SRandall Stewart
1570d7313dc6SRandall Stewart #define ONE_POINT_TWO_MEG 150000 /* 1.2 megabits in bytes */
1571d7313dc6SRandall Stewart #define ONE_HUNDRED_MBPS 12500000 /* 100Mbps in bytes per second */
1572d7313dc6SRandall Stewart #define FIVE_HUNDRED_MBPS 62500000 /* 500Mbps in bytes per second */
1573d7313dc6SRandall Stewart #define MAX_MSS_SENT 43 /* 43 mss = 43 x 1500 = 64,500 bytes */
1574d7313dc6SRandall Stewart
15751a714ff2SRandall Stewart static void
tcp_log_pacing_size(struct tcpcb * tp,uint64_t bw,uint32_t segsiz,uint32_t new_tso,uint64_t hw_rate,uint32_t time_between,uint32_t calc_time_between,uint32_t segs,uint32_t res_div,uint16_t mult,uint8_t mod)15761a714ff2SRandall Stewart tcp_log_pacing_size(struct tcpcb *tp, uint64_t bw, uint32_t segsiz, uint32_t new_tso,
15771a714ff2SRandall Stewart uint64_t hw_rate, uint32_t time_between, uint32_t calc_time_between,
15781a714ff2SRandall Stewart uint32_t segs, uint32_t res_div, uint16_t mult, uint8_t mod)
15791a714ff2SRandall Stewart {
158069c7c811SRandall Stewart if (tcp_bblogging_on(tp)) {
15811a714ff2SRandall Stewart union tcp_log_stackspecific log;
15821a714ff2SRandall Stewart struct timeval tv;
15831a714ff2SRandall Stewart
15841a714ff2SRandall Stewart memset(&log, 0, sizeof(log));
15851a714ff2SRandall Stewart log.u_bbr.flex1 = segsiz;
15861a714ff2SRandall Stewart log.u_bbr.flex2 = new_tso;
15871a714ff2SRandall Stewart log.u_bbr.flex3 = time_between;
15881a714ff2SRandall Stewart log.u_bbr.flex4 = calc_time_between;
15891a714ff2SRandall Stewart log.u_bbr.flex5 = segs;
15901a714ff2SRandall Stewart log.u_bbr.flex6 = res_div;
15911a714ff2SRandall Stewart log.u_bbr.flex7 = mult;
15921a714ff2SRandall Stewart log.u_bbr.flex8 = mod;
15931a714ff2SRandall Stewart log.u_bbr.timeStamp = tcp_get_usecs(&tv);
15941a714ff2SRandall Stewart log.u_bbr.cur_del_rate = bw;
15951a714ff2SRandall Stewart log.u_bbr.delRate = hw_rate;
15961a714ff2SRandall Stewart TCP_LOG_EVENTP(tp, NULL,
15979eb0e832SGleb Smirnoff &tptosocket(tp)->so_rcv,
15989eb0e832SGleb Smirnoff &tptosocket(tp)->so_snd,
15991a714ff2SRandall Stewart TCP_HDWR_PACE_SIZE, 0,
16001a714ff2SRandall Stewart 0, &log, false, &tv);
16011a714ff2SRandall Stewart }
16021a714ff2SRandall Stewart }
16031a714ff2SRandall Stewart
1604d7313dc6SRandall Stewart uint32_t
tcp_get_pacing_burst_size_w_divisor(struct tcpcb * tp,uint64_t bw,uint32_t segsiz,int can_use_1mss,const struct tcp_hwrate_limit_table * te,int * err,int divisor)160526bdd35cSRandall Stewart tcp_get_pacing_burst_size_w_divisor(struct tcpcb *tp, uint64_t bw, uint32_t segsiz, int can_use_1mss,
160626bdd35cSRandall Stewart const struct tcp_hwrate_limit_table *te, int *err, int divisor)
1607d7313dc6SRandall Stewart {
1608d7313dc6SRandall Stewart /*
1609d7313dc6SRandall Stewart * We use the google formula to calculate the
1610d7313dc6SRandall Stewart * TSO size. I.E.
1611d7313dc6SRandall Stewart * bw < 24Meg
1612d7313dc6SRandall Stewart * tso = 2mss
1613d7313dc6SRandall Stewart * else
161426bdd35cSRandall Stewart * tso = min(bw/(div=1000), 64k)
1615d7313dc6SRandall Stewart *
1616d7313dc6SRandall Stewart * Note for these calculations we ignore the
1617d7313dc6SRandall Stewart * packet overhead (enet hdr, ip hdr and tcp hdr).
161826bdd35cSRandall Stewart * We only get the google formula when we have
161926bdd35cSRandall Stewart * divisor = 1000, which is the default for now.
1620d7313dc6SRandall Stewart */
1621d7313dc6SRandall Stewart uint64_t lentim, res, bytes;
1622d7313dc6SRandall Stewart uint32_t new_tso, min_tso_segs;
1623d7313dc6SRandall Stewart
162426bdd35cSRandall Stewart /* It can't be zero */
162526bdd35cSRandall Stewart if ((divisor == 0) ||
162626bdd35cSRandall Stewart (divisor < RL_MIN_DIVISOR)) {
162726bdd35cSRandall Stewart if (mss_divisor)
162826bdd35cSRandall Stewart bytes = bw / mss_divisor;
162926bdd35cSRandall Stewart else
1630d7313dc6SRandall Stewart bytes = bw / 1000;
163126bdd35cSRandall Stewart } else
163226bdd35cSRandall Stewart bytes = bw / divisor;
163326bdd35cSRandall Stewart /* We can't ever send more than 65k in a TSO */
163426bdd35cSRandall Stewart if (bytes > 0xffff) {
163526bdd35cSRandall Stewart bytes = 0xffff;
163626bdd35cSRandall Stewart }
1637d7313dc6SRandall Stewart /* Round up */
1638d7313dc6SRandall Stewart new_tso = (bytes + segsiz - 1) / segsiz;
163926bdd35cSRandall Stewart /* Are we enforcing even boundaries? */
164026bdd35cSRandall Stewart if (even_num_segs && (new_tso & 1) && (new_tso > even_threshold))
164126bdd35cSRandall Stewart new_tso++;
164226bdd35cSRandall Stewart if (can_use_1mss)
1643d7313dc6SRandall Stewart min_tso_segs = 1;
1644d7313dc6SRandall Stewart else
1645d7313dc6SRandall Stewart min_tso_segs = 2;
16461a714ff2SRandall Stewart if (rs_floor_mss && (new_tso < rs_floor_mss))
16471a714ff2SRandall Stewart new_tso = rs_floor_mss;
16481a714ff2SRandall Stewart else if (new_tso < min_tso_segs)
1649d7313dc6SRandall Stewart new_tso = min_tso_segs;
1650d7313dc6SRandall Stewart if (new_tso > MAX_MSS_SENT)
1651d7313dc6SRandall Stewart new_tso = MAX_MSS_SENT;
1652d7313dc6SRandall Stewart new_tso *= segsiz;
16531a714ff2SRandall Stewart tcp_log_pacing_size(tp, bw, segsiz, new_tso,
16541a714ff2SRandall Stewart 0, 0, 0, 0, 0, 0, 1);
1655d7313dc6SRandall Stewart /*
1656d7313dc6SRandall Stewart * If we are not doing hardware pacing
1657d7313dc6SRandall Stewart * then we are done.
1658d7313dc6SRandall Stewart */
1659d7313dc6SRandall Stewart if (te == NULL) {
1660d7313dc6SRandall Stewart if (err)
1661d7313dc6SRandall Stewart *err = 0;
1662d7313dc6SRandall Stewart return(new_tso);
1663d7313dc6SRandall Stewart }
1664d7313dc6SRandall Stewart /*
1665d7313dc6SRandall Stewart * For hardware pacing we look at the
1666d7313dc6SRandall Stewart * rate you are sending at and compare
1667d7313dc6SRandall Stewart * that to the rate you have in hardware.
1668d7313dc6SRandall Stewart *
1669d7313dc6SRandall Stewart * If the hardware rate is slower than your
1670d7313dc6SRandall Stewart * software rate then you are in error and
1671d7313dc6SRandall Stewart * we will build a queue in our hardware whic
1672d7313dc6SRandall Stewart * is probably not desired, in such a case
1673d7313dc6SRandall Stewart * just return the non-hardware TSO size.
1674d7313dc6SRandall Stewart *
1675d7313dc6SRandall Stewart * If the rate in hardware is faster (which
1676d7313dc6SRandall Stewart * it should be) then look at how long it
1677d7313dc6SRandall Stewart * takes to send one ethernet segment size at
1678d7313dc6SRandall Stewart * your b/w and compare that to the time it
1679d7313dc6SRandall Stewart * takes to send at the rate you had selected.
1680d7313dc6SRandall Stewart *
1681d7313dc6SRandall Stewart * If your time is greater (which we hope it is)
1682d7313dc6SRandall Stewart * we get the delta between the two, and then
1683d7313dc6SRandall Stewart * divide that into your pacing time. This tells
1684d7313dc6SRandall Stewart * us how many MSS you can send down at once (rounded up).
1685d7313dc6SRandall Stewart *
1686d7313dc6SRandall Stewart * Note we also double this value if the b/w is over
1687d7313dc6SRandall Stewart * 100Mbps. If its over 500meg we just set you to the
1688d7313dc6SRandall Stewart * max (43 segments).
1689d7313dc6SRandall Stewart */
1690d7313dc6SRandall Stewart if (te->rate > FIVE_HUNDRED_MBPS)
16911a714ff2SRandall Stewart goto max;
1692d7313dc6SRandall Stewart if (te->rate == bw) {
1693d7313dc6SRandall Stewart /* We are pacing at exactly the hdwr rate */
16941a714ff2SRandall Stewart max:
16951a714ff2SRandall Stewart tcp_log_pacing_size(tp, bw, segsiz, new_tso,
16961a714ff2SRandall Stewart te->rate, te->time_between, (uint32_t)0,
16971a714ff2SRandall Stewart (segsiz * MAX_MSS_SENT), 0, 0, 3);
1698d7313dc6SRandall Stewart return (segsiz * MAX_MSS_SENT);
1699d7313dc6SRandall Stewart }
1700d7313dc6SRandall Stewart lentim = ETHERNET_SEGMENT_SIZE * USECS_IN_SECOND;
1701d7313dc6SRandall Stewart res = lentim / bw;
1702d7313dc6SRandall Stewart if (res > te->time_between) {
17031a714ff2SRandall Stewart uint32_t delta, segs, res_div;
1704d7313dc6SRandall Stewart
17051a714ff2SRandall Stewart res_div = ((res * num_of_waits_allowed) + wait_time_floor);
1706d7313dc6SRandall Stewart delta = res - te->time_between;
17071a714ff2SRandall Stewart segs = (res_div + delta - 1)/delta;
1708d7313dc6SRandall Stewart if (segs < min_tso_segs)
1709d7313dc6SRandall Stewart segs = min_tso_segs;
17101a714ff2SRandall Stewart if (segs < rs_hw_floor_mss)
17111a714ff2SRandall Stewart segs = rs_hw_floor_mss;
1712d7313dc6SRandall Stewart if (segs > MAX_MSS_SENT)
1713d7313dc6SRandall Stewart segs = MAX_MSS_SENT;
1714d7313dc6SRandall Stewart segs *= segsiz;
17151a714ff2SRandall Stewart tcp_log_pacing_size(tp, bw, segsiz, new_tso,
17161a714ff2SRandall Stewart te->rate, te->time_between, (uint32_t)res,
17171a714ff2SRandall Stewart segs, res_div, 1, 3);
1718d7313dc6SRandall Stewart if (err)
1719d7313dc6SRandall Stewart *err = 0;
1720d7313dc6SRandall Stewart if (segs < new_tso) {
1721d7313dc6SRandall Stewart /* unexpected ? */
1722d7313dc6SRandall Stewart return(new_tso);
1723d7313dc6SRandall Stewart } else {
1724d7313dc6SRandall Stewart return (segs);
1725d7313dc6SRandall Stewart }
1726d7313dc6SRandall Stewart } else {
1727d7313dc6SRandall Stewart /*
1728d7313dc6SRandall Stewart * Your time is smaller which means
1729d7313dc6SRandall Stewart * we will grow a queue on our
1730d7313dc6SRandall Stewart * hardware. Send back the non-hardware
1731d7313dc6SRandall Stewart * rate.
1732d7313dc6SRandall Stewart */
17331a714ff2SRandall Stewart tcp_log_pacing_size(tp, bw, segsiz, new_tso,
17341a714ff2SRandall Stewart te->rate, te->time_between, (uint32_t)res,
17351a714ff2SRandall Stewart 0, 0, 0, 4);
1736d7313dc6SRandall Stewart if (err)
1737d7313dc6SRandall Stewart *err = -1;
1738d7313dc6SRandall Stewart return (new_tso);
1739d7313dc6SRandall Stewart }
1740d7313dc6SRandall Stewart }
1741d7313dc6SRandall Stewart
17421a714ff2SRandall Stewart uint64_t
tcp_hw_highest_rate_ifp(struct ifnet * ifp,struct inpcb * inp)17431a714ff2SRandall Stewart tcp_hw_highest_rate_ifp(struct ifnet *ifp, struct inpcb *inp)
17441a714ff2SRandall Stewart {
17451a714ff2SRandall Stewart struct epoch_tracker et;
17461a714ff2SRandall Stewart struct tcp_rate_set *rs;
17471a714ff2SRandall Stewart uint64_t rate_ret;
17481a714ff2SRandall Stewart
17491a714ff2SRandall Stewart NET_EPOCH_ENTER(et);
17501a714ff2SRandall Stewart use_next_interface:
17511a714ff2SRandall Stewart rs = find_rs_for_ifp(ifp);
17521a714ff2SRandall Stewart if (rs == NULL) {
17531a714ff2SRandall Stewart /* This interface does not do ratelimiting */
17541a714ff2SRandall Stewart rate_ret = 0;
17551a714ff2SRandall Stewart } else if (rs->rs_flags & RS_IS_DEFF) {
17561a714ff2SRandall Stewart /* We need to find the real interface */
17571a714ff2SRandall Stewart struct ifnet *tifp;
17581a714ff2SRandall Stewart
17591a714ff2SRandall Stewart tifp = rt_find_real_interface(ifp, inp, NULL);
17601a714ff2SRandall Stewart if (tifp == NULL) {
17611a714ff2SRandall Stewart NET_EPOCH_EXIT(et);
17621a714ff2SRandall Stewart return (0);
17631a714ff2SRandall Stewart }
17641a714ff2SRandall Stewart ifp = tifp;
17651a714ff2SRandall Stewart goto use_next_interface;
17661a714ff2SRandall Stewart } else {
17671a714ff2SRandall Stewart /* Lets return the highest rate this guy has */
17681a714ff2SRandall Stewart rate_ret = rs->rs_rlt[rs->rs_highest_valid].rate;
17691a714ff2SRandall Stewart }
17701a714ff2SRandall Stewart NET_EPOCH_EXIT(et);
17711a714ff2SRandall Stewart return(rate_ret);
17721a714ff2SRandall Stewart }
17731a714ff2SRandall Stewart
177420abea66SRandall Stewart static eventhandler_tag rl_ifnet_departs;
177520abea66SRandall Stewart static eventhandler_tag rl_ifnet_arrives;
177620abea66SRandall Stewart static eventhandler_tag rl_shutdown_start;
177720abea66SRandall Stewart
177820abea66SRandall Stewart static void
tcp_rs_init(void * st __unused)177920abea66SRandall Stewart tcp_rs_init(void *st __unused)
178020abea66SRandall Stewart {
178120abea66SRandall Stewart CK_LIST_INIT(&int_rs);
178220abea66SRandall Stewart rs_number_alive = 0;
1783c012cfe6SEd Maste rs_number_dead = 0;
178420abea66SRandall Stewart mtx_init(&rs_mtx, "tcp_rs_mtx", "rsmtx", MTX_DEF);
178520abea66SRandall Stewart rl_ifnet_departs = EVENTHANDLER_REGISTER(ifnet_departure_event,
178620abea66SRandall Stewart tcp_rl_ifnet_departure,
178720abea66SRandall Stewart NULL, EVENTHANDLER_PRI_ANY);
178820abea66SRandall Stewart rl_ifnet_arrives = EVENTHANDLER_REGISTER(ifnet_link_event,
178920abea66SRandall Stewart tcp_rl_ifnet_link,
179020abea66SRandall Stewart NULL, EVENTHANDLER_PRI_ANY);
179120abea66SRandall Stewart rl_shutdown_start = EVENTHANDLER_REGISTER(shutdown_pre_sync,
179220abea66SRandall Stewart tcp_rl_shutdown, NULL,
179320abea66SRandall Stewart SHUTDOWN_PRI_FIRST);
179420abea66SRandall Stewart printf("TCP_ratelimit: Is now initialized\n");
179520abea66SRandall Stewart }
179620abea66SRandall Stewart
179720abea66SRandall Stewart SYSINIT(tcp_rl_init, SI_SUB_SMP + 1, SI_ORDER_ANY, tcp_rs_init, NULL);
179820abea66SRandall Stewart #endif
1799