xref: /freebsd/sys/netinet/tcp_ratelimit.c (revision 1f628be888b74f1219b3ea7ccea1e7a3d1db77a2)
120abea66SRandall Stewart /*-
220abea66SRandall Stewart  *
320abea66SRandall Stewart  * SPDX-License-Identifier: BSD-3-Clause
420abea66SRandall Stewart  *
528540ab1SWarner Losh  * Copyright (c) 2018-2020
620abea66SRandall Stewart  *	Netflix Inc.
720abea66SRandall Stewart  *
820abea66SRandall Stewart  * Redistribution and use in source and binary forms, with or without
920abea66SRandall Stewart  * modification, are permitted provided that the following conditions
1020abea66SRandall Stewart  * are met:
1120abea66SRandall Stewart  * 1. Redistributions of source code must retain the above copyright
1220abea66SRandall Stewart  *    notice, this list of conditions and the following disclaimer.
1320abea66SRandall Stewart  * 2. Redistributions in binary form must reproduce the above copyright
1420abea66SRandall Stewart  *    notice, this list of conditions and the following disclaimer in the
1520abea66SRandall Stewart  *    documentation and/or other materials provided with the distribution.
1620abea66SRandall Stewart  *
1720abea66SRandall Stewart  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
1820abea66SRandall Stewart  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
1920abea66SRandall Stewart  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
2020abea66SRandall Stewart  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
2120abea66SRandall Stewart  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
2220abea66SRandall Stewart  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
2320abea66SRandall Stewart  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
2420abea66SRandall Stewart  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
2520abea66SRandall Stewart  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
2620abea66SRandall Stewart  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
2720abea66SRandall Stewart  * SUCH DAMAGE.
2820abea66SRandall Stewart  *
2920abea66SRandall Stewart  */
3020abea66SRandall Stewart /**
3120abea66SRandall Stewart  * Author: Randall Stewart <rrs@netflix.com>
3220abea66SRandall Stewart  */
3320abea66SRandall Stewart 
3420abea66SRandall Stewart #include <sys/cdefs.h>
3520abea66SRandall Stewart #include "opt_inet.h"
3620abea66SRandall Stewart #include "opt_inet6.h"
3720abea66SRandall Stewart #include "opt_ipsec.h"
3820abea66SRandall Stewart #include "opt_ratelimit.h"
3920abea66SRandall Stewart #include <sys/param.h>
4020abea66SRandall Stewart #include <sys/kernel.h>
4120abea66SRandall Stewart #include <sys/malloc.h>
4220abea66SRandall Stewart #include <sys/mbuf.h>
4320abea66SRandall Stewart #include <sys/socket.h>
4420abea66SRandall Stewart #include <sys/socketvar.h>
4520abea66SRandall Stewart #include <sys/sysctl.h>
4620abea66SRandall Stewart #include <sys/eventhandler.h>
4720abea66SRandall Stewart #include <sys/mutex.h>
4820abea66SRandall Stewart #include <sys/ck.h>
49348404bcSRandall Stewart #include <net/if.h>
50348404bcSRandall Stewart #include <net/if_var.h>
513d0d5b21SJustin Hibbits #include <net/if_private.h>
5220abea66SRandall Stewart #include <netinet/in.h>
5320abea66SRandall Stewart #include <netinet/in_pcb.h>
54348404bcSRandall Stewart #define TCPSTATES		/* for logging */
5520abea66SRandall Stewart #include <netinet/tcp_var.h>
561a714ff2SRandall Stewart #include <netinet/tcp_hpts.h>
571a714ff2SRandall Stewart #include <netinet/tcp_log_buf.h>
5820abea66SRandall Stewart #include <netinet/tcp_ratelimit.h>
5920abea66SRandall Stewart #ifndef USECS_IN_SECOND
6020abea66SRandall Stewart #define USECS_IN_SECOND 1000000
6120abea66SRandall Stewart #endif
6220abea66SRandall Stewart /*
6320abea66SRandall Stewart  * For the purposes of each send, what is the size
6420abea66SRandall Stewart  * of an ethernet frame.
6520abea66SRandall Stewart  */
6620abea66SRandall Stewart MALLOC_DEFINE(M_TCPPACE, "tcp_hwpace", "TCP Hardware pacing memory");
6720abea66SRandall Stewart #ifdef RATELIMIT
6820abea66SRandall Stewart 
69d7313dc6SRandall Stewart /*
70d7313dc6SRandall Stewart  * The following preferred table will seem weird to
71d7313dc6SRandall Stewart  * the casual viewer. Why do we not have any rates below
72d7313dc6SRandall Stewart  * 1Mbps? Why do we have a rate at 1.44Mbps called common?
73d7313dc6SRandall Stewart  * Why do the rates cluster in the 1-100Mbps range more
74d7313dc6SRandall Stewart  * than others? Why does the table jump around at the beginnign
75d7313dc6SRandall Stewart  * and then be more consistently raising?
76d7313dc6SRandall Stewart  *
77d7313dc6SRandall Stewart  * Let me try to answer those questions. A lot of
78d7313dc6SRandall Stewart  * this is dependant on the hardware. We have three basic
79d7313dc6SRandall Stewart  * supporters of rate limiting
80d7313dc6SRandall Stewart  *
81d7313dc6SRandall Stewart  * Chelsio - Supporting 16 configurable rates.
82d7313dc6SRandall Stewart  * Mlx  - c4 supporting 13 fixed rates.
83d7313dc6SRandall Stewart  * Mlx  - c5 & c6 supporting 127 configurable rates.
84d7313dc6SRandall Stewart  *
85d7313dc6SRandall Stewart  * The c4 is why we have a common rate that is available
86d7313dc6SRandall Stewart  * in all rate tables. This is a selected rate from the
87d7313dc6SRandall Stewart  * c4 table and we assure its available in all ratelimit
88d7313dc6SRandall Stewart  * tables. This way the tcp_ratelimit code has an assured
89d7313dc6SRandall Stewart  * rate it should always be able to get. This answers a
90d7313dc6SRandall Stewart  * couple of the questions above.
91d7313dc6SRandall Stewart  *
92d7313dc6SRandall Stewart  * So what about the rest, well the table is built to
93d7313dc6SRandall Stewart  * try to get the most out of a joint hardware/software
94d7313dc6SRandall Stewart  * pacing system.  The software pacer will always pick
95d7313dc6SRandall Stewart  * a rate higher than the b/w that it is estimating
96d7313dc6SRandall Stewart  *
97d7313dc6SRandall Stewart  * on the path. This is done for two reasons.
98d7313dc6SRandall Stewart  * a) So we can discover more b/w
99d7313dc6SRandall Stewart  * and
100d7313dc6SRandall Stewart  * b) So we can send a block of MSS's down and then
101d7313dc6SRandall Stewart  *    have the software timer go off after the previous
102d7313dc6SRandall Stewart  *    send is completely out of the hardware.
103d7313dc6SRandall Stewart  *
104d7313dc6SRandall Stewart  * But when we do <b> we don't want to have the delay
105d7313dc6SRandall Stewart  * between the last packet sent by the hardware be
106d7313dc6SRandall Stewart  * excessively long (to reach our desired rate).
107d7313dc6SRandall Stewart  *
108d7313dc6SRandall Stewart  * So let me give an example for clarity.
109d7313dc6SRandall Stewart  *
110d7313dc6SRandall Stewart  * Lets assume that the tcp stack sees that 29,110,000 bps is
111d7313dc6SRandall Stewart  * what the bw of the path is. The stack would select the
112d7313dc6SRandall Stewart  * rate 31Mbps. 31Mbps means that each send that is done
113d7313dc6SRandall Stewart  * by the hardware will cause a 390 micro-second gap between
114d7313dc6SRandall Stewart  * the packets sent at that rate. For 29,110,000 bps we
115d7313dc6SRandall Stewart  * would need 416 micro-seconds gap between each send.
116d7313dc6SRandall Stewart  *
117d7313dc6SRandall Stewart  * Note that are calculating a complete time for pacing
118d7313dc6SRandall Stewart  * which includes the ethernet, IP and TCP overhead. So
119d7313dc6SRandall Stewart  * a full 1514 bytes is used for the above calculations.
120d7313dc6SRandall Stewart  * My testing has shown that both cards are also using this
121d7313dc6SRandall Stewart  * as their basis i.e. full payload size of the ethernet frame.
122d7313dc6SRandall Stewart  * The TCP stack caller needs to be aware of this and make the
123d7313dc6SRandall Stewart  * appropriate overhead calculations be included in its choices.
124d7313dc6SRandall Stewart  *
125d7313dc6SRandall Stewart  * Now, continuing our example, we pick a MSS size based on the
126d7313dc6SRandall Stewart  * delta between the two rates (416 - 390) divided into the rate
127d7313dc6SRandall Stewart  * we really wish to send at rounded up.  That results in a MSS
128d7313dc6SRandall Stewart  * send of 17 mss's at once. The hardware then will
129d7313dc6SRandall Stewart  * run out of data in a single 17MSS send in 6,630 micro-seconds.
130d7313dc6SRandall Stewart  *
131d7313dc6SRandall Stewart  * On the other hand the software pacer will send more data
132d7313dc6SRandall Stewart  * in 7,072 micro-seconds. This means that we will refill
133d7313dc6SRandall Stewart  * the hardware 52 microseconds after it would have sent
134d7313dc6SRandall Stewart  * next if it had not ran out of data. This is a win since we are
135d7313dc6SRandall Stewart  * only sending every 7ms or so and yet all the packets are spaced on
136d7313dc6SRandall Stewart  * the wire with 94% of what they should be and only
137d7313dc6SRandall Stewart  * the last packet is delayed extra to make up for the
138d7313dc6SRandall Stewart  * difference.
139d7313dc6SRandall Stewart  *
140d7313dc6SRandall Stewart  * Note that the above formula has two important caveat.
141d7313dc6SRandall Stewart  * If we are above (b/w wise) over 100Mbps we double the result
142d7313dc6SRandall Stewart  * of the MSS calculation. The second caveat is if we are 500Mbps
143d7313dc6SRandall Stewart  * or more we just send the maximum MSS at once i.e. 45MSS. At
144d7313dc6SRandall Stewart  * the higher b/w's even the cards have limits to what times (timer granularity)
145d7313dc6SRandall Stewart  * they can insert between packets and start to send more than one
146d7313dc6SRandall Stewart  * packet at a time on the wire.
147d7313dc6SRandall Stewart  *
148d7313dc6SRandall Stewart  */
14920abea66SRandall Stewart #define COMMON_RATE 180500
150d7313dc6SRandall Stewart const uint64_t desired_rates[] = {
151d7313dc6SRandall Stewart 	122500,			/* 1Mbps  - rate 1 */
152d7313dc6SRandall Stewart 	180500,			/* 1.44Mpbs - rate 2  common rate */
153d7313dc6SRandall Stewart 	375000,			/* 3Mbps    - rate 3 */
154d7313dc6SRandall Stewart 	625000,			/* 5Mbps    - rate 4 */
1551a714ff2SRandall Stewart 	1250000,		/* 10Mbps   - rate 5 */
1561a714ff2SRandall Stewart 	1875000,		/* 15Mbps   - rate 6 */
1571a714ff2SRandall Stewart 	2500000,		/* 20Mbps   - rate 7 */
1581a714ff2SRandall Stewart 	3125000,	       	/* 25Mbps   - rate 8 */
1591a714ff2SRandall Stewart 	3750000,		/* 30Mbps   - rate 9 */
1601a714ff2SRandall Stewart 	4375000,		/* 35Mbps   - rate 10 */
1611a714ff2SRandall Stewart 	5000000,		/* 40Meg    - rate 11 */
1621a714ff2SRandall Stewart 	6250000,		/* 50Mbps   - rate 12 */
1631a714ff2SRandall Stewart 	12500000,		/* 100Mbps  - rate 13 */
1641a714ff2SRandall Stewart 	25000000,		/* 200Mbps  - rate 14 */
1651a714ff2SRandall Stewart 	50000000,		/* 400Mbps  - rate 15 */
166d7313dc6SRandall Stewart 	100000000,		/* 800Mbps  - rate 16 */
1671a714ff2SRandall Stewart 	5625000,		/* 45Mbps   - rate 17 */
1681a714ff2SRandall Stewart 	6875000,		/* 55Mbps   - rate 19 */
1691a714ff2SRandall Stewart 	7500000,		/* 60Mbps   - rate 20 */
1701a714ff2SRandall Stewart 	8125000,		/* 65Mbps   - rate 21 */
1711a714ff2SRandall Stewart 	8750000,		/* 70Mbps   - rate 22 */
1721a714ff2SRandall Stewart 	9375000,		/* 75Mbps   - rate 23 */
1731a714ff2SRandall Stewart 	10000000,		/* 80Mbps   - rate 24 */
1741a714ff2SRandall Stewart 	10625000,		/* 85Mbps   - rate 25 */
1751a714ff2SRandall Stewart 	11250000,		/* 90Mbps   - rate 26 */
1761a714ff2SRandall Stewart 	11875000,		/* 95Mbps   - rate 27 */
1771a714ff2SRandall Stewart 	12500000,		/* 100Mbps  - rate 28 */
1781a714ff2SRandall Stewart 	13750000,		/* 110Mbps  - rate 29 */
1791a714ff2SRandall Stewart 	15000000,		/* 120Mbps  - rate 30 */
1801a714ff2SRandall Stewart 	16250000,		/* 130Mbps  - rate 31 */
1811a714ff2SRandall Stewart 	17500000,		/* 140Mbps  - rate 32 */
1821a714ff2SRandall Stewart 	18750000,		/* 150Mbps  - rate 33 */
1831a714ff2SRandall Stewart 	20000000,		/* 160Mbps  - rate 34 */
1841a714ff2SRandall Stewart 	21250000,		/* 170Mbps  - rate 35 */
1851a714ff2SRandall Stewart 	22500000,		/* 180Mbps  - rate 36 */
1861a714ff2SRandall Stewart 	23750000,		/* 190Mbps  - rate 37 */
1871a714ff2SRandall Stewart 	26250000,		/* 210Mbps  - rate 38 */
1881a714ff2SRandall Stewart 	27500000,		/* 220Mbps  - rate 39 */
1891a714ff2SRandall Stewart 	28750000,		/* 230Mbps  - rate 40 */
1901a714ff2SRandall Stewart 	30000000,	       	/* 240Mbps  - rate 41 */
1911a714ff2SRandall Stewart 	31250000,		/* 250Mbps  - rate 42 */
1921a714ff2SRandall Stewart 	34375000,		/* 275Mbps  - rate 43 */
1931a714ff2SRandall Stewart 	37500000,		/* 300Mbps  - rate 44 */
1941a714ff2SRandall Stewart 	40625000,		/* 325Mbps  - rate 45 */
1951a714ff2SRandall Stewart 	43750000,		/* 350Mbps  - rate 46 */
1961a714ff2SRandall Stewart 	46875000,		/* 375Mbps  - rate 47 */
1971a714ff2SRandall Stewart 	53125000,		/* 425Mbps  - rate 48 */
1981a714ff2SRandall Stewart 	56250000,		/* 450Mbps  - rate 49 */
1991a714ff2SRandall Stewart 	59375000,		/* 475Mbps  - rate 50 */
2001a714ff2SRandall Stewart 	62500000,		/* 500Mbps  - rate 51 */
2011a714ff2SRandall Stewart 	68750000,		/* 550Mbps  - rate 52 */
2021a714ff2SRandall Stewart 	75000000,		/* 600Mbps  - rate 53 */
2031a714ff2SRandall Stewart 	81250000,		/* 650Mbps  - rate 54 */
2041a714ff2SRandall Stewart 	87500000,		/* 700Mbps  - rate 55 */
2051a714ff2SRandall Stewart 	93750000,		/* 750Mbps  - rate 56 */
2061a714ff2SRandall Stewart 	106250000,		/* 850Mbps  - rate 57 */
2071a714ff2SRandall Stewart 	112500000,		/* 900Mbps  - rate 58 */
2081a714ff2SRandall Stewart 	125000000,		/* 1Gbps    - rate 59 */
2091a714ff2SRandall Stewart 	156250000,		/* 1.25Gps  - rate 60 */
2101a714ff2SRandall Stewart 	187500000,		/* 1.5Gps   - rate 61 */
2111a714ff2SRandall Stewart 	218750000,		/* 1.75Gps  - rate 62 */
2121a714ff2SRandall Stewart 	250000000,		/* 2Gbps    - rate 63 */
2131a714ff2SRandall Stewart 	281250000,		/* 2.25Gps  - rate 64 */
2141a714ff2SRandall Stewart 	312500000,		/* 2.5Gbps  - rate 65 */
2151a714ff2SRandall Stewart 	343750000,		/* 2.75Gbps - rate 66 */
2161a714ff2SRandall Stewart 	375000000,		/* 3Gbps    - rate 67 */
2171a714ff2SRandall Stewart 	500000000,		/* 4Gbps    - rate 68 */
2181a714ff2SRandall Stewart 	625000000,		/* 5Gbps    - rate 69 */
2191a714ff2SRandall Stewart 	750000000,		/* 6Gbps    - rate 70 */
2201a714ff2SRandall Stewart 	875000000,		/* 7Gbps    - rate 71 */
2211a714ff2SRandall Stewart 	1000000000,		/* 8Gbps    - rate 72 */
2221a714ff2SRandall Stewart 	1125000000,		/* 9Gbps    - rate 73 */
2231a714ff2SRandall Stewart 	1250000000,		/* 10Gbps   - rate 74 */
2241a714ff2SRandall Stewart 	1875000000,		/* 15Gbps   - rate 75 */
2251a714ff2SRandall Stewart 	2500000000		/* 20Gbps   - rate 76 */
22620abea66SRandall Stewart };
227d7313dc6SRandall Stewart 
22820abea66SRandall Stewart #define MAX_HDWR_RATES (sizeof(desired_rates)/sizeof(uint64_t))
22920abea66SRandall Stewart #define RS_ORDERED_COUNT 16	/*
23020abea66SRandall Stewart 				 * Number that are in order
23120abea66SRandall Stewart 				 * at the beginning of the table,
23220abea66SRandall Stewart 				 * over this a sort is required.
23320abea66SRandall Stewart 				 */
23420abea66SRandall Stewart #define RS_NEXT_ORDER_GROUP 16	/*
23520abea66SRandall Stewart 				 * The point in our table where
23620abea66SRandall Stewart 				 * we come fill in a second ordered
23720abea66SRandall Stewart 				 * group (index wise means -1).
23820abea66SRandall Stewart 				 */
23920abea66SRandall Stewart #define ALL_HARDWARE_RATES 1004 /*
24020abea66SRandall Stewart 				 * 1Meg - 1Gig in 1 Meg steps
24120abea66SRandall Stewart 				 * plus 100, 200k  and 500k and
24220abea66SRandall Stewart 				 * 10Gig
24320abea66SRandall Stewart 				 */
24420abea66SRandall Stewart 
24520abea66SRandall Stewart #define RS_ONE_MEGABIT_PERSEC 1000000
24620abea66SRandall Stewart #define RS_ONE_GIGABIT_PERSEC 1000000000
24720abea66SRandall Stewart #define RS_TEN_GIGABIT_PERSEC 10000000000
24820abea66SRandall Stewart 
24920abea66SRandall Stewart static struct head_tcp_rate_set int_rs;
25020abea66SRandall Stewart static struct mtx rs_mtx;
25120abea66SRandall Stewart uint32_t rs_number_alive;
25220abea66SRandall Stewart uint32_t rs_number_dead;
2531a714ff2SRandall Stewart static uint32_t rs_floor_mss = 0;
2541a714ff2SRandall Stewart static uint32_t wait_time_floor = 8000;	/* 8 ms */
2551a714ff2SRandall Stewart static uint32_t rs_hw_floor_mss = 16;
2561a714ff2SRandall Stewart static uint32_t num_of_waits_allowed = 1; /* How many time blocks are we willing to wait */
25720abea66SRandall Stewart 
25826bdd35cSRandall Stewart static uint32_t mss_divisor = RL_DEFAULT_DIVISOR;
25926bdd35cSRandall Stewart static uint32_t even_num_segs = 1;
26026bdd35cSRandall Stewart static uint32_t even_threshold = 4;
26126bdd35cSRandall Stewart 
2627029da5cSPawel Biernacki SYSCTL_NODE(_net_inet_tcp, OID_AUTO, rl, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
26320abea66SRandall Stewart     "TCP Ratelimit stats");
26420abea66SRandall Stewart SYSCTL_UINT(_net_inet_tcp_rl, OID_AUTO, alive, CTLFLAG_RW,
26520abea66SRandall Stewart     &rs_number_alive, 0,
26620abea66SRandall Stewart     "Number of interfaces initialized for ratelimiting");
26720abea66SRandall Stewart SYSCTL_UINT(_net_inet_tcp_rl, OID_AUTO, dead, CTLFLAG_RW,
26820abea66SRandall Stewart     &rs_number_dead, 0,
26920abea66SRandall Stewart     "Number of interfaces departing from ratelimiting");
2701a714ff2SRandall Stewart SYSCTL_UINT(_net_inet_tcp_rl, OID_AUTO, floor_mss, CTLFLAG_RW,
2711a714ff2SRandall Stewart     &rs_floor_mss, 0,
2721a714ff2SRandall Stewart     "Number of MSS that will override the normal minimums (0 means don't enforce)");
2731a714ff2SRandall Stewart SYSCTL_UINT(_net_inet_tcp_rl, OID_AUTO, wait_floor, CTLFLAG_RW,
2741a714ff2SRandall Stewart     &wait_time_floor, 2000,
2751a714ff2SRandall Stewart     "Has b/w increases what is the wait floor we are willing to wait at the end?");
2761a714ff2SRandall Stewart SYSCTL_UINT(_net_inet_tcp_rl, OID_AUTO, time_blocks, CTLFLAG_RW,
2771a714ff2SRandall Stewart     &num_of_waits_allowed, 1,
2781a714ff2SRandall Stewart     "How many time blocks on the end should software pacing be willing to wait?");
2791a714ff2SRandall Stewart 
2801a714ff2SRandall Stewart SYSCTL_UINT(_net_inet_tcp_rl, OID_AUTO, hw_floor_mss, CTLFLAG_RW,
2811a714ff2SRandall Stewart     &rs_hw_floor_mss, 16,
2821a714ff2SRandall Stewart     "Number of mss that are a minum for hardware pacing?");
2831a714ff2SRandall Stewart 
28426bdd35cSRandall Stewart SYSCTL_INT(_net_inet_tcp_rl, OID_AUTO, divisor, CTLFLAG_RW,
28526bdd35cSRandall Stewart     &mss_divisor, RL_DEFAULT_DIVISOR,
28626bdd35cSRandall Stewart     "The value divided into bytes per second to help establish mss size");
28726bdd35cSRandall Stewart SYSCTL_INT(_net_inet_tcp_rl, OID_AUTO, even, CTLFLAG_RW,
28826bdd35cSRandall Stewart     &even_num_segs, 1,
28926bdd35cSRandall Stewart     "Do we round mss size up to an even number of segments for delayed ack");
29026bdd35cSRandall Stewart SYSCTL_INT(_net_inet_tcp_rl, OID_AUTO, eventhresh, CTLFLAG_RW,
29126bdd35cSRandall Stewart     &even_threshold, 4,
29226bdd35cSRandall Stewart     "At what number of mss do we start rounding up to an even number of mss?");
29320abea66SRandall Stewart 
29420abea66SRandall Stewart static void
rl_add_syctl_entries(struct sysctl_oid * rl_sysctl_root,struct tcp_rate_set * rs)29520abea66SRandall Stewart rl_add_syctl_entries(struct sysctl_oid *rl_sysctl_root, struct tcp_rate_set *rs)
29620abea66SRandall Stewart {
29720abea66SRandall Stewart 	/*
29820abea66SRandall Stewart 	 * Add sysctl entries for thus interface.
29920abea66SRandall Stewart 	 */
30020abea66SRandall Stewart 	if (rs->rs_flags & RS_INTF_NO_SUP) {
30120abea66SRandall Stewart 		SYSCTL_ADD_S32(&rs->sysctl_ctx,
30220abea66SRandall Stewart 		   SYSCTL_CHILDREN(rl_sysctl_root),
30320abea66SRandall Stewart 		   OID_AUTO, "disable", CTLFLAG_RD,
30420abea66SRandall Stewart 		   &rs->rs_disable, 0,
30520abea66SRandall Stewart 		   "Disable this interface from new hdwr limiting?");
30620abea66SRandall Stewart 	} else {
30720abea66SRandall Stewart 		SYSCTL_ADD_S32(&rs->sysctl_ctx,
30820abea66SRandall Stewart 		   SYSCTL_CHILDREN(rl_sysctl_root),
30920abea66SRandall Stewart 		   OID_AUTO, "disable", CTLFLAG_RW,
31020abea66SRandall Stewart 		   &rs->rs_disable, 0,
31120abea66SRandall Stewart 		   "Disable this interface from new hdwr limiting?");
31220abea66SRandall Stewart 	}
31320abea66SRandall Stewart 	SYSCTL_ADD_S32(&rs->sysctl_ctx,
31420abea66SRandall Stewart 	    SYSCTL_CHILDREN(rl_sysctl_root),
31520abea66SRandall Stewart 	    OID_AUTO, "minseg", CTLFLAG_RW,
31620abea66SRandall Stewart 	    &rs->rs_min_seg, 0,
31720abea66SRandall Stewart 	    "What is the minimum we need to send on this interface?");
31820abea66SRandall Stewart 	SYSCTL_ADD_U64(&rs->sysctl_ctx,
31920abea66SRandall Stewart 	    SYSCTL_CHILDREN(rl_sysctl_root),
32020abea66SRandall Stewart 	    OID_AUTO, "flow_limit", CTLFLAG_RW,
32120abea66SRandall Stewart 	    &rs->rs_flow_limit, 0,
32220abea66SRandall Stewart 	    "What is the limit for number of flows (0=unlimited)?");
32320abea66SRandall Stewart 	SYSCTL_ADD_S32(&rs->sysctl_ctx,
32420abea66SRandall Stewart 	    SYSCTL_CHILDREN(rl_sysctl_root),
32520abea66SRandall Stewart 	    OID_AUTO, "highest", CTLFLAG_RD,
32620abea66SRandall Stewart 	    &rs->rs_highest_valid, 0,
32720abea66SRandall Stewart 	    "Highest valid rate");
32820abea66SRandall Stewart 	SYSCTL_ADD_S32(&rs->sysctl_ctx,
32920abea66SRandall Stewart 	    SYSCTL_CHILDREN(rl_sysctl_root),
33020abea66SRandall Stewart 	    OID_AUTO, "lowest", CTLFLAG_RD,
33120abea66SRandall Stewart 	    &rs->rs_lowest_valid, 0,
33220abea66SRandall Stewart 	    "Lowest valid rate");
33320abea66SRandall Stewart 	SYSCTL_ADD_S32(&rs->sysctl_ctx,
33420abea66SRandall Stewart 	    SYSCTL_CHILDREN(rl_sysctl_root),
33520abea66SRandall Stewart 	    OID_AUTO, "flags", CTLFLAG_RD,
33620abea66SRandall Stewart 	    &rs->rs_flags, 0,
33720abea66SRandall Stewart 	    "What lags are on the entry?");
33820abea66SRandall Stewart 	SYSCTL_ADD_S32(&rs->sysctl_ctx,
33920abea66SRandall Stewart 	    SYSCTL_CHILDREN(rl_sysctl_root),
34020abea66SRandall Stewart 	    OID_AUTO, "numrates", CTLFLAG_RD,
34120abea66SRandall Stewart 	    &rs->rs_rate_cnt, 0,
34220abea66SRandall Stewart 	    "How many rates re there?");
34320abea66SRandall Stewart 	SYSCTL_ADD_U64(&rs->sysctl_ctx,
34420abea66SRandall Stewart 	    SYSCTL_CHILDREN(rl_sysctl_root),
34520abea66SRandall Stewart 	    OID_AUTO, "flows_using", CTLFLAG_RD,
34620abea66SRandall Stewart 	    &rs->rs_flows_using, 0,
34720abea66SRandall Stewart 	    "How many flows are using this interface now?");
34820abea66SRandall Stewart #ifdef DETAILED_RATELIMIT_SYSCTL
34920abea66SRandall Stewart 	if (rs->rs_rlt && rs->rs_rate_cnt > 0) {
35020abea66SRandall Stewart 		/*  Lets display the rates */
35120abea66SRandall Stewart 		int i;
35220abea66SRandall Stewart 		struct sysctl_oid *rl_rates;
35320abea66SRandall Stewart 		struct sysctl_oid *rl_rate_num;
35420abea66SRandall Stewart 		char rate_num[16];
35520abea66SRandall Stewart 		rl_rates = SYSCTL_ADD_NODE(&rs->sysctl_ctx,
35620abea66SRandall Stewart 					    SYSCTL_CHILDREN(rl_sysctl_root),
35720abea66SRandall Stewart 					    OID_AUTO,
35820abea66SRandall Stewart 					    "rate",
3597029da5cSPawel Biernacki 					    CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
36020abea66SRandall Stewart 					    "Ratelist");
36120abea66SRandall Stewart 		for( i = 0; i < rs->rs_rate_cnt; i++) {
36220abea66SRandall Stewart 			sprintf(rate_num, "%d", i);
36320abea66SRandall Stewart 			rl_rate_num = SYSCTL_ADD_NODE(&rs->sysctl_ctx,
36420abea66SRandall Stewart 					    SYSCTL_CHILDREN(rl_rates),
36520abea66SRandall Stewart 					    OID_AUTO,
36620abea66SRandall Stewart 					    rate_num,
3677029da5cSPawel Biernacki 					    CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
36820abea66SRandall Stewart 					    "Individual Rate");
36920abea66SRandall Stewart 			SYSCTL_ADD_U32(&rs->sysctl_ctx,
37020abea66SRandall Stewart 				       SYSCTL_CHILDREN(rl_rate_num),
37120abea66SRandall Stewart 				       OID_AUTO, "flags", CTLFLAG_RD,
37220abea66SRandall Stewart 				       &rs->rs_rlt[i].flags, 0,
37320abea66SRandall Stewart 				       "Flags on this rate");
37420abea66SRandall Stewart 			SYSCTL_ADD_U32(&rs->sysctl_ctx,
37520abea66SRandall Stewart 				       SYSCTL_CHILDREN(rl_rate_num),
37620abea66SRandall Stewart 				       OID_AUTO, "pacetime", CTLFLAG_RD,
37720abea66SRandall Stewart 				       &rs->rs_rlt[i].time_between, 0,
37820abea66SRandall Stewart 				       "Time hardware inserts between 1500 byte sends");
3795d8fd932SRandall Stewart 			SYSCTL_ADD_LONG(&rs->sysctl_ctx,
38020abea66SRandall Stewart 				       SYSCTL_CHILDREN(rl_rate_num),
38120abea66SRandall Stewart 				       OID_AUTO, "rate", CTLFLAG_RD,
3825d8fd932SRandall Stewart 				       &rs->rs_rlt[i].rate,
38320abea66SRandall Stewart 				       "Rate in bytes per second");
3845d8fd932SRandall Stewart 			SYSCTL_ADD_LONG(&rs->sysctl_ctx,
3855d8fd932SRandall Stewart 				       SYSCTL_CHILDREN(rl_rate_num),
3865d8fd932SRandall Stewart 				       OID_AUTO, "using", CTLFLAG_RD,
3875d8fd932SRandall Stewart 				       &rs->rs_rlt[i].using,
3885d8fd932SRandall Stewart 				       "Number of flows using");
3895d8fd932SRandall Stewart 			SYSCTL_ADD_LONG(&rs->sysctl_ctx,
3905d8fd932SRandall Stewart 				       SYSCTL_CHILDREN(rl_rate_num),
3915d8fd932SRandall Stewart 				       OID_AUTO, "enobufs", CTLFLAG_RD,
3925d8fd932SRandall Stewart 				       &rs->rs_rlt[i].rs_num_enobufs,
3935d8fd932SRandall Stewart 				       "Number of enobufs logged on this rate");
3945d8fd932SRandall Stewart 
39520abea66SRandall Stewart 		}
39620abea66SRandall Stewart 	}
39720abea66SRandall Stewart #endif
39820abea66SRandall Stewart }
39920abea66SRandall Stewart 
40020abea66SRandall Stewart static void
rs_destroy(epoch_context_t ctx)40120abea66SRandall Stewart rs_destroy(epoch_context_t ctx)
40220abea66SRandall Stewart {
40320abea66SRandall Stewart 	struct tcp_rate_set *rs;
40424be1353SHans Petter Selasky 	bool do_free_rs;
40520abea66SRandall Stewart 
40620abea66SRandall Stewart 	rs = __containerof(ctx, struct tcp_rate_set, rs_epoch_ctx);
40724be1353SHans Petter Selasky 
40820abea66SRandall Stewart 	mtx_lock(&rs_mtx);
40920abea66SRandall Stewart 	rs->rs_flags &= ~RS_FUNERAL_SCHD;
41020abea66SRandall Stewart 	/*
41120abea66SRandall Stewart 	 * In theory its possible (but unlikely)
41220abea66SRandall Stewart 	 * that while the delete was occuring
41320abea66SRandall Stewart 	 * and we were applying the DEAD flag
41420abea66SRandall Stewart 	 * someone slipped in and found the
41520abea66SRandall Stewart 	 * interface in a lookup. While we
41620abea66SRandall Stewart 	 * decided rs_flows_using were 0 and
41720abea66SRandall Stewart 	 * scheduling the epoch_call, the other
41820abea66SRandall Stewart 	 * thread incremented rs_flow_using. This
41920abea66SRandall Stewart 	 * is because users have a pointer and
42020abea66SRandall Stewart 	 * we only use the rs_flows_using in an
42120abea66SRandall Stewart 	 * atomic fashion, i.e. the other entities
42220abea66SRandall Stewart 	 * are not protected. To assure this did
42320abea66SRandall Stewart 	 * not occur, we check rs_flows_using here
42424be1353SHans Petter Selasky 	 * before deleting.
42520abea66SRandall Stewart 	 */
42624be1353SHans Petter Selasky 	do_free_rs = (rs->rs_flows_using == 0);
42724be1353SHans Petter Selasky 	rs_number_dead--;
42824be1353SHans Petter Selasky 	mtx_unlock(&rs_mtx);
42924be1353SHans Petter Selasky 
43024be1353SHans Petter Selasky 	if (do_free_rs) {
43120abea66SRandall Stewart 		sysctl_ctx_free(&rs->sysctl_ctx);
43220abea66SRandall Stewart 		free(rs->rs_rlt, M_TCPPACE);
43320abea66SRandall Stewart 		free(rs, M_TCPPACE);
43420abea66SRandall Stewart 	}
43520abea66SRandall Stewart }
43620abea66SRandall Stewart 
437eabddb25SHans Petter Selasky static void
rs_defer_destroy(struct tcp_rate_set * rs)438eabddb25SHans Petter Selasky rs_defer_destroy(struct tcp_rate_set *rs)
439eabddb25SHans Petter Selasky {
440eabddb25SHans Petter Selasky 
441eabddb25SHans Petter Selasky 	mtx_assert(&rs_mtx, MA_OWNED);
442eabddb25SHans Petter Selasky 
443eabddb25SHans Petter Selasky 	/* Check if already pending. */
444eabddb25SHans Petter Selasky 	if (rs->rs_flags & RS_FUNERAL_SCHD)
445eabddb25SHans Petter Selasky 		return;
446eabddb25SHans Petter Selasky 
447eabddb25SHans Petter Selasky 	rs_number_dead++;
448eabddb25SHans Petter Selasky 
449eabddb25SHans Petter Selasky 	/* Set flag to only defer once. */
450eabddb25SHans Petter Selasky 	rs->rs_flags |= RS_FUNERAL_SCHD;
451348404bcSRandall Stewart 	NET_EPOCH_CALL(rs_destroy, &rs->rs_epoch_ctx);
452eabddb25SHans Petter Selasky }
453eabddb25SHans Petter Selasky 
454903c4ee6SXin LI #ifdef INET
4551a714ff2SRandall Stewart extern counter_u64_t rate_limit_new;
4561a714ff2SRandall Stewart extern counter_u64_t rate_limit_chg;
45720abea66SRandall Stewart extern counter_u64_t rate_limit_set_ok;
45820abea66SRandall Stewart extern counter_u64_t rate_limit_active;
45920abea66SRandall Stewart extern counter_u64_t rate_limit_alloc_fail;
460903c4ee6SXin LI #endif
46120abea66SRandall Stewart 
46220abea66SRandall Stewart static int
rl_attach_txrtlmt(struct ifnet * ifp,uint32_t flowtype,int flowid,uint64_t cfg_rate,struct m_snd_tag ** tag)46320abea66SRandall Stewart rl_attach_txrtlmt(struct ifnet *ifp,
46420abea66SRandall Stewart     uint32_t flowtype,
46520abea66SRandall Stewart     int flowid,
46620abea66SRandall Stewart     uint64_t cfg_rate,
46720abea66SRandall Stewart     struct m_snd_tag **tag)
46820abea66SRandall Stewart {
46920abea66SRandall Stewart 	int error;
47020abea66SRandall Stewart 	union if_snd_tag_alloc_params params = {
47120abea66SRandall Stewart 		.rate_limit.hdr.type = IF_SND_TAG_TYPE_RATE_LIMIT,
47220abea66SRandall Stewart 		.rate_limit.hdr.flowid = flowid,
47320abea66SRandall Stewart 		.rate_limit.hdr.flowtype = flowtype,
47420abea66SRandall Stewart 		.rate_limit.max_rate = cfg_rate,
47520abea66SRandall Stewart 		.rate_limit.flags = M_NOWAIT,
47620abea66SRandall Stewart 	};
47720abea66SRandall Stewart 
47836e0a362SJohn Baldwin 	error = m_snd_tag_alloc(ifp, &params, tag);
479903c4ee6SXin LI #ifdef INET
48020abea66SRandall Stewart 	if (error == 0) {
48120abea66SRandall Stewart 		counter_u64_add(rate_limit_set_ok, 1);
48220abea66SRandall Stewart 		counter_u64_add(rate_limit_active, 1);
48336e0a362SJohn Baldwin 	} else if (error != EOPNOTSUPP)
48420abea66SRandall Stewart 		counter_u64_add(rate_limit_alloc_fail, 1);
485903c4ee6SXin LI #endif
48620abea66SRandall Stewart 	return (error);
48720abea66SRandall Stewart }
48820abea66SRandall Stewart 
48920abea66SRandall Stewart static void
populate_canned_table(struct tcp_rate_set * rs,const uint64_t * rate_table_act)49020abea66SRandall Stewart populate_canned_table(struct tcp_rate_set *rs, const uint64_t *rate_table_act)
49120abea66SRandall Stewart {
49220abea66SRandall Stewart 	/*
49320abea66SRandall Stewart 	 * The internal table is "special", it
49420abea66SRandall Stewart 	 * is two seperate ordered tables that
49520abea66SRandall Stewart 	 * must be merged. We get here when the
49620abea66SRandall Stewart 	 * adapter specifies a number of rates that
49720abea66SRandall Stewart 	 * covers both ranges in the table in some
49820abea66SRandall Stewart 	 * form.
49920abea66SRandall Stewart 	 */
50020abea66SRandall Stewart 	int i, at_low, at_high;
50120abea66SRandall Stewart 	uint8_t low_disabled = 0, high_disabled = 0;
50220abea66SRandall Stewart 
50320abea66SRandall Stewart 	for(i = 0, at_low = 0, at_high = RS_NEXT_ORDER_GROUP; i < rs->rs_rate_cnt; i++) {
50420abea66SRandall Stewart 		rs->rs_rlt[i].flags = 0;
50520abea66SRandall Stewart 		rs->rs_rlt[i].time_between = 0;
50620abea66SRandall Stewart 		if ((low_disabled == 0) &&
50720abea66SRandall Stewart 		    (high_disabled ||
50820abea66SRandall Stewart 		     (rate_table_act[at_low] < rate_table_act[at_high]))) {
50920abea66SRandall Stewart 			rs->rs_rlt[i].rate = rate_table_act[at_low];
51020abea66SRandall Stewart 			at_low++;
51120abea66SRandall Stewart 			if (at_low == RS_NEXT_ORDER_GROUP)
51220abea66SRandall Stewart 				low_disabled = 1;
51320abea66SRandall Stewart 		} else if (high_disabled == 0) {
51420abea66SRandall Stewart 			rs->rs_rlt[i].rate = rate_table_act[at_high];
51520abea66SRandall Stewart 			at_high++;
51620abea66SRandall Stewart 			if (at_high == MAX_HDWR_RATES)
51720abea66SRandall Stewart 				high_disabled = 1;
51820abea66SRandall Stewart 		}
51920abea66SRandall Stewart 	}
52020abea66SRandall Stewart }
52120abea66SRandall Stewart 
52220abea66SRandall Stewart static struct tcp_rate_set *
rt_setup_new_rs(struct ifnet * ifp,int * error)52320abea66SRandall Stewart rt_setup_new_rs(struct ifnet *ifp, int *error)
52420abea66SRandall Stewart {
52520abea66SRandall Stewart 	struct tcp_rate_set *rs;
52620abea66SRandall Stewart 	const uint64_t *rate_table_act;
52720abea66SRandall Stewart 	uint64_t lentim, res;
52820abea66SRandall Stewart 	size_t sz;
52920abea66SRandall Stewart 	uint32_t hash_type;
53020abea66SRandall Stewart 	int i;
53120abea66SRandall Stewart 	struct if_ratelimit_query_results rl;
53220abea66SRandall Stewart 	struct sysctl_oid *rl_sysctl_root;
5331a714ff2SRandall Stewart 	struct epoch_tracker et;
53420abea66SRandall Stewart 	/*
53520abea66SRandall Stewart 	 * We expect to enter with the
53620abea66SRandall Stewart 	 * mutex locked.
53720abea66SRandall Stewart 	 */
53820abea66SRandall Stewart 
53920abea66SRandall Stewart 	if (ifp->if_ratelimit_query == NULL) {
54020abea66SRandall Stewart 		/*
54120abea66SRandall Stewart 		 * We can do nothing if we cannot
54220abea66SRandall Stewart 		 * get a query back from the driver.
54320abea66SRandall Stewart 		 */
544d7313dc6SRandall Stewart 		printf("Warning:No query functions for %s:%d-- failed\n",
545d7313dc6SRandall Stewart 		       ifp->if_dname, ifp->if_dunit);
54620abea66SRandall Stewart 		return (NULL);
54720abea66SRandall Stewart 	}
54820abea66SRandall Stewart 	rs = malloc(sizeof(struct tcp_rate_set), M_TCPPACE, M_NOWAIT | M_ZERO);
54920abea66SRandall Stewart 	if (rs == NULL) {
55020abea66SRandall Stewart 		if (error)
55120abea66SRandall Stewart 			*error = ENOMEM;
552d7313dc6SRandall Stewart 		printf("Warning:No memory for malloc of tcp_rate_set\n");
55320abea66SRandall Stewart 		return (NULL);
55420abea66SRandall Stewart 	}
555d7313dc6SRandall Stewart 	memset(&rl, 0, sizeof(rl));
55620abea66SRandall Stewart 	rl.flags = RT_NOSUPPORT;
55720abea66SRandall Stewart 	ifp->if_ratelimit_query(ifp, &rl);
55820abea66SRandall Stewart 	if (rl.flags & RT_IS_UNUSABLE) {
55920abea66SRandall Stewart 		/*
56020abea66SRandall Stewart 		 * The interface does not really support
56120abea66SRandall Stewart 		 * the rate-limiting.
56220abea66SRandall Stewart 		 */
56320abea66SRandall Stewart 		memset(rs, 0, sizeof(struct tcp_rate_set));
56420abea66SRandall Stewart 		rs->rs_ifp = ifp;
56520abea66SRandall Stewart 		rs->rs_if_dunit = ifp->if_dunit;
56620abea66SRandall Stewart 		rs->rs_flags = RS_INTF_NO_SUP;
56720abea66SRandall Stewart 		rs->rs_disable = 1;
56820abea66SRandall Stewart 		rs_number_alive++;
56920abea66SRandall Stewart 		sysctl_ctx_init(&rs->sysctl_ctx);
57020abea66SRandall Stewart 		rl_sysctl_root = SYSCTL_ADD_NODE(&rs->sysctl_ctx,
57120abea66SRandall Stewart 		    SYSCTL_STATIC_CHILDREN(_net_inet_tcp_rl),
57220abea66SRandall Stewart 		    OID_AUTO,
57320abea66SRandall Stewart 		    rs->rs_ifp->if_xname,
5747029da5cSPawel Biernacki 		    CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
57520abea66SRandall Stewart 		    "");
57620abea66SRandall Stewart 		rl_add_syctl_entries(rl_sysctl_root, rs);
5771a714ff2SRandall Stewart 		NET_EPOCH_ENTER(et);
57820abea66SRandall Stewart 		mtx_lock(&rs_mtx);
57915ddc5e4SMichael Tuexen 		CK_LIST_INSERT_HEAD(&int_rs, rs, next);
58015ddc5e4SMichael Tuexen 		mtx_unlock(&rs_mtx);
5811a714ff2SRandall Stewart 		NET_EPOCH_EXIT(et);
58220abea66SRandall Stewart 		return (rs);
58320abea66SRandall Stewart 	} else if ((rl.flags & RT_IS_INDIRECT) == RT_IS_INDIRECT) {
58420abea66SRandall Stewart 		memset(rs, 0, sizeof(struct tcp_rate_set));
58520abea66SRandall Stewart 		rs->rs_ifp = ifp;
58620abea66SRandall Stewart 		rs->rs_if_dunit = ifp->if_dunit;
58720abea66SRandall Stewart 		rs->rs_flags = RS_IS_DEFF;
58820abea66SRandall Stewart 		rs_number_alive++;
58920abea66SRandall Stewart 		sysctl_ctx_init(&rs->sysctl_ctx);
59020abea66SRandall Stewart 		rl_sysctl_root = SYSCTL_ADD_NODE(&rs->sysctl_ctx,
59120abea66SRandall Stewart 		    SYSCTL_STATIC_CHILDREN(_net_inet_tcp_rl),
59220abea66SRandall Stewart 		    OID_AUTO,
59320abea66SRandall Stewart 		    rs->rs_ifp->if_xname,
5947029da5cSPawel Biernacki 		    CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
59520abea66SRandall Stewart 		    "");
59620abea66SRandall Stewart 		rl_add_syctl_entries(rl_sysctl_root, rs);
5971a714ff2SRandall Stewart 		NET_EPOCH_ENTER(et);
59820abea66SRandall Stewart 		mtx_lock(&rs_mtx);
59915ddc5e4SMichael Tuexen 		CK_LIST_INSERT_HEAD(&int_rs, rs, next);
60015ddc5e4SMichael Tuexen 		mtx_unlock(&rs_mtx);
6011a714ff2SRandall Stewart 		NET_EPOCH_EXIT(et);
60220abea66SRandall Stewart 		return (rs);
60320abea66SRandall Stewart 	} else if ((rl.flags & RT_IS_FIXED_TABLE) == RT_IS_FIXED_TABLE) {
604d7313dc6SRandall Stewart 		/* Mellanox C4 likely */
60520abea66SRandall Stewart 		rs->rs_ifp = ifp;
60620abea66SRandall Stewart 		rs->rs_if_dunit = ifp->if_dunit;
60720abea66SRandall Stewart 		rs->rs_rate_cnt = rl.number_of_rates;
60820abea66SRandall Stewart 		rs->rs_min_seg = rl.min_segment_burst;
60920abea66SRandall Stewart 		rs->rs_highest_valid = 0;
61020abea66SRandall Stewart 		rs->rs_flow_limit = rl.max_flows;
61120abea66SRandall Stewart 		rs->rs_flags = RS_IS_INTF | RS_NO_PRE;
61220abea66SRandall Stewart 		rs->rs_disable = 0;
61320abea66SRandall Stewart 		rate_table_act = rl.rate_table;
61420abea66SRandall Stewart 	} else if ((rl.flags & RT_IS_SELECTABLE) == RT_IS_SELECTABLE) {
615d7313dc6SRandall Stewart 		/* Chelsio, C5 and C6 of Mellanox? */
61620abea66SRandall Stewart 		rs->rs_ifp = ifp;
61720abea66SRandall Stewart 		rs->rs_if_dunit = ifp->if_dunit;
61820abea66SRandall Stewart 		rs->rs_rate_cnt = rl.number_of_rates;
61920abea66SRandall Stewart 		rs->rs_min_seg = rl.min_segment_burst;
62020abea66SRandall Stewart 		rs->rs_disable = 0;
62120abea66SRandall Stewart 		rs->rs_flow_limit = rl.max_flows;
62220abea66SRandall Stewart 		rate_table_act = desired_rates;
62320abea66SRandall Stewart 		if ((rs->rs_rate_cnt > MAX_HDWR_RATES) &&
62420abea66SRandall Stewart 		    (rs->rs_rate_cnt < ALL_HARDWARE_RATES)) {
62520abea66SRandall Stewart 			/*
62620abea66SRandall Stewart 			 * Our desired table is not big
62720abea66SRandall Stewart 			 * enough, do what we can.
62820abea66SRandall Stewart 			 */
62920abea66SRandall Stewart 			rs->rs_rate_cnt = MAX_HDWR_RATES;
63020abea66SRandall Stewart 		 }
63120abea66SRandall Stewart 		if (rs->rs_rate_cnt <= RS_ORDERED_COUNT)
63220abea66SRandall Stewart 			rs->rs_flags = RS_IS_INTF;
63320abea66SRandall Stewart 		else
63420abea66SRandall Stewart 			rs->rs_flags = RS_IS_INTF | RS_INT_TBL;
63520abea66SRandall Stewart 		if (rs->rs_rate_cnt >= ALL_HARDWARE_RATES)
63620abea66SRandall Stewart 			rs->rs_rate_cnt = ALL_HARDWARE_RATES;
63720abea66SRandall Stewart 	} else {
63820abea66SRandall Stewart 		free(rs, M_TCPPACE);
63920abea66SRandall Stewart 		return (NULL);
64020abea66SRandall Stewart 	}
64120abea66SRandall Stewart 	sz = sizeof(struct tcp_hwrate_limit_table) * rs->rs_rate_cnt;
64220abea66SRandall Stewart 	rs->rs_rlt = malloc(sz, M_TCPPACE, M_NOWAIT);
64320abea66SRandall Stewart 	if (rs->rs_rlt == NULL) {
64420abea66SRandall Stewart 		if (error)
64520abea66SRandall Stewart 			*error = ENOMEM;
64620abea66SRandall Stewart bail:
64720abea66SRandall Stewart 		free(rs, M_TCPPACE);
64820abea66SRandall Stewart 		return (NULL);
64920abea66SRandall Stewart 	}
65020abea66SRandall Stewart 	if (rs->rs_rate_cnt >= ALL_HARDWARE_RATES) {
65120abea66SRandall Stewart 		/*
65220abea66SRandall Stewart 		 * The interface supports all
65320abea66SRandall Stewart 		 * the rates we could possibly want.
65420abea66SRandall Stewart 		 */
65520abea66SRandall Stewart 		uint64_t rat;
65620abea66SRandall Stewart 
65720abea66SRandall Stewart 		rs->rs_rlt[0].rate = 12500;	/* 100k */
65820abea66SRandall Stewart 		rs->rs_rlt[1].rate = 25000;	/* 200k */
65920abea66SRandall Stewart 		rs->rs_rlt[2].rate = 62500;	/* 500k */
66020abea66SRandall Stewart 		/* Note 125000 == 1Megabit
66120abea66SRandall Stewart 		 * populate 1Meg - 1000meg.
66220abea66SRandall Stewart 		 */
66320abea66SRandall Stewart 		for(i = 3, rat = 125000; i< (ALL_HARDWARE_RATES-1); i++) {
66420abea66SRandall Stewart 			rs->rs_rlt[i].rate = rat;
66520abea66SRandall Stewart 			rat += 125000;
66620abea66SRandall Stewart 		}
66720abea66SRandall Stewart 		rs->rs_rlt[(ALL_HARDWARE_RATES-1)].rate = 1250000000;
66820abea66SRandall Stewart 	} else if (rs->rs_flags & RS_INT_TBL) {
66920abea66SRandall Stewart 		/* We populate this in a special way */
67020abea66SRandall Stewart 		populate_canned_table(rs, rate_table_act);
67120abea66SRandall Stewart 	} else {
67220abea66SRandall Stewart 		/*
67320abea66SRandall Stewart 		 * Just copy in the rates from
67420abea66SRandall Stewart 		 * the table, it is in order.
67520abea66SRandall Stewart 		 */
67620abea66SRandall Stewart 		for (i=0; i<rs->rs_rate_cnt; i++) {
67720abea66SRandall Stewart 			rs->rs_rlt[i].rate = rate_table_act[i];
67820abea66SRandall Stewart 			rs->rs_rlt[i].time_between = 0;
67920abea66SRandall Stewart 			rs->rs_rlt[i].flags = 0;
68020abea66SRandall Stewart 		}
68120abea66SRandall Stewart 	}
68220abea66SRandall Stewart 	for (i = (rs->rs_rate_cnt - 1); i >= 0; i--) {
68320abea66SRandall Stewart 		/*
68420abea66SRandall Stewart 		 * We go backwards through the list so that if we can't get
68520abea66SRandall Stewart 		 * a rate and fail to init one, we have at least a chance of
68620abea66SRandall Stewart 		 * getting the highest one.
68720abea66SRandall Stewart 		 */
68820abea66SRandall Stewart 		rs->rs_rlt[i].ptbl = rs;
68920abea66SRandall Stewart 		rs->rs_rlt[i].tag = NULL;
6905d8fd932SRandall Stewart 		rs->rs_rlt[i].using = 0;
6915d8fd932SRandall Stewart 		rs->rs_rlt[i].rs_num_enobufs = 0;
69220abea66SRandall Stewart 		/*
69320abea66SRandall Stewart 		 * Calculate the time between.
69420abea66SRandall Stewart 		 */
69520abea66SRandall Stewart 		lentim = ETHERNET_SEGMENT_SIZE * USECS_IN_SECOND;
69620abea66SRandall Stewart 		res = lentim / rs->rs_rlt[i].rate;
69720abea66SRandall Stewart 		if (res > 0)
69820abea66SRandall Stewart 			rs->rs_rlt[i].time_between = res;
69920abea66SRandall Stewart 		else
70020abea66SRandall Stewart 			rs->rs_rlt[i].time_between = 1;
70120abea66SRandall Stewart 		if (rs->rs_flags & RS_NO_PRE) {
70220abea66SRandall Stewart 			rs->rs_rlt[i].flags = HDWRPACE_INITED;
70320abea66SRandall Stewart 			rs->rs_lowest_valid = i;
70420abea66SRandall Stewart 		} else {
70520abea66SRandall Stewart 			int err;
706d7313dc6SRandall Stewart 
707d7313dc6SRandall Stewart 			if ((rl.flags & RT_IS_SETUP_REQ)  &&
708d7313dc6SRandall Stewart 			    (ifp->if_ratelimit_query)) {
709d7313dc6SRandall Stewart 				err = ifp->if_ratelimit_setup(ifp,
710d7313dc6SRandall Stewart   				         rs->rs_rlt[i].rate, i);
711d7313dc6SRandall Stewart 				if (err)
712d7313dc6SRandall Stewart 					goto handle_err;
713d7313dc6SRandall Stewart 			}
71420abea66SRandall Stewart #ifdef RSS
71520abea66SRandall Stewart 			hash_type = M_HASHTYPE_RSS_TCP_IPV4;
71620abea66SRandall Stewart #else
71720abea66SRandall Stewart 			hash_type = M_HASHTYPE_OPAQUE_HASH;
71820abea66SRandall Stewart #endif
71920abea66SRandall Stewart 			err = rl_attach_txrtlmt(ifp,
72020abea66SRandall Stewart 			    hash_type,
72120abea66SRandall Stewart 			    (i + 1),
72220abea66SRandall Stewart 			    rs->rs_rlt[i].rate,
72320abea66SRandall Stewart 			    &rs->rs_rlt[i].tag);
72420abea66SRandall Stewart 			if (err) {
725d7313dc6SRandall Stewart handle_err:
72620abea66SRandall Stewart 				if (i == (rs->rs_rate_cnt - 1)) {
72720abea66SRandall Stewart 					/*
72820abea66SRandall Stewart 					 * Huh - first rate and we can't get
72920abea66SRandall Stewart 					 * it?
73020abea66SRandall Stewart 					 */
73120abea66SRandall Stewart 					free(rs->rs_rlt, M_TCPPACE);
73220abea66SRandall Stewart 					if (error)
73320abea66SRandall Stewart 						*error = err;
73420abea66SRandall Stewart 					goto bail;
73520abea66SRandall Stewart 				} else {
73620abea66SRandall Stewart 					if (error)
73720abea66SRandall Stewart 						*error = err;
73820abea66SRandall Stewart 				}
73920abea66SRandall Stewart 				break;
74020abea66SRandall Stewart 			} else {
74120abea66SRandall Stewart 				rs->rs_rlt[i].flags = HDWRPACE_INITED | HDWRPACE_TAGPRESENT;
74220abea66SRandall Stewart 				rs->rs_lowest_valid = i;
74320abea66SRandall Stewart 			}
74420abea66SRandall Stewart 		}
74520abea66SRandall Stewart 	}
74620abea66SRandall Stewart 	/* Did we get at least 1 rate? */
74720abea66SRandall Stewart 	if (rs->rs_rlt[(rs->rs_rate_cnt - 1)].flags & HDWRPACE_INITED)
74820abea66SRandall Stewart 		rs->rs_highest_valid = rs->rs_rate_cnt - 1;
74920abea66SRandall Stewart 	else {
75020abea66SRandall Stewart 		free(rs->rs_rlt, M_TCPPACE);
75120abea66SRandall Stewart 		goto bail;
75220abea66SRandall Stewart 	}
75320abea66SRandall Stewart 	rs_number_alive++;
75420abea66SRandall Stewart 	sysctl_ctx_init(&rs->sysctl_ctx);
75520abea66SRandall Stewart 	rl_sysctl_root = SYSCTL_ADD_NODE(&rs->sysctl_ctx,
75620abea66SRandall Stewart 	    SYSCTL_STATIC_CHILDREN(_net_inet_tcp_rl),
75720abea66SRandall Stewart 	    OID_AUTO,
75820abea66SRandall Stewart 	    rs->rs_ifp->if_xname,
7597029da5cSPawel Biernacki 	    CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
76020abea66SRandall Stewart 	    "");
76120abea66SRandall Stewart 	rl_add_syctl_entries(rl_sysctl_root, rs);
7621a714ff2SRandall Stewart 	NET_EPOCH_ENTER(et);
76320abea66SRandall Stewart 	mtx_lock(&rs_mtx);
76415ddc5e4SMichael Tuexen 	CK_LIST_INSERT_HEAD(&int_rs, rs, next);
76515ddc5e4SMichael Tuexen 	mtx_unlock(&rs_mtx);
7661a714ff2SRandall Stewart 	NET_EPOCH_EXIT(et);
76720abea66SRandall Stewart 	return (rs);
76820abea66SRandall Stewart }
76920abea66SRandall Stewart 
7701a714ff2SRandall Stewart /*
7711a714ff2SRandall Stewart  * For an explanation of why the argument is volatile please
7721a714ff2SRandall Stewart  * look at the comments around rt_setup_rate().
7731a714ff2SRandall Stewart  */
77420abea66SRandall Stewart static const struct tcp_hwrate_limit_table *
tcp_int_find_suitable_rate(const volatile struct tcp_rate_set * rs,uint64_t bytes_per_sec,uint32_t flags,uint64_t * lower_rate)7751a714ff2SRandall Stewart tcp_int_find_suitable_rate(const volatile struct tcp_rate_set *rs,
7761a714ff2SRandall Stewart     uint64_t bytes_per_sec, uint32_t flags, uint64_t *lower_rate)
77720abea66SRandall Stewart {
77820abea66SRandall Stewart 	struct tcp_hwrate_limit_table *arte = NULL, *rte = NULL;
7791a714ff2SRandall Stewart 	uint64_t mbits_per_sec, ind_calc, previous_rate = 0;
78020abea66SRandall Stewart 	int i;
78120abea66SRandall Stewart 
78220abea66SRandall Stewart 	mbits_per_sec = (bytes_per_sec * 8);
78320abea66SRandall Stewart 	if (flags & RS_PACING_LT) {
78420abea66SRandall Stewart 		if ((mbits_per_sec < RS_ONE_MEGABIT_PERSEC) &&
78520abea66SRandall Stewart 		    (rs->rs_lowest_valid <= 2)){
78620abea66SRandall Stewart 			/*
78720abea66SRandall Stewart 			 * Smaller than 1Meg, only
78820abea66SRandall Stewart 			 * 3 entries can match it.
78920abea66SRandall Stewart 			 */
7901a714ff2SRandall Stewart 			previous_rate = 0;
79120abea66SRandall Stewart 			for(i = rs->rs_lowest_valid; i < 3; i++) {
79220abea66SRandall Stewart 				if (bytes_per_sec <= rs->rs_rlt[i].rate) {
79320abea66SRandall Stewart 					rte = &rs->rs_rlt[i];
79420abea66SRandall Stewart 					break;
79520abea66SRandall Stewart 				} else if (rs->rs_rlt[i].flags & HDWRPACE_INITED) {
79620abea66SRandall Stewart 					arte = &rs->rs_rlt[i];
79720abea66SRandall Stewart 				}
7981a714ff2SRandall Stewart 				previous_rate = rs->rs_rlt[i].rate;
79920abea66SRandall Stewart 			}
80020abea66SRandall Stewart 			goto done;
80120abea66SRandall Stewart 		} else if ((mbits_per_sec > RS_ONE_GIGABIT_PERSEC) &&
80220abea66SRandall Stewart 			   (rs->rs_rlt[(ALL_HARDWARE_RATES-1)].flags & HDWRPACE_INITED)){
80320abea66SRandall Stewart 			/*
80420abea66SRandall Stewart 			 * Larger than 1G (the majority of
80520abea66SRandall Stewart 			 * our table.
80620abea66SRandall Stewart 			 */
80720abea66SRandall Stewart 			if (mbits_per_sec < RS_TEN_GIGABIT_PERSEC)
80820abea66SRandall Stewart 				rte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)];
80920abea66SRandall Stewart 			else
81020abea66SRandall Stewart 				arte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)];
8111a714ff2SRandall Stewart 			previous_rate = rs->rs_rlt[(ALL_HARDWARE_RATES-2)].rate;
81220abea66SRandall Stewart 			goto done;
81320abea66SRandall Stewart 		}
81420abea66SRandall Stewart 		/*
81520abea66SRandall Stewart 		 * If we reach here its in our table (between 1Meg - 1000Meg),
81620abea66SRandall Stewart 		 * just take the rounded down mbits per second, and add
81720abea66SRandall Stewart 		 * 1Megabit to it, from this we can calculate
81820abea66SRandall Stewart 		 * the index in the table.
81920abea66SRandall Stewart 		 */
82020abea66SRandall Stewart 		ind_calc = mbits_per_sec/RS_ONE_MEGABIT_PERSEC;
82120abea66SRandall Stewart 		if ((ind_calc * RS_ONE_MEGABIT_PERSEC) != mbits_per_sec)
82220abea66SRandall Stewart 			ind_calc++;
82320abea66SRandall Stewart 		/* our table is offset by 3, we add 2 */
82420abea66SRandall Stewart 		ind_calc += 2;
82520abea66SRandall Stewart 		if (ind_calc > (ALL_HARDWARE_RATES-1)) {
82620abea66SRandall Stewart 			/* This should not happen */
82720abea66SRandall Stewart 			ind_calc = ALL_HARDWARE_RATES-1;
82820abea66SRandall Stewart 		}
82920abea66SRandall Stewart 		if ((ind_calc >= rs->rs_lowest_valid) &&
8301a714ff2SRandall Stewart 		    (ind_calc <= rs->rs_highest_valid)) {
83120abea66SRandall Stewart 			rte = &rs->rs_rlt[ind_calc];
8321a714ff2SRandall Stewart 			if (ind_calc >= 1)
8331a714ff2SRandall Stewart 				previous_rate = rs->rs_rlt[(ind_calc-1)].rate;
8341a714ff2SRandall Stewart 		}
83520abea66SRandall Stewart 	} else if (flags & RS_PACING_EXACT_MATCH) {
83620abea66SRandall Stewart 		if ((mbits_per_sec < RS_ONE_MEGABIT_PERSEC) &&
83720abea66SRandall Stewart 		    (rs->rs_lowest_valid <= 2)){
83820abea66SRandall Stewart 			for(i = rs->rs_lowest_valid; i < 3; i++) {
83920abea66SRandall Stewart 				if (bytes_per_sec == rs->rs_rlt[i].rate) {
84020abea66SRandall Stewart 					rte = &rs->rs_rlt[i];
84120abea66SRandall Stewart 					break;
84220abea66SRandall Stewart 				}
84320abea66SRandall Stewart 			}
84420abea66SRandall Stewart 		} else if ((mbits_per_sec > RS_ONE_GIGABIT_PERSEC) &&
84520abea66SRandall Stewart 			   (rs->rs_rlt[(ALL_HARDWARE_RATES-1)].flags & HDWRPACE_INITED)) {
84620abea66SRandall Stewart 			/* > 1Gbps only one rate */
84720abea66SRandall Stewart 			if (bytes_per_sec == rs->rs_rlt[(ALL_HARDWARE_RATES-1)].rate) {
84820abea66SRandall Stewart 				/* Its 10G wow */
84920abea66SRandall Stewart 				rte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)];
85020abea66SRandall Stewart 			}
85120abea66SRandall Stewart 		} else {
85220abea66SRandall Stewart 			/* Ok it must be a exact meg (its between 1G and 1Meg) */
85320abea66SRandall Stewart 			ind_calc = mbits_per_sec/RS_ONE_MEGABIT_PERSEC;
85420abea66SRandall Stewart 			if ((ind_calc * RS_ONE_MEGABIT_PERSEC) == mbits_per_sec) {
85520abea66SRandall Stewart 				/* its an exact Mbps */
85620abea66SRandall Stewart 				ind_calc += 2;
85720abea66SRandall Stewart 				if (ind_calc > (ALL_HARDWARE_RATES-1)) {
85820abea66SRandall Stewart 					/* This should not happen */
85920abea66SRandall Stewart 					ind_calc = ALL_HARDWARE_RATES-1;
86020abea66SRandall Stewart 				}
86120abea66SRandall Stewart 				if (rs->rs_rlt[ind_calc].flags & HDWRPACE_INITED)
86220abea66SRandall Stewart 					rte = &rs->rs_rlt[ind_calc];
86320abea66SRandall Stewart 			}
86420abea66SRandall Stewart 		}
86520abea66SRandall Stewart 	} else {
86620abea66SRandall Stewart 		/* we want greater than the requested rate */
86720abea66SRandall Stewart 		if ((mbits_per_sec < RS_ONE_MEGABIT_PERSEC) &&
86820abea66SRandall Stewart 		    (rs->rs_lowest_valid <= 2)){
86920abea66SRandall Stewart 			arte = &rs->rs_rlt[3]; /* set alternate to 1Meg */
87020abea66SRandall Stewart 			for (i=2; i>=rs->rs_lowest_valid; i--) {
87120abea66SRandall Stewart 				if (bytes_per_sec < rs->rs_rlt[i].rate) {
87220abea66SRandall Stewart 					rte = &rs->rs_rlt[i];
8731a714ff2SRandall Stewart 					if (i >= 1) {
8741a714ff2SRandall Stewart 						previous_rate = rs->rs_rlt[(i-1)].rate;
8751a714ff2SRandall Stewart 					}
87620abea66SRandall Stewart 					break;
87720abea66SRandall Stewart 				} else if ((flags & RS_PACING_GEQ) &&
87820abea66SRandall Stewart 					   (bytes_per_sec == rs->rs_rlt[i].rate)) {
87920abea66SRandall Stewart 					rte = &rs->rs_rlt[i];
8801a714ff2SRandall Stewart 					if (i >= 1) {
8811a714ff2SRandall Stewart 						previous_rate = rs->rs_rlt[(i-1)].rate;
8821a714ff2SRandall Stewart 					}
88320abea66SRandall Stewart 					break;
88420abea66SRandall Stewart 				} else {
88520abea66SRandall Stewart 					arte = &rs->rs_rlt[i]; /* new alternate */
88620abea66SRandall Stewart 				}
88720abea66SRandall Stewart 			}
88820abea66SRandall Stewart 		} else if (mbits_per_sec > RS_ONE_GIGABIT_PERSEC) {
88920abea66SRandall Stewart 			if ((bytes_per_sec < rs->rs_rlt[(ALL_HARDWARE_RATES-1)].rate) &&
89020abea66SRandall Stewart 			    (rs->rs_rlt[(ALL_HARDWARE_RATES-1)].flags & HDWRPACE_INITED)){
89120abea66SRandall Stewart 				/* Our top rate is larger than the request */
89220abea66SRandall Stewart 				rte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)];
89320abea66SRandall Stewart 			} else if ((flags & RS_PACING_GEQ) &&
89420abea66SRandall Stewart 				   (bytes_per_sec == rs->rs_rlt[(ALL_HARDWARE_RATES-1)].rate) &&
89520abea66SRandall Stewart 				   (rs->rs_rlt[(ALL_HARDWARE_RATES-1)].flags & HDWRPACE_INITED)) {
89620abea66SRandall Stewart 				/* It matches our top rate */
89720abea66SRandall Stewart 				rte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)];
89820abea66SRandall Stewart 			} else if (rs->rs_rlt[(ALL_HARDWARE_RATES-1)].flags & HDWRPACE_INITED) {
89920abea66SRandall Stewart 				/* The top rate is an alternative */
90020abea66SRandall Stewart 				arte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)];
90120abea66SRandall Stewart 			}
9021a714ff2SRandall Stewart 			previous_rate = rs->rs_rlt[(ALL_HARDWARE_RATES-2)].rate;
90320abea66SRandall Stewart 		} else {
90420abea66SRandall Stewart 			/* Its in our range 1Meg - 1Gig */
90520abea66SRandall Stewart 			if (flags & RS_PACING_GEQ) {
90620abea66SRandall Stewart 				ind_calc = mbits_per_sec/RS_ONE_MEGABIT_PERSEC;
90720abea66SRandall Stewart 				if ((ind_calc * RS_ONE_MEGABIT_PERSEC) == mbits_per_sec) {
90820abea66SRandall Stewart 					if (ind_calc > (ALL_HARDWARE_RATES-1)) {
90920abea66SRandall Stewart 						/* This should not happen */
91020abea66SRandall Stewart 						ind_calc = (ALL_HARDWARE_RATES-1);
91120abea66SRandall Stewart 					}
91220abea66SRandall Stewart 					rte = &rs->rs_rlt[ind_calc];
9131a714ff2SRandall Stewart 					if (ind_calc >= 1)
9141a714ff2SRandall Stewart 						previous_rate = rs->rs_rlt[(ind_calc-1)].rate;
91520abea66SRandall Stewart 				}
91620abea66SRandall Stewart 				goto done;
91720abea66SRandall Stewart 			}
91820abea66SRandall Stewart 			ind_calc = (mbits_per_sec + (RS_ONE_MEGABIT_PERSEC-1))/RS_ONE_MEGABIT_PERSEC;
91920abea66SRandall Stewart 			ind_calc += 2;
92020abea66SRandall Stewart 			if (ind_calc > (ALL_HARDWARE_RATES-1)) {
92120abea66SRandall Stewart 				/* This should not happen */
92220abea66SRandall Stewart 				ind_calc = ALL_HARDWARE_RATES-1;
92320abea66SRandall Stewart 			}
9241a714ff2SRandall Stewart 			if (rs->rs_rlt[ind_calc].flags & HDWRPACE_INITED) {
92520abea66SRandall Stewart 				rte = &rs->rs_rlt[ind_calc];
9261a714ff2SRandall Stewart 				if (ind_calc >= 1)
9271a714ff2SRandall Stewart 					previous_rate = rs->rs_rlt[(ind_calc-1)].rate;
9281a714ff2SRandall Stewart 			}
92920abea66SRandall Stewart 		}
93020abea66SRandall Stewart 	}
93120abea66SRandall Stewart done:
93220abea66SRandall Stewart 	if ((rte == NULL) &&
93320abea66SRandall Stewart 	    (arte != NULL) &&
93420abea66SRandall Stewart 	    (flags & RS_PACING_SUB_OK)) {
93520abea66SRandall Stewart 		/* We can use the substitute */
93620abea66SRandall Stewart 		rte = arte;
93720abea66SRandall Stewart 	}
9381a714ff2SRandall Stewart 	if (lower_rate)
9391a714ff2SRandall Stewart 		*lower_rate = previous_rate;
94020abea66SRandall Stewart 	return (rte);
94120abea66SRandall Stewart }
94220abea66SRandall Stewart 
9431a714ff2SRandall Stewart /*
9441a714ff2SRandall Stewart  * For an explanation of why the argument is volatile please
9451a714ff2SRandall Stewart  * look at the comments around rt_setup_rate().
9461a714ff2SRandall Stewart  */
94720abea66SRandall Stewart static const struct tcp_hwrate_limit_table *
tcp_find_suitable_rate(const volatile struct tcp_rate_set * rs,uint64_t bytes_per_sec,uint32_t flags,uint64_t * lower_rate)9481a714ff2SRandall Stewart tcp_find_suitable_rate(const volatile struct tcp_rate_set *rs, uint64_t bytes_per_sec, uint32_t flags, uint64_t *lower_rate)
94920abea66SRandall Stewart {
95020abea66SRandall Stewart 	/**
95120abea66SRandall Stewart 	 * Hunt the rate table with the restrictions in flags and find a
95220abea66SRandall Stewart 	 * suitable rate if possible.
95320abea66SRandall Stewart 	 * RS_PACING_EXACT_MATCH - look for an exact match to rate.
95420abea66SRandall Stewart 	 * RS_PACING_GT     - must be greater than.
95520abea66SRandall Stewart 	 * RS_PACING_GEQ    - must be greater than or equal.
95620abea66SRandall Stewart 	 * RS_PACING_LT     - must be less than.
95720abea66SRandall Stewart 	 * RS_PACING_SUB_OK - If we don't meet criteria a
95820abea66SRandall Stewart 	 *                    substitute is ok.
95920abea66SRandall Stewart 	 */
96020abea66SRandall Stewart 	int i, matched;
96120abea66SRandall Stewart 	struct tcp_hwrate_limit_table *rte = NULL;
9621a714ff2SRandall Stewart 	uint64_t previous_rate = 0;
96320abea66SRandall Stewart 
96420abea66SRandall Stewart 	if ((rs->rs_flags & RS_INT_TBL) &&
96520abea66SRandall Stewart 	    (rs->rs_rate_cnt >= ALL_HARDWARE_RATES)) {
96620abea66SRandall Stewart 		/*
96720abea66SRandall Stewart 		 * Here we don't want to paw thru
96820abea66SRandall Stewart 		 * a big table, we have everything
96920abea66SRandall Stewart 		 * from 1Meg - 1000Meg in 1Meg increments.
97020abea66SRandall Stewart 		 * Use an alternate method to "lookup".
97120abea66SRandall Stewart 		 */
9721a714ff2SRandall Stewart 		return (tcp_int_find_suitable_rate(rs, bytes_per_sec, flags, lower_rate));
97320abea66SRandall Stewart 	}
97420abea66SRandall Stewart 	if ((flags & RS_PACING_LT) ||
97520abea66SRandall Stewart 	    (flags & RS_PACING_EXACT_MATCH)) {
97620abea66SRandall Stewart 		/*
97720abea66SRandall Stewart 		 * For exact and less than we go forward through the table.
97820abea66SRandall Stewart 		 * This way when we find one larger we stop (exact was a
97920abea66SRandall Stewart 		 * toss up).
98020abea66SRandall Stewart 		 */
98120abea66SRandall Stewart 		for (i = rs->rs_lowest_valid, matched = 0; i <= rs->rs_highest_valid; i++) {
98220abea66SRandall Stewart 			if ((flags & RS_PACING_EXACT_MATCH) &&
98320abea66SRandall Stewart 			    (bytes_per_sec == rs->rs_rlt[i].rate)) {
98420abea66SRandall Stewart 				rte = &rs->rs_rlt[i];
98520abea66SRandall Stewart 				matched = 1;
9861a714ff2SRandall Stewart 				if (lower_rate != NULL)
9871a714ff2SRandall Stewart 					*lower_rate = previous_rate;
98820abea66SRandall Stewart 				break;
98920abea66SRandall Stewart 			} else if ((flags & RS_PACING_LT) &&
99020abea66SRandall Stewart 			    (bytes_per_sec <= rs->rs_rlt[i].rate)) {
99120abea66SRandall Stewart 				rte = &rs->rs_rlt[i];
99220abea66SRandall Stewart 				matched = 1;
9931a714ff2SRandall Stewart 				if (lower_rate != NULL)
9941a714ff2SRandall Stewart 					*lower_rate = previous_rate;
99520abea66SRandall Stewart 				break;
99620abea66SRandall Stewart 			}
9971a714ff2SRandall Stewart 			previous_rate = rs->rs_rlt[i].rate;
99820abea66SRandall Stewart 			if (bytes_per_sec > rs->rs_rlt[i].rate)
99920abea66SRandall Stewart 				break;
100020abea66SRandall Stewart 		}
100120abea66SRandall Stewart 		if ((matched == 0) &&
100220abea66SRandall Stewart 		    (flags & RS_PACING_LT) &&
100320abea66SRandall Stewart 		    (flags & RS_PACING_SUB_OK)) {
100420abea66SRandall Stewart 			/* Kick in a substitute (the lowest) */
100520abea66SRandall Stewart 			rte = &rs->rs_rlt[rs->rs_lowest_valid];
100620abea66SRandall Stewart 		}
100720abea66SRandall Stewart 	} else {
100820abea66SRandall Stewart 		/*
100920abea66SRandall Stewart 		 * Here we go backward through the table so that we can find
101020abea66SRandall Stewart 		 * the one greater in theory faster (but its probably a
101120abea66SRandall Stewart 		 * wash).
101220abea66SRandall Stewart 		 */
101320abea66SRandall Stewart 		for (i = rs->rs_highest_valid, matched = 0; i >= rs->rs_lowest_valid; i--) {
101420abea66SRandall Stewart 			if (rs->rs_rlt[i].rate > bytes_per_sec) {
101520abea66SRandall Stewart 				/* A possible candidate */
101620abea66SRandall Stewart 				rte = &rs->rs_rlt[i];
101720abea66SRandall Stewart 			}
101820abea66SRandall Stewart 			if ((flags & RS_PACING_GEQ) &&
101920abea66SRandall Stewart 			    (bytes_per_sec == rs->rs_rlt[i].rate)) {
102020abea66SRandall Stewart 				/* An exact match and we want equal */
102120abea66SRandall Stewart 				matched = 1;
102220abea66SRandall Stewart 				rte = &rs->rs_rlt[i];
102320abea66SRandall Stewart 				break;
102420abea66SRandall Stewart 			} else if (rte) {
102520abea66SRandall Stewart 				/*
102620abea66SRandall Stewart 				 * Found one that is larger than but don't
102720abea66SRandall Stewart 				 * stop, there may be a more closer match.
102820abea66SRandall Stewart 				 */
102920abea66SRandall Stewart 				matched = 1;
103020abea66SRandall Stewart 			}
103120abea66SRandall Stewart 			if (rs->rs_rlt[i].rate < bytes_per_sec) {
103220abea66SRandall Stewart 				/*
103320abea66SRandall Stewart 				 * We found a table entry that is smaller,
103420abea66SRandall Stewart 				 * stop there will be none greater or equal.
103520abea66SRandall Stewart 				 */
10361a714ff2SRandall Stewart 				if (lower_rate != NULL)
10371a714ff2SRandall Stewart 					*lower_rate = rs->rs_rlt[i].rate;
103820abea66SRandall Stewart 				break;
103920abea66SRandall Stewart 			}
104020abea66SRandall Stewart 		}
104120abea66SRandall Stewart 		if ((matched == 0) &&
104220abea66SRandall Stewart 		    (flags & RS_PACING_SUB_OK)) {
104320abea66SRandall Stewart 			/* Kick in a substitute (the highest) */
104420abea66SRandall Stewart 			rte = &rs->rs_rlt[rs->rs_highest_valid];
104520abea66SRandall Stewart 		}
104620abea66SRandall Stewart 	}
104720abea66SRandall Stewart 	return (rte);
104820abea66SRandall Stewart }
104920abea66SRandall Stewart 
105020abea66SRandall Stewart static struct ifnet *
rt_find_real_interface(struct ifnet * ifp,struct inpcb * inp,int * error)105120abea66SRandall Stewart rt_find_real_interface(struct ifnet *ifp, struct inpcb *inp, int *error)
105220abea66SRandall Stewart {
105320abea66SRandall Stewart 	struct ifnet *tifp;
10541a714ff2SRandall Stewart 	struct m_snd_tag *tag, *ntag;
105520abea66SRandall Stewart 	union if_snd_tag_alloc_params params = {
105620abea66SRandall Stewart 		.rate_limit.hdr.type = IF_SND_TAG_TYPE_RATE_LIMIT,
10571a714ff2SRandall Stewart 		.rate_limit.hdr.flowid = inp->inp_flowid,
105898085baeSAndrew Gallatin 		.rate_limit.hdr.numa_domain = inp->inp_numa_domain,
105920abea66SRandall Stewart 		.rate_limit.max_rate = COMMON_RATE,
106020abea66SRandall Stewart 		.rate_limit.flags = M_NOWAIT,
106120abea66SRandall Stewart 	};
106220abea66SRandall Stewart 	int err;
106320abea66SRandall Stewart #ifdef RSS
106420abea66SRandall Stewart 	params.rate_limit.hdr.flowtype = ((inp->inp_vflag & INP_IPV6) ?
106520abea66SRandall Stewart 	    M_HASHTYPE_RSS_TCP_IPV6 : M_HASHTYPE_RSS_TCP_IPV4);
106620abea66SRandall Stewart #else
106720abea66SRandall Stewart 	params.rate_limit.hdr.flowtype = M_HASHTYPE_OPAQUE_HASH;
106820abea66SRandall Stewart #endif
106936e0a362SJohn Baldwin 	err = m_snd_tag_alloc(ifp, &params, &tag);
107020abea66SRandall Stewart 	if (err) {
107120abea66SRandall Stewart 		/* Failed to setup a tag? */
107220abea66SRandall Stewart 		if (error)
107320abea66SRandall Stewart 			*error = err;
107420abea66SRandall Stewart 		return (NULL);
107520abea66SRandall Stewart 	}
10761a714ff2SRandall Stewart 	ntag = tag;
1077c782ea8bSJohn Baldwin 	while (ntag->sw->next_snd_tag != NULL) {
1078c782ea8bSJohn Baldwin 		ntag = ntag->sw->next_snd_tag(ntag);
10791a714ff2SRandall Stewart 	}
10801a714ff2SRandall Stewart 	tifp = ntag->ifp;
108198d7a8d9SJohn Baldwin 	m_snd_tag_rele(tag);
108220abea66SRandall Stewart 	return (tifp);
108320abea66SRandall Stewart }
108420abea66SRandall Stewart 
10851a714ff2SRandall Stewart static void
rl_increment_using(const struct tcp_hwrate_limit_table * rte)10861a714ff2SRandall Stewart rl_increment_using(const struct tcp_hwrate_limit_table *rte)
10871a714ff2SRandall Stewart {
10885d8fd932SRandall Stewart 	struct tcp_hwrate_limit_table *decon_rte;
10895d8fd932SRandall Stewart 
10905d8fd932SRandall Stewart 	decon_rte = __DECONST(struct tcp_hwrate_limit_table *, rte);
10915d8fd932SRandall Stewart 	atomic_add_long(&decon_rte->using, 1);
10921a714ff2SRandall Stewart }
10931a714ff2SRandall Stewart 
10941a714ff2SRandall Stewart static void
rl_decrement_using(const struct tcp_hwrate_limit_table * rte)10951a714ff2SRandall Stewart rl_decrement_using(const struct tcp_hwrate_limit_table *rte)
10961a714ff2SRandall Stewart {
10975d8fd932SRandall Stewart 	struct tcp_hwrate_limit_table *decon_rte;
10985d8fd932SRandall Stewart 
10995d8fd932SRandall Stewart 	decon_rte = __DECONST(struct tcp_hwrate_limit_table *, rte);
11005d8fd932SRandall Stewart 	atomic_subtract_long(&decon_rte->using, 1);
11011a714ff2SRandall Stewart }
11021a714ff2SRandall Stewart 
11031a714ff2SRandall Stewart void
tcp_rl_log_enobuf(const struct tcp_hwrate_limit_table * rte)11041a714ff2SRandall Stewart tcp_rl_log_enobuf(const struct tcp_hwrate_limit_table *rte)
11051a714ff2SRandall Stewart {
11065d8fd932SRandall Stewart 	struct tcp_hwrate_limit_table *decon_rte;
11075d8fd932SRandall Stewart 
11085d8fd932SRandall Stewart 	decon_rte = __DECONST(struct tcp_hwrate_limit_table *, rte);
11095d8fd932SRandall Stewart 	atomic_add_long(&decon_rte->rs_num_enobufs, 1);
11101a714ff2SRandall Stewart }
11111a714ff2SRandall Stewart 
11121a714ff2SRandall Stewart /*
11131a714ff2SRandall Stewart  * Do NOT take the __noinline out of the
11141a714ff2SRandall Stewart  * find_rs_for_ifp() function. If you do the inline
11151a714ff2SRandall Stewart  * of it for the rt_setup_rate() will show you a
11161a714ff2SRandall Stewart  * compiler bug. For some reason the compiler thinks
11171a714ff2SRandall Stewart  * the list can never be empty. The consequence of
11181a714ff2SRandall Stewart  * this will be a crash when we dereference NULL
11191a714ff2SRandall Stewart  * if an ifp is removed just has a hw rate limit
11201a714ff2SRandall Stewart  * is attempted. If you are working on the compiler
11211a714ff2SRandall Stewart  * and want to "test" this go ahead and take the noinline
11221a714ff2SRandall Stewart  * out otherwise let sleeping dogs ly until such time
11231a714ff2SRandall Stewart  * as we get a compiler fix 10/2/20 -- RRS
11241a714ff2SRandall Stewart  */
11251a714ff2SRandall Stewart static __noinline struct tcp_rate_set *
find_rs_for_ifp(struct ifnet * ifp)11261a714ff2SRandall Stewart find_rs_for_ifp(struct ifnet *ifp)
11271a714ff2SRandall Stewart {
11281a714ff2SRandall Stewart 	struct tcp_rate_set *rs;
11291a714ff2SRandall Stewart 
11301a714ff2SRandall Stewart 	CK_LIST_FOREACH(rs, &int_rs, next) {
11311a714ff2SRandall Stewart 		if ((rs->rs_ifp == ifp) &&
11321a714ff2SRandall Stewart 		    (rs->rs_if_dunit == ifp->if_dunit)) {
11331a714ff2SRandall Stewart 			/* Ok we found it */
11341a714ff2SRandall Stewart 			return (rs);
11351a714ff2SRandall Stewart 		}
11361a714ff2SRandall Stewart 	}
11371a714ff2SRandall Stewart 	return (NULL);
11381a714ff2SRandall Stewart }
11391a714ff2SRandall Stewart 
11401a714ff2SRandall Stewart 
114120abea66SRandall Stewart static const struct tcp_hwrate_limit_table *
rt_setup_rate(struct inpcb * inp,struct ifnet * ifp,uint64_t bytes_per_sec,uint32_t flags,int * error,uint64_t * lower_rate)114220abea66SRandall Stewart rt_setup_rate(struct inpcb *inp, struct ifnet *ifp, uint64_t bytes_per_sec,
11431a714ff2SRandall Stewart     uint32_t flags, int *error, uint64_t *lower_rate)
114420abea66SRandall Stewart {
114520abea66SRandall Stewart 	/* First lets find the interface if it exists */
114620abea66SRandall Stewart 	const struct tcp_hwrate_limit_table *rte;
11471a714ff2SRandall Stewart 	/*
11481a714ff2SRandall Stewart 	 * So why is rs volatile? This is to defeat a
11491a714ff2SRandall Stewart 	 * compiler bug where in the compiler is convinced
11501a714ff2SRandall Stewart 	 * that rs can never be NULL (which is not true). Because
11511a714ff2SRandall Stewart 	 * of its conviction it nicely optimizes out the if ((rs == NULL
11521a714ff2SRandall Stewart 	 * below which means if you get a NULL back you dereference it.
11531a714ff2SRandall Stewart 	 */
11541a714ff2SRandall Stewart 	volatile struct tcp_rate_set *rs;
115520abea66SRandall Stewart 	struct epoch_tracker et;
11561a714ff2SRandall Stewart 	struct ifnet *oifp = ifp;
115720abea66SRandall Stewart 	int err;
115820abea66SRandall Stewart 
1159348404bcSRandall Stewart 	NET_EPOCH_ENTER(et);
116020abea66SRandall Stewart use_real_interface:
11611a714ff2SRandall Stewart 	rs = find_rs_for_ifp(ifp);
116220abea66SRandall Stewart 	if ((rs == NULL) ||
116320abea66SRandall Stewart 	    (rs->rs_flags & RS_INTF_NO_SUP) ||
116420abea66SRandall Stewart 	    (rs->rs_flags & RS_IS_DEAD)) {
116520abea66SRandall Stewart 		/*
116620abea66SRandall Stewart 		 * This means we got a packet *before*
116720abea66SRandall Stewart 		 * the IF-UP was processed below, <or>
116820abea66SRandall Stewart 		 * while or after we already received an interface
116920abea66SRandall Stewart 		 * departed event. In either case we really don't
117020abea66SRandall Stewart 		 * want to do anything with pacing, in
117120abea66SRandall Stewart 		 * the departing case the packet is not
117220abea66SRandall Stewart 		 * going to go very far. The new case
117320abea66SRandall Stewart 		 * might be arguable, but its impossible
117420abea66SRandall Stewart 		 * to tell from the departing case.
117520abea66SRandall Stewart 		 */
11761a714ff2SRandall Stewart 		if (error)
117720abea66SRandall Stewart 			*error = ENODEV;
1178348404bcSRandall Stewart 		NET_EPOCH_EXIT(et);
117920abea66SRandall Stewart 		return (NULL);
118020abea66SRandall Stewart 	}
118120abea66SRandall Stewart 
118220abea66SRandall Stewart 	if ((rs == NULL) || (rs->rs_disable != 0)) {
11831a714ff2SRandall Stewart 		if (error)
118420abea66SRandall Stewart 			*error = ENOSPC;
1185348404bcSRandall Stewart 		NET_EPOCH_EXIT(et);
118620abea66SRandall Stewart 		return (NULL);
118720abea66SRandall Stewart 	}
118820abea66SRandall Stewart 	if (rs->rs_flags & RS_IS_DEFF) {
118920abea66SRandall Stewart 		/* We need to find the real interface */
119020abea66SRandall Stewart 		struct ifnet *tifp;
119120abea66SRandall Stewart 
119220abea66SRandall Stewart 		tifp = rt_find_real_interface(ifp, inp, error);
119320abea66SRandall Stewart 		if (tifp == NULL) {
119420abea66SRandall Stewart 			if (rs->rs_disable && error)
119520abea66SRandall Stewart 				*error = ENOTSUP;
1196348404bcSRandall Stewart 			NET_EPOCH_EXIT(et);
119720abea66SRandall Stewart 			return (NULL);
119820abea66SRandall Stewart 		}
11991a714ff2SRandall Stewart 		KASSERT((tifp != ifp),
12001a714ff2SRandall Stewart 			("Lookup failure ifp:%p inp:%p rt_find_real_interface() returns the same interface tifp:%p?\n",
12011a714ff2SRandall Stewart 			 ifp, inp, tifp));
12021a714ff2SRandall Stewart 		ifp = tifp;
120320abea66SRandall Stewart 		goto use_real_interface;
120420abea66SRandall Stewart 	}
120520abea66SRandall Stewart 	if (rs->rs_flow_limit &&
120620abea66SRandall Stewart 	    ((rs->rs_flows_using + 1) > rs->rs_flow_limit)) {
120720abea66SRandall Stewart 		if (error)
120820abea66SRandall Stewart 			*error = ENOSPC;
1209348404bcSRandall Stewart 		NET_EPOCH_EXIT(et);
121020abea66SRandall Stewart 		return (NULL);
121120abea66SRandall Stewart 	}
12121a714ff2SRandall Stewart 	rte = tcp_find_suitable_rate(rs, bytes_per_sec, flags, lower_rate);
121320abea66SRandall Stewart 	if (rte) {
12141a714ff2SRandall Stewart 		err = in_pcbattach_txrtlmt(inp, oifp,
121520abea66SRandall Stewart 		    inp->inp_flowtype,
121620abea66SRandall Stewart 		    inp->inp_flowid,
121720abea66SRandall Stewart 		    rte->rate,
121820abea66SRandall Stewart 		    &inp->inp_snd_tag);
121920abea66SRandall Stewart 		if (err) {
122020abea66SRandall Stewart 			/* Failed to attach */
122120abea66SRandall Stewart 			if (error)
122220abea66SRandall Stewart 				*error = err;
122320abea66SRandall Stewart 			rte = NULL;
12241a714ff2SRandall Stewart 		} else {
12251a714ff2SRandall Stewart 			KASSERT((inp->inp_snd_tag != NULL) ,
1226db46c0d0SHans Petter Selasky 				("Setup rate has no snd_tag inp:%p rte:%p rate:%llu rs:%p",
1227db46c0d0SHans Petter Selasky 				 inp, rte, (unsigned long long)rte->rate, rs));
1228db46c0d0SHans Petter Selasky #ifdef INET
12291a714ff2SRandall Stewart 			counter_u64_add(rate_limit_new, 1);
1230db46c0d0SHans Petter Selasky #endif
123120abea66SRandall Stewart 		}
123220abea66SRandall Stewart 	}
123320abea66SRandall Stewart 	if (rte) {
123420abea66SRandall Stewart 		/*
123520abea66SRandall Stewart 		 * We use an atomic here for accounting so we don't have to
123620abea66SRandall Stewart 		 * use locks when freeing.
123720abea66SRandall Stewart 		 */
123899c311c4SRandall Stewart 		atomic_add_64(&rs->rs_flows_using, 1);
123920abea66SRandall Stewart 	}
1240348404bcSRandall Stewart 	NET_EPOCH_EXIT(et);
124120abea66SRandall Stewart 	return (rte);
124220abea66SRandall Stewart }
124320abea66SRandall Stewart 
124420abea66SRandall Stewart static void
tcp_rl_ifnet_link(void * arg __unused,struct ifnet * ifp,int link_state)124520abea66SRandall Stewart tcp_rl_ifnet_link(void *arg __unused, struct ifnet *ifp, int link_state)
124620abea66SRandall Stewart {
124720abea66SRandall Stewart 	int error;
124820abea66SRandall Stewart 	struct tcp_rate_set *rs;
12491a714ff2SRandall Stewart 	struct epoch_tracker et;
125020abea66SRandall Stewart 
12519aed26b9SJohn Baldwin 	if (((ifp->if_capenable & IFCAP_TXRTLMT) == 0) ||
125220abea66SRandall Stewart 	    (link_state != LINK_STATE_UP)) {
125320abea66SRandall Stewart 		/*
125420abea66SRandall Stewart 		 * We only care on an interface going up that is rate-limit
125520abea66SRandall Stewart 		 * capable.
125620abea66SRandall Stewart 		 */
125720abea66SRandall Stewart 		return;
125820abea66SRandall Stewart 	}
12591a714ff2SRandall Stewart 	NET_EPOCH_ENTER(et);
126020abea66SRandall Stewart 	mtx_lock(&rs_mtx);
12611a714ff2SRandall Stewart 	rs = find_rs_for_ifp(ifp);
12621a714ff2SRandall Stewart 	if (rs) {
126320abea66SRandall Stewart 		/* We already have initialized this guy */
126420abea66SRandall Stewart 		mtx_unlock(&rs_mtx);
12651a714ff2SRandall Stewart 		NET_EPOCH_EXIT(et);
126620abea66SRandall Stewart 		return;
126720abea66SRandall Stewart 	}
126820abea66SRandall Stewart 	mtx_unlock(&rs_mtx);
12691a714ff2SRandall Stewart 	NET_EPOCH_EXIT(et);
127015ddc5e4SMichael Tuexen 	rt_setup_new_rs(ifp, &error);
127120abea66SRandall Stewart }
127220abea66SRandall Stewart 
127320abea66SRandall Stewart static void
tcp_rl_ifnet_departure(void * arg __unused,struct ifnet * ifp)127420abea66SRandall Stewart tcp_rl_ifnet_departure(void *arg __unused, struct ifnet *ifp)
127520abea66SRandall Stewart {
12761a714ff2SRandall Stewart 	struct tcp_rate_set *rs;
12771a714ff2SRandall Stewart 	struct epoch_tracker et;
127820abea66SRandall Stewart 	int i;
127920abea66SRandall Stewart 
12801a714ff2SRandall Stewart 	NET_EPOCH_ENTER(et);
128120abea66SRandall Stewart 	mtx_lock(&rs_mtx);
12821a714ff2SRandall Stewart 	rs = find_rs_for_ifp(ifp);
12831a714ff2SRandall Stewart 	if (rs) {
128420abea66SRandall Stewart 		CK_LIST_REMOVE(rs, next);
128520abea66SRandall Stewart 		rs_number_alive--;
128620abea66SRandall Stewart 		rs->rs_flags |= RS_IS_DEAD;
128720abea66SRandall Stewart 		for (i = 0; i < rs->rs_rate_cnt; i++) {
128820abea66SRandall Stewart 			if (rs->rs_rlt[i].flags & HDWRPACE_TAGPRESENT) {
128998d7a8d9SJohn Baldwin 				in_pcbdetach_tag(rs->rs_rlt[i].tag);
129020abea66SRandall Stewart 				rs->rs_rlt[i].tag = NULL;
129120abea66SRandall Stewart 			}
129220abea66SRandall Stewart 			rs->rs_rlt[i].flags = HDWRPACE_IFPDEPARTED;
129320abea66SRandall Stewart 		}
1294eabddb25SHans Petter Selasky 		if (rs->rs_flows_using == 0)
1295eabddb25SHans Petter Selasky 			rs_defer_destroy(rs);
129620abea66SRandall Stewart 	}
129720abea66SRandall Stewart 	mtx_unlock(&rs_mtx);
12981a714ff2SRandall Stewart 	NET_EPOCH_EXIT(et);
129920abea66SRandall Stewart }
130020abea66SRandall Stewart 
1301*1f628be8SAndrew Gallatin void
tcp_rl_release_ifnet(struct ifnet * ifp)1302*1f628be8SAndrew Gallatin tcp_rl_release_ifnet(struct ifnet *ifp)
1303*1f628be8SAndrew Gallatin {
1304*1f628be8SAndrew Gallatin 	tcp_rl_ifnet_departure(NULL, ifp);
1305*1f628be8SAndrew Gallatin }
1306*1f628be8SAndrew Gallatin 
130720abea66SRandall Stewart static void
tcp_rl_shutdown(void * arg __unused,int howto __unused)130820abea66SRandall Stewart tcp_rl_shutdown(void *arg __unused, int howto __unused)
130920abea66SRandall Stewart {
131020abea66SRandall Stewart 	struct tcp_rate_set *rs, *nrs;
13111a714ff2SRandall Stewart 	struct epoch_tracker et;
131220abea66SRandall Stewart 	int i;
131320abea66SRandall Stewart 
13141a714ff2SRandall Stewart 	NET_EPOCH_ENTER(et);
131520abea66SRandall Stewart 	mtx_lock(&rs_mtx);
131620abea66SRandall Stewart 	CK_LIST_FOREACH_SAFE(rs, &int_rs, next, nrs) {
131720abea66SRandall Stewart 		CK_LIST_REMOVE(rs, next);
131820abea66SRandall Stewart 		rs_number_alive--;
131920abea66SRandall Stewart 		rs->rs_flags |= RS_IS_DEAD;
132020abea66SRandall Stewart 		for (i = 0; i < rs->rs_rate_cnt; i++) {
132120abea66SRandall Stewart 			if (rs->rs_rlt[i].flags & HDWRPACE_TAGPRESENT) {
132298d7a8d9SJohn Baldwin 				in_pcbdetach_tag(rs->rs_rlt[i].tag);
132320abea66SRandall Stewart 				rs->rs_rlt[i].tag = NULL;
132420abea66SRandall Stewart 			}
132520abea66SRandall Stewart 			rs->rs_rlt[i].flags = HDWRPACE_IFPDEPARTED;
132620abea66SRandall Stewart 		}
1327eabddb25SHans Petter Selasky 		if (rs->rs_flows_using == 0)
1328eabddb25SHans Petter Selasky 			rs_defer_destroy(rs);
132920abea66SRandall Stewart 	}
133020abea66SRandall Stewart 	mtx_unlock(&rs_mtx);
13311a714ff2SRandall Stewart 	NET_EPOCH_EXIT(et);
133220abea66SRandall Stewart }
133320abea66SRandall Stewart 
133420abea66SRandall Stewart const struct tcp_hwrate_limit_table *
tcp_set_pacing_rate(struct tcpcb * tp,struct ifnet * ifp,uint64_t bytes_per_sec,int flags,int * error,uint64_t * lower_rate)133520abea66SRandall Stewart tcp_set_pacing_rate(struct tcpcb *tp, struct ifnet *ifp,
13361a714ff2SRandall Stewart     uint64_t bytes_per_sec, int flags, int *error, uint64_t *lower_rate)
133720abea66SRandall Stewart {
13389eb0e832SGleb Smirnoff 	struct inpcb *inp = tptoinpcb(tp);
133920abea66SRandall Stewart 	const struct tcp_hwrate_limit_table *rte;
1340521eac97SJohn Baldwin #ifdef KERN_TLS
1341521eac97SJohn Baldwin 	struct ktls_session *tls;
1342521eac97SJohn Baldwin #endif
134320abea66SRandall Stewart 
13449eb0e832SGleb Smirnoff 	INP_WLOCK_ASSERT(inp);
1345ce398115SJohn Baldwin 
13469eb0e832SGleb Smirnoff 	if (inp->inp_snd_tag == NULL) {
134720abea66SRandall Stewart 		/*
134820abea66SRandall Stewart 		 * We are setting up a rate for the first time.
134920abea66SRandall Stewart 		 */
13509aed26b9SJohn Baldwin 		if ((ifp->if_capenable & IFCAP_TXRTLMT) == 0) {
135120abea66SRandall Stewart 			/* Not supported by the egress */
135220abea66SRandall Stewart 			if (error)
135320abea66SRandall Stewart 				*error = ENODEV;
135420abea66SRandall Stewart 			return (NULL);
135520abea66SRandall Stewart 		}
135620abea66SRandall Stewart #ifdef KERN_TLS
1357521eac97SJohn Baldwin 		tls = NULL;
1358c0e4090eSAndrew Gallatin 		if (tp->t_nic_ktls_xmit != 0) {
13599eb0e832SGleb Smirnoff 			tls = tptosocket(tp)->so_snd.sb_tls_info;
1360521eac97SJohn Baldwin 
1361521eac97SJohn Baldwin 			if ((ifp->if_capenable & IFCAP_TXTLS_RTLMT) == 0 ||
1362521eac97SJohn Baldwin 			    tls->mode != TCP_TLS_MODE_IFNET) {
136320abea66SRandall Stewart 				if (error)
1364521eac97SJohn Baldwin 					*error = ENODEV;
136520abea66SRandall Stewart 				return (NULL);
136620abea66SRandall Stewart 			}
1367521eac97SJohn Baldwin 		}
136820abea66SRandall Stewart #endif
13699eb0e832SGleb Smirnoff 		rte = rt_setup_rate(inp, ifp, bytes_per_sec, flags, error, lower_rate);
13701a714ff2SRandall Stewart 		if (rte)
13711a714ff2SRandall Stewart 			rl_increment_using(rte);
1372521eac97SJohn Baldwin #ifdef KERN_TLS
1373521eac97SJohn Baldwin 		if (rte != NULL && tls != NULL && tls->snd_tag != NULL) {
1374521eac97SJohn Baldwin 			/*
1375521eac97SJohn Baldwin 			 * Fake a route change error to reset the TLS
1376521eac97SJohn Baldwin 			 * send tag.  This will convert the existing
1377521eac97SJohn Baldwin 			 * tag to a TLS ratelimit tag.
1378521eac97SJohn Baldwin 			 */
1379c782ea8bSJohn Baldwin 			MPASS(tls->snd_tag->sw->type == IF_SND_TAG_TYPE_TLS);
13809eb0e832SGleb Smirnoff 			ktls_output_eagain(inp, tls);
1381521eac97SJohn Baldwin 		}
1382521eac97SJohn Baldwin #endif
138320abea66SRandall Stewart 	} else {
138420abea66SRandall Stewart 		/*
138520abea66SRandall Stewart 		 * We are modifying a rate, wrong interface?
138620abea66SRandall Stewart 		 */
138720abea66SRandall Stewart 		if (error)
138820abea66SRandall Stewart 			*error = EINVAL;
138920abea66SRandall Stewart 		rte = NULL;
139020abea66SRandall Stewart 	}
13911a714ff2SRandall Stewart 	if (rte != NULL) {
1392ce398115SJohn Baldwin 		tp->t_pacing_rate = rte->rate;
1393d7313dc6SRandall Stewart 		*error = 0;
13941a714ff2SRandall Stewart 	}
139520abea66SRandall Stewart 	return (rte);
139620abea66SRandall Stewart }
139720abea66SRandall Stewart 
139820abea66SRandall Stewart const struct tcp_hwrate_limit_table *
tcp_chg_pacing_rate(const struct tcp_hwrate_limit_table * crte,struct tcpcb * tp,struct ifnet * ifp,uint64_t bytes_per_sec,int flags,int * error,uint64_t * lower_rate)139920abea66SRandall Stewart tcp_chg_pacing_rate(const struct tcp_hwrate_limit_table *crte,
140020abea66SRandall Stewart     struct tcpcb *tp, struct ifnet *ifp,
14011a714ff2SRandall Stewart     uint64_t bytes_per_sec, int flags, int *error, uint64_t *lower_rate)
140220abea66SRandall Stewart {
14039eb0e832SGleb Smirnoff 	struct inpcb *inp = tptoinpcb(tp);
140420abea66SRandall Stewart 	const struct tcp_hwrate_limit_table *nrte;
140520abea66SRandall Stewart 	const struct tcp_rate_set *rs;
1406521eac97SJohn Baldwin #ifdef KERN_TLS
1407521eac97SJohn Baldwin 	struct ktls_session *tls = NULL;
1408521eac97SJohn Baldwin #endif
140920abea66SRandall Stewart 	int err;
141020abea66SRandall Stewart 
14119eb0e832SGleb Smirnoff 	INP_WLOCK_ASSERT(inp);
1412ce398115SJohn Baldwin 
1413521eac97SJohn Baldwin 	if (crte == NULL) {
1414521eac97SJohn Baldwin 		/* Wrong interface */
1415521eac97SJohn Baldwin 		if (error)
1416521eac97SJohn Baldwin 			*error = EINVAL;
1417521eac97SJohn Baldwin 		return (NULL);
1418521eac97SJohn Baldwin 	}
1419521eac97SJohn Baldwin 
1420521eac97SJohn Baldwin #ifdef KERN_TLS
1421c0e4090eSAndrew Gallatin 	if (tp->t_nic_ktls_xmit) {
14229eb0e832SGleb Smirnoff 		tls = tptosocket(tp)->so_snd.sb_tls_info;
1423d782385eSJohn Baldwin 		if (tls->mode != TCP_TLS_MODE_IFNET)
1424d782385eSJohn Baldwin 			tls = NULL;
1425d782385eSJohn Baldwin 		else if (tls->snd_tag != NULL &&
1426c782ea8bSJohn Baldwin 		    tls->snd_tag->sw->type != IF_SND_TAG_TYPE_TLS_RATE_LIMIT) {
1427d782385eSJohn Baldwin 			if (!tls->reset_pending) {
1428521eac97SJohn Baldwin 				/*
1429d782385eSJohn Baldwin 				 * NIC probably doesn't support
1430d782385eSJohn Baldwin 				 * ratelimit TLS tags if it didn't
1431d782385eSJohn Baldwin 				 * allocate one when an existing rate
1432d782385eSJohn Baldwin 				 * was present, so ignore.
1433521eac97SJohn Baldwin 				 */
14348a7404b2SAndrew Gallatin 				tcp_rel_pacing_rate(crte, tp);
1435521eac97SJohn Baldwin 				if (error)
1436521eac97SJohn Baldwin 					*error = EOPNOTSUPP;
1437521eac97SJohn Baldwin 				return (NULL);
1438521eac97SJohn Baldwin 			}
1439d782385eSJohn Baldwin 
1440d782385eSJohn Baldwin 			/*
1441d782385eSJohn Baldwin 			 * The send tag is being converted, so set the
1442d782385eSJohn Baldwin 			 * rate limit on the inpcb tag.  There is a
1443d782385eSJohn Baldwin 			 * race that the new NIC send tag might use
1444d782385eSJohn Baldwin 			 * the current rate instead of this one.
1445d782385eSJohn Baldwin 			 */
1446d782385eSJohn Baldwin 			tls = NULL;
1447d782385eSJohn Baldwin 		}
1448521eac97SJohn Baldwin 	}
1449521eac97SJohn Baldwin #endif
14509eb0e832SGleb Smirnoff 	if (inp->inp_snd_tag == NULL) {
145120abea66SRandall Stewart 		/* Wrong interface */
14528a7404b2SAndrew Gallatin 		tcp_rel_pacing_rate(crte, tp);
145320abea66SRandall Stewart 		if (error)
145420abea66SRandall Stewart 			*error = EINVAL;
145520abea66SRandall Stewart 		return (NULL);
145620abea66SRandall Stewart 	}
145720abea66SRandall Stewart 	rs = crte->ptbl;
145820abea66SRandall Stewart 	if ((rs->rs_flags & RS_IS_DEAD) ||
145920abea66SRandall Stewart 	    (crte->flags & HDWRPACE_IFPDEPARTED)) {
146020abea66SRandall Stewart 		/* Release the rate, and try anew */
14611a714ff2SRandall Stewart 
146220abea66SRandall Stewart 		tcp_rel_pacing_rate(crte, tp);
146320abea66SRandall Stewart 		nrte = tcp_set_pacing_rate(tp, ifp,
14641a714ff2SRandall Stewart 		    bytes_per_sec, flags, error, lower_rate);
146520abea66SRandall Stewart 		return (nrte);
146620abea66SRandall Stewart 	}
14671a714ff2SRandall Stewart 	nrte = tcp_find_suitable_rate(rs, bytes_per_sec, flags, lower_rate);
146820abea66SRandall Stewart 	if (nrte == crte) {
146920abea66SRandall Stewart 		/* No change */
147020abea66SRandall Stewart 		if (error)
147120abea66SRandall Stewart 			*error = 0;
147220abea66SRandall Stewart 		return (crte);
147320abea66SRandall Stewart 	}
147420abea66SRandall Stewart 	if (nrte == NULL) {
147520abea66SRandall Stewart 		/* Release the old rate */
14761a714ff2SRandall Stewart 		if (error)
14771a714ff2SRandall Stewart 			*error = ENOENT;
147820abea66SRandall Stewart 		tcp_rel_pacing_rate(crte, tp);
147920abea66SRandall Stewart 		return (NULL);
148020abea66SRandall Stewart 	}
14811a714ff2SRandall Stewart 	rl_decrement_using(crte);
14821a714ff2SRandall Stewart 	rl_increment_using(nrte);
148320abea66SRandall Stewart 	/* Change rates to our new entry */
1484521eac97SJohn Baldwin #ifdef KERN_TLS
1485521eac97SJohn Baldwin 	if (tls != NULL)
1486521eac97SJohn Baldwin 		err = ktls_modify_txrtlmt(tls, nrte->rate);
1487521eac97SJohn Baldwin 	else
1488521eac97SJohn Baldwin #endif
14899eb0e832SGleb Smirnoff 		err = in_pcbmodify_txrtlmt(inp, nrte->rate);
149020abea66SRandall Stewart 	if (err) {
14918a7404b2SAndrew Gallatin 		struct tcp_rate_set *lrs;
14928a7404b2SAndrew Gallatin 		uint64_t pre;
14938a7404b2SAndrew Gallatin 
14941a714ff2SRandall Stewart 		rl_decrement_using(nrte);
14958a7404b2SAndrew Gallatin 		lrs = __DECONST(struct tcp_rate_set *, rs);
14968a7404b2SAndrew Gallatin 		pre = atomic_fetchadd_64(&lrs->rs_flows_using, -1);
14971a714ff2SRandall Stewart 		/* Do we still have a snd-tag attached? */
14989eb0e832SGleb Smirnoff 		if (inp->inp_snd_tag)
14999eb0e832SGleb Smirnoff 			in_pcbdetach_txrtlmt(inp);
15008a7404b2SAndrew Gallatin 
15018a7404b2SAndrew Gallatin 		if (pre == 1) {
15028a7404b2SAndrew Gallatin 			struct epoch_tracker et;
15038a7404b2SAndrew Gallatin 
15048a7404b2SAndrew Gallatin 			NET_EPOCH_ENTER(et);
15058a7404b2SAndrew Gallatin 			mtx_lock(&rs_mtx);
15068a7404b2SAndrew Gallatin 			/*
15078a7404b2SAndrew Gallatin 			 * Is it dead?
15088a7404b2SAndrew Gallatin 			 */
15098a7404b2SAndrew Gallatin 			if (lrs->rs_flags & RS_IS_DEAD)
15108a7404b2SAndrew Gallatin 				rs_defer_destroy(lrs);
15118a7404b2SAndrew Gallatin 			mtx_unlock(&rs_mtx);
15128a7404b2SAndrew Gallatin 			NET_EPOCH_EXIT(et);
15138a7404b2SAndrew Gallatin 		}
151420abea66SRandall Stewart 		if (error)
151520abea66SRandall Stewart 			*error = err;
151620abea66SRandall Stewart 		return (NULL);
1517db46c0d0SHans Petter Selasky 	} else {
1518db46c0d0SHans Petter Selasky #ifdef INET
15191a714ff2SRandall Stewart 		counter_u64_add(rate_limit_chg, 1);
1520db46c0d0SHans Petter Selasky #endif
1521db46c0d0SHans Petter Selasky 	}
152220abea66SRandall Stewart 	if (error)
152320abea66SRandall Stewart 		*error = 0;
1524ce398115SJohn Baldwin 	tp->t_pacing_rate = nrte->rate;
152520abea66SRandall Stewart 	return (nrte);
152620abea66SRandall Stewart }
152720abea66SRandall Stewart 
152820abea66SRandall Stewart void
tcp_rel_pacing_rate(const struct tcp_hwrate_limit_table * crte,struct tcpcb * tp)152920abea66SRandall Stewart tcp_rel_pacing_rate(const struct tcp_hwrate_limit_table *crte, struct tcpcb *tp)
153020abea66SRandall Stewart {
15319eb0e832SGleb Smirnoff 	struct inpcb *inp = tptoinpcb(tp);
153220abea66SRandall Stewart 	const struct tcp_rate_set *crs;
153320abea66SRandall Stewart 	struct tcp_rate_set *rs;
153420abea66SRandall Stewart 	uint64_t pre;
153520abea66SRandall Stewart 
15369eb0e832SGleb Smirnoff 	INP_WLOCK_ASSERT(inp);
1537ce398115SJohn Baldwin 
1538ce398115SJohn Baldwin 	tp->t_pacing_rate = -1;
153920abea66SRandall Stewart 	crs = crte->ptbl;
154020abea66SRandall Stewart 	/*
154120abea66SRandall Stewart 	 * Now we must break the const
154220abea66SRandall Stewart 	 * in order to release our refcount.
154320abea66SRandall Stewart 	 */
154420abea66SRandall Stewart 	rs = __DECONST(struct tcp_rate_set *, crs);
15451a714ff2SRandall Stewart 	rl_decrement_using(crte);
1546a1589eb8SRandall Stewart 	pre = atomic_fetchadd_64(&rs->rs_flows_using, -1);
154720abea66SRandall Stewart 	if (pre == 1) {
15481a714ff2SRandall Stewart 		struct epoch_tracker et;
15491a714ff2SRandall Stewart 
15501a714ff2SRandall Stewart 		NET_EPOCH_ENTER(et);
155120abea66SRandall Stewart 		mtx_lock(&rs_mtx);
155220abea66SRandall Stewart 		/*
155320abea66SRandall Stewart 		 * Is it dead?
155420abea66SRandall Stewart 		 */
1555eabddb25SHans Petter Selasky 		if (rs->rs_flags & RS_IS_DEAD)
1556eabddb25SHans Petter Selasky 			rs_defer_destroy(rs);
155720abea66SRandall Stewart 		mtx_unlock(&rs_mtx);
15581a714ff2SRandall Stewart 		NET_EPOCH_EXIT(et);
155920abea66SRandall Stewart 	}
1560521eac97SJohn Baldwin 
1561521eac97SJohn Baldwin 	/*
1562521eac97SJohn Baldwin 	 * XXX: If this connection is using ifnet TLS, should we
1563521eac97SJohn Baldwin 	 * switch it to using an unlimited rate, or perhaps use
1564521eac97SJohn Baldwin 	 * ktls_output_eagain() to reset the send tag to a plain
1565521eac97SJohn Baldwin 	 * TLS tag?
1566521eac97SJohn Baldwin 	 */
15679eb0e832SGleb Smirnoff 	in_pcbdetach_txrtlmt(inp);
156820abea66SRandall Stewart }
156920abea66SRandall Stewart 
1570d7313dc6SRandall Stewart #define ONE_POINT_TWO_MEG 150000 /* 1.2 megabits in bytes */
1571d7313dc6SRandall Stewart #define ONE_HUNDRED_MBPS 12500000	/* 100Mbps in bytes per second */
1572d7313dc6SRandall Stewart #define FIVE_HUNDRED_MBPS 62500000	/* 500Mbps in bytes per second */
1573d7313dc6SRandall Stewart #define MAX_MSS_SENT 43	/* 43 mss = 43 x 1500 = 64,500 bytes */
1574d7313dc6SRandall Stewart 
15751a714ff2SRandall Stewart static void
tcp_log_pacing_size(struct tcpcb * tp,uint64_t bw,uint32_t segsiz,uint32_t new_tso,uint64_t hw_rate,uint32_t time_between,uint32_t calc_time_between,uint32_t segs,uint32_t res_div,uint16_t mult,uint8_t mod)15761a714ff2SRandall Stewart tcp_log_pacing_size(struct tcpcb *tp, uint64_t bw, uint32_t segsiz, uint32_t new_tso,
15771a714ff2SRandall Stewart 		    uint64_t hw_rate, uint32_t time_between, uint32_t calc_time_between,
15781a714ff2SRandall Stewart 		    uint32_t segs, uint32_t res_div, uint16_t mult, uint8_t mod)
15791a714ff2SRandall Stewart {
158069c7c811SRandall Stewart 	if (tcp_bblogging_on(tp)) {
15811a714ff2SRandall Stewart 		union tcp_log_stackspecific log;
15821a714ff2SRandall Stewart 		struct timeval tv;
15831a714ff2SRandall Stewart 
15841a714ff2SRandall Stewart 		memset(&log, 0, sizeof(log));
15851a714ff2SRandall Stewart 		log.u_bbr.flex1 = segsiz;
15861a714ff2SRandall Stewart 		log.u_bbr.flex2 = new_tso;
15871a714ff2SRandall Stewart 		log.u_bbr.flex3 = time_between;
15881a714ff2SRandall Stewart 		log.u_bbr.flex4 = calc_time_between;
15891a714ff2SRandall Stewart 		log.u_bbr.flex5 = segs;
15901a714ff2SRandall Stewart 		log.u_bbr.flex6 = res_div;
15911a714ff2SRandall Stewart 		log.u_bbr.flex7 = mult;
15921a714ff2SRandall Stewart 		log.u_bbr.flex8 = mod;
15931a714ff2SRandall Stewart 		log.u_bbr.timeStamp = tcp_get_usecs(&tv);
15941a714ff2SRandall Stewart 		log.u_bbr.cur_del_rate = bw;
15951a714ff2SRandall Stewart 		log.u_bbr.delRate = hw_rate;
15961a714ff2SRandall Stewart 		TCP_LOG_EVENTP(tp, NULL,
15979eb0e832SGleb Smirnoff 		    &tptosocket(tp)->so_rcv,
15989eb0e832SGleb Smirnoff 		    &tptosocket(tp)->so_snd,
15991a714ff2SRandall Stewart 		    TCP_HDWR_PACE_SIZE, 0,
16001a714ff2SRandall Stewart 		    0, &log, false, &tv);
16011a714ff2SRandall Stewart 	}
16021a714ff2SRandall Stewart }
16031a714ff2SRandall Stewart 
1604d7313dc6SRandall Stewart uint32_t
tcp_get_pacing_burst_size_w_divisor(struct tcpcb * tp,uint64_t bw,uint32_t segsiz,int can_use_1mss,const struct tcp_hwrate_limit_table * te,int * err,int divisor)160526bdd35cSRandall Stewart tcp_get_pacing_burst_size_w_divisor(struct tcpcb *tp, uint64_t bw, uint32_t segsiz, int can_use_1mss,
160626bdd35cSRandall Stewart    const struct tcp_hwrate_limit_table *te, int *err, int divisor)
1607d7313dc6SRandall Stewart {
1608d7313dc6SRandall Stewart 	/*
1609d7313dc6SRandall Stewart 	 * We use the google formula to calculate the
1610d7313dc6SRandall Stewart 	 * TSO size. I.E.
1611d7313dc6SRandall Stewart 	 * bw < 24Meg
1612d7313dc6SRandall Stewart 	 *   tso = 2mss
1613d7313dc6SRandall Stewart 	 * else
161426bdd35cSRandall Stewart 	 *   tso = min(bw/(div=1000), 64k)
1615d7313dc6SRandall Stewart 	 *
1616d7313dc6SRandall Stewart 	 * Note for these calculations we ignore the
1617d7313dc6SRandall Stewart 	 * packet overhead (enet hdr, ip hdr and tcp hdr).
161826bdd35cSRandall Stewart 	 * We only get the google formula when we have
161926bdd35cSRandall Stewart 	 * divisor = 1000, which is the default for now.
1620d7313dc6SRandall Stewart 	 */
1621d7313dc6SRandall Stewart 	uint64_t lentim, res, bytes;
1622d7313dc6SRandall Stewart 	uint32_t new_tso, min_tso_segs;
1623d7313dc6SRandall Stewart 
162426bdd35cSRandall Stewart 	/* It can't be zero */
162526bdd35cSRandall Stewart 	if ((divisor == 0) ||
162626bdd35cSRandall Stewart 	    (divisor < RL_MIN_DIVISOR)) {
162726bdd35cSRandall Stewart 		if (mss_divisor)
162826bdd35cSRandall Stewart 			bytes = bw / mss_divisor;
162926bdd35cSRandall Stewart 		else
1630d7313dc6SRandall Stewart 			bytes = bw / 1000;
163126bdd35cSRandall Stewart 	} else
163226bdd35cSRandall Stewart 		bytes = bw / divisor;
163326bdd35cSRandall Stewart 	/* We can't ever send more than 65k in a TSO */
163426bdd35cSRandall Stewart 	if (bytes > 0xffff) {
163526bdd35cSRandall Stewart 		bytes = 0xffff;
163626bdd35cSRandall Stewart 	}
1637d7313dc6SRandall Stewart 	/* Round up */
1638d7313dc6SRandall Stewart 	new_tso = (bytes + segsiz - 1) / segsiz;
163926bdd35cSRandall Stewart 	/* Are we enforcing even boundaries? */
164026bdd35cSRandall Stewart 	if (even_num_segs && (new_tso & 1) && (new_tso > even_threshold))
164126bdd35cSRandall Stewart 		new_tso++;
164226bdd35cSRandall Stewart 	if (can_use_1mss)
1643d7313dc6SRandall Stewart 		min_tso_segs = 1;
1644d7313dc6SRandall Stewart 	else
1645d7313dc6SRandall Stewart 		min_tso_segs = 2;
16461a714ff2SRandall Stewart 	if (rs_floor_mss && (new_tso < rs_floor_mss))
16471a714ff2SRandall Stewart 		new_tso = rs_floor_mss;
16481a714ff2SRandall Stewart 	else if (new_tso < min_tso_segs)
1649d7313dc6SRandall Stewart 		new_tso = min_tso_segs;
1650d7313dc6SRandall Stewart 	if (new_tso > MAX_MSS_SENT)
1651d7313dc6SRandall Stewart 		new_tso = MAX_MSS_SENT;
1652d7313dc6SRandall Stewart 	new_tso *= segsiz;
16531a714ff2SRandall Stewart  	tcp_log_pacing_size(tp, bw, segsiz, new_tso,
16541a714ff2SRandall Stewart 			    0, 0, 0, 0, 0, 0, 1);
1655d7313dc6SRandall Stewart 	/*
1656d7313dc6SRandall Stewart 	 * If we are not doing hardware pacing
1657d7313dc6SRandall Stewart 	 * then we are done.
1658d7313dc6SRandall Stewart 	 */
1659d7313dc6SRandall Stewart 	if (te == NULL) {
1660d7313dc6SRandall Stewart 		if (err)
1661d7313dc6SRandall Stewart 			*err = 0;
1662d7313dc6SRandall Stewart 		return(new_tso);
1663d7313dc6SRandall Stewart 	}
1664d7313dc6SRandall Stewart 	/*
1665d7313dc6SRandall Stewart 	 * For hardware pacing we look at the
1666d7313dc6SRandall Stewart 	 * rate you are sending at and compare
1667d7313dc6SRandall Stewart 	 * that to the rate you have in hardware.
1668d7313dc6SRandall Stewart 	 *
1669d7313dc6SRandall Stewart 	 * If the hardware rate is slower than your
1670d7313dc6SRandall Stewart 	 * software rate then you are in error and
1671d7313dc6SRandall Stewart 	 * we will build a queue in our hardware whic
1672d7313dc6SRandall Stewart 	 * is probably not desired, in such a case
1673d7313dc6SRandall Stewart 	 * just return the non-hardware TSO size.
1674d7313dc6SRandall Stewart 	 *
1675d7313dc6SRandall Stewart 	 * If the rate in hardware is faster (which
1676d7313dc6SRandall Stewart 	 * it should be) then look at how long it
1677d7313dc6SRandall Stewart 	 * takes to send one ethernet segment size at
1678d7313dc6SRandall Stewart 	 * your b/w and compare that to the time it
1679d7313dc6SRandall Stewart 	 * takes to send at the rate you had selected.
1680d7313dc6SRandall Stewart 	 *
1681d7313dc6SRandall Stewart 	 * If your time is greater (which we hope it is)
1682d7313dc6SRandall Stewart 	 * we get the delta between the two, and then
1683d7313dc6SRandall Stewart 	 * divide that into your pacing time. This tells
1684d7313dc6SRandall Stewart 	 * us how many MSS you can send down at once (rounded up).
1685d7313dc6SRandall Stewart 	 *
1686d7313dc6SRandall Stewart 	 * Note we also double this value if the b/w is over
1687d7313dc6SRandall Stewart 	 * 100Mbps. If its over 500meg we just set you to the
1688d7313dc6SRandall Stewart 	 * max (43 segments).
1689d7313dc6SRandall Stewart 	 */
1690d7313dc6SRandall Stewart 	if (te->rate > FIVE_HUNDRED_MBPS)
16911a714ff2SRandall Stewart 		goto max;
1692d7313dc6SRandall Stewart 	if (te->rate == bw) {
1693d7313dc6SRandall Stewart 		/* We are pacing at exactly the hdwr rate */
16941a714ff2SRandall Stewart max:
16951a714ff2SRandall Stewart 		tcp_log_pacing_size(tp, bw, segsiz, new_tso,
16961a714ff2SRandall Stewart 				    te->rate, te->time_between, (uint32_t)0,
16971a714ff2SRandall Stewart 				    (segsiz * MAX_MSS_SENT), 0, 0, 3);
1698d7313dc6SRandall Stewart 		return (segsiz * MAX_MSS_SENT);
1699d7313dc6SRandall Stewart 	}
1700d7313dc6SRandall Stewart 	lentim = ETHERNET_SEGMENT_SIZE * USECS_IN_SECOND;
1701d7313dc6SRandall Stewart 	res = lentim / bw;
1702d7313dc6SRandall Stewart 	if (res > te->time_between) {
17031a714ff2SRandall Stewart 		uint32_t delta, segs, res_div;
1704d7313dc6SRandall Stewart 
17051a714ff2SRandall Stewart 		res_div = ((res * num_of_waits_allowed) + wait_time_floor);
1706d7313dc6SRandall Stewart 		delta = res - te->time_between;
17071a714ff2SRandall Stewart 		segs = (res_div + delta - 1)/delta;
1708d7313dc6SRandall Stewart 		if (segs < min_tso_segs)
1709d7313dc6SRandall Stewart 			segs = min_tso_segs;
17101a714ff2SRandall Stewart 		if (segs < rs_hw_floor_mss)
17111a714ff2SRandall Stewart 			segs = rs_hw_floor_mss;
1712d7313dc6SRandall Stewart 		if (segs > MAX_MSS_SENT)
1713d7313dc6SRandall Stewart 			segs = MAX_MSS_SENT;
1714d7313dc6SRandall Stewart 		segs *= segsiz;
17151a714ff2SRandall Stewart 		tcp_log_pacing_size(tp, bw, segsiz, new_tso,
17161a714ff2SRandall Stewart 				    te->rate, te->time_between, (uint32_t)res,
17171a714ff2SRandall Stewart 				    segs, res_div, 1, 3);
1718d7313dc6SRandall Stewart 		if (err)
1719d7313dc6SRandall Stewart 			*err = 0;
1720d7313dc6SRandall Stewart 		if (segs < new_tso) {
1721d7313dc6SRandall Stewart 			/* unexpected ? */
1722d7313dc6SRandall Stewart 			return(new_tso);
1723d7313dc6SRandall Stewart 		} else {
1724d7313dc6SRandall Stewart 			return (segs);
1725d7313dc6SRandall Stewart 		}
1726d7313dc6SRandall Stewart 	} else {
1727d7313dc6SRandall Stewart 		/*
1728d7313dc6SRandall Stewart 		 * Your time is smaller which means
1729d7313dc6SRandall Stewart 		 * we will grow a queue on our
1730d7313dc6SRandall Stewart 		 * hardware. Send back the non-hardware
1731d7313dc6SRandall Stewart 		 * rate.
1732d7313dc6SRandall Stewart 		 */
17331a714ff2SRandall Stewart 		tcp_log_pacing_size(tp, bw, segsiz, new_tso,
17341a714ff2SRandall Stewart 				    te->rate, te->time_between, (uint32_t)res,
17351a714ff2SRandall Stewart 				    0, 0, 0, 4);
1736d7313dc6SRandall Stewart 		if (err)
1737d7313dc6SRandall Stewart 			*err = -1;
1738d7313dc6SRandall Stewart 		return (new_tso);
1739d7313dc6SRandall Stewart 	}
1740d7313dc6SRandall Stewart }
1741d7313dc6SRandall Stewart 
17421a714ff2SRandall Stewart uint64_t
tcp_hw_highest_rate_ifp(struct ifnet * ifp,struct inpcb * inp)17431a714ff2SRandall Stewart tcp_hw_highest_rate_ifp(struct ifnet *ifp, struct inpcb *inp)
17441a714ff2SRandall Stewart {
17451a714ff2SRandall Stewart 	struct epoch_tracker et;
17461a714ff2SRandall Stewart 	struct tcp_rate_set *rs;
17471a714ff2SRandall Stewart 	uint64_t rate_ret;
17481a714ff2SRandall Stewart 
17491a714ff2SRandall Stewart 	NET_EPOCH_ENTER(et);
17501a714ff2SRandall Stewart use_next_interface:
17511a714ff2SRandall Stewart 	rs = find_rs_for_ifp(ifp);
17521a714ff2SRandall Stewart 	if (rs == NULL) {
17531a714ff2SRandall Stewart 		/* This interface does not do ratelimiting */
17541a714ff2SRandall Stewart 		rate_ret = 0;
17551a714ff2SRandall Stewart 	} else if (rs->rs_flags & RS_IS_DEFF) {
17561a714ff2SRandall Stewart 		/* We need to find the real interface */
17571a714ff2SRandall Stewart 		struct ifnet *tifp;
17581a714ff2SRandall Stewart 
17591a714ff2SRandall Stewart 		tifp = rt_find_real_interface(ifp, inp, NULL);
17601a714ff2SRandall Stewart 		if (tifp == NULL) {
17611a714ff2SRandall Stewart 			NET_EPOCH_EXIT(et);
17621a714ff2SRandall Stewart 			return (0);
17631a714ff2SRandall Stewart 		}
17641a714ff2SRandall Stewart 		ifp = tifp;
17651a714ff2SRandall Stewart 		goto use_next_interface;
17661a714ff2SRandall Stewart 	} else {
17671a714ff2SRandall Stewart 		/* Lets return the highest rate this guy has */
17681a714ff2SRandall Stewart 		rate_ret = rs->rs_rlt[rs->rs_highest_valid].rate;
17691a714ff2SRandall Stewart 	}
17701a714ff2SRandall Stewart 	NET_EPOCH_EXIT(et);
17711a714ff2SRandall Stewart 	return(rate_ret);
17721a714ff2SRandall Stewart }
17731a714ff2SRandall Stewart 
177420abea66SRandall Stewart static eventhandler_tag rl_ifnet_departs;
177520abea66SRandall Stewart static eventhandler_tag rl_ifnet_arrives;
177620abea66SRandall Stewart static eventhandler_tag rl_shutdown_start;
177720abea66SRandall Stewart 
177820abea66SRandall Stewart static void
tcp_rs_init(void * st __unused)177920abea66SRandall Stewart tcp_rs_init(void *st __unused)
178020abea66SRandall Stewart {
178120abea66SRandall Stewart 	CK_LIST_INIT(&int_rs);
178220abea66SRandall Stewart 	rs_number_alive = 0;
1783c012cfe6SEd Maste 	rs_number_dead = 0;
178420abea66SRandall Stewart 	mtx_init(&rs_mtx, "tcp_rs_mtx", "rsmtx", MTX_DEF);
178520abea66SRandall Stewart 	rl_ifnet_departs = EVENTHANDLER_REGISTER(ifnet_departure_event,
178620abea66SRandall Stewart 	    tcp_rl_ifnet_departure,
178720abea66SRandall Stewart 	    NULL, EVENTHANDLER_PRI_ANY);
178820abea66SRandall Stewart 	rl_ifnet_arrives = EVENTHANDLER_REGISTER(ifnet_link_event,
178920abea66SRandall Stewart 	    tcp_rl_ifnet_link,
179020abea66SRandall Stewart 	    NULL, EVENTHANDLER_PRI_ANY);
179120abea66SRandall Stewart 	rl_shutdown_start = EVENTHANDLER_REGISTER(shutdown_pre_sync,
179220abea66SRandall Stewart 	    tcp_rl_shutdown, NULL,
179320abea66SRandall Stewart 	    SHUTDOWN_PRI_FIRST);
179420abea66SRandall Stewart 	printf("TCP_ratelimit: Is now initialized\n");
179520abea66SRandall Stewart }
179620abea66SRandall Stewart 
179720abea66SRandall Stewart SYSINIT(tcp_rl_init, SI_SUB_SMP + 1, SI_ORDER_ANY, tcp_rs_init, NULL);
179820abea66SRandall Stewart #endif
1799