xref: /freebsd/sys/netinet/tcp_offload.h (revision bc65987ade60fb8ce3ec6c7241cda809bcdc661e)
1 /*-
2  * Copyright (c) 2007, Chelsio Inc.
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  *
8  * 1. Redistributions of source code must retain the above copyright notice,
9  *    this list of conditions and the following disclaimer.
10  *
11  * 2. Neither the name of the Chelsio Corporation nor the names of its
12  *    contributors may be used to endorse or promote products derived from
13  *    this software without specific prior written permission.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
16  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
19  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
20  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
21  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
22  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
23  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
24  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
25  * POSSIBILITY OF SUCH DAMAGE.
26  *
27  * $FreeBSD$
28  */
29 
30 #ifndef _NETINET_TCP_OFFLOAD_H_
31 #define	_NETINET_TCP_OFFLOAD_H_
32 
33 #ifndef _KERNEL
34 #error "no user-serviceable parts inside"
35 #endif
36 
37 /*
38  * A driver publishes that it provides offload services
39  * by setting IFCAP_TOE in the ifnet. The offload connect
40  * will bypass any further work if the interface that a
41  * connection would use does not support TCP offload.
42  *
43  * The TOE API assumes that the tcp offload engine can offload the
44  * the entire connection from set up to teardown, with some provision
45  * being made to allowing the software stack to handle time wait. If
46  * the device does not meet these criteria, it is the driver's responsibility
47  * to overload the functions that it needs to in tcp_usrreqs and make
48  * its own calls to tcp_output if it needs to do so.
49  *
50  * There is currently no provision for the device advertising the congestion
51  * control algorithms it supports as there is currently no API for querying
52  * an operating system for the protocols that it has loaded. This is a desirable
53  * future extension.
54  *
55  *
56  *
57  * It is assumed that individuals deploying TOE will want connections
58  * to be offloaded without software changes so all connections on an
59  * interface providing TOE are offloaded unless the the SO_NO_OFFLOAD
60  * flag is set on the socket.
61  *
62  *
63  * The toe_usrreqs structure constitutes the TOE driver's
64  * interface to the TCP stack for functionality that doesn't
65  * interact directly with userspace. If one wants to provide
66  * (optional) functionality to do zero-copy to/from
67  * userspace one still needs to override soreceive/sosend
68  * with functions that fault in and pin the user buffers.
69  *
70  * + tu_send
71  *   - tells the driver that new data may have been added to the
72  *     socket's send buffer - the driver should not fail if the
73  *     buffer is in fact unchanged
74  *   - the driver is responsible for providing credits (bytes in the send window)
75  *     back to the socket by calling sbdrop() as segments are acknowledged.
76  *   - The driver expects the inpcb lock to be held - the driver is expected
77  *     not to drop the lock. Hence the driver is not allowed to acquire the
78  *     pcbinfo lock during this call.
79  *
80  * + tu_rcvd
81  *   - returns credits to the driver and triggers window updates
82  *     to the peer (a credit as used here is a byte in the peer's receive window)
83  *   - the driver is expected to determine how many bytes have been
84  *     consumed and credit that back to the card so that it can grow
85  *     the window again by maintaining its own state between invocations.
86  *   - In principle this could be used to shrink the window as well as
87  *     grow the window, although it is not used for that now.
88  *   - this function needs to correctly handle being called any number of
89  *     times without any bytes being consumed from the receive buffer.
90  *   - The driver expects the inpcb lock to be held - the driver is expected
91  *     not to drop the lock. Hence the driver is not allowed to acquire the
92  *     pcbinfo lock during this call.
93  *
94  * + tu_disconnect
95  *   - tells the driver to send FIN to peer
96  *   - driver is expected to send the remaining data and then do a clean half close
97  *   - disconnect implies at least half-close so only send, reset, and detach
98  *     are legal
99  *   - the driver is expected to handle transition through the shutdown
100  *     state machine and allow the stack to support SO_LINGER.
101  *   - The driver expects the inpcb lock to be held - the driver is expected
102  *     not to drop the lock. Hence the driver is not allowed to acquire the
103  *     pcbinfo lock during this call.
104  *
105  * + tu_reset
106  *   - closes the connection and sends a RST to peer
107  *   - driver is expectd to trigger an RST and detach the toepcb
108  *   - no further calls are legal after reset
109  *   - The driver expects the inpcb lock to be held - the driver is expected
110  *     not to drop the lock. Hence the driver is not allowed to acquire the
111  *     pcbinfo lock during this call.
112  *
113  *   The following fields in the tcpcb are expected to be referenced by the driver:
114  *	+ iss
115  *	+ rcv_nxt
116  *	+ rcv_wnd
117  *	+ snd_isn
118  *	+ snd_max
119  *	+ snd_nxt
120  *	+ snd_una
121  *	+ t_flags
122  *	+ t_inpcb
123  *	+ t_maxseg
124  *	+ t_toe
125  *
126  *   The following fields in the inpcb are expected to be referenced by the driver:
127  *	+ inp_lport
128  *	+ inp_fport
129  *	+ inp_laddr
130  *	+ inp_fport
131  *	+ inp_socket
132  *	+ inp_ip_tos
133  *
134  *   The following fields in the socket are expected to be referenced by the
135  *   driver:
136  *	+ so_comp
137  *	+ so_error
138  *	+ so_linger
139  *	+ so_options
140  *	+ so_rcv
141  *	+ so_snd
142  *	+ so_state
143  *	+ so_timeo
144  *
145  *   These functions all return 0 on success and can return the following errors
146  *   as appropriate:
147  *	+ EPERM:
148  *	+ ENOBUFS: memory allocation failed
149  *	+ EMSGSIZE: MTU changed during the call
150  *	+ EHOSTDOWN:
151  *	+ EHOSTUNREACH:
152  *	+ ENETDOWN:
153  *	* ENETUNREACH: the peer is no longer reachable
154  *
155  * + tu_detach
156  *   - tells driver that the socket is going away so disconnect
157  *     the toepcb and free appropriate resources
158  *   - allows the driver to cleanly handle the case of connection state
159  *     outliving the socket
160  *   - no further calls are legal after detach
161  *   - the driver is expected to provide its own synchronization between
162  *     detach and receiving new data.
163  *
164  * + tu_syncache_event
165  *   - even if it is not actually needed, the driver is expected to
166  *     call syncache_add for the initial SYN and then syncache_expand
167  *     for the SYN,ACK
168  *   - tells driver that a connection either has not been added or has
169  *     been dropped from the syncache
170  *   - the driver is expected to maintain state that lives outside the
171  *     software stack so the syncache needs to be able to notify the
172  *     toe driver that the software stack is not going to create a connection
173  *     for a received SYN
174  *   - The driver is responsible for any synchronization required between
175  *     the syncache dropping an entry and the driver processing the SYN,ACK.
176  *
177  */
178 struct toe_usrreqs {
179 	int (*tu_send)(struct tcpcb *tp);
180 	int (*tu_rcvd)(struct tcpcb *tp);
181 	int (*tu_disconnect)(struct tcpcb *tp);
182 	int (*tu_reset)(struct tcpcb *tp);
183 	void (*tu_detach)(struct tcpcb *tp);
184 	void (*tu_syncache_event)(int event, void *toep);
185 };
186 
187 #define	TOE_SC_ENTRY_PRESENT		1	/* 4-tuple already present */
188 #define	TOE_SC_DROP			2	/* connection was timed out */
189 
190 /*
191  * Because listen is a one-to-many relationship (a socket can be listening
192  * on all interfaces on a machine some of which may be using different TCP
193  * offload devices), listen uses a publish/subscribe mechanism. The TCP
194  * offload driver registers a listen notification function with the stack.
195  * When a listen socket is created all TCP offload devices are notified
196  * so that they can do the appropriate set up to offload connections on the
197  * port to which the socket is bound. When the listen socket is closed,
198  * the offload devices are notified so that they will stop listening on that
199  * port and free any associated resources as well as sending RSTs on any
200  * connections in the SYN_RCVD state.
201  *
202  */
203 
204 typedef	void	(*tcp_offload_listen_start_fn)(void *, struct tcpcb *);
205 typedef	void	(*tcp_offload_listen_stop_fn)(void *, struct tcpcb *);
206 
207 EVENTHANDLER_DECLARE(tcp_offload_listen_start, tcp_offload_listen_start_fn);
208 EVENTHANDLER_DECLARE(tcp_offload_listen_stop, tcp_offload_listen_stop_fn);
209 
210 /*
211  * Check if the socket can be offloaded by the following steps:
212  * - determine the egress interface
213  * - check the interface for TOE capability and TOE is enabled
214  * - check if the device has resources to offload the connection
215  */
216 int	tcp_offload_connect(struct socket *so, struct sockaddr *nam);
217 
218 /*
219  * The tcp_output_* routines are wrappers around the toe_usrreqs calls
220  * which trigger packet transmission. In the non-offloaded case they
221  * translate to tcp_output. The tcp_offload_* routines notify TOE
222  * of specific events. I the non-offloaded case they are no-ops.
223  *
224  * Listen is a special case because it is a 1 to many relationship
225  * and there can be more than one offload driver in the system.
226  */
227 
228 /*
229  * Connection is offloaded
230  */
231 #define	tp_offload(tp)		((tp)->t_flags & TF_TOE)
232 /*
233  * The socket has not been marked as "do not offload"
234  */
235 #define	SO_OFFLOADABLE(so)	((so->so_options & SO_NO_OFFLOAD) == 0)
236 
237 static __inline int
238 tcp_output_connect(struct socket *so, struct sockaddr *nam)
239 {
240 	struct tcpcb *tp = sototcpcb(so);
241 	int error;
242 
243 	/*
244 	 * If offload has been disabled for this socket or the
245 	 * connection cannot be offloaded just call tcp_output
246 	 * to start the TCP state machine.
247 	 */
248 #ifndef TCP_OFFLOAD_DISABLE
249 	if (!SO_OFFLOADABLE(so) || (error = tcp_offload_connect(so, nam)) != 0)
250 #endif
251 		error = tcp_output(tp);
252 	return (error);
253 }
254 
255 static __inline int
256 tcp_output_send(struct tcpcb *tp)
257 {
258 
259 #ifndef TCP_OFFLOAD_DISABLE
260 	if (tp_offload(tp))
261 		return (tp->t_tu->tu_send(tp));
262 #endif
263 	return (tcp_output(tp));
264 }
265 
266 static __inline int
267 tcp_output_rcvd(struct tcpcb *tp)
268 {
269 
270 #ifndef TCP_OFFLOAD_DISABLE
271 	if (tp_offload(tp))
272 		return (tp->t_tu->tu_rcvd(tp));
273 #endif
274 	return (tcp_output(tp));
275 }
276 
277 static __inline int
278 tcp_output_disconnect(struct tcpcb *tp)
279 {
280 
281 #ifndef TCP_OFFLOAD_DISABLE
282 	if (tp_offload(tp))
283 		return (tp->t_tu->tu_disconnect(tp));
284 #endif
285 	return (tcp_output(tp));
286 }
287 
288 static __inline int
289 tcp_output_reset(struct tcpcb *tp)
290 {
291 
292 #ifndef TCP_OFFLOAD_DISABLE
293 	if (tp_offload(tp))
294 		return (tp->t_tu->tu_reset(tp));
295 #endif
296 	return (tcp_output(tp));
297 }
298 
299 static __inline void
300 tcp_offload_detach(struct tcpcb *tp)
301 {
302 
303 #ifndef TCP_OFFLOAD_DISABLE
304 	if (tp_offload(tp))
305 		tp->t_tu->tu_detach(tp);
306 #endif
307 }
308 
309 static __inline void
310 tcp_offload_listen_open(struct tcpcb *tp)
311 {
312 
313 #ifndef TCP_OFFLOAD_DISABLE
314 	if (SO_OFFLOADABLE(tp->t_inpcb->inp_socket))
315 		EVENTHANDLER_INVOKE(tcp_offload_listen_start, tp);
316 #endif
317 }
318 
319 static __inline void
320 tcp_offload_listen_close(struct tcpcb *tp)
321 {
322 
323 #ifndef TCP_OFFLOAD_DISABLE
324 	EVENTHANDLER_INVOKE(tcp_offload_listen_stop, tp);
325 #endif
326 }
327 
328 #undef tp_offload
329 #undef SO_OFFLOADABLE
330 #endif /* _NETINET_TCP_OFFLOAD_H_ */
331