xref: /freebsd/sys/netinet/tcp_offload.h (revision 8fc257994d0ce2396196d7a06d50d20c8015f4b7)
1 /*-
2  * Copyright (c) 2007, Chelsio Inc.
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  *
8  * 1. Redistributions of source code must retain the above copyright notice,
9  *    this list of conditions and the following disclaimer.
10  *
11  * 2. Neither the name of the Chelsio Corporation nor the names of its
12  *    contributors may be used to endorse or promote products derived from
13  *    this software without specific prior written permission.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
16  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
19  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
20  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
21  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
22  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
23  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
24  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
25  * POSSIBILITY OF SUCH DAMAGE.
26  *
27  * $FreeBSD$
28  */
29 
30 #ifndef _NETINET_TCP_OFFLOAD_H_
31 #define	_NETINET_TCP_OFFLOAD_H_
32 
33 #ifndef _KERNEL
34 #error "no user-serviceable parts inside"
35 #endif
36 
37 /*
38  * A driver publishes that it provides offload services
39  * by setting IFCAP_TOE in the ifnet. The offload connect
40  * will bypass any further work if the interface that a
41  * connection would use does not support TCP offload.
42  *
43  * The TOE API assumes that the tcp offload engine can offload the
44  * the entire connection from set up to teardown, with some provision
45  * being made to allowing the software stack to handle time wait. If
46  * the device does not meet these criteria, it is the driver's responsibility
47  * to overload the functions that it needs to in tcp_usrreqs and make
48  * its own calls to tcp_output if it needs to do so.
49  *
50  * There is currently no provision for the device advertising the congestion
51  * control algorithms it supports as there is currently no API for querying
52  * an operating system for the protocols that it has loaded. This is a desirable
53  * future extension.
54  *
55  *
56  *
57  * It is assumed that individuals deploying TOE will want connections
58  * to be offloaded without software changes so all connections on an
59  * interface providing TOE are offloaded unless the the SO_NO_OFFLOAD
60  * flag is set on the socket.
61  *
62  *
63  * The toe_usrreqs structure constitutes the TOE driver's
64  * interface to the TCP stack for functionality that doesn't
65  * interact directly with userspace. If one wants to provide
66  * (optional) functionality to do zero-copy to/from
67  * userspace one still needs to override soreceive/sosend
68  * with functions that fault in and pin the user buffers.
69  *
70  * + tu_send
71  *   - tells the driver that new data may have been added to the
72  *     socket's send buffer - the driver should not fail if the
73  *     buffer is in fact unchanged
74  *   - the driver is responsible for providing credits (bytes in the send window)
75  *     back to the socket by calling sbdrop() as segments are acknowledged.
76  *   - The driver expects the inpcb lock to be held - the driver is expected
77  *     not to drop the lock. Hence the driver is not allowed to acquire the
78  *     pcbinfo lock during this call.
79  *
80  * + tu_rcvd
81  *   - returns credits to the driver and triggers window updates
82  *     to the peer (a credit as used here is a byte in the peer's receive window)
83  *   - the driver is expected to determine how many bytes have been
84  *     consumed and credit that back to the card so that it can grow
85  *     the window again by maintaining its own state between invocations.
86  *   - In principle this could be used to shrink the window as well as
87  *     grow the window, although it is not used for that now.
88  *   - this function needs to correctly handle being called any number of
89  *     times without any bytes being consumed from the receive buffer.
90  *   - The driver expects the inpcb lock to be held - the driver is expected
91  *     not to drop the lock. Hence the driver is not allowed to acquire the
92  *     pcbinfo lock during this call.
93  *
94  * + tu_disconnect
95  *   - tells the driver to send FIN to peer
96  *   - driver is expected to send the remaining data and then do a clean half close
97  *   - disconnect implies at least half-close so only send, reset, and detach
98  *     are legal
99  *   - the driver is expected to handle transition through the shutdown
100  *     state machine and allow the stack to support SO_LINGER.
101  *   - The driver expects the inpcb lock to be held - the driver is expected
102  *     not to drop the lock. Hence the driver is not allowed to acquire the
103  *     pcbinfo lock during this call.
104  *
105  * + tu_reset
106  *   - closes the connection and sends a RST to peer
107  *   - driver is expectd to trigger an RST and detach the toepcb
108  *   - no further calls are legal after reset
109  *   - The driver expects the inpcb lock to be held - the driver is expected
110  *     not to drop the lock. Hence the driver is not allowed to acquire the
111  *     pcbinfo lock during this call.
112  *
113  *   The following fields in the tcpcb are expected to be referenced by the driver:
114  *	+ iss
115  *	+ rcv_nxt
116  *	+ rcv_wnd
117  *	+ snd_isn
118  *	+ snd_max
119  *	+ snd_nxt
120  *	+ snd_una
121  *	+ t_flags
122  *	+ t_inpcb
123  *	+ t_maxseg
124  *	+ t_toe
125  *
126  *   The following fields in the inpcb are expected to be referenced by the driver:
127  *	+ inp_lport
128  *	+ inp_fport
129  *	+ inp_laddr
130  *	+ inp_fport
131  *	+ inp_socket
132  *	+ inp_ip_tos
133  *
134  *   The following fields in the socket are expected to be referenced by the
135  *   driver:
136  *	+ so_comp
137  *	+ so_error
138  *	+ so_linger
139  *	+ so_options
140  *	+ so_rcv
141  *	+ so_snd
142  *	+ so_state
143  *	+ so_timeo
144  *
145  *   These functions all return 0 on success and can return the following errors
146  *   as appropriate:
147  *	+ EPERM:
148  *	+ ENOBUFS: memory allocation failed
149  *	+ EMSGSIZE: MTU changed during the call
150  *	+ EHOSTDOWN:
151  *	+ EHOSTUNREACH:
152  *	+ ENETDOWN:
153  *	* ENETUNREACH: the peer is no longer reachable
154  *
155  * + tu_detach
156  *   - tells driver that the socket is going away so disconnect
157  *     the toepcb and free appropriate resources
158  *   - allows the driver to cleanly handle the case of connection state
159  *     outliving the socket
160  *   - no further calls are legal after detach
161  *   - the driver is expected to provide its own synchronization between
162  *     detach and receiving new data.
163  *
164  * + tu_syncache_event
165  *   - even if it is not actually needed, the driver is expected to
166  *     call syncache_add for the initial SYN and then syncache_expand
167  *     for the SYN,ACK
168  *   - tells driver that a connection either has not been added or has
169  *     been dropped from the syncache
170  *   - the driver is expected to maintain state that lives outside the
171  *     software stack so the syncache needs to be able to notify the
172  *     toe driver that the software stack is not going to create a connection
173  *     for a received SYN
174  *   - The driver is responsible for any synchronization required between
175  *     the syncache dropping an entry and the driver processing the SYN,ACK.
176  *
177  */
178 struct toe_usrreqs {
179 	int (*tu_send)(struct tcpcb *tp);
180 	int (*tu_rcvd)(struct tcpcb *tp);
181 	int (*tu_disconnect)(struct tcpcb *tp);
182 	int (*tu_reset)(struct tcpcb *tp);
183 	void (*tu_detach)(struct tcpcb *tp);
184 	void (*tu_syncache_event)(int event, void *toep);
185 };
186 
187 /*
188  * Proxy for struct tcpopt between TOE drivers and TCP functions.
189  */
190 struct toeopt {
191 	u_int64_t	to_flags;	/* see tcpopt in tcp_var.h */
192 	u_int16_t	to_mss;		/* maximum segment size */
193 	u_int8_t	to_wscale;	/* window scaling */
194 
195 	u_int8_t	_pad1;		/* explicit pad for 64bit alignment */
196 	u_int32_t	_pad2;		/* explicit pad for 64bit alignment */
197 	u_int64_t	_pad3[4];	/* TBD */
198 };
199 
200 #define	TOE_SC_ENTRY_PRESENT		1	/* 4-tuple already present */
201 #define	TOE_SC_DROP			2	/* connection was timed out */
202 
203 /*
204  * Because listen is a one-to-many relationship (a socket can be listening
205  * on all interfaces on a machine some of which may be using different TCP
206  * offload devices), listen uses a publish/subscribe mechanism. The TCP
207  * offload driver registers a listen notification function with the stack.
208  * When a listen socket is created all TCP offload devices are notified
209  * so that they can do the appropriate set up to offload connections on the
210  * port to which the socket is bound. When the listen socket is closed,
211  * the offload devices are notified so that they will stop listening on that
212  * port and free any associated resources as well as sending RSTs on any
213  * connections in the SYN_RCVD state.
214  *
215  */
216 
217 typedef	void	(*tcp_offload_listen_start_fn)(void *, struct tcpcb *);
218 typedef	void	(*tcp_offload_listen_stop_fn)(void *, struct tcpcb *);
219 
220 EVENTHANDLER_DECLARE(tcp_offload_listen_start, tcp_offload_listen_start_fn);
221 EVENTHANDLER_DECLARE(tcp_offload_listen_stop, tcp_offload_listen_stop_fn);
222 
223 /*
224  * Check if the socket can be offloaded by the following steps:
225  * - determine the egress interface
226  * - check the interface for TOE capability and TOE is enabled
227  * - check if the device has resources to offload the connection
228  */
229 int	tcp_offload_connect(struct socket *so, struct sockaddr *nam);
230 
231 /*
232  * The tcp_output_* routines are wrappers around the toe_usrreqs calls
233  * which trigger packet transmission. In the non-offloaded case they
234  * translate to tcp_output. The tcp_offload_* routines notify TOE
235  * of specific events. I the non-offloaded case they are no-ops.
236  *
237  * Listen is a special case because it is a 1 to many relationship
238  * and there can be more than one offload driver in the system.
239  */
240 
241 /*
242  * Connection is offloaded
243  */
244 #define	tp_offload(tp)		((tp)->t_flags & TF_TOE)
245 
246 /*
247  * hackish way of allowing this file to also be included by TOE
248  * which needs to be kept ignorant of socket implementation details
249  */
250 #ifdef _SYS_SOCKETVAR_H_
251 /*
252  * The socket has not been marked as "do not offload"
253  */
254 #define	SO_OFFLOADABLE(so)	((so->so_options & SO_NO_OFFLOAD) == 0)
255 
256 static __inline int
257 tcp_output_connect(struct socket *so, struct sockaddr *nam)
258 {
259 	struct tcpcb *tp = sototcpcb(so);
260 	int error;
261 
262 	/*
263 	 * If offload has been disabled for this socket or the
264 	 * connection cannot be offloaded just call tcp_output
265 	 * to start the TCP state machine.
266 	 */
267 #ifndef TCP_OFFLOAD_DISABLE
268 	if (!SO_OFFLOADABLE(so) || (error = tcp_offload_connect(so, nam)) != 0)
269 #endif
270 		error = tcp_output(tp);
271 	return (error);
272 }
273 
274 static __inline int
275 tcp_output_send(struct tcpcb *tp)
276 {
277 
278 #ifndef TCP_OFFLOAD_DISABLE
279 	if (tp_offload(tp))
280 		return (tp->t_tu->tu_send(tp));
281 #endif
282 	return (tcp_output(tp));
283 }
284 
285 static __inline int
286 tcp_output_rcvd(struct tcpcb *tp)
287 {
288 
289 #ifndef TCP_OFFLOAD_DISABLE
290 	if (tp_offload(tp))
291 		return (tp->t_tu->tu_rcvd(tp));
292 #endif
293 	return (tcp_output(tp));
294 }
295 
296 static __inline int
297 tcp_output_disconnect(struct tcpcb *tp)
298 {
299 
300 #ifndef TCP_OFFLOAD_DISABLE
301 	if (tp_offload(tp))
302 		return (tp->t_tu->tu_disconnect(tp));
303 #endif
304 	return (tcp_output(tp));
305 }
306 
307 static __inline int
308 tcp_output_reset(struct tcpcb *tp)
309 {
310 
311 #ifndef TCP_OFFLOAD_DISABLE
312 	if (tp_offload(tp))
313 		return (tp->t_tu->tu_reset(tp));
314 #endif
315 	return (tcp_output(tp));
316 }
317 
318 static __inline void
319 tcp_offload_detach(struct tcpcb *tp)
320 {
321 
322 #ifndef TCP_OFFLOAD_DISABLE
323 	if (tp_offload(tp))
324 		tp->t_tu->tu_detach(tp);
325 #endif
326 }
327 
328 static __inline void
329 tcp_offload_listen_open(struct tcpcb *tp)
330 {
331 
332 #ifndef TCP_OFFLOAD_DISABLE
333 	if (SO_OFFLOADABLE(tp->t_inpcb->inp_socket))
334 		EVENTHANDLER_INVOKE(tcp_offload_listen_start, tp);
335 #endif
336 }
337 
338 static __inline void
339 tcp_offload_listen_close(struct tcpcb *tp)
340 {
341 
342 #ifndef TCP_OFFLOAD_DISABLE
343 	EVENTHANDLER_INVOKE(tcp_offload_listen_stop, tp);
344 #endif
345 }
346 #undef SO_OFFLOADABLE
347 #endif /* _SYS_SOCKETVAR_H_ */
348 #undef tp_offload
349 
350 void tcp_offload_twstart(struct tcpcb *tp);
351 struct tcpcb *tcp_offload_close(struct tcpcb *tp);
352 struct tcpcb *tcp_offload_drop(struct tcpcb *tp, int error);
353 
354 #endif /* _NETINET_TCP_OFFLOAD_H_ */
355