1 /*- 2 * Copyright (c) 2007, Chelsio Inc. 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions are met: 7 * 8 * 1. Redistributions of source code must retain the above copyright notice, 9 * this list of conditions and the following disclaimer. 10 * 11 * 2. Neither the name of the Chelsio Corporation nor the names of its 12 * contributors may be used to endorse or promote products derived from 13 * this software without specific prior written permission. 14 * 15 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 16 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 19 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 20 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 21 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 22 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 23 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 24 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 25 * POSSIBILITY OF SUCH DAMAGE. 26 * 27 * $FreeBSD$ 28 */ 29 30 #ifndef _NETINET_TCP_OFFLOAD_H_ 31 #define _NETINET_TCP_OFFLOAD_H_ 32 33 #ifndef _KERNEL 34 #error "no user-serviceable parts inside" 35 #endif 36 37 /* 38 * A driver publishes that it provides offload services 39 * by setting IFCAP_TOE in the ifnet. The offload connect 40 * will bypass any further work if the interface that a 41 * connection would use does not support TCP offload. 42 * 43 * The TOE API assumes that the tcp offload engine can offload the 44 * the entire connection from set up to teardown, with some provision 45 * being made to allowing the software stack to handle time wait. If 46 * the device does not meet these criteria, it is the driver's responsibility 47 * to overload the functions that it needs to in tcp_usrreqs and make 48 * its own calls to tcp_output if it needs to do so. 49 * 50 * There is currently no provision for the device advertising the congestion 51 * control algorithms it supports as there is currently no API for querying 52 * an operating system for the protocols that it has loaded. This is a desirable 53 * future extension. 54 * 55 * 56 * 57 * It is assumed that individuals deploying TOE will want connections 58 * to be offloaded without software changes so all connections on an 59 * interface providing TOE are offloaded unless the SO_NO_OFFLOAD 60 * flag is set on the socket. 61 * 62 * 63 * The toe_usrreqs structure constitutes the TOE driver's 64 * interface to the TCP stack for functionality that doesn't 65 * interact directly with userspace. If one wants to provide 66 * (optional) functionality to do zero-copy to/from 67 * userspace one still needs to override soreceive/sosend 68 * with functions that fault in and pin the user buffers. 69 * 70 * + tu_send 71 * - tells the driver that new data may have been added to the 72 * socket's send buffer - the driver should not fail if the 73 * buffer is in fact unchanged 74 * - the driver is responsible for providing credits (bytes in the send window) 75 * back to the socket by calling sbdrop() as segments are acknowledged. 76 * - The driver expects the inpcb lock to be held - the driver is expected 77 * not to drop the lock. Hence the driver is not allowed to acquire the 78 * pcbinfo lock during this call. 79 * 80 * + tu_rcvd 81 * - returns credits to the driver and triggers window updates 82 * to the peer (a credit as used here is a byte in the peer's receive window) 83 * - the driver is expected to determine how many bytes have been 84 * consumed and credit that back to the card so that it can grow 85 * the window again by maintaining its own state between invocations. 86 * - In principle this could be used to shrink the window as well as 87 * grow the window, although it is not used for that now. 88 * - this function needs to correctly handle being called any number of 89 * times without any bytes being consumed from the receive buffer. 90 * - The driver expects the inpcb lock to be held - the driver is expected 91 * not to drop the lock. Hence the driver is not allowed to acquire the 92 * pcbinfo lock during this call. 93 * 94 * + tu_disconnect 95 * - tells the driver to send FIN to peer 96 * - driver is expected to send the remaining data and then do a clean half close 97 * - disconnect implies at least half-close so only send, reset, and detach 98 * are legal 99 * - the driver is expected to handle transition through the shutdown 100 * state machine and allow the stack to support SO_LINGER. 101 * - The driver expects the inpcb lock to be held - the driver is expected 102 * not to drop the lock. Hence the driver is not allowed to acquire the 103 * pcbinfo lock during this call. 104 * 105 * + tu_reset 106 * - closes the connection and sends a RST to peer 107 * - driver is expectd to trigger an RST and detach the toepcb 108 * - no further calls are legal after reset 109 * - The driver expects the inpcb lock to be held - the driver is expected 110 * not to drop the lock. Hence the driver is not allowed to acquire the 111 * pcbinfo lock during this call. 112 * 113 * The following fields in the tcpcb are expected to be referenced by the driver: 114 * + iss 115 * + rcv_nxt 116 * + rcv_wnd 117 * + snd_isn 118 * + snd_max 119 * + snd_nxt 120 * + snd_una 121 * + t_flags 122 * + t_inpcb 123 * + t_maxseg 124 * + t_toe 125 * 126 * The following fields in the inpcb are expected to be referenced by the driver: 127 * + inp_lport 128 * + inp_fport 129 * + inp_laddr 130 * + inp_fport 131 * + inp_socket 132 * + inp_ip_tos 133 * 134 * The following fields in the socket are expected to be referenced by the 135 * driver: 136 * + so_comp 137 * + so_error 138 * + so_linger 139 * + so_options 140 * + so_rcv 141 * + so_snd 142 * + so_state 143 * + so_timeo 144 * 145 * These functions all return 0 on success and can return the following errors 146 * as appropriate: 147 * + EPERM: 148 * + ENOBUFS: memory allocation failed 149 * + EMSGSIZE: MTU changed during the call 150 * + EHOSTDOWN: 151 * + EHOSTUNREACH: 152 * + ENETDOWN: 153 * * ENETUNREACH: the peer is no longer reachable 154 * 155 * + tu_detach 156 * - tells driver that the socket is going away so disconnect 157 * the toepcb and free appropriate resources 158 * - allows the driver to cleanly handle the case of connection state 159 * outliving the socket 160 * - no further calls are legal after detach 161 * - the driver is expected to provide its own synchronization between 162 * detach and receiving new data. 163 * 164 * + tu_syncache_event 165 * - even if it is not actually needed, the driver is expected to 166 * call syncache_add for the initial SYN and then syncache_expand 167 * for the SYN,ACK 168 * - tells driver that a connection either has not been added or has 169 * been dropped from the syncache 170 * - the driver is expected to maintain state that lives outside the 171 * software stack so the syncache needs to be able to notify the 172 * toe driver that the software stack is not going to create a connection 173 * for a received SYN 174 * - The driver is responsible for any synchronization required between 175 * the syncache dropping an entry and the driver processing the SYN,ACK. 176 * 177 */ 178 struct toe_usrreqs { 179 int (*tu_send)(struct tcpcb *tp); 180 int (*tu_rcvd)(struct tcpcb *tp); 181 int (*tu_disconnect)(struct tcpcb *tp); 182 int (*tu_reset)(struct tcpcb *tp); 183 void (*tu_detach)(struct tcpcb *tp); 184 void (*tu_syncache_event)(int event, void *toep); 185 }; 186 187 /* 188 * Proxy for struct tcpopt between TOE drivers and TCP functions. 189 */ 190 struct toeopt { 191 u_int64_t to_flags; /* see tcpopt in tcp_var.h */ 192 u_int16_t to_mss; /* maximum segment size */ 193 u_int8_t to_wscale; /* window scaling */ 194 195 u_int8_t _pad1; /* explicit pad for 64bit alignment */ 196 u_int32_t _pad2; /* explicit pad for 64bit alignment */ 197 u_int64_t _pad3[4]; /* TBD */ 198 }; 199 200 #define TOE_SC_ENTRY_PRESENT 1 /* 4-tuple already present */ 201 #define TOE_SC_DROP 2 /* connection was timed out */ 202 203 /* 204 * Because listen is a one-to-many relationship (a socket can be listening 205 * on all interfaces on a machine some of which may be using different TCP 206 * offload devices), listen uses a publish/subscribe mechanism. The TCP 207 * offload driver registers a listen notification function with the stack. 208 * When a listen socket is created all TCP offload devices are notified 209 * so that they can do the appropriate set up to offload connections on the 210 * port to which the socket is bound. When the listen socket is closed, 211 * the offload devices are notified so that they will stop listening on that 212 * port and free any associated resources as well as sending RSTs on any 213 * connections in the SYN_RCVD state. 214 * 215 */ 216 217 typedef void (*tcp_offload_listen_start_fn)(void *, struct tcpcb *); 218 typedef void (*tcp_offload_listen_stop_fn)(void *, struct tcpcb *); 219 220 EVENTHANDLER_DECLARE(tcp_offload_listen_start, tcp_offload_listen_start_fn); 221 EVENTHANDLER_DECLARE(tcp_offload_listen_stop, tcp_offload_listen_stop_fn); 222 223 /* 224 * Check if the socket can be offloaded by the following steps: 225 * - determine the egress interface 226 * - check the interface for TOE capability and TOE is enabled 227 * - check if the device has resources to offload the connection 228 */ 229 int tcp_offload_connect(struct socket *so, struct sockaddr *nam); 230 231 /* 232 * The tcp_output_* routines are wrappers around the toe_usrreqs calls 233 * which trigger packet transmission. In the non-offloaded case they 234 * translate to tcp_output. The tcp_offload_* routines notify TOE 235 * of specific events. I the non-offloaded case they are no-ops. 236 * 237 * Listen is a special case because it is a 1 to many relationship 238 * and there can be more than one offload driver in the system. 239 */ 240 241 /* 242 * Connection is offloaded 243 */ 244 #define tp_offload(tp) ((tp)->t_flags & TF_TOE) 245 246 /* 247 * hackish way of allowing this file to also be included by TOE 248 * which needs to be kept ignorant of socket implementation details 249 */ 250 #ifdef _SYS_SOCKETVAR_H_ 251 /* 252 * The socket has not been marked as "do not offload" 253 */ 254 #define SO_OFFLOADABLE(so) ((so->so_options & SO_NO_OFFLOAD) == 0) 255 256 static __inline int 257 tcp_output_connect(struct socket *so, struct sockaddr *nam) 258 { 259 struct tcpcb *tp = sototcpcb(so); 260 int error; 261 262 /* 263 * If offload has been disabled for this socket or the 264 * connection cannot be offloaded just call tcp_output 265 * to start the TCP state machine. 266 */ 267 #ifndef TCP_OFFLOAD_DISABLE 268 if (!SO_OFFLOADABLE(so) || (error = tcp_offload_connect(so, nam)) != 0) 269 #endif 270 error = tcp_output(tp); 271 return (error); 272 } 273 274 static __inline int 275 tcp_output_send(struct tcpcb *tp) 276 { 277 278 #ifndef TCP_OFFLOAD_DISABLE 279 if (tp_offload(tp)) 280 return (tp->t_tu->tu_send(tp)); 281 #endif 282 return (tcp_output(tp)); 283 } 284 285 static __inline int 286 tcp_output_rcvd(struct tcpcb *tp) 287 { 288 289 #ifndef TCP_OFFLOAD_DISABLE 290 if (tp_offload(tp)) 291 return (tp->t_tu->tu_rcvd(tp)); 292 #endif 293 return (tcp_output(tp)); 294 } 295 296 static __inline int 297 tcp_output_disconnect(struct tcpcb *tp) 298 { 299 300 #ifndef TCP_OFFLOAD_DISABLE 301 if (tp_offload(tp)) 302 return (tp->t_tu->tu_disconnect(tp)); 303 #endif 304 return (tcp_output(tp)); 305 } 306 307 static __inline int 308 tcp_output_reset(struct tcpcb *tp) 309 { 310 311 #ifndef TCP_OFFLOAD_DISABLE 312 if (tp_offload(tp)) 313 return (tp->t_tu->tu_reset(tp)); 314 #endif 315 return (tcp_output(tp)); 316 } 317 318 static __inline void 319 tcp_offload_detach(struct tcpcb *tp) 320 { 321 322 #ifndef TCP_OFFLOAD_DISABLE 323 if (tp_offload(tp)) 324 tp->t_tu->tu_detach(tp); 325 #endif 326 } 327 328 static __inline void 329 tcp_offload_listen_open(struct tcpcb *tp) 330 { 331 332 #ifndef TCP_OFFLOAD_DISABLE 333 if (SO_OFFLOADABLE(tp->t_inpcb->inp_socket)) 334 EVENTHANDLER_INVOKE(tcp_offload_listen_start, tp); 335 #endif 336 } 337 338 static __inline void 339 tcp_offload_listen_close(struct tcpcb *tp) 340 { 341 342 #ifndef TCP_OFFLOAD_DISABLE 343 EVENTHANDLER_INVOKE(tcp_offload_listen_stop, tp); 344 #endif 345 } 346 #undef SO_OFFLOADABLE 347 #endif /* _SYS_SOCKETVAR_H_ */ 348 #undef tp_offload 349 350 void tcp_offload_twstart(struct tcpcb *tp); 351 struct tcpcb *tcp_offload_close(struct tcpcb *tp); 352 struct tcpcb *tcp_offload_drop(struct tcpcb *tp, int error); 353 354 #endif /* _NETINET_TCP_OFFLOAD_H_ */ 355