xref: /illumos-gate/usr/src/uts/common/inet/ip_impl.h (revision db874c57ae335a07060499f1492b0d0e2593e26c)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #ifndef	_INET_IP_IMPL_H
28 #define	_INET_IP_IMPL_H
29 
30 #pragma ident	"%Z%%M%	%I%	%E% SMI"
31 
32 /*
33  * IP implementation private declarations.  These interfaces are
34  * used to build the IP module and are not meant to be accessed
35  * by any modules except IP itself.  They are undocumented and are
36  * subject to change without notice.
37  */
38 
39 #ifdef	__cplusplus
40 extern "C" {
41 #endif
42 
43 #ifdef _KERNEL
44 
45 #define	IP_MOD_ID		5701
46 
47 #ifdef	_BIG_ENDIAN
48 #define	IP_HDR_CSUM_TTL_ADJUST	256
49 #define	IP_TCP_CSUM_COMP	IPPROTO_TCP
50 #define	IP_UDP_CSUM_COMP	IPPROTO_UDP
51 #else
52 #define	IP_HDR_CSUM_TTL_ADJUST	1
53 #define	IP_TCP_CSUM_COMP	(IPPROTO_TCP << 8)
54 #define	IP_UDP_CSUM_COMP	(IPPROTO_UDP << 8)
55 #endif
56 
57 #define	TCP_CHECKSUM_OFFSET	16
58 #define	TCP_CHECKSUM_SIZE	2
59 
60 #define	UDP_CHECKSUM_OFFSET	6
61 #define	UDP_CHECKSUM_SIZE	2
62 
63 #define	IPH_TCPH_CHECKSUMP(ipha, hlen)	\
64 	((uint16_t *)(((uchar_t *)(ipha)) + ((hlen) + TCP_CHECKSUM_OFFSET)))
65 
66 #define	IPH_UDPH_CHECKSUMP(ipha, hlen)	\
67 	((uint16_t *)(((uchar_t *)(ipha)) + ((hlen) + UDP_CHECKSUM_OFFSET)))
68 
69 #define	ILL_HCKSUM_CAPABLE(ill)		\
70 	(((ill)->ill_capabilities & ILL_CAPAB_HCKSUM) != 0)
71 /*
72  * Macro that performs software checksum calculation on the IP header.
73  */
74 #define	IP_HDR_CKSUM(ipha, sum, v_hlen_tos_len, ttl_protocol) {		\
75 	(sum) += (ttl_protocol) + (ipha)->ipha_ident +			\
76 	    ((v_hlen_tos_len) >> 16) +					\
77 	    ((v_hlen_tos_len) & 0xFFFF) +				\
78 	    (ipha)->ipha_fragment_offset_and_flags;			\
79 	(sum) = (((sum) & 0xFFFF) + ((sum) >> 16));			\
80 	(sum) = ~((sum) + ((sum) >> 16));				\
81 	(ipha)->ipha_hdr_checksum = (uint16_t)(sum);			\
82 }
83 
84 #define	IS_IP_HDR_HWCKSUM(ipsec, mp, ill)				\
85 	((!ipsec) && (DB_CKSUMFLAGS(mp) & HCK_IPV4_HDRCKSUM) &&		\
86 	ILL_HCKSUM_CAPABLE(ill) && dohwcksum)
87 
88 /*
89  * This macro acts as a wrapper around IP_CKSUM_XMIT_FAST, and it performs
90  * several checks on the IRE and ILL (among other things) in order to see
91  * whether or not hardware checksum offload is allowed for the outgoing
92  * packet.  It assumes that the caller has held a reference to the IRE.
93  */
94 #define	IP_CKSUM_XMIT(ill, ire, mp, ihp, up, proto, start, end,		\
95 	    max_frag, ipsec_len, pseudo) {				\
96 	uint32_t _hck_flags;						\
97 	/*								\
98 	 * We offload checksum calculation to hardware when IPsec isn't	\
99 	 * present and if fragmentation isn't required.  We also check	\
100 	 * if M_DATA fastpath is safe to be used on the	corresponding	\
101 	 * IRE; this check is performed without grabbing ire_lock but	\
102 	 * instead by holding a reference to it.  This is sufficient	\
103 	 * for IRE_CACHE; for IRE_BROADCAST on non-Ethernet links, the	\
104 	 * DL_NOTE_FASTPATH_FLUSH indication could come up from the	\
105 	 * driver and trigger the IRE (hence fp_mp) deletion.  This is	\
106 	 * why only IRE_CACHE type is eligible for offload.		\
107 	 *								\
108 	 * The presense of IP options also forces the network stack to	\
109 	 * calculate the checksum in software.  This is because:	\
110 	 *								\
111 	 * Wrap around: certain partial-checksum NICs (eri, ce) limit	\
112 	 * the size of "start offset" width to 6-bit.  This effectively	\
113 	 * sets the largest value of the offset to 64-bytes, starting	\
114 	 * from the MAC header.  When the cumulative MAC and IP headers	\
115 	 * exceed such limit, the offset will wrap around.  This causes	\
116 	 * the checksum to be calculated at the wrong place.		\
117 	 *								\
118 	 * IPv4 source routing: none of the full-checksum capable NICs	\
119 	 * is capable of correctly handling the	IPv4 source-routing	\
120 	 * option for purposes of calculating the pseudo-header; the	\
121 	 * actual destination is different from the destination in the	\
122 	 * header which is that of the next-hop.  (This case may not be	\
123 	 * true for NICs which can parse IPv6 extension headers, but	\
124 	 * we choose to simplify the implementation by not offloading	\
125 	 * checksum when they are present.)				\
126 	 *								\
127 	 */								\
128 	if ((ill) != NULL && ILL_HCKSUM_CAPABLE(ill) &&			\
129 	    !((ire)->ire_flags & RTF_MULTIRT) &&			\
130 	    (!((ire)->ire_type & (IRE_BROADCAST|IRE_MIPRTUN)) ||	\
131 	    (ill)->ill_type == IFT_ETHER) &&				\
132 	    (ipsec_len) == 0 &&						\
133 	    (((ire)->ire_ipversion == IPV4_VERSION &&			\
134 	    (start) == IP_SIMPLE_HDR_LENGTH &&				\
135 	    (ire)->ire_fp_mp != NULL &&					\
136 	    MBLKHEAD(mp) >= MBLKL((ire)->ire_fp_mp)) ||			\
137 	    ((ire)->ire_ipversion == IPV6_VERSION &&			\
138 	    (start) == IPV6_HDR_LEN &&					\
139 	    (ire)->ire_nce->nce_fp_mp != NULL &&			\
140 	    MBLKHEAD(mp) >= MBLKL((ire)->ire_nce->nce_fp_mp))) &&	\
141 	    (max_frag) >= (uint_t)((end) + (ipsec_len)) &&		\
142 	    dohwcksum) {						\
143 		_hck_flags = (ill)->ill_hcksum_capab->ill_hcksum_txflags; \
144 	} else {							\
145 		_hck_flags = 0;						\
146 	}								\
147 	IP_CKSUM_XMIT_FAST((ire)->ire_ipversion, _hck_flags, mp, ihp,	\
148 	    up, proto, start, end, pseudo);				\
149 }
150 
151 /*
152  * Based on the device capabilities, this macro either marks an outgoing
153  * packet with hardware checksum offload information or calculate the
154  * checksum in software.  If the latter is performed, the checksum field
155  * of the dblk is cleared; otherwise it will be non-zero and contain the
156  * necessary flag(s) for the driver.
157  */
158 #define	IP_CKSUM_XMIT_FAST(ipver, hck_flags, mp, ihp, up, proto, start,	\
159 	    end, pseudo) {						\
160 	uint32_t _sum;							\
161 	/*								\
162 	 * Underlying interface supports hardware checksum offload for	\
163 	 * the payload; leave the payload checksum for the hardware to	\
164 	 * calculate.  N.B: We only need to set up checksum info on the	\
165 	 * first mblk.							\
166 	 */								\
167 	DB_CKSUMFLAGS(mp) = 0;						\
168 	if (((ipver) == IPV4_VERSION &&					\
169 	    ((hck_flags) & HCKSUM_INET_FULL_V4)) ||			\
170 	    ((ipver) == IPV6_VERSION &&					\
171 	    ((hck_flags) & HCKSUM_INET_FULL_V6))) {			\
172 		/*							\
173 		 * Hardware calculates pseudo-header, header and the	\
174 		 * payload checksums, so clear the checksum field in	\
175 		 * the protocol header.					\
176 		 */							\
177 		*(up) = 0;						\
178 		DB_CKSUMFLAGS(mp) |= HCK_FULLCKSUM;			\
179 	} else if ((hck_flags) & HCKSUM_INET_PARTIAL)  {		\
180 		/*							\
181 		 * Partial checksum offload has been enabled.  Fill	\
182 		 * the checksum field in the protocl header with the	\
183 		 * pseudo-header checksum value.			\
184 		 */							\
185 		_sum = ((proto) == IPPROTO_UDP) ?			\
186 		    IP_UDP_CSUM_COMP : IP_TCP_CSUM_COMP;		\
187 		_sum += *(up) + (pseudo);				\
188 		_sum = (_sum & 0xFFFF) + (_sum >> 16);			\
189 		*(up) = (_sum & 0xFFFF) + (_sum >> 16);			\
190 		/*							\
191 		 * Offsets are relative to beginning of IP header.	\
192 		 */							\
193 		DB_CKSUMSTART(mp) = (start);				\
194 		DB_CKSUMSTUFF(mp) = ((proto) == IPPROTO_UDP) ?		\
195 		    (start) + UDP_CHECKSUM_OFFSET :			\
196 		    (start) + TCP_CHECKSUM_OFFSET;			\
197 		DB_CKSUMEND(mp) = (end);				\
198 		DB_CKSUMFLAGS(mp) |= HCK_PARTIALCKSUM;			\
199 	} else {							\
200 		/*							\
201 		 * Software checksumming.				\
202 		 */							\
203 		_sum = ((proto) == IPPROTO_UDP) ?			\
204 		    IP_UDP_CSUM_COMP : IP_TCP_CSUM_COMP;		\
205 		_sum += (pseudo);					\
206 		_sum = IP_CSUM(mp, start, _sum);			\
207 		*(up) = (uint16_t)(_sum ? _sum : ~_sum);		\
208 	}								\
209 	/*								\
210 	 * Hardware supports IP header checksum offload; clear the	\
211 	 * contents of IP header checksum field as expected by NIC.	\
212 	 * Do this only if we offloaded either full or partial sum.	\
213 	 */								\
214 	if ((ipver) == IPV4_VERSION && DB_CKSUMFLAGS(mp) != 0 &&	\
215 	    ((hck_flags) & HCKSUM_IPHDRCKSUM)) {			\
216 		DB_CKSUMFLAGS(mp) |= HCK_IPV4_HDRCKSUM;			\
217 		((ipha_t *)(ihp))->ipha_hdr_checksum = 0;		\
218 	}								\
219 }
220 
221 /*
222  * Macro to inspect the checksum of a fully-reassembled incoming datagram.
223  */
224 #define	IP_CKSUM_RECV_REASS(hck_flags, off, pseudo, sum, err) {		\
225 	(err) = B_FALSE;						\
226 	if ((hck_flags) & HCK_FULLCKSUM) {				\
227 		/*							\
228 		 * The sum of all fragment checksums should		\
229 		 * result in -0 (0xFFFF) or otherwise invalid.		\
230 		 */							\
231 		if ((sum) != 0xFFFF)					\
232 			(err) = B_TRUE;					\
233 	} else if ((hck_flags) & HCK_PARTIALCKSUM) {			\
234 		(sum) += (pseudo);					\
235 		(sum) = ((sum) & 0xFFFF) + ((sum) >> 16);		\
236 		(sum) = ((sum) & 0xFFFF) + ((sum) >> 16);		\
237 		if (~(sum) & 0xFFFF)					\
238 			(err) = B_TRUE;					\
239 	} else if (((sum) = IP_CSUM(mp, off, pseudo)) != 0) {		\
240 		(err) = B_TRUE;						\
241 	}								\
242 }
243 
244 /*
245  * This macro inspects an incoming packet to see if the checksum value
246  * contained in it is valid; if the hardware has provided the information,
247  * the value is verified, otherwise it performs software checksumming.
248  * The checksum value is returned to caller.
249  */
250 #define	IP_CKSUM_RECV(hck_flags, sum, cksum_start, ulph_off, mp, mp1, err) { \
251 	int32_t _len;							\
252 									\
253 	(err) = B_FALSE;						\
254 	if ((hck_flags) & HCK_FULLCKSUM) {				\
255 		/*							\
256 		 * Full checksum has been computed by the hardware	\
257 		 * and has been attached.  If the driver wants us to	\
258 		 * verify the correctness of the attached value, in	\
259 		 * order to protect against faulty hardware, compare	\
260 		 * it against -0 (0xFFFF) to see if it's valid.		\
261 		 */							\
262 		(sum) = DB_CKSUM16(mp);					\
263 		if (!((hck_flags) & HCK_FULLCKSUM_OK) && (sum) != 0xFFFF) \
264 			(err) = B_TRUE;					\
265 	} else if (((hck_flags) & HCK_PARTIALCKSUM) &&			\
266 	    ((mp1) == NULL || (mp1)->b_cont == NULL) &&			\
267 	    (ulph_off) >= DB_CKSUMSTART(mp) &&				\
268 	    ((_len = (ulph_off) - DB_CKSUMSTART(mp)) & 1) == 0) {	\
269 		uint32_t _adj;						\
270 		/*							\
271 		 * Partial checksum has been calculated by hardware	\
272 		 * and attached to the packet; in addition, any		\
273 		 * prepended extraneous data is even byte aligned,	\
274 		 * and there are at most two mblks associated with	\
275 		 * the packet.  If any such data exists, we adjust	\
276 		 * the checksum; also take care any postpended data.	\
277 		 */							\
278 		IP_ADJCKSUM_PARTIAL(cksum_start, mp, mp1, _len, _adj);	\
279 		/*							\
280 		 * One's complement subtract extraneous checksum	\
281 		 */							\
282 		(sum) += DB_CKSUM16(mp);				\
283 		if (_adj >= (sum))					\
284 			(sum) = ~(_adj - (sum)) & 0xFFFF;		\
285 		else							\
286 			(sum) -= _adj;					\
287 		(sum) = ((sum) & 0xFFFF) + ((int)(sum) >> 16);		\
288 		(sum) = ((sum) & 0xFFFF) + ((int)(sum) >> 16);		\
289 		if (~(sum) & 0xFFFF)					\
290 			(err) = B_TRUE;					\
291 	} else if (((sum) = IP_CSUM(mp, ulph_off, sum)) != 0) {		\
292 		(err) = B_TRUE;						\
293 	}								\
294 }
295 
296 /*
297  * Macro to adjust a given checksum value depending on any prepended
298  * or postpended data on the packet.  It expects the start offset to
299  * begin at an even boundary and that the packet consists of at most
300  * two mblks.
301  */
302 #define	IP_ADJCKSUM_PARTIAL(cksum_start, mp, mp1, len, adj) {		\
303 	/*								\
304 	 * Prepended extraneous data; adjust checksum.			\
305 	 */								\
306 	if ((len) > 0)							\
307 		(adj) = IP_BCSUM_PARTIAL(cksum_start, len, 0);		\
308 	else								\
309 		(adj) = 0;						\
310 	/*								\
311 	 * len is now the total length of mblk(s)			\
312 	 */								\
313 	(len) = MBLKL(mp);						\
314 	if ((mp1) == NULL)						\
315 		(mp1) = (mp);						\
316 	else								\
317 		(len) += MBLKL(mp1);					\
318 	/*								\
319 	 * Postpended extraneous data; adjust checksum.			\
320 	 */								\
321 	if (((len) = (DB_CKSUMEND(mp) - len)) > 0) {			\
322 		uint32_t _pad;						\
323 									\
324 		_pad = IP_BCSUM_PARTIAL((mp1)->b_wptr, len, 0);		\
325 		/*							\
326 		 * If the postpended extraneous data was odd		\
327 		 * byte aligned, swap resulting checksum bytes.		\
328 		 */							\
329 		if ((uintptr_t)(mp1)->b_wptr & 1)			\
330 			(adj) += ((_pad << 8) & 0xFFFF) | (_pad >> 8);	\
331 		else							\
332 			(adj) += _pad;					\
333 		(adj) = ((adj) & 0xFFFF) + ((int)(adj) >> 16);		\
334 	}								\
335 }
336 
337 #define	ILL_MDT_CAPABLE(ill)		\
338 	(((ill)->ill_capabilities & ILL_CAPAB_MDT) != 0)
339 
340 /*
341  * ioctl identifier and structure for Multidata Transmit update
342  * private M_CTL communication from IP to ULP.
343  */
344 #define	MDT_IOC_INFO_UPDATE	(('M' << 8) + 1020)
345 
346 typedef struct ip_mdt_info_s {
347 	uint_t	mdt_info_id;	/* MDT_IOC_INFO_UPDATE */
348 	ill_mdt_capab_t	mdt_capab; /* ILL MDT capabilities */
349 } ip_mdt_info_t;
350 
351 /*
352  * Macro that determines whether or not a given ILL is allowed for MDT.
353  */
354 #define	ILL_MDT_USABLE(ill)						\
355 	(ILL_MDT_CAPABLE(ill) &&					\
356 	ill->ill_mdt_capab != NULL &&					\
357 	ill->ill_mdt_capab->ill_mdt_version == MDT_VERSION_2 &&		\
358 	ill->ill_mdt_capab->ill_mdt_on != 0)
359 
360 /*
361  * Macro that determines whether or not a given CONN may be considered
362  * for fast path prior to proceeding further with Multidata.
363  */
364 #define	CONN_IS_MD_FASTPATH(connp)	\
365 	((connp)->conn_dontroute == 0 &&	/* SO_DONTROUTE */	\
366 	(connp)->conn_nofailover_ill == NULL &&	/* IPIF_NOFAILOVER */	\
367 	(connp)->conn_xmit_if_ill == NULL &&	/* IP_XMIT_IF */	\
368 	(connp)->conn_outgoing_pill == NULL &&	/* IP{V6}_BOUND_PIF */	\
369 	(connp)->conn_outgoing_ill == NULL)	/* IP{V6}_BOUND_IF */
370 
371 /* Definitons for fragmenting IP packets using MDT. */
372 
373 /*
374  * Smaller and private version of pdescinfo_t used specifically for IP,
375  * which allows for only a single payload span per packet.
376  */
377 typedef struct ip_pdescinfo_s PDESCINFO_STRUCT(2)	ip_pdescinfo_t;
378 
379 /*
380  * Macro version of ip_can_frag_mdt() which avoids the function call if we
381  * only examine a single message block.
382  */
383 #define	IP_CAN_FRAG_MDT(mp, hdr_len, len)			\
384 	(((mp)->b_cont == NULL) ?				\
385 	(MBLKL(mp) >= ((hdr_len) + ip_wput_frag_mdt_min)) :	\
386 	ip_can_frag_mdt((mp), (hdr_len), (len)))
387 
388 /*
389  * Macro that determines whether or not a given IPC requires
390  * outbound IPSEC processing.
391  */
392 #define	CONN_IPSEC_OUT_ENCAPSULATED(connp)	\
393 	((connp)->conn_out_enforce_policy ||	\
394 	((connp)->conn_latch != NULL &&		\
395 	(connp)->conn_latch->ipl_out_policy != NULL))
396 
397 /*
398  * These are used by the synchronous streams code in tcp and udp.
399  */
400 #define	STR_WAKEUP_CLEAR(stp) {						\
401 	mutex_enter(&stp->sd_lock);					\
402 	stp->sd_wakeq &= ~RSLEEP;					\
403 	mutex_exit(&stp->sd_lock);					\
404 }
405 
406 #define	STR_WAKEUP_SET(stp) {						\
407 	mutex_enter(&stp->sd_lock);					\
408 	if (stp->sd_flag & RSLEEP) {					\
409 		stp->sd_flag &= ~RSLEEP;				\
410 		cv_broadcast(&_RD(stp->sd_wrq)->q_wait);		\
411 	} else {							\
412 		stp->sd_wakeq |= RSLEEP;				\
413 	}								\
414 	mutex_exit(&stp->sd_lock);					\
415 }
416 
417 #define	STR_SENDSIG(stp) {						\
418 	int _events;							\
419 	mutex_enter(&stp->sd_lock);					\
420 	if ((_events = stp->sd_sigflags & (S_INPUT | S_RDNORM)) != 0)	\
421 		strsendsig(stp->sd_siglist, _events, 0, 0);		\
422 	if (stp->sd_rput_opt & SR_POLLIN) {				\
423 		stp->sd_rput_opt &= ~SR_POLLIN;				\
424 		mutex_exit(&stp->sd_lock);				\
425 		pollwakeup(&stp->sd_pollist, POLLIN | POLLRDNORM);	\
426 	} else {							\
427 		mutex_exit(&stp->sd_lock);				\
428 	}								\
429 }
430 
431 #define	CONN_UDP_SYNCSTR(connp)						\
432 	(IPCL_IS_UDP(connp) && (connp)->conn_udp->udp_direct_sockfs)
433 
434 /*
435  * Macro that checks whether or not a particular UDP conn is
436  * flow-controlling on the read-side.  If udp module is directly
437  * above ip, check to see if the drain queue is full; note here
438  * that we check this without any lock protection because this
439  * is a coarse granularity inbound flow-control.  If the module
440  * above ip is not udp, then use canputnext to determine the
441  * flow-control.
442  *
443  * Note that these checks are done after the conn is found in
444  * the UDP fanout table.  A UDP conn in that table may have its
445  * IPCL_UDP bit cleared from the conn_flags when the application
446  * pops the udp module without issuing an unbind; in this case
447  * IP will still receive packets for the conn and deliver it
448  * upstream via putnext.  This is the reason why we have to test
449  * against IPCL_UDP.
450  */
451 #define	CONN_UDP_FLOWCTLD(connp)					\
452 	((CONN_UDP_SYNCSTR(connp) &&					\
453 	(connp)->conn_udp->udp_drain_qfull) ||				\
454 	(!CONN_UDP_SYNCSTR(connp) && !canputnext((connp)->conn_rq)))
455 
456 /*
457  * Macro that delivers a given message upstream; if udp module
458  * is directly above ip, the message is passed directly into
459  * the stream-less entry point.  Otherwise putnext is used.
460  */
461 #define	CONN_UDP_RECV(connp, mp) {					\
462 	if (IPCL_IS_UDP(connp))						\
463 		udp_conn_recv(connp, mp);				\
464 	else								\
465 		putnext((connp)->conn_rq, mp);				\
466 }
467 
468 #define	ILL_POLL_CAPABLE(ill)	\
469 	(((ill)->ill_capabilities & ILL_CAPAB_POLL) != 0)
470 
471 /*
472  * Macro that hands off one or more messages directly to DLD
473  * when the interface is marked with ILL_CAPAB_POLL.
474  */
475 #define	IP_POLL_ILL_TX(ill, mp) {					\
476 	ill_poll_capab_t *ill_poll = ill->ill_poll_capab;		\
477 	ASSERT(ILL_POLL_CAPABLE(ill));					\
478 	ASSERT(ill_poll != NULL);					\
479 	ASSERT(ill_poll->ill_tx != NULL);				\
480 	ASSERT(ill_poll->ill_tx_handle != NULL);			\
481 	ill_poll->ill_tx(ill_poll->ill_tx_handle, mp);			\
482 }
483 
484 extern int	ip_wput_frag_mdt_min;
485 extern boolean_t ip_can_frag_mdt(mblk_t *, ssize_t, ssize_t);
486 
487 #endif	/* _KERNEL */
488 
489 #ifdef	__cplusplus
490 }
491 #endif
492 
493 #endif	/* _INET_IP_IMPL_H */
494