xref: /illumos-gate/usr/src/uts/common/inet/ip/ip_output.c (revision a94efd7875d6cf7ca97bb9188beaed3bf1282a49)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
24  * Copyright 2018 Joyent, Inc.
25  * Copyright 2024 Oxide Computer Company
26  * Copyright 2026 Bill Sommerfeld <sommerfeld@hamachi.org>
27  */
28 /* Copyright (c) 1990 Mentat Inc. */
29 
30 #include <sys/types.h>
31 #include <sys/stream.h>
32 #include <sys/strsubr.h>
33 #include <sys/dlpi.h>
34 #include <sys/strsun.h>
35 #include <sys/zone.h>
36 #include <sys/ddi.h>
37 #include <sys/sunddi.h>
38 #include <sys/cmn_err.h>
39 #include <sys/debug.h>
40 #include <sys/atomic.h>
41 
42 #include <sys/systm.h>
43 #include <sys/param.h>
44 #include <sys/kmem.h>
45 #include <sys/sdt.h>
46 #include <sys/socket.h>
47 #include <sys/mac.h>
48 #include <net/if.h>
49 #include <net/if_arp.h>
50 #include <net/route.h>
51 #include <sys/sockio.h>
52 #include <netinet/in.h>
53 #include <net/if_dl.h>
54 
55 #include <inet/common.h>
56 #include <inet/mi.h>
57 #include <inet/mib2.h>
58 #include <inet/nd.h>
59 #include <inet/arp.h>
60 #include <inet/snmpcom.h>
61 #include <inet/kstatcom.h>
62 
63 #include <netinet/igmp_var.h>
64 #include <netinet/ip6.h>
65 #include <netinet/icmp6.h>
66 #include <netinet/sctp.h>
67 
68 #include <inet/ip.h>
69 #include <inet/ip_impl.h>
70 #include <inet/ip6.h>
71 #include <inet/ip6_asp.h>
72 #include <inet/tcp.h>
73 #include <inet/ip_multi.h>
74 #include <inet/ip_if.h>
75 #include <inet/ip_ire.h>
76 #include <inet/ip_ftable.h>
77 #include <inet/ip_rts.h>
78 #include <inet/optcom.h>
79 #include <inet/ip_ndp.h>
80 #include <inet/ip_listutils.h>
81 #include <netinet/igmp.h>
82 #include <netinet/ip_mroute.h>
83 #include <inet/ipp_common.h>
84 
85 #include <net/pfkeyv2.h>
86 #include <inet/sadb.h>
87 #include <inet/ipsec_impl.h>
88 #include <inet/ipdrop.h>
89 #include <inet/ip_netinfo.h>
90 
91 #include <sys/pattr.h>
92 #include <inet/ipclassifier.h>
93 #include <inet/sctp_ip.h>
94 #include <inet/sctp/sctp_impl.h>
95 #include <inet/udp_impl.h>
96 #include <sys/sunddi.h>
97 
98 #include <sys/tsol/label.h>
99 #include <sys/tsol/tnet.h>
100 
101 #include <sys/clock_impl.h>	/* For LBOLT_FASTPATH{,64} */
102 
103 #ifdef	DEBUG
104 extern boolean_t skip_sctp_cksum;
105 #endif
106 
107 static int	ip_verify_nce(mblk_t *, ip_xmit_attr_t *);
108 static int	ip_verify_dce(mblk_t *, ip_xmit_attr_t *);
109 static boolean_t ip_verify_lso(ill_t *, ip_xmit_attr_t *);
110 static boolean_t ip_verify_zcopy(ill_t *, ip_xmit_attr_t *);
111 static void	ip_output_simple_broadcast(ip_xmit_attr_t *, mblk_t *);
112 
113 /*
114  * There are two types of output functions for IP used for different
115  * purposes:
116  *  - ip_output_simple() is when sending ICMP errors, TCP resets, etc when there
117  *     is no context in the form of a conn_t. However, there is a
118  *     ip_xmit_attr_t that the callers use to influence interface selection
119  *     (needed for ICMP echo as well as IPv6 link-locals) and IPsec.
120  *
121  *  - conn_ip_output() is used when sending packets with a conn_t and
122  *    ip_set_destination has been called to cache information. In that case
123  *    various socket options are recorded in the ip_xmit_attr_t and should
124  *    be taken into account.
125  */
126 
127 /*
128  * The caller *must* have called conn_connect() or ip_attr_connect()
129  * before calling conn_ip_output(). The caller needs to redo that each time
130  * the destination IP address or port changes, as well as each time there is
131  * a change to any socket option that would modify how packets are routed out
132  * of the box (e.g., SO_DONTROUTE, IP_NEXTHOP, IP_BOUND_IF).
133  *
134  * The ULP caller has to serialize the use of a single ip_xmit_attr_t.
135  * We assert for that here.
136  */
137 int
138 conn_ip_output(mblk_t *mp, ip_xmit_attr_t *ixa)
139 {
140 	iaflags_t	ixaflags = ixa->ixa_flags;
141 	ire_t		*ire;
142 	nce_t		*nce;
143 	dce_t		*dce;
144 	ill_t		*ill;
145 	ip_stack_t	*ipst = ixa->ixa_ipst;
146 	int		error;
147 
148 	/* We defer ipIfStatsHCOutRequests until an error or we have an ill */
149 
150 	ASSERT(ixa->ixa_ire != NULL);
151 	/* Note there is no ixa_nce when reject and blackhole routes */
152 	ASSERT(ixa->ixa_dce != NULL);	/* Could be default dce */
153 
154 #ifdef DEBUG
155 	ASSERT(ixa->ixa_curthread == NULL);
156 	ixa->ixa_curthread = curthread;
157 #endif
158 
159 	/*
160 	 * Even on labeled systems we can have a NULL ixa_tsl e.g.,
161 	 * for IGMP/MLD traffic.
162 	 */
163 
164 	ire = ixa->ixa_ire;
165 
166 	/*
167 	 * If the ULP says the (old) IRE resulted in reachability we
168 	 * record this before determine whether to use a new IRE.
169 	 * No locking for performance reasons.
170 	 */
171 	if (ixaflags & IXAF_REACH_CONF)
172 		ire->ire_badcnt = 0;
173 
174 	/*
175 	 * Has routing changed since we cached the results of the lookup?
176 	 *
177 	 * This check captures all of:
178 	 *  - the cached ire being deleted (by means of the special
179 	 *    IRE_GENERATION_CONDEMNED)
180 	 *  - A potentially better ire being added (ire_generation being
181 	 *    increased)
182 	 *  - A deletion of the nexthop ire that was used when we did the
183 	 *    lookup.
184 	 *  - An addition of a potentially better nexthop ire.
185 	 * The last two are handled by walking and increasing the generation
186 	 * number on all dependant IREs in ire_flush_cache().
187 	 *
188 	 * The check also handles all cases of RTF_REJECT and RTF_BLACKHOLE
189 	 * since we ensure that each time we set ixa_ire to such an IRE we
190 	 * make sure the ixa_ire_generation does not match (by using
191 	 * IRE_GENERATION_VERIFY).
192 	 */
193 	if (ire->ire_generation != ixa->ixa_ire_generation) {
194 		error = ip_verify_ire(mp, ixa);
195 		if (error != 0) {
196 			ip_drop_output("ipIfStatsOutDiscards - verify ire",
197 			    mp, NULL);
198 			goto drop;
199 		}
200 		ire = ixa->ixa_ire;
201 		ASSERT(ire != NULL);
202 		if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
203 #ifdef DEBUG
204 			ASSERT(ixa->ixa_curthread == curthread);
205 			ixa->ixa_curthread = NULL;
206 #endif
207 			ire->ire_ob_pkt_count++;
208 			/* ixa_dce might be condemned; use default one */
209 			return ((ire->ire_sendfn)(ire, mp, mp->b_rptr, ixa,
210 			    &ipst->ips_dce_default->dce_ident));
211 		}
212 		/*
213 		 * If the ncec changed then ip_verify_ire already set
214 		 * ixa->ixa_dce_generation = DCE_GENERATION_VERIFY;
215 		 * so we can recheck the interface mtu.
216 		 */
217 
218 		/*
219 		 * Note that ire->ire_generation could already have changed.
220 		 * We catch that next time we send a packet.
221 		 */
222 	}
223 
224 	/*
225 	 * No need to lock access to ixa_nce since the ip_xmit_attr usage
226 	 * is single threaded.
227 	 */
228 	ASSERT(ixa->ixa_nce != NULL);
229 	nce = ixa->ixa_nce;
230 	if (nce->nce_is_condemned) {
231 		error = ip_verify_nce(mp, ixa);
232 		/*
233 		 * In case ZEROCOPY capability become not available, we
234 		 * copy the message and free the original one. We might
235 		 * be copying more data than needed but it doesn't hurt
236 		 * since such change rarely happens.
237 		 */
238 		switch (error) {
239 		case 0:
240 			break;
241 		case ENOTSUP: { /* ZEROCOPY */
242 			mblk_t *nmp;
243 
244 			if ((nmp = copymsg(mp)) != NULL) {
245 				freemsg(mp);
246 				mp = nmp;
247 
248 				break;
249 			}
250 		}
251 		/* FALLTHROUGH */
252 		default:
253 			ip_drop_output("ipIfStatsOutDiscards - verify nce",
254 			    mp, NULL);
255 			goto drop;
256 		}
257 		ire = ixa->ixa_ire;
258 		ASSERT(ire != NULL);
259 		if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
260 #ifdef DEBUG
261 			ASSERT(ixa->ixa_curthread == curthread);
262 			ixa->ixa_curthread = NULL;
263 #endif
264 			ire->ire_ob_pkt_count++;
265 			/* ixa_dce might be condemned; use default one */
266 			return ((ire->ire_sendfn)(ire, mp, mp->b_rptr,
267 			    ixa, &ipst->ips_dce_default->dce_ident));
268 		}
269 		ASSERT(ixa->ixa_nce != NULL);
270 		nce = ixa->ixa_nce;
271 
272 		/*
273 		 * Note that some other event could already have made
274 		 * the new nce condemned. We catch that next time we
275 		 * try to send a packet.
276 		 */
277 	}
278 	/*
279 	 * If there is no per-destination dce_t then we have a reference to
280 	 * the default dce_t (which merely contains the dce_ipid).
281 	 * The generation check captures both the introduction of a
282 	 * per-destination dce_t (e.g., due to ICMP packet too big) and
283 	 * any change to the per-destination dce (including it becoming
284 	 * condemned by use of the special DCE_GENERATION_CONDEMNED).
285 	 */
286 	dce = ixa->ixa_dce;
287 
288 	/*
289 	 * To avoid a periodic timer to increase the path MTU we
290 	 * look at dce_last_change_time each time we send a packet.
291 	 */
292 	if (dce->dce_flags & DCEF_PMTU) {
293 		int64_t		now = LBOLT_FASTPATH64;
294 
295 		if ((TICK_TO_SEC(now) - dce->dce_last_change_time >
296 		    ipst->ips_ip_pathmtu_interval)) {
297 			/*
298 			 * Older than 20 minutes. Drop the path MTU information.
299 			 * Since the path MTU changes as a result of this,
300 			 * twiddle ixa_dce_generation to make us go through the
301 			 * dce verification code in conn_ip_output.
302 			 */
303 			mutex_enter(&dce->dce_lock);
304 			dce->dce_flags &= ~(DCEF_PMTU|DCEF_TOO_SMALL_PMTU);
305 			dce->dce_last_change_time = TICK_TO_SEC(now);
306 			mutex_exit(&dce->dce_lock);
307 			dce_increment_generation(dce);
308 		}
309 	}
310 
311 	if (dce->dce_generation != ixa->ixa_dce_generation) {
312 		error = ip_verify_dce(mp, ixa);
313 		if (error != 0) {
314 			ip_drop_output("ipIfStatsOutDiscards - verify dce",
315 			    mp, NULL);
316 			goto drop;
317 		}
318 		dce = ixa->ixa_dce;
319 
320 		/*
321 		 * Note that some other event could already have made the
322 		 * new dce's generation number change.
323 		 * We catch that next time we try to send a packet.
324 		 */
325 	}
326 
327 	ill = nce->nce_ill;
328 
329 	/*
330 	 * An initial ixa_fragsize was set in ip_set_destination
331 	 * and we update it if any routing changes above.
332 	 * A change to ill_mtu with ifconfig will increase all dce_generation
333 	 * so that we will detect that with the generation check. Ditto for
334 	 * ill_mc_mtu.
335 	 */
336 
337 	/*
338 	 * Caller needs to make sure IXAF_VERIFY_SRC is not set if
339 	 * conn_unspec_src.
340 	 */
341 	if ((ixaflags & IXAF_VERIFY_SOURCE) &&
342 	    ixa->ixa_src_generation != ipst->ips_src_generation) {
343 		/* Check if the IP source is still assigned to the host. */
344 		uint_t gen;
345 
346 		if (!ip_verify_src(mp, ixa, &gen)) {
347 			/* Don't send a packet with a source that isn't ours */
348 			error = EADDRNOTAVAIL;
349 			ip_drop_output("ipIfStatsOutDiscards - invalid src",
350 			    mp, NULL);
351 			goto drop;
352 		}
353 		/* The source is still valid - update the generation number */
354 		ixa->ixa_src_generation = gen;
355 	}
356 
357 	/*
358 	 * We don't have an IRE when we fragment, hence ire_ob_pkt_count
359 	 * can only count the use prior to fragmentation. However the MIB
360 	 * counters on the ill will be incremented in post fragmentation.
361 	 */
362 	ire->ire_ob_pkt_count++;
363 	BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCOutRequests);
364 
365 	/*
366 	 * Based on ire_type and ire_flags call one of:
367 	 *	ire_send_local_v* - for IRE_LOCAL and IRE_LOOPBACK
368 	 *	ire_send_multirt_v* - if RTF_MULTIRT
369 	 *	ire_send_noroute_v* - if RTF_REJECT or RTF_BLACHOLE
370 	 *	ire_send_multicast_v* - for IRE_MULTICAST
371 	 *	ire_send_broadcast_v4 - for IRE_BROADCAST
372 	 *	ire_send_wire_v* - for the rest.
373 	 */
374 #ifdef DEBUG
375 	ASSERT(ixa->ixa_curthread == curthread);
376 	ixa->ixa_curthread = NULL;
377 #endif
378 	return ((ire->ire_sendfn)(ire, mp, mp->b_rptr, ixa, &dce->dce_ident));
379 
380 drop:
381 	if (ixaflags & IXAF_IS_IPV4) {
382 		BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests);
383 		BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
384 	} else {
385 		BUMP_MIB(&ipst->ips_ip6_mib, ipIfStatsHCOutRequests);
386 		BUMP_MIB(&ipst->ips_ip6_mib, ipIfStatsOutDiscards);
387 	}
388 	freemsg(mp);
389 #ifdef DEBUG
390 	ASSERT(ixa->ixa_curthread == curthread);
391 	ixa->ixa_curthread = NULL;
392 #endif
393 	return (error);
394 }
395 
396 /*
397  * Handle both IPv4 and IPv6. Sets the generation number
398  * to allow the caller to know when to call us again.
399  * Returns true if the source address in the packet is a valid source.
400  * We handle callers which try to send with a zero address (since we only
401  * get here if UNSPEC_SRC is not set).
402  */
403 boolean_t
404 ip_verify_src(mblk_t *mp, ip_xmit_attr_t *ixa, uint_t *generationp)
405 {
406 	ip_stack_t	*ipst = ixa->ixa_ipst;
407 
408 	/*
409 	 * Need to grab the generation number before we check to
410 	 * avoid a race with a change to the set of local addresses.
411 	 * No lock needed since the thread which updates the set of local
412 	 * addresses use ipif/ill locks and exit those (hence a store memory
413 	 * barrier) before doing the atomic increase of ips_src_generation.
414 	 */
415 	if (generationp != NULL)
416 		*generationp = ipst->ips_src_generation;
417 
418 	if (ixa->ixa_flags & IXAF_IS_IPV4) {
419 		ipha_t	*ipha = (ipha_t *)mp->b_rptr;
420 
421 		if (ipha->ipha_src == INADDR_ANY)
422 			return (B_FALSE);
423 
424 		return (ip_laddr_verify_v4(ipha->ipha_src, ixa->ixa_zoneid,
425 		    ipst, B_FALSE) != IPVL_BAD);
426 	} else {
427 		ip6_t	*ip6h = (ip6_t *)mp->b_rptr;
428 		uint_t	scopeid;
429 
430 		if (IN6_IS_ADDR_UNSPECIFIED(&ip6h->ip6_src))
431 			return (B_FALSE);
432 
433 		if (ixa->ixa_flags & IXAF_SCOPEID_SET)
434 			scopeid = ixa->ixa_scopeid;
435 		else
436 			scopeid = 0;
437 
438 		return (ip_laddr_verify_v6(&ip6h->ip6_src, ixa->ixa_zoneid,
439 		    ipst, B_FALSE, scopeid) != IPVL_BAD);
440 	}
441 }
442 
443 /*
444  * Handle both IPv4 and IPv6. Reverify/recalculate the IRE to use.
445  */
446 int
447 ip_verify_ire(mblk_t *mp, ip_xmit_attr_t *ixa)
448 {
449 	uint_t		gen;
450 	ire_t		*ire;
451 	nce_t		*nce;
452 	int		error;
453 	boolean_t	multirt = B_FALSE;
454 
455 	/*
456 	 * Redo ip_select_route.
457 	 * Need to grab generation number as part of the lookup to
458 	 * avoid race.
459 	 */
460 	error = 0;
461 	ire = ip_select_route_pkt(mp, ixa, &gen, &error, &multirt);
462 	ASSERT(ire != NULL); /* IRE_NOROUTE if none found */
463 	if (error != 0) {
464 		ire_refrele(ire);
465 		return (error);
466 	}
467 
468 	if (ixa->ixa_ire != NULL)
469 		ire_refrele_notr(ixa->ixa_ire);
470 #ifdef DEBUG
471 	ire_refhold_notr(ire);
472 	ire_refrele(ire);
473 #endif
474 	ixa->ixa_ire = ire;
475 	ixa->ixa_ire_generation = gen;
476 	if (multirt) {
477 		if (ixa->ixa_flags & IXAF_IS_IPV4)
478 			ixa->ixa_postfragfn = ip_postfrag_multirt_v4;
479 		else
480 			ixa->ixa_postfragfn = ip_postfrag_multirt_v6;
481 		ixa->ixa_flags |= IXAF_MULTIRT_MULTICAST;
482 	} else {
483 		ixa->ixa_postfragfn = ire->ire_postfragfn;
484 		ixa->ixa_flags &= ~IXAF_MULTIRT_MULTICAST;
485 	}
486 
487 	/*
488 	 * Don't look for an nce for reject or blackhole.
489 	 * They have ire_generation set to IRE_GENERATION_VERIFY which
490 	 * makes conn_ip_output avoid references to ixa_nce.
491 	 */
492 	if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
493 		ASSERT(ixa->ixa_ire_generation == IRE_GENERATION_VERIFY);
494 		ixa->ixa_dce_generation = DCE_GENERATION_VERIFY;
495 		return (0);
496 	}
497 
498 	/* The NCE could now be different */
499 	nce = ire_to_nce_pkt(ire, mp);
500 	if (nce == NULL) {
501 		/*
502 		 * Allocation failure. Make sure we redo ire/nce selection
503 		 * next time we send.
504 		 */
505 		ixa->ixa_ire_generation = IRE_GENERATION_VERIFY;
506 		ixa->ixa_dce_generation = DCE_GENERATION_VERIFY;
507 		return (ENOBUFS);
508 	}
509 	if (nce == ixa->ixa_nce) {
510 		/* No change */
511 		nce_refrele(nce);
512 		return (0);
513 	}
514 
515 	/*
516 	 * Since the path MTU might change as a result of this
517 	 * route change, we twiddle ixa_dce_generation to
518 	 * make conn_ip_output go through the ip_verify_dce code.
519 	 */
520 	ixa->ixa_dce_generation = DCE_GENERATION_VERIFY;
521 
522 	if (ixa->ixa_nce != NULL)
523 		nce_refrele(ixa->ixa_nce);
524 	ixa->ixa_nce = nce;
525 	return (0);
526 }
527 
528 /*
529  * Handle both IPv4 and IPv6. Reverify/recalculate the NCE to use.
530  */
531 static int
532 ip_verify_nce(mblk_t *mp, ip_xmit_attr_t *ixa)
533 {
534 	ire_t		*ire = ixa->ixa_ire;
535 	nce_t		*nce;
536 	int		error = 0;
537 	ipha_t		*ipha = NULL;
538 	ip6_t		*ip6h = NULL;
539 
540 	if (ire->ire_ipversion == IPV4_VERSION)
541 		ipha = (ipha_t *)mp->b_rptr;
542 	else
543 		ip6h = (ip6_t *)mp->b_rptr;
544 
545 	nce = ire_handle_condemned_nce(ixa->ixa_nce, ire, ipha, ip6h, B_TRUE);
546 	if (nce == NULL) {
547 		/* Try to find a better ire */
548 		return (ip_verify_ire(mp, ixa));
549 	}
550 
551 	/*
552 	 * The hardware offloading capabilities, for example LSO, of the
553 	 * interface might have changed, so do sanity verification here.
554 	 */
555 	if (ixa->ixa_flags & IXAF_VERIFY_LSO) {
556 		if (!ip_verify_lso(nce->nce_ill, ixa)) {
557 			ASSERT(ixa->ixa_notify != NULL);
558 			ixa->ixa_notify(ixa->ixa_notify_cookie, ixa,
559 			    IXAN_LSO, 0);
560 			error = ENOTSUP;
561 		}
562 	}
563 
564 	/*
565 	 * Verify ZEROCOPY capability of underlying ill. Notify the ULP with
566 	 * any ZEROCOPY changes. In case ZEROCOPY capability is not available
567 	 * any more, return error so that conn_ip_output() can take care of
568 	 * the ZEROCOPY message properly. It's safe to continue send the
569 	 * message when ZEROCOPY newly become available.
570 	 */
571 	if (ixa->ixa_flags & IXAF_VERIFY_ZCOPY) {
572 		if (!ip_verify_zcopy(nce->nce_ill, ixa)) {
573 			ASSERT(ixa->ixa_notify != NULL);
574 			ixa->ixa_notify(ixa->ixa_notify_cookie, ixa,
575 			    IXAN_ZCOPY, 0);
576 			if ((ixa->ixa_flags & IXAF_ZCOPY_CAPAB) == 0)
577 				error = ENOTSUP;
578 		}
579 	}
580 
581 	/*
582 	 * Since the path MTU might change as a result of this
583 	 * change, we twiddle ixa_dce_generation to
584 	 * make conn_ip_output go through the ip_verify_dce code.
585 	 */
586 	ixa->ixa_dce_generation = DCE_GENERATION_VERIFY;
587 
588 	nce_refrele(ixa->ixa_nce);
589 	ixa->ixa_nce = nce;
590 	return (error);
591 }
592 
593 /*
594  * Handle both IPv4 and IPv6. Reverify/recalculate the DCE to use.
595  */
596 static int
597 ip_verify_dce(mblk_t *mp, ip_xmit_attr_t *ixa)
598 {
599 	dce_t		*dce;
600 	uint_t		gen;
601 	uint_t		pmtu;
602 
603 	dce = dce_lookup_pkt(mp, ixa, &gen);
604 	ASSERT(dce != NULL);
605 
606 	dce_refrele_notr(ixa->ixa_dce);
607 #ifdef DEBUG
608 	dce_refhold_notr(dce);
609 	dce_refrele(dce);
610 #endif
611 	ixa->ixa_dce = dce;
612 	ixa->ixa_dce_generation = gen;
613 
614 	/* Extract the (path) mtu from the dce, ncec_ill etc */
615 	pmtu = ip_get_pmtu(ixa);
616 
617 	/*
618 	 * Tell ULP about PMTU changes - increase or decrease - by returning
619 	 * an error if IXAF_VERIFY_PMTU is set. In such case, ULP should update
620 	 * both ixa_pmtu and ixa_fragsize appropriately.
621 	 *
622 	 * If ULP doesn't set that flag then we need to update ixa_fragsize
623 	 * since routing could have changed the ill after after ixa_fragsize
624 	 * was set previously in the conn_ip_output path or in
625 	 * ip_set_destination.
626 	 *
627 	 * In case of LSO, ixa_fragsize might be greater than ixa_pmtu.
628 	 *
629 	 * In the case of a path MTU increase we send the packet after the
630 	 * notify to the ULP.
631 	 */
632 	if (ixa->ixa_flags & IXAF_VERIFY_PMTU) {
633 		if (ixa->ixa_pmtu != pmtu) {
634 			uint_t oldmtu = ixa->ixa_pmtu;
635 
636 			DTRACE_PROBE2(verify_pmtu, uint32_t, pmtu,
637 			    uint32_t, ixa->ixa_pmtu);
638 			ASSERT(ixa->ixa_notify != NULL);
639 			ixa->ixa_notify(ixa->ixa_notify_cookie, ixa,
640 			    IXAN_PMTU, pmtu);
641 			if (pmtu < oldmtu)
642 				return (EMSGSIZE);
643 		}
644 	} else {
645 		ixa->ixa_fragsize = pmtu;
646 	}
647 	return (0);
648 }
649 
650 /*
651  * Verify LSO usability. Keep the return value simple to indicate whether
652  * the LSO capability has changed. Handle both IPv4 and IPv6.
653  */
654 static boolean_t
655 ip_verify_lso(ill_t *ill, ip_xmit_attr_t *ixa)
656 {
657 	ill_lso_capab_t	*lsoc = &ixa->ixa_lso_capab;
658 	ill_lso_capab_t	*new_lsoc = ill->ill_lso_capab;
659 
660 	if (ixa->ixa_flags & IXAF_LSO_CAPAB) {
661 		/*
662 		 * Not usable any more?
663 		 */
664 		if (!dohwcksum ||
665 		    (ixa->ixa_flags & IXAF_IPSEC_SECURE) ||
666 		    (ixa->ixa_ire->ire_type & (IRE_LOCAL | IRE_LOOPBACK)) ||
667 		    (ixa->ixa_ire->ire_flags & RTF_MULTIRT) ||
668 		    ((ixa->ixa_flags & IXAF_IS_IPV4) ?
669 		    !ILL_LSO_TCP_IPV4_USABLE(ill) :
670 		    !ILL_LSO_TCP_IPV6_USABLE(ill))) {
671 			ixa->ixa_flags &= ~IXAF_LSO_CAPAB;
672 
673 			return (B_FALSE);
674 		}
675 
676 		/*
677 		 * Capability has changed, refresh the copy in ixa.
678 		 */
679 		if (lsoc->ill_lso_max_tcpv4 != new_lsoc->ill_lso_max_tcpv4 ||
680 		    lsoc->ill_lso_max_tcpv6 != new_lsoc->ill_lso_max_tcpv6) {
681 			*lsoc = *new_lsoc;
682 
683 			return (B_FALSE);
684 		}
685 	} else { /* Was not usable */
686 		if (dohwcksum &&
687 		    !(ixa->ixa_flags & IXAF_IPSEC_SECURE) &&
688 		    !(ixa->ixa_ire->ire_type & (IRE_LOCAL | IRE_LOOPBACK)) &&
689 		    !(ixa->ixa_ire->ire_flags & RTF_MULTIRT) &&
690 		    ((ixa->ixa_flags & IXAF_IS_IPV4) ?
691 		    ILL_LSO_TCP_IPV4_USABLE(ill) :
692 		    ILL_LSO_TCP_IPV6_USABLE(ill))) {
693 			*lsoc = *new_lsoc;
694 			ixa->ixa_flags |= IXAF_LSO_CAPAB;
695 
696 			return (B_FALSE);
697 		}
698 	}
699 
700 	return (B_TRUE);
701 }
702 
703 /*
704  * Verify ZEROCOPY usability. Keep the return value simple to indicate whether
705  * the ZEROCOPY capability has changed. Handle both IPv4 and IPv6.
706  */
707 static boolean_t
708 ip_verify_zcopy(ill_t *ill, ip_xmit_attr_t *ixa)
709 {
710 	if (ixa->ixa_flags & IXAF_ZCOPY_CAPAB) {
711 		/*
712 		 * Not unsable any more.
713 		 */
714 		if ((ixa->ixa_flags & IXAF_IPSEC_SECURE) ||
715 		    (ixa->ixa_ire->ire_type & (IRE_LOCAL | IRE_LOOPBACK)) ||
716 		    (ixa->ixa_ire->ire_flags & RTF_MULTIRT) ||
717 		    !ILL_ZCOPY_USABLE(ill)) {
718 			ixa->ixa_flags &= ~IXAF_ZCOPY_CAPAB;
719 
720 			return (B_FALSE);
721 		}
722 	} else { /* Was not usable */
723 		if (!(ixa->ixa_flags & IXAF_IPSEC_SECURE) &&
724 		    !(ixa->ixa_ire->ire_type & (IRE_LOCAL | IRE_LOOPBACK)) &&
725 		    !(ixa->ixa_ire->ire_flags & RTF_MULTIRT) &&
726 		    ILL_ZCOPY_USABLE(ill)) {
727 			ixa->ixa_flags |= IXAF_ZCOPY_CAPAB;
728 
729 			return (B_FALSE);
730 		}
731 	}
732 
733 	return (B_TRUE);
734 }
735 
736 
737 /*
738  * When there is no conn_t context, this will send a packet.
739  * The caller must *not* have called conn_connect() or ip_attr_connect()
740  * before calling ip_output_simple().
741  * Handles IPv4 and IPv6. Returns zero or an errno such as ENETUNREACH.
742  * Honors IXAF_SET_SOURCE.
743  *
744  * We acquire the ire and after calling ire_sendfn we release
745  * the hold on the ire. Ditto for the nce and dce.
746  *
747  * This assumes that the caller has set the following in ip_xmit_attr_t:
748  *	ixa_tsl, ixa_zoneid, and ixa_ipst must always be set.
749  *	If ixa_ifindex is non-zero it means send out that ill. (If it is
750  *	an upper IPMP ill we load balance across the group; if a lower we send
751  *	on that lower ill without load balancing.)
752  *	IXAF_IS_IPV4 must be set correctly.
753  *	If IXAF_IPSEC_SECURE is set then the ixa_ipsec_* fields must be set.
754  *	If IXAF_NO_IPSEC is set we'd skip IPsec policy lookup.
755  *	If neither of those two are set we do an IPsec policy lookup.
756  *
757  * We handle setting things like
758  *	ixa_pktlen
759  *	ixa_ip_hdr_length
760  *	ixa->ixa_protocol
761  *
762  * The caller may set ixa_xmit_hint, which is used for ECMP selection and
763  * transmit ring selecting in GLD.
764  *
765  * The caller must do an ixa_cleanup() to release any IPsec references
766  * after we return.
767  */
768 int
769 ip_output_simple(mblk_t *mp, ip_xmit_attr_t *ixa)
770 {
771 	ts_label_t	*effective_tsl = NULL;
772 	int		err;
773 
774 	ASSERT(ixa->ixa_ipst != NULL);
775 
776 	if (is_system_labeled()) {
777 		ip_stack_t *ipst = ixa->ixa_ipst;
778 
779 		if (ixa->ixa_flags & IXAF_IS_IPV4) {
780 			err = tsol_check_label_v4(ixa->ixa_tsl, ixa->ixa_zoneid,
781 			    &mp, CONN_MAC_DEFAULT, B_FALSE, ixa->ixa_ipst,
782 			    &effective_tsl);
783 		} else {
784 			err = tsol_check_label_v6(ixa->ixa_tsl, ixa->ixa_zoneid,
785 			    &mp, CONN_MAC_DEFAULT, B_FALSE, ixa->ixa_ipst,
786 			    &effective_tsl);
787 		}
788 		if (err != 0) {
789 			ip2dbg(("tsol_check: label check failed (%d)\n", err));
790 			BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests);
791 			BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
792 			ip_drop_output("tsol_check_label", mp, NULL);
793 			freemsg(mp);
794 			return (err);
795 		}
796 		if (effective_tsl != NULL) {
797 			/* Update the label */
798 			ip_xmit_attr_replace_tsl(ixa, effective_tsl);
799 		}
800 	}
801 
802 	if (ixa->ixa_flags & IXAF_IS_IPV4)
803 		return (ip_output_simple_v4(mp, ixa));
804 	else
805 		return (ip_output_simple_v6(mp, ixa));
806 }
807 
808 int
809 ip_output_simple_v4(mblk_t *mp, ip_xmit_attr_t *ixa)
810 {
811 	ipha_t		*ipha;
812 	ipaddr_t	firsthop; /* In IP header */
813 	ipaddr_t	dst;	/* End of source route, or ipha_dst if none */
814 	ire_t		*ire;
815 	ipaddr_t	setsrc;	/* RTF_SETSRC */
816 	int		error;
817 	ill_t		*ill = NULL;
818 	dce_t		*dce = NULL;
819 	nce_t		*nce;
820 	iaflags_t	ixaflags = ixa->ixa_flags;
821 	ip_stack_t	*ipst = ixa->ixa_ipst;
822 	boolean_t	repeat = B_FALSE;
823 	boolean_t	multirt = B_FALSE;
824 	int64_t		now;
825 
826 	ipha = (ipha_t *)mp->b_rptr;
827 	ASSERT(IPH_HDR_VERSION(ipha) == IPV4_VERSION);
828 
829 	/*
830 	 * Even on labeled systems we can have a NULL ixa_tsl e.g.,
831 	 * for IGMP/MLD traffic.
832 	 */
833 
834 	/* Caller already set flags */
835 	ASSERT(ixa->ixa_flags & IXAF_IS_IPV4);
836 
837 	ASSERT(ixa->ixa_nce == NULL);
838 
839 	ixa->ixa_pktlen = ntohs(ipha->ipha_length);
840 	ASSERT(ixa->ixa_pktlen == msgdsize(mp));
841 	ixa->ixa_ip_hdr_length = IPH_HDR_LENGTH(ipha);
842 	ixa->ixa_protocol = ipha->ipha_protocol;
843 
844 	/*
845 	 * Assumes that source routed packets have already been massaged by
846 	 * the ULP (ip_massage_options) and as a result ipha_dst is the next
847 	 * hop in the source route. The final destination is used for IPsec
848 	 * policy and DCE lookup.
849 	 */
850 	firsthop = ipha->ipha_dst;
851 	dst = ip_get_dst(ipha);
852 
853 repeat_ire:
854 	error = 0;
855 	setsrc = INADDR_ANY;
856 	ire = ip_select_route_v4(firsthop, ipha->ipha_src, ixa, NULL,
857 	    &setsrc, &error, &multirt);
858 	ASSERT(ire != NULL);	/* IRE_NOROUTE if none found */
859 	if (error != 0) {
860 		BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests);
861 		BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
862 		ip_drop_output("ipIfStatsOutDiscards - select route", mp, NULL);
863 		freemsg(mp);
864 		goto done;
865 	}
866 
867 	if (ire->ire_flags & (RTF_BLACKHOLE|RTF_REJECT)) {
868 		/* ire_ill might be NULL hence need to skip some code */
869 		if (ixaflags & IXAF_SET_SOURCE)
870 			ipha->ipha_src = htonl(INADDR_LOOPBACK);
871 		ixa->ixa_fragsize = IP_MAXPACKET;
872 		ill = NULL;
873 		nce = NULL;
874 		ire->ire_ob_pkt_count++;
875 		BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests);
876 		/* No dce yet; use default one */
877 		error = (ire->ire_sendfn)(ire, mp, ipha, ixa,
878 		    &ipst->ips_dce_default->dce_ident);
879 		goto done;
880 	}
881 
882 	/* Note that ipha_dst is only used for IRE_MULTICAST */
883 	nce = ire_to_nce(ire, ipha->ipha_dst, NULL);
884 	if (nce == NULL) {
885 		/* Allocation failure? */
886 		ip_drop_output("ire_to_nce", mp, ill);
887 		freemsg(mp);
888 		error = ENOBUFS;
889 		goto done;
890 	}
891 	if (nce->nce_is_condemned) {
892 		nce_t *nce1;
893 
894 		nce1 = ire_handle_condemned_nce(nce, ire, ipha, NULL, B_TRUE);
895 		nce_refrele(nce);
896 		if (nce1 == NULL) {
897 			if (!repeat) {
898 				/* Try finding a better IRE */
899 				repeat = B_TRUE;
900 				ire_refrele(ire);
901 				goto repeat_ire;
902 			}
903 			/* Tried twice - drop packet */
904 			BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
905 			ip_drop_output("No nce", mp, ill);
906 			freemsg(mp);
907 			error = ENOBUFS;
908 			goto done;
909 		}
910 		nce = nce1;
911 	}
912 
913 	/*
914 	 * For multicast with multirt we have a flag passed back from
915 	 * ire_lookup_multi_ill_v4 since we don't have an IRE for each
916 	 * possible multicast address.
917 	 * We also need a flag for multicast since we can't check
918 	 * whether RTF_MULTIRT is set in ixa_ire for multicast.
919 	 */
920 	if (multirt) {
921 		ixa->ixa_postfragfn = ip_postfrag_multirt_v4;
922 		ixa->ixa_flags |= IXAF_MULTIRT_MULTICAST;
923 	} else {
924 		ixa->ixa_postfragfn = ire->ire_postfragfn;
925 		ixa->ixa_flags &= ~IXAF_MULTIRT_MULTICAST;
926 	}
927 	ASSERT(ixa->ixa_nce == NULL);
928 	ixa->ixa_nce = nce;
929 
930 	/*
931 	 * Check for a dce_t with a path mtu.
932 	 */
933 	dce = dce_lookup_v4(dst, ipst, NULL);
934 	ASSERT(dce != NULL);
935 
936 	if (!(ixaflags & IXAF_PMTU_DISCOVERY)) {
937 		ixa->ixa_fragsize = ip_get_base_mtu(nce->nce_ill, ire);
938 	} else if (dce->dce_flags & DCEF_PMTU) {
939 		/*
940 		 * To avoid a periodic timer to increase the path MTU we
941 		 * look at dce_last_change_time each time we send a packet.
942 		 */
943 		now = ddi_get_lbolt64();
944 		if (TICK_TO_SEC(now) - dce->dce_last_change_time >
945 		    ipst->ips_ip_pathmtu_interval) {
946 			/*
947 			 * Older than 20 minutes. Drop the path MTU information.
948 			 */
949 			mutex_enter(&dce->dce_lock);
950 			dce->dce_flags &= ~(DCEF_PMTU|DCEF_TOO_SMALL_PMTU);
951 			dce->dce_last_change_time = TICK_TO_SEC(now);
952 			mutex_exit(&dce->dce_lock);
953 			dce_increment_generation(dce);
954 			ixa->ixa_fragsize = ip_get_base_mtu(nce->nce_ill, ire);
955 		} else {
956 			uint_t fragsize;
957 
958 			fragsize = ip_get_base_mtu(nce->nce_ill, ire);
959 			if (fragsize > dce->dce_pmtu)
960 				fragsize = dce->dce_pmtu;
961 			ixa->ixa_fragsize = fragsize;
962 		}
963 	} else {
964 		ixa->ixa_fragsize = ip_get_base_mtu(nce->nce_ill, ire);
965 	}
966 
967 	/*
968 	 * We use use ire_nexthop_ill (and not ncec_ill) to avoid the under ipmp
969 	 * interface for source address selection.
970 	 */
971 	ill = ire_nexthop_ill(ire);
972 
973 	if (ixaflags & IXAF_SET_SOURCE) {
974 		ipaddr_t	src;
975 
976 		/*
977 		 * We use the final destination to get
978 		 * correct selection for source routed packets
979 		 */
980 
981 		/* If unreachable we have no ill but need some source */
982 		if (ill == NULL) {
983 			src = htonl(INADDR_LOOPBACK);
984 			error = 0;
985 		} else {
986 			error = ip_select_source_v4(ill, setsrc, dst,
987 			    ixa->ixa_multicast_ifaddr, ixa->ixa_zoneid, ipst,
988 			    &src, NULL, NULL);
989 		}
990 		if (error != 0) {
991 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCOutRequests);
992 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
993 			ip_drop_output("ipIfStatsOutDiscards - no source",
994 			    mp, ill);
995 			freemsg(mp);
996 			goto done;
997 		}
998 		ipha->ipha_src = src;
999 	} else if (ixaflags & IXAF_VERIFY_SOURCE) {
1000 		/* Check if the IP source is assigned to the host. */
1001 		if (!ip_verify_src(mp, ixa, NULL)) {
1002 			/* Don't send a packet with a source that isn't ours */
1003 			BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests);
1004 			BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
1005 			ip_drop_output("ipIfStatsOutDiscards - invalid source",
1006 			    mp, ill);
1007 			freemsg(mp);
1008 			error = EADDRNOTAVAIL;
1009 			goto done;
1010 		}
1011 	}
1012 
1013 
1014 	/*
1015 	 * Check against global IPsec policy to set the AH/ESP attributes.
1016 	 * IPsec will set IXAF_IPSEC_* and ixa_ipsec_* as appropriate.
1017 	 */
1018 	if (!(ixaflags & (IXAF_NO_IPSEC|IXAF_IPSEC_SECURE))) {
1019 		ASSERT(ixa->ixa_ipsec_policy == NULL);
1020 		mp = ip_output_attach_policy(mp, ipha, NULL, NULL, ixa);
1021 		if (mp == NULL) {
1022 			/* MIB and ip_drop_packet already done */
1023 			return (EHOSTUNREACH);	/* IPsec policy failure */
1024 		}
1025 	}
1026 
1027 	if (ill != NULL) {
1028 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCOutRequests);
1029 	} else {
1030 		BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests);
1031 	}
1032 
1033 	/*
1034 	 * We update the statistics on the most specific IRE i.e., the first
1035 	 * one we found.
1036 	 * We don't have an IRE when we fragment, hence ire_ob_pkt_count
1037 	 * can only count the use prior to fragmentation. However the MIB
1038 	 * counters on the ill will be incremented in post fragmentation.
1039 	 */
1040 	ire->ire_ob_pkt_count++;
1041 
1042 	/*
1043 	 * Based on ire_type and ire_flags call one of:
1044 	 *	ire_send_local_v4 - for IRE_LOCAL and IRE_LOOPBACK
1045 	 *	ire_send_multirt_v4 - if RTF_MULTIRT
1046 	 *	ire_send_noroute_v4 - if RTF_REJECT or RTF_BLACHOLE
1047 	 *	ire_send_multicast_v4 - for IRE_MULTICAST
1048 	 *	ire_send_broadcast_v4 - for IRE_BROADCAST
1049 	 *	ire_send_wire_v4 - for the rest.
1050 	 */
1051 	error = (ire->ire_sendfn)(ire, mp, ipha, ixa, &dce->dce_ident);
1052 done:
1053 	ire_refrele(ire);
1054 	if (dce != NULL)
1055 		dce_refrele(dce);
1056 	if (ill != NULL)
1057 		ill_refrele(ill);
1058 	if (ixa->ixa_nce != NULL)
1059 		nce_refrele(ixa->ixa_nce);
1060 	ixa->ixa_nce = NULL;
1061 	return (error);
1062 }
1063 
1064 /*
1065  * ire_sendfn() functions.
1066  * These functions use the following xmit_attr:
1067  *  - ixa_fragsize - read to determine whether or not to fragment
1068  *  - IXAF_IPSEC_SECURE - to determine whether or not to invoke IPsec
1069  *  - ixa_ipsec_*  are used inside IPsec
1070  *  - IXAF_SET_SOURCE - replace IP source in broadcast case.
1071  *  - IXAF_LOOPBACK_COPY - for multicast and broadcast
1072  */
1073 
1074 
1075 /*
1076  * ire_sendfn for IRE_LOCAL and IRE_LOOPBACK
1077  *
1078  * The checks for restrict_interzone_loopback are done in ire_route_recursive.
1079  */
1080 /* ARGSUSED4 */
1081 int
1082 ire_send_local_v4(ire_t *ire, mblk_t *mp, void *iph_arg,
1083     ip_xmit_attr_t *ixa, uint32_t *identp)
1084 {
1085 	ipha_t		*ipha = (ipha_t *)iph_arg;
1086 	ip_stack_t	*ipst = ixa->ixa_ipst;
1087 	ill_t		*ill = ire->ire_ill;
1088 	ip_recv_attr_t	iras;	/* NOTE: No bzero for performance */
1089 	uint_t		pktlen = ixa->ixa_pktlen;
1090 
1091 	/*
1092 	 * No fragmentation, no nce, no application of IPsec,
1093 	 * and no ipha_ident assignment.
1094 	 *
1095 	 * Note different order between IP provider and FW_HOOKS than in
1096 	 * send_wire case.
1097 	 */
1098 
1099 	/*
1100 	 * DTrace this as ip:::send.  A packet blocked by FW_HOOKS will fire the
1101 	 * send probe, but not the receive probe.
1102 	 */
1103 	DTRACE_IP7(send, mblk_t *, mp, conn_t *, NULL, void_ip_t *,
1104 	    ipha, __dtrace_ipsr_ill_t *, ill, ipha_t *, ipha, ip6_t *, NULL,
1105 	    int, 1);
1106 
1107 	if (HOOKS4_INTERESTED_LOOPBACK_OUT(ipst)) {
1108 		int error = 0;
1109 
1110 		DTRACE_PROBE4(ip4__loopback__out__start, ill_t *, NULL,
1111 		    ill_t *, ill, ipha_t *, ipha, mblk_t *, mp);
1112 		FW_HOOKS(ipst->ips_ip4_loopback_out_event,
1113 		    ipst->ips_ipv4firewall_loopback_out,
1114 		    NULL, ill, ipha, mp, mp, 0, ipst, error);
1115 		DTRACE_PROBE1(ip4__loopback__out__end, mblk_t *, mp);
1116 		if (mp == NULL)
1117 			return (error);
1118 
1119 		/*
1120 		 * Even if the destination was changed by the filter we use the
1121 		 * forwarding decision that was made based on the address
1122 		 * in ip_output/ip_set_destination.
1123 		 */
1124 		/* Length could be different */
1125 		ipha = (ipha_t *)mp->b_rptr;
1126 		pktlen = ntohs(ipha->ipha_length);
1127 	}
1128 
1129 	/*
1130 	 * If a callback is enabled then we need to know the
1131 	 * source and destination zoneids for the packet. We already
1132 	 * have those handy.
1133 	 */
1134 	if (ipst->ips_ip4_observe.he_interested) {
1135 		zoneid_t szone, dzone;
1136 		zoneid_t stackzoneid;
1137 
1138 		stackzoneid = netstackid_to_zoneid(
1139 		    ipst->ips_netstack->netstack_stackid);
1140 
1141 		if (stackzoneid == GLOBAL_ZONEID) {
1142 			/* Shared-IP zone */
1143 			dzone = ire->ire_zoneid;
1144 			szone = ixa->ixa_zoneid;
1145 		} else {
1146 			szone = dzone = stackzoneid;
1147 		}
1148 		ipobs_hook(mp, IPOBS_HOOK_LOCAL, szone, dzone, ill, ipst);
1149 	}
1150 
1151 	/* Handle lo0 stats */
1152 	ipst->ips_loopback_packets++;
1153 
1154 	/* Map ixa to ira including IPsec policies */
1155 	ipsec_out_to_in(ixa, ill, &iras);
1156 	iras.ira_pktlen = pktlen;
1157 	iras.ira_ttl = ipha->ipha_ttl;
1158 
1159 	if (!IS_SIMPLE_IPH(ipha)) {
1160 		ip_output_local_options(ipha, ipst);
1161 		iras.ira_flags |= IRAF_IPV4_OPTIONS;
1162 	}
1163 
1164 	if (HOOKS4_INTERESTED_LOOPBACK_IN(ipst)) {
1165 		int error = 0;
1166 
1167 		DTRACE_PROBE4(ip4__loopback__in__start, ill_t *, ill,
1168 		    ill_t *, NULL, ipha_t *, ipha, mblk_t *, mp);
1169 		FW_HOOKS(ipst->ips_ip4_loopback_in_event,
1170 		    ipst->ips_ipv4firewall_loopback_in,
1171 		    ill, NULL, ipha, mp, mp, 0, ipst, error);
1172 
1173 		DTRACE_PROBE1(ip4__loopback__in__end, mblk_t *, mp);
1174 		if (mp == NULL) {
1175 			ira_cleanup(&iras, B_FALSE);
1176 			return (error);
1177 		}
1178 		/*
1179 		 * Even if the destination was changed by the filter we use the
1180 		 * forwarding decision that was made based on the address
1181 		 * in ip_output/ip_set_destination.
1182 		 */
1183 		/* Length could be different */
1184 		ipha = (ipha_t *)mp->b_rptr;
1185 		pktlen = iras.ira_pktlen = ntohs(ipha->ipha_length);
1186 	}
1187 
1188 	DTRACE_IP7(receive, mblk_t *, mp, conn_t *, NULL, void_ip_t *,
1189 	    ipha, __dtrace_ipsr_ill_t *, ill, ipha_t *, ipha, ip6_t *, NULL,
1190 	    int, 1);
1191 
1192 	ire->ire_ib_pkt_count++;
1193 	BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInReceives);
1194 	UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCInOctets, pktlen);
1195 
1196 	/* Destined to ire_zoneid - use that for fanout */
1197 	iras.ira_zoneid = ire->ire_zoneid;
1198 
1199 	if (is_system_labeled()) {
1200 		iras.ira_flags |= IRAF_SYSTEM_LABELED;
1201 
1202 		/*
1203 		 * This updates ira_cred, ira_tsl and ira_free_flags based
1204 		 * on the label. We don't expect this to ever fail for
1205 		 * loopback packets, so we silently drop the packet should it
1206 		 * fail.
1207 		 */
1208 		if (!tsol_get_pkt_label(mp, IPV4_VERSION, &iras)) {
1209 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
1210 			ip_drop_input("tsol_get_pkt_label", mp, ill);
1211 			freemsg(mp);
1212 			return (0);
1213 		}
1214 		ASSERT(iras.ira_tsl != NULL);
1215 
1216 		/* tsol_get_pkt_label sometimes does pullupmsg */
1217 		ipha = (ipha_t *)mp->b_rptr;
1218 	}
1219 
1220 	ip_fanout_v4(mp, ipha, &iras);
1221 
1222 	/* We moved any IPsec refs from ixa to iras */
1223 	ira_cleanup(&iras, B_FALSE);
1224 	return (0);
1225 }
1226 
1227 /*
1228  * ire_sendfn for IRE_BROADCAST
1229  * If the broadcast address is present on multiple ills and ixa_ifindex
1230  * isn't set, then we generate
1231  * a separate datagram (potentially with different source address) for
1232  * those ills. In any case, only one copy is looped back to ip_input_v4.
1233  */
1234 int
1235 ire_send_broadcast_v4(ire_t *ire, mblk_t *mp, void *iph_arg,
1236     ip_xmit_attr_t *ixa, uint32_t *identp)
1237 {
1238 	ipha_t		*ipha = (ipha_t *)iph_arg;
1239 	ip_stack_t	*ipst = ixa->ixa_ipst;
1240 	irb_t		*irb = ire->ire_bucket;
1241 	ire_t		*ire1;
1242 	mblk_t		*mp1;
1243 	ipha_t		*ipha1;
1244 	iaflags_t	ixaflags = ixa->ixa_flags;
1245 	nce_t		*nce1, *nce_orig;
1246 
1247 	/*
1248 	 * Unless ire_send_multirt_v4 already set a ttl, force the
1249 	 * ttl to a smallish value.
1250 	 */
1251 	if (!(ixa->ixa_flags & IXAF_NO_TTL_CHANGE)) {
1252 		/*
1253 		 * To avoid broadcast storms, we usually set the TTL to 1 for
1254 		 * broadcasts.  This can
1255 		 * be overridden stack-wide through the ip_broadcast_ttl
1256 		 * ndd tunable, or on a per-connection basis through the
1257 		 * IP_BROADCAST_TTL socket option.
1258 		 *
1259 		 * If SO_DONTROUTE/IXAF_DONTROUTE is set, then ire_send_wire_v4
1260 		 * will force ttl to one after we've set this.
1261 		 */
1262 		if (ixaflags & IXAF_BROADCAST_TTL_SET)
1263 			ipha->ipha_ttl = ixa->ixa_broadcast_ttl;
1264 		else
1265 			ipha->ipha_ttl = ipst->ips_ip_broadcast_ttl;
1266 	}
1267 	/*
1268 	 * Make sure we get a loopback copy (after IPsec and frag)
1269 	 * Skip hardware checksum so that loopback copy is checksumed.
1270 	 */
1271 	ixa->ixa_flags |= IXAF_LOOPBACK_COPY | IXAF_NO_HW_CKSUM;
1272 
1273 	/* Do we need to potentially generate multiple copies? */
1274 	if (irb->irb_ire_cnt == 1 || ixa->ixa_ifindex != 0)
1275 		return (ire_send_wire_v4(ire, mp, ipha, ixa, identp));
1276 
1277 	/*
1278 	 * Loop over all IRE_BROADCAST in the bucket (might only be one).
1279 	 * Note that everything in the bucket has the same destination address.
1280 	 */
1281 	irb_refhold(irb);
1282 	for (ire1 = irb->irb_ire; ire1 != NULL; ire1 = ire1->ire_next) {
1283 		/* We do the main IRE after the end of the loop */
1284 		if (ire1 == ire)
1285 			continue;
1286 
1287 		/*
1288 		 * Only IREs for the same IP address should be in the same
1289 		 * bucket.
1290 		 * But could have IRE_HOSTs in the case of CGTP.
1291 		 * If we find any multirt routes we bail out of the loop
1292 		 * and just do the single packet at the end; ip_postfrag_multirt
1293 		 * will duplicate the packet.
1294 		 */
1295 		ASSERT(ire1->ire_addr == ire->ire_addr);
1296 		if (!(ire1->ire_type & IRE_BROADCAST))
1297 			continue;
1298 
1299 		if (IRE_IS_CONDEMNED(ire1))
1300 			continue;
1301 
1302 		if (ixa->ixa_zoneid != ALL_ZONES &&
1303 		    ire->ire_zoneid != ire1->ire_zoneid)
1304 			continue;
1305 
1306 		ASSERT(ire->ire_ill != ire1->ire_ill && ire1->ire_ill != NULL);
1307 
1308 		if (ire1->ire_flags & RTF_MULTIRT)
1309 			break;
1310 
1311 		/*
1312 		 * For IPMP we only send for the ipmp_ill. arp_nce_init() will
1313 		 * ensure that this goes out on the cast_ill.
1314 		 */
1315 		if (IS_UNDER_IPMP(ire1->ire_ill))
1316 			continue;
1317 
1318 		mp1 = copymsg(mp);
1319 		if (mp1 == NULL) {
1320 			BUMP_MIB(ire1->ire_ill->ill_ip_mib,
1321 			    ipIfStatsOutDiscards);
1322 			ip_drop_output("ipIfStatsOutDiscards",
1323 			    mp, ire1->ire_ill);
1324 			continue;
1325 		}
1326 
1327 		ipha1 = (ipha_t *)mp1->b_rptr;
1328 		if (ixa->ixa_flags & IXAF_SET_SOURCE) {
1329 			/*
1330 			 * Need to pick a different source address for each
1331 			 * interface. If we have a global IPsec policy and
1332 			 * no per-socket policy then we punt to
1333 			 * ip_output_simple_v4 using a separate ip_xmit_attr_t.
1334 			 */
1335 			if (ixaflags & IXAF_IPSEC_GLOBAL_POLICY) {
1336 				ip_output_simple_broadcast(ixa, mp1);
1337 				continue;
1338 			}
1339 			/* Pick a new source address for each interface */
1340 			if (ip_select_source_v4(ire1->ire_ill, INADDR_ANY,
1341 			    ipha1->ipha_dst, INADDR_ANY, ixa->ixa_zoneid, ipst,
1342 			    &ipha1->ipha_src, NULL, NULL) != 0) {
1343 				BUMP_MIB(ire1->ire_ill->ill_ip_mib,
1344 				    ipIfStatsOutDiscards);
1345 				ip_drop_output("ipIfStatsOutDiscards - select "
1346 				    "broadcast source", mp1, ire1->ire_ill);
1347 				freemsg(mp1);
1348 				continue;
1349 			}
1350 			/*
1351 			 * Check against global IPsec policy to set the AH/ESP
1352 			 * attributes. IPsec will set IXAF_IPSEC_* and
1353 			 * ixa_ipsec_* as appropriate.
1354 			 */
1355 			if (!(ixaflags & (IXAF_NO_IPSEC|IXAF_IPSEC_SECURE))) {
1356 				ASSERT(ixa->ixa_ipsec_policy == NULL);
1357 				mp1 = ip_output_attach_policy(mp1, ipha, NULL,
1358 				    NULL, ixa);
1359 				if (mp1 == NULL) {
1360 					/*
1361 					 * MIB and ip_drop_packet already
1362 					 * done
1363 					 */
1364 					continue;
1365 				}
1366 			}
1367 		}
1368 		/* Make sure we have an NCE on this ill */
1369 		nce1 = arp_nce_init(ire1->ire_ill, ire1->ire_addr,
1370 		    ire1->ire_type);
1371 		if (nce1 == NULL) {
1372 			BUMP_MIB(ire1->ire_ill->ill_ip_mib,
1373 			    ipIfStatsOutDiscards);
1374 			ip_drop_output("ipIfStatsOutDiscards - broadcast nce",
1375 			    mp1, ire1->ire_ill);
1376 			freemsg(mp1);
1377 			continue;
1378 		}
1379 		nce_orig = ixa->ixa_nce;
1380 		ixa->ixa_nce = nce1;
1381 
1382 		ire_refhold(ire1);
1383 		/*
1384 		 * Ignore any errors here. We just collect the errno for
1385 		 * the main ire below
1386 		 */
1387 		(void) ire_send_wire_v4(ire1, mp1, ipha1, ixa, identp);
1388 		ire_refrele(ire1);
1389 
1390 		ixa->ixa_nce = nce_orig;
1391 		nce_refrele(nce1);
1392 
1393 		ixa->ixa_flags &= ~IXAF_LOOPBACK_COPY;
1394 	}
1395 	irb_refrele(irb);
1396 	/* Finally, the main one */
1397 
1398 	/*
1399 	 * For IPMP we only send broadcasts on the ipmp_ill.
1400 	 */
1401 	if (IS_UNDER_IPMP(ire->ire_ill)) {
1402 		freemsg(mp);
1403 		return (0);
1404 	}
1405 
1406 	return (ire_send_wire_v4(ire, mp, ipha, ixa, identp));
1407 }
1408 
1409 /*
1410  * Send a packet using a different source address and different
1411  * IPsec policy.
1412  */
1413 static void
1414 ip_output_simple_broadcast(ip_xmit_attr_t *ixa, mblk_t *mp)
1415 {
1416 	ip_xmit_attr_t ixas;
1417 
1418 	bzero(&ixas, sizeof (ixas));
1419 	ixas.ixa_flags = IXAF_BASIC_SIMPLE_V4;
1420 	ixas.ixa_zoneid = ixa->ixa_zoneid;
1421 	ixas.ixa_ifindex = 0;
1422 	ixas.ixa_ipst = ixa->ixa_ipst;
1423 	ixas.ixa_cred = ixa->ixa_cred;
1424 	ixas.ixa_cpid = ixa->ixa_cpid;
1425 	ixas.ixa_tsl = ixa->ixa_tsl;
1426 	ixas.ixa_multicast_ttl = IP_DEFAULT_MULTICAST_TTL;
1427 
1428 	(void) ip_output_simple(mp, &ixas);
1429 	ixa_cleanup(&ixas);
1430 }
1431 
1432 
1433 static void
1434 multirt_check_v4(ire_t *ire, ipha_t *ipha, ip_xmit_attr_t *ixa)
1435 {
1436 	ip_stack_t	*ipst = ixa->ixa_ipst;
1437 
1438 	/* Limit the TTL on multirt packets */
1439 	if (ire->ire_type & IRE_MULTICAST) {
1440 		if (ipha->ipha_ttl > 1) {
1441 			ip2dbg(("ire_send_multirt_v4: forcing multicast "
1442 			    "multirt TTL to 1 (was %d), dst 0x%08x\n",
1443 			    ipha->ipha_ttl, ntohl(ire->ire_addr)));
1444 			ipha->ipha_ttl = 1;
1445 		}
1446 		ixa->ixa_flags |= IXAF_NO_TTL_CHANGE;
1447 	} else if ((ipst->ips_ip_multirt_ttl > 0) &&
1448 	    (ipha->ipha_ttl > ipst->ips_ip_multirt_ttl)) {
1449 		ipha->ipha_ttl = ipst->ips_ip_multirt_ttl;
1450 		/*
1451 		 * Need to ensure we don't increase the ttl should we go through
1452 		 * ire_send_broadcast or multicast.
1453 		 */
1454 		ixa->ixa_flags |= IXAF_NO_TTL_CHANGE;
1455 	}
1456 }
1457 
1458 /*
1459  * ire_sendfn for IRE_MULTICAST
1460  */
1461 int
1462 ire_send_multicast_v4(ire_t *ire, mblk_t *mp, void *iph_arg,
1463     ip_xmit_attr_t *ixa, uint32_t *identp)
1464 {
1465 	ipha_t		*ipha = (ipha_t *)iph_arg;
1466 	ip_stack_t	*ipst = ixa->ixa_ipst;
1467 	ill_t		*ill = ire->ire_ill;
1468 	iaflags_t	ixaflags = ixa->ixa_flags;
1469 
1470 	/*
1471 	 * The IRE_MULTICAST is the same whether or not multirt is in use.
1472 	 * Hence we need special-case code.
1473 	 */
1474 	if (ixaflags & IXAF_MULTIRT_MULTICAST)
1475 		multirt_check_v4(ire, ipha, ixa);
1476 
1477 	/*
1478 	 * Check if anything in ip_input_v4 wants a copy of the transmitted
1479 	 * packet (after IPsec and fragmentation)
1480 	 *
1481 	 * 1. Multicast routers always need a copy unless SO_DONTROUTE is set
1482 	 *    RSVP and the rsvp daemon is an example of a
1483 	 *    protocol and user level process that
1484 	 *    handles it's own routing. Hence, it uses the
1485 	 *    SO_DONTROUTE option to accomplish this.
1486 	 * 2. If the sender has set IP_MULTICAST_LOOP, then we just
1487 	 *    check whether there are any receivers for the group on the ill
1488 	 *    (ignoring the zoneid).
1489 	 * 3. If IP_MULTICAST_LOOP is not set, then we check if there are
1490 	 *    any members in other shared-IP zones.
1491 	 *    If such members exist, then we indicate that the sending zone
1492 	 *    shouldn't get a loopback copy to preserve the IP_MULTICAST_LOOP
1493 	 *    behavior.
1494 	 *
1495 	 * When we loopback we skip hardware checksum to make sure loopback
1496 	 * copy is checksumed.
1497 	 *
1498 	 * Note that ire_ill is the upper in the case of IPMP.
1499 	 */
1500 	ixa->ixa_flags &= ~(IXAF_LOOPBACK_COPY | IXAF_NO_HW_CKSUM);
1501 	if (ipst->ips_ip_g_mrouter && ill->ill_mrouter_cnt > 0 &&
1502 	    !(ixaflags & IXAF_DONTROUTE)) {
1503 		ixa->ixa_flags |= IXAF_LOOPBACK_COPY | IXAF_NO_HW_CKSUM;
1504 	} else if (ixaflags & IXAF_MULTICAST_LOOP) {
1505 		/*
1506 		 * If this zone or any other zone has members then loopback
1507 		 * a copy.
1508 		 */
1509 		if (ill_hasmembers_v4(ill, ipha->ipha_dst))
1510 			ixa->ixa_flags |= IXAF_LOOPBACK_COPY | IXAF_NO_HW_CKSUM;
1511 	} else if (ipst->ips_netstack->netstack_numzones > 1) {
1512 		/*
1513 		 * This zone should not have a copy. But there are some other
1514 		 * zones which might have members.
1515 		 */
1516 		if (ill_hasmembers_otherzones_v4(ill, ipha->ipha_dst,
1517 		    ixa->ixa_zoneid)) {
1518 			ixa->ixa_flags |= IXAF_NO_LOOP_ZONEID_SET;
1519 			ixa->ixa_no_loop_zoneid = ixa->ixa_zoneid;
1520 			ixa->ixa_flags |= IXAF_LOOPBACK_COPY | IXAF_NO_HW_CKSUM;
1521 		}
1522 	}
1523 
1524 	/*
1525 	 * Unless ire_send_multirt_v4 or icmp_output_hdrincl already set a ttl,
1526 	 * force the ttl to the IP_MULTICAST_TTL value
1527 	 */
1528 	if (!(ixaflags & IXAF_NO_TTL_CHANGE)) {
1529 		ipha->ipha_ttl = ixa->ixa_multicast_ttl;
1530 	}
1531 
1532 	return (ire_send_wire_v4(ire, mp, ipha, ixa, identp));
1533 }
1534 
1535 /*
1536  * ire_sendfn for IREs with RTF_MULTIRT
1537  */
1538 int
1539 ire_send_multirt_v4(ire_t *ire, mblk_t *mp, void *iph_arg,
1540     ip_xmit_attr_t *ixa, uint32_t *identp)
1541 {
1542 	ipha_t		*ipha = (ipha_t *)iph_arg;
1543 
1544 	multirt_check_v4(ire, ipha, ixa);
1545 
1546 	if (ire->ire_type & IRE_MULTICAST)
1547 		return (ire_send_multicast_v4(ire, mp, ipha, ixa, identp));
1548 	else if (ire->ire_type & IRE_BROADCAST)
1549 		return (ire_send_broadcast_v4(ire, mp, ipha, ixa, identp));
1550 	else
1551 		return (ire_send_wire_v4(ire, mp, ipha, ixa, identp));
1552 }
1553 
1554 /*
1555  * ire_sendfn for IREs with RTF_REJECT/RTF_BLACKHOLE, including IRE_NOROUTE
1556  */
1557 int
1558 ire_send_noroute_v4(ire_t *ire, mblk_t *mp, void *iph_arg,
1559     ip_xmit_attr_t *ixa, uint32_t *identp)
1560 {
1561 	ip_stack_t	*ipst = ixa->ixa_ipst;
1562 	ipha_t		*ipha = (ipha_t *)iph_arg;
1563 	ill_t		*ill;
1564 	ip_recv_attr_t	iras;
1565 	boolean_t	dummy;
1566 
1567 	/* We assign an IP ident for nice errors */
1568 	ipha->ipha_ident = atomic_inc_32_nv(identp);
1569 
1570 	BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutNoRoutes);
1571 
1572 	if (ire->ire_type & IRE_NOROUTE) {
1573 		/* A lack of a route as opposed to RTF_REJECT|BLACKHOLE */
1574 		ip_rts_change(RTM_MISS, ipha->ipha_dst, 0, 0, 0, 0, 0, 0,
1575 		    RTA_DST, ipst);
1576 	}
1577 
1578 	if (ire->ire_flags & RTF_BLACKHOLE) {
1579 		ip_drop_output("ipIfStatsOutNoRoutes RTF_BLACKHOLE", mp, NULL);
1580 		freemsg(mp);
1581 		/* No error even for local senders - silent blackhole */
1582 		return (0);
1583 	}
1584 	ip_drop_output("ipIfStatsOutNoRoutes RTF_REJECT", mp, NULL);
1585 
1586 	/*
1587 	 * We need an ill_t for the ip_recv_attr_t even though this packet
1588 	 * was never received and icmp_unreachable doesn't currently use
1589 	 * ira_ill.
1590 	 */
1591 	ill = ill_lookup_on_name("lo0", B_FALSE,
1592 	    !(ixa->ixa_flags & IRAF_IS_IPV4), &dummy, ipst);
1593 	if (ill == NULL) {
1594 		freemsg(mp);
1595 		return (EHOSTUNREACH);
1596 	}
1597 
1598 	bzero(&iras, sizeof (iras));
1599 	/* Map ixa to ira including IPsec policies */
1600 	ipsec_out_to_in(ixa, ill, &iras);
1601 
1602 	if (ip_source_routed(ipha, ipst)) {
1603 		icmp_unreachable(mp, ICMP_SOURCE_ROUTE_FAILED, &iras);
1604 	} else {
1605 		icmp_unreachable(mp, ICMP_HOST_UNREACHABLE, &iras);
1606 	}
1607 	/* We moved any IPsec refs from ixa to iras */
1608 	ira_cleanup(&iras, B_FALSE);
1609 	ill_refrele(ill);
1610 	return (EHOSTUNREACH);
1611 }
1612 
1613 /*
1614  * Calculate a checksum ignoring any hardware capabilities
1615  *
1616  * Returns B_FALSE if the packet was too short for the checksum. Caller
1617  * should free and do stats.
1618  */
1619 static boolean_t
1620 ip_output_sw_cksum_v4(mblk_t *mp, ipha_t *ipha, ip_xmit_attr_t *ixa)
1621 {
1622 	ip_stack_t	*ipst = ixa->ixa_ipst;
1623 	uint_t		pktlen = ixa->ixa_pktlen;
1624 	uint16_t	*cksump;
1625 	uint32_t	cksum;
1626 	uint8_t		protocol = ixa->ixa_protocol;
1627 	uint16_t	ip_hdr_length = ixa->ixa_ip_hdr_length;
1628 	ipaddr_t	dst = ipha->ipha_dst;
1629 	ipaddr_t	src = ipha->ipha_src;
1630 
1631 	/* Just in case it contained garbage */
1632 	DB_CKSUMFLAGS(mp) &= ~HCK_FLAGS;
1633 
1634 	/*
1635 	 * Calculate ULP checksum
1636 	 */
1637 	if (protocol == IPPROTO_TCP) {
1638 		cksump = IPH_TCPH_CHECKSUMP(ipha, ip_hdr_length);
1639 		cksum = IP_TCP_CSUM_COMP;
1640 	} else if (protocol == IPPROTO_UDP) {
1641 		cksump = IPH_UDPH_CHECKSUMP(ipha, ip_hdr_length);
1642 		cksum = IP_UDP_CSUM_COMP;
1643 	} else if (protocol == IPPROTO_SCTP) {
1644 		sctp_hdr_t	*sctph;
1645 
1646 		ASSERT(MBLKL(mp) >= (ip_hdr_length + sizeof (*sctph)));
1647 		sctph = (sctp_hdr_t *)(mp->b_rptr + ip_hdr_length);
1648 		/*
1649 		 * Zero out the checksum field to ensure proper
1650 		 * checksum calculation.
1651 		 */
1652 		sctph->sh_chksum = 0;
1653 #ifdef	DEBUG
1654 		if (!skip_sctp_cksum)
1655 #endif
1656 			sctph->sh_chksum = sctp_cksum(mp, ip_hdr_length);
1657 		goto ip_hdr_cksum;
1658 	} else {
1659 		goto ip_hdr_cksum;
1660 	}
1661 
1662 	/* ULP puts the checksum field is in the first mblk */
1663 	ASSERT(((uchar_t *)cksump) + sizeof (uint16_t) <= mp->b_wptr);
1664 
1665 	/*
1666 	 * We accumulate the pseudo header checksum in cksum.
1667 	 * This is pretty hairy code, so watch close.  One
1668 	 * thing to keep in mind is that UDP and TCP have
1669 	 * stored their respective datagram lengths in their
1670 	 * checksum fields.  This lines things up real nice.
1671 	 */
1672 	cksum += (dst >> 16) + (dst & 0xFFFF) + (src >> 16) + (src & 0xFFFF);
1673 
1674 	cksum = IP_CSUM(mp, ip_hdr_length, cksum);
1675 	/*
1676 	 * For UDP/IPv4 a zero means that the packets wasn't checksummed.
1677 	 * Change to 0xffff
1678 	 */
1679 	if (protocol == IPPROTO_UDP && cksum == 0)
1680 		*cksump = ~cksum;
1681 	else
1682 		*cksump = cksum;
1683 
1684 	IP_STAT(ipst, ip_out_sw_cksum);
1685 	IP_STAT_UPDATE(ipst, ip_out_sw_cksum_bytes, pktlen);
1686 
1687 ip_hdr_cksum:
1688 	/* Calculate IPv4 header checksum */
1689 	ipha->ipha_hdr_checksum = 0;
1690 	ipha->ipha_hdr_checksum = ip_csum_hdr(ipha);
1691 	return (B_TRUE);
1692 }
1693 
1694 /*
1695  * Calculate the ULP checksum - try to use hardware.
1696  * In the case of MULTIRT, broadcast or multicast the
1697  * IXAF_NO_HW_CKSUM is set in which case we use software.
1698  *
1699  * If the hardware supports IP header checksum offload; then clear the
1700  * contents of IP header checksum field as expected by NIC.
1701  * Do this only if we offloaded either full or partial sum.
1702  *
1703  * Returns B_FALSE if the packet was too short for the checksum. Caller
1704  * should free and do stats.
1705  */
1706 static boolean_t
1707 ip_output_cksum_v4(iaflags_t ixaflags, mblk_t *mp, ipha_t *ipha,
1708     ip_xmit_attr_t *ixa, ill_t *ill)
1709 {
1710 	uint_t		pktlen = ixa->ixa_pktlen;
1711 	uint16_t	*cksump;
1712 	uint16_t	hck_flags;
1713 	uint32_t	cksum;
1714 	uint8_t		protocol = ixa->ixa_protocol;
1715 	uint16_t	ip_hdr_length = ixa->ixa_ip_hdr_length;
1716 
1717 	if ((ixaflags & IXAF_NO_HW_CKSUM) || !ILL_HCKSUM_CAPABLE(ill) ||
1718 	    !dohwcksum) {
1719 		return (ip_output_sw_cksum_v4(mp, ipha, ixa));
1720 	}
1721 
1722 	/*
1723 	 * Calculate ULP checksum. Note that we don't use cksump and cksum
1724 	 * if the ill has FULL support.
1725 	 */
1726 	if (protocol == IPPROTO_TCP) {
1727 		cksump = IPH_TCPH_CHECKSUMP(ipha, ip_hdr_length);
1728 		cksum = IP_TCP_CSUM_COMP;	/* Pseudo-header cksum */
1729 	} else if (protocol == IPPROTO_UDP) {
1730 		cksump = IPH_UDPH_CHECKSUMP(ipha, ip_hdr_length);
1731 		cksum = IP_UDP_CSUM_COMP;	/* Pseudo-header cksum */
1732 	} else if (protocol == IPPROTO_SCTP) {
1733 		sctp_hdr_t	*sctph;
1734 
1735 		ASSERT(MBLKL(mp) >= (ip_hdr_length + sizeof (*sctph)));
1736 		sctph = (sctp_hdr_t *)(mp->b_rptr + ip_hdr_length);
1737 		/*
1738 		 * Zero out the checksum field to ensure proper
1739 		 * checksum calculation.
1740 		 */
1741 		sctph->sh_chksum = 0;
1742 #ifdef	DEBUG
1743 		if (!skip_sctp_cksum)
1744 #endif
1745 			sctph->sh_chksum = sctp_cksum(mp, ip_hdr_length);
1746 		goto ip_hdr_cksum;
1747 	} else if (protocol == IPPROTO_ICMP) {
1748 		/*
1749 		 * Note that we always calculate a SW checksum for ICMP. In the
1750 		 * future, if HW support for ICMP is advertised, we can change
1751 		 * this.
1752 		 */
1753 		return (ip_output_sw_cksum_v4(mp, ipha, ixa));
1754 	} else {
1755 	ip_hdr_cksum:
1756 		/* Calculate IPv4 header checksum */
1757 		ipha->ipha_hdr_checksum = 0;
1758 		ipha->ipha_hdr_checksum = ip_csum_hdr(ipha);
1759 		return (B_TRUE);
1760 	}
1761 
1762 	/* ULP puts the checksum field is in the first mblk */
1763 	ASSERT(((uchar_t *)cksump) + sizeof (uint16_t) <= mp->b_wptr);
1764 
1765 	/*
1766 	 * Underlying interface supports hardware checksum offload for
1767 	 * the payload; leave the payload checksum for the hardware to
1768 	 * calculate.  N.B: We only need to set up checksum info on the
1769 	 * first mblk.
1770 	 */
1771 	hck_flags = ill->ill_hcksum_capab->ill_hcksum_txflags;
1772 
1773 	DB_CKSUMFLAGS(mp) &= ~HCK_FLAGS;
1774 	if (hck_flags & HCKSUM_INET_FULL_V4) {
1775 		/*
1776 		 * Hardware calculates pseudo-header, header and the
1777 		 * payload checksums, so clear the checksum field in
1778 		 * the protocol header.
1779 		 */
1780 		*cksump = 0;
1781 		DB_CKSUMFLAGS(mp) |= HCK_FULLCKSUM;
1782 
1783 		ipha->ipha_hdr_checksum = 0;
1784 		if (hck_flags & HCKSUM_IPHDRCKSUM) {
1785 			DB_CKSUMFLAGS(mp) |= HCK_IPV4_HDRCKSUM;
1786 		} else {
1787 			ipha->ipha_hdr_checksum = ip_csum_hdr(ipha);
1788 		}
1789 		return (B_TRUE);
1790 	}
1791 	if ((hck_flags) & HCKSUM_INET_PARTIAL)  {
1792 		ipaddr_t	dst = ipha->ipha_dst;
1793 		ipaddr_t	src = ipha->ipha_src;
1794 		/*
1795 		 * Partial checksum offload has been enabled.  Fill
1796 		 * the checksum field in the protocol header with the
1797 		 * pseudo-header checksum value.
1798 		 *
1799 		 * We accumulate the pseudo header checksum in cksum.
1800 		 * This is pretty hairy code, so watch close.  One
1801 		 * thing to keep in mind is that UDP and TCP have
1802 		 * stored their respective datagram lengths in their
1803 		 * checksum fields.  This lines things up real nice.
1804 		 */
1805 		cksum += (dst >> 16) + (dst & 0xFFFF) +
1806 		    (src >> 16) + (src & 0xFFFF);
1807 		cksum += *(cksump);
1808 		cksum = (cksum & 0xFFFF) + (cksum >> 16);
1809 		*(cksump) = (cksum & 0xFFFF) + (cksum >> 16);
1810 
1811 		/*
1812 		 * Offsets are relative to beginning of IP header.
1813 		 */
1814 		DB_CKSUMSTART(mp) = ip_hdr_length;
1815 		DB_CKSUMSTUFF(mp) = (uint8_t *)cksump - (uint8_t *)ipha;
1816 		DB_CKSUMEND(mp) = pktlen;
1817 		DB_CKSUMFLAGS(mp) |= HCK_PARTIALCKSUM;
1818 
1819 		ipha->ipha_hdr_checksum = 0;
1820 		if (hck_flags & HCKSUM_IPHDRCKSUM) {
1821 			DB_CKSUMFLAGS(mp) |= HCK_IPV4_HDRCKSUM;
1822 		} else {
1823 			ipha->ipha_hdr_checksum = ip_csum_hdr(ipha);
1824 		}
1825 		return (B_TRUE);
1826 	}
1827 	/* Hardware capabilities include neither full nor partial IPv4 */
1828 	return (ip_output_sw_cksum_v4(mp, ipha, ixa));
1829 }
1830 
1831 /*
1832  * ire_sendfn for offlink and onlink destinations.
1833  * Also called from the multicast, broadcast, multirt send functions.
1834  *
1835  * Assumes that the caller has a hold on the ire.
1836  *
1837  * This function doesn't care if the IRE just became condemned since that
1838  * can happen at any time.
1839  */
1840 /* ARGSUSED */
1841 int
1842 ire_send_wire_v4(ire_t *ire, mblk_t *mp, void *iph_arg,
1843     ip_xmit_attr_t *ixa, uint32_t *identp)
1844 {
1845 	ip_stack_t	*ipst = ixa->ixa_ipst;
1846 	ipha_t		*ipha = (ipha_t *)iph_arg;
1847 	iaflags_t	ixaflags = ixa->ixa_flags;
1848 	ill_t		*ill;
1849 
1850 	ASSERT(ixa->ixa_nce != NULL);
1851 	ill = ixa->ixa_nce->nce_ill;
1852 
1853 	if (ixaflags & IXAF_DONTROUTE)
1854 		ipha->ipha_ttl = 1;
1855 
1856 	/*
1857 	 * Assign an ident value for this packet. There could be other
1858 	 * threads targeting the same destination, so we have to arrange
1859 	 * for a atomic increment.  Note that we use a 32-bit atomic add
1860 	 * because it has better performance than its 16-bit sibling.
1861 	 *
1862 	 * Normally ixa_extra_ident is 0, but in the case of LSO it will
1863 	 * be the number of TCP segments  that the driver/hardware will
1864 	 * extraly construct.
1865 	 *
1866 	 * If running in cluster mode and if the source address
1867 	 * belongs to a replicated service then vector through
1868 	 * cl_inet_ipident vector to allocate ip identifier
1869 	 * NOTE: This is a contract private interface with the
1870 	 * clustering group.
1871 	 */
1872 	if (cl_inet_ipident != NULL) {
1873 		ipaddr_t src = ipha->ipha_src;
1874 		ipaddr_t dst = ipha->ipha_dst;
1875 		netstackid_t stack_id = ipst->ips_netstack->netstack_stackid;
1876 
1877 		ASSERT(cl_inet_isclusterwide != NULL);
1878 		if ((*cl_inet_isclusterwide)(stack_id, IPPROTO_IP,
1879 		    AF_INET, (uint8_t *)(uintptr_t)src, NULL)) {
1880 			/*
1881 			 * Note: not correct with LSO since we can't allocate
1882 			 * ixa_extra_ident+1 consecutive values.
1883 			 */
1884 			ipha->ipha_ident = (*cl_inet_ipident)(stack_id,
1885 			    IPPROTO_IP, AF_INET, (uint8_t *)(uintptr_t)src,
1886 			    (uint8_t *)(uintptr_t)dst, NULL);
1887 		} else {
1888 			ipha->ipha_ident = atomic_add_32_nv(identp,
1889 			    ixa->ixa_extra_ident + 1);
1890 		}
1891 	} else {
1892 		ipha->ipha_ident = atomic_add_32_nv(identp,
1893 		    ixa->ixa_extra_ident + 1);
1894 	}
1895 #ifndef _BIG_ENDIAN
1896 	ipha->ipha_ident = htons(ipha->ipha_ident);
1897 #endif
1898 
1899 	/*
1900 	 * This might set b_band, thus the IPsec and fragmentation
1901 	 * code in IP ensures that b_band is updated in the first mblk.
1902 	 */
1903 	if (IPP_ENABLED(IPP_LOCAL_OUT, ipst)) {
1904 		/* ip_process translates an IS_UNDER_IPMP */
1905 		mp = ip_process(IPP_LOCAL_OUT, mp, ill, ill);
1906 		if (mp == NULL) {
1907 			/* ip_drop_packet and MIB done */
1908 			return (0);	/* Might just be delayed */
1909 		}
1910 	}
1911 
1912 	/*
1913 	 * Verify any IPv4 options.
1914 	 *
1915 	 * The presence of IP options also forces the network stack to
1916 	 * calculate the checksum in software.  This is because:
1917 	 *
1918 	 * Wrap around: certain partial-checksum NICs (eri, ce) limit
1919 	 * the size of "start offset" width to 6-bit.  This effectively
1920 	 * sets the largest value of the offset to 64-bytes, starting
1921 	 * from the MAC header.  When the cumulative MAC and IP headers
1922 	 * exceed such limit, the offset will wrap around.  This causes
1923 	 * the checksum to be calculated at the wrong place.
1924 	 *
1925 	 * IPv4 source routing: none of the full-checksum capable NICs
1926 	 * is capable of correctly handling the	IPv4 source-routing
1927 	 * option for purposes of calculating the pseudo-header; the
1928 	 * actual destination is different from the destination in the
1929 	 * header which is that of the next-hop.  (This case may not be
1930 	 * true for NICs which can parse IPv6 extension headers, but
1931 	 * we choose to simplify the implementation by not offloading
1932 	 * checksum when they are present.)
1933 	 */
1934 	if (!IS_SIMPLE_IPH(ipha)) {
1935 		ixaflags = ixa->ixa_flags |= IXAF_NO_HW_CKSUM;
1936 		/* An IS_UNDER_IPMP ill is ok here */
1937 		if (ip_output_options(mp, ipha, ixa, ill)) {
1938 			/* Packet has been consumed and ICMP error sent */
1939 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
1940 			return (EINVAL);
1941 		}
1942 	}
1943 
1944 	/*
1945 	 * To handle IPsec/iptun's labeling needs we need to tag packets
1946 	 * while we still have ixa_tsl
1947 	 */
1948 	if (is_system_labeled() && ixa->ixa_tsl != NULL &&
1949 	    (ill->ill_mactype == DL_6TO4 || ill->ill_mactype == DL_IPV4 ||
1950 	    ill->ill_mactype == DL_IPV6)) {
1951 		cred_t *newcr;
1952 
1953 		newcr = copycred_from_tslabel(ixa->ixa_cred, ixa->ixa_tsl,
1954 		    KM_NOSLEEP);
1955 		if (newcr == NULL) {
1956 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
1957 			ip_drop_output("ipIfStatsOutDiscards - newcr",
1958 			    mp, ill);
1959 			freemsg(mp);
1960 			return (ENOBUFS);
1961 		}
1962 		mblk_setcred(mp, newcr, NOPID);
1963 		crfree(newcr);	/* mblk_setcred did its own crhold */
1964 	}
1965 
1966 	if (ixa->ixa_pktlen > ixa->ixa_fragsize ||
1967 	    (ixaflags & IXAF_IPSEC_SECURE)) {
1968 		uint32_t pktlen;
1969 
1970 		pktlen = ixa->ixa_pktlen;
1971 		if (ixaflags & IXAF_IPSEC_SECURE)
1972 			pktlen += ipsec_out_extra_length(ixa);
1973 
1974 		if (pktlen > IP_MAXPACKET)
1975 			return (EMSGSIZE);
1976 
1977 		if (ixaflags & IXAF_SET_ULP_CKSUM) {
1978 			/*
1979 			 * Compute ULP checksum and IP header checksum
1980 			 * using software
1981 			 */
1982 			if (!ip_output_sw_cksum_v4(mp, ipha, ixa)) {
1983 				BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
1984 				ip_drop_output("ipIfStatsOutDiscards", mp, ill);
1985 				freemsg(mp);
1986 				return (EINVAL);
1987 			}
1988 		} else {
1989 			/* Calculate IPv4 header checksum */
1990 			ipha->ipha_hdr_checksum = 0;
1991 			ipha->ipha_hdr_checksum = ip_csum_hdr(ipha);
1992 		}
1993 
1994 		/*
1995 		 * If this packet would generate a icmp_frag_needed
1996 		 * message, we need to handle it before we do the IPsec
1997 		 * processing. Otherwise, we need to strip the IPsec
1998 		 * headers before we send up the message to the ULPs
1999 		 * which becomes messy and difficult.
2000 		 *
2001 		 * We check using IXAF_DONTFRAG. The DF bit in the header
2002 		 * is not inspected - it will be copied to any generated
2003 		 * fragments.
2004 		 */
2005 		if ((pktlen > ixa->ixa_fragsize) &&
2006 		    (ixaflags & IXAF_DONTFRAG)) {
2007 			/* Generate ICMP and return error */
2008 			ip_recv_attr_t	iras;
2009 
2010 			DTRACE_PROBE4(ip4__fragsize__fail, uint_t, pktlen,
2011 			    uint_t, ixa->ixa_fragsize, uint_t, ixa->ixa_pktlen,
2012 			    uint_t, ixa->ixa_pmtu);
2013 
2014 			bzero(&iras, sizeof (iras));
2015 			/* Map ixa to ira including IPsec policies */
2016 			ipsec_out_to_in(ixa, ill, &iras);
2017 
2018 			ip_drop_output("ICMP_FRAG_NEEDED", mp, ill);
2019 			icmp_frag_needed(mp, ixa->ixa_fragsize, &iras);
2020 			/* We moved any IPsec refs from ixa to iras */
2021 			ira_cleanup(&iras, B_FALSE);
2022 			return (EMSGSIZE);
2023 		}
2024 		DTRACE_PROBE4(ip4__fragsize__ok, uint_t, pktlen,
2025 		    uint_t, ixa->ixa_fragsize, uint_t, ixa->ixa_pktlen,
2026 		    uint_t, ixa->ixa_pmtu);
2027 
2028 		if (ixaflags & IXAF_IPSEC_SECURE) {
2029 			/*
2030 			 * Pass in sufficient information so that
2031 			 * IPsec can determine whether to fragment, and
2032 			 * which function to call after fragmentation.
2033 			 */
2034 			return (ipsec_out_process(mp, ixa));
2035 		}
2036 		return (ip_fragment_v4(mp, ixa->ixa_nce, ixaflags,
2037 		    ixa->ixa_pktlen, ixa->ixa_fragsize, ixa->ixa_xmit_hint,
2038 		    ixa->ixa_zoneid, ixa->ixa_no_loop_zoneid,
2039 		    ixa->ixa_postfragfn, &ixa->ixa_cookie));
2040 	}
2041 	if (ixaflags & IXAF_SET_ULP_CKSUM) {
2042 		/* Compute ULP checksum and IP header checksum */
2043 		/* An IS_UNDER_IPMP ill is ok here */
2044 		if (!ip_output_cksum_v4(ixaflags, mp, ipha, ixa, ill)) {
2045 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
2046 			ip_drop_output("ipIfStatsOutDiscards", mp, ill);
2047 			freemsg(mp);
2048 			return (EINVAL);
2049 		}
2050 	} else {
2051 		/* Calculate IPv4 header checksum */
2052 		ipha->ipha_hdr_checksum = 0;
2053 		ipha->ipha_hdr_checksum = ip_csum_hdr(ipha);
2054 	}
2055 	return ((ixa->ixa_postfragfn)(mp, ixa->ixa_nce, ixaflags,
2056 	    ixa->ixa_pktlen, ixa->ixa_xmit_hint, ixa->ixa_zoneid,
2057 	    ixa->ixa_no_loop_zoneid, &ixa->ixa_cookie));
2058 }
2059 
2060 /*
2061  * Send mp into ip_input
2062  * Common for IPv4 and IPv6
2063  */
2064 void
2065 ip_postfrag_loopback(mblk_t *mp, nce_t *nce, iaflags_t ixaflags,
2066     uint_t pkt_len, zoneid_t nolzid)
2067 {
2068 	rtc_t		rtc;
2069 	ill_t		*ill = nce->nce_ill;
2070 	ip_recv_attr_t	iras;	/* NOTE: No bzero for performance */
2071 	ncec_t		*ncec;
2072 
2073 	ncec = nce->nce_common;
2074 	iras.ira_flags = IRAF_VERIFY_IP_CKSUM | IRAF_VERIFY_ULP_CKSUM |
2075 	    IRAF_LOOPBACK | IRAF_L2SRC_LOOPBACK;
2076 	if (ncec->ncec_flags & NCE_F_BCAST)
2077 		iras.ira_flags |= IRAF_L2DST_BROADCAST;
2078 	else if (ncec->ncec_flags & NCE_F_MCAST)
2079 		iras.ira_flags |= IRAF_L2DST_MULTICAST;
2080 
2081 	iras.ira_free_flags = 0;
2082 	iras.ira_cred = NULL;
2083 	iras.ira_cpid = NOPID;
2084 	iras.ira_tsl = NULL;
2085 	iras.ira_zoneid = ALL_ZONES;
2086 	iras.ira_pktlen = pkt_len;
2087 	UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCInOctets, iras.ira_pktlen);
2088 	BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInReceives);
2089 
2090 	if (ixaflags & IXAF_IS_IPV4)
2091 		iras.ira_flags |= IRAF_IS_IPV4;
2092 
2093 	iras.ira_ill = iras.ira_rill = ill;
2094 	iras.ira_ruifindex = ill->ill_phyint->phyint_ifindex;
2095 	iras.ira_rifindex = iras.ira_ruifindex;
2096 	iras.ira_mhip = NULL;
2097 
2098 	iras.ira_flags |= ixaflags & IAF_MASK;
2099 	iras.ira_no_loop_zoneid = nolzid;
2100 
2101 	/* Broadcast and multicast doesn't care about the squeue */
2102 	iras.ira_sqp = NULL;
2103 
2104 	rtc.rtc_ire = NULL;
2105 	if (ixaflags & IXAF_IS_IPV4) {
2106 		ipha_t		*ipha = (ipha_t *)mp->b_rptr;
2107 
2108 		rtc.rtc_ipaddr = INADDR_ANY;
2109 
2110 		(*ill->ill_inputfn)(mp, ipha, &ipha->ipha_dst, &iras, &rtc);
2111 		if (rtc.rtc_ire != NULL) {
2112 			ASSERT(rtc.rtc_ipaddr != INADDR_ANY);
2113 			ire_refrele(rtc.rtc_ire);
2114 		}
2115 	} else {
2116 		ip6_t		*ip6h = (ip6_t *)mp->b_rptr;
2117 
2118 		rtc.rtc_ip6addr = ipv6_all_zeros;
2119 
2120 		(*ill->ill_inputfn)(mp, ip6h, &ip6h->ip6_dst, &iras, &rtc);
2121 		if (rtc.rtc_ire != NULL) {
2122 			ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&rtc.rtc_ip6addr));
2123 			ire_refrele(rtc.rtc_ire);
2124 		}
2125 	}
2126 	/* Any references to clean up? No hold on ira */
2127 	if (iras.ira_flags & (IRAF_IPSEC_SECURE|IRAF_SYSTEM_LABELED))
2128 		ira_cleanup(&iras, B_FALSE);
2129 }
2130 
2131 /*
2132  * Post fragmentation function for IRE_MULTICAST and IRE_BROADCAST which
2133  * looks at the IXAF_LOOPBACK_COPY flag.
2134  * Common for IPv4 and IPv6.
2135  *
2136  * If the loopback copy fails (due to no memory) but we send the packet out
2137  * on the wire we return no failure. Only in the case we supress the wire
2138  * sending do we take the loopback failure into account.
2139  *
2140  * Note that we do not perform DTRACE_IP7 and FW_HOOKS for the looped back copy.
2141  * Those operations are performed on this packet in ip_xmit() and it would
2142  * be odd to do it twice for the same packet.
2143  */
2144 int
2145 ip_postfrag_loopcheck(mblk_t *mp, nce_t *nce, iaflags_t ixaflags,
2146     uint_t pkt_len, uint32_t xmit_hint, zoneid_t szone, zoneid_t nolzid,
2147     uintptr_t *ixacookie)
2148 {
2149 	ill_t		*ill = nce->nce_ill;
2150 	int		error = 0;
2151 
2152 	/*
2153 	 * Check for IXAF_LOOPBACK_COPY - send a copy to ip as if the driver
2154 	 * had looped it back
2155 	 */
2156 	if (ixaflags & IXAF_LOOPBACK_COPY) {
2157 		mblk_t		*mp1;
2158 
2159 		mp1 = copymsg(mp);
2160 		if (mp1 == NULL) {
2161 			/* Failed to deliver the loopback copy. */
2162 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
2163 			ip_drop_output("ipIfStatsOutDiscards", mp, ill);
2164 			error = ENOBUFS;
2165 		} else {
2166 			ip_postfrag_loopback(mp1, nce, ixaflags, pkt_len,
2167 			    nolzid);
2168 		}
2169 	}
2170 
2171 	/*
2172 	 * If TTL = 0 then only do the loopback to this host i.e. we are
2173 	 * done. We are also done if this was the
2174 	 * loopback interface since it is sufficient
2175 	 * to loopback one copy of a multicast packet.
2176 	 */
2177 	if (ixaflags & IXAF_IS_IPV4) {
2178 		ipha_t *ipha = (ipha_t *)mp->b_rptr;
2179 
2180 		if (ipha->ipha_ttl == 0) {
2181 			ip_drop_output("multicast ipha_ttl not sent to wire",
2182 			    mp, ill);
2183 			freemsg(mp);
2184 			return (error);
2185 		}
2186 	} else {
2187 		ip6_t	*ip6h = (ip6_t *)mp->b_rptr;
2188 
2189 		if (ip6h->ip6_hops == 0) {
2190 			ip_drop_output("multicast ipha_ttl not sent to wire",
2191 			    mp, ill);
2192 			freemsg(mp);
2193 			return (error);
2194 		}
2195 	}
2196 	if (nce->nce_ill->ill_wq == NULL) {
2197 		/* Loopback interface */
2198 		ip_drop_output("multicast on lo0 not sent to wire", mp, ill);
2199 		freemsg(mp);
2200 		return (error);
2201 	}
2202 
2203 	return (ip_xmit(mp, nce, ixaflags, pkt_len, xmit_hint, szone, 0,
2204 	    ixacookie));
2205 }
2206 
2207 /*
2208  * Post fragmentation function for RTF_MULTIRT routes.
2209  * Since IRE_BROADCASTs can have RTF_MULTIRT, this function
2210  * checks IXAF_LOOPBACK_COPY.
2211  *
2212  * If no packet is sent due to failures then we return an errno, but if at
2213  * least one succeeded we return zero.
2214  */
2215 int
2216 ip_postfrag_multirt_v4(mblk_t *mp, nce_t *nce, iaflags_t ixaflags,
2217     uint_t pkt_len, uint32_t xmit_hint, zoneid_t szone, zoneid_t nolzid,
2218     uintptr_t *ixacookie)
2219 {
2220 	irb_t		*irb;
2221 	ipha_t		*ipha = (ipha_t *)mp->b_rptr;
2222 	ire_t		*ire;
2223 	ire_t		*ire1;
2224 	mblk_t		*mp1;
2225 	nce_t		*nce1;
2226 	ill_t		*ill = nce->nce_ill;
2227 	ill_t		*ill1;
2228 	ip_stack_t	*ipst = ill->ill_ipst;
2229 	int		error = 0;
2230 	int		num_sent = 0;
2231 	int		err;
2232 	uint_t		ire_type;
2233 	ipaddr_t	nexthop;
2234 
2235 	ASSERT(ixaflags & IXAF_IS_IPV4);
2236 
2237 	/* Check for IXAF_LOOPBACK_COPY */
2238 	if (ixaflags & IXAF_LOOPBACK_COPY) {
2239 		mblk_t *mp1;
2240 
2241 		mp1 = copymsg(mp);
2242 		if (mp1 == NULL) {
2243 			/* Failed to deliver the loopback copy. */
2244 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
2245 			ip_drop_output("ipIfStatsOutDiscards", mp, ill);
2246 			error = ENOBUFS;
2247 		} else {
2248 			ip_postfrag_loopback(mp1, nce, ixaflags, pkt_len,
2249 			    nolzid);
2250 		}
2251 	}
2252 
2253 	/*
2254 	 * Loop over RTF_MULTIRT for ipha_dst in the same bucket. Send
2255 	 * a copy to each one.
2256 	 * Use the nce (nexthop) and ipha_dst to find the ire.
2257 	 *
2258 	 * MULTIRT is not designed to work with shared-IP zones thus we don't
2259 	 * need to pass a zoneid or a label to the IRE lookup.
2260 	 */
2261 	if (V4_PART_OF_V6(nce->nce_addr) == ipha->ipha_dst) {
2262 		/* Broadcast and multicast case */
2263 		ire = ire_ftable_lookup_v4(ipha->ipha_dst, 0, 0, 0,
2264 		    NULL, ALL_ZONES, NULL, MATCH_IRE_DSTONLY, 0, ipst, NULL);
2265 	} else {
2266 		ipaddr_t v4addr = V4_PART_OF_V6(nce->nce_addr);
2267 
2268 		/* Unicast case */
2269 		ire = ire_ftable_lookup_v4(ipha->ipha_dst, 0, v4addr, 0,
2270 		    NULL, ALL_ZONES, NULL, MATCH_IRE_GW, 0, ipst, NULL);
2271 	}
2272 
2273 	if (ire == NULL ||
2274 	    (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) ||
2275 	    !(ire->ire_flags & RTF_MULTIRT)) {
2276 		/* Drop */
2277 		ip_drop_output("ip_postfrag_multirt didn't find route",
2278 		    mp, nce->nce_ill);
2279 		if (ire != NULL)
2280 			ire_refrele(ire);
2281 		return (ENETUNREACH);
2282 	}
2283 
2284 	irb = ire->ire_bucket;
2285 	irb_refhold(irb);
2286 	for (ire1 = irb->irb_ire; ire1 != NULL; ire1 = ire1->ire_next) {
2287 		/*
2288 		 * For broadcast we can have a mixture of IRE_BROADCAST and
2289 		 * IRE_HOST due to the manually added IRE_HOSTs that are used
2290 		 * to trigger the creation of the special CGTP broadcast routes.
2291 		 * Thus we have to skip if ire_type doesn't match the original.
2292 		 */
2293 		if (IRE_IS_CONDEMNED(ire1) ||
2294 		    !(ire1->ire_flags & RTF_MULTIRT) ||
2295 		    ire1->ire_type != ire->ire_type)
2296 			continue;
2297 
2298 		/* Do the ire argument one after the loop */
2299 		if (ire1 == ire)
2300 			continue;
2301 
2302 		ill1 = ire_nexthop_ill(ire1);
2303 		if (ill1 == NULL) {
2304 			/*
2305 			 * This ire might not have been picked by
2306 			 * ire_route_recursive, in which case ire_dep might
2307 			 * not have been setup yet.
2308 			 * We kick ire_route_recursive to try to resolve
2309 			 * starting at ire1.
2310 			 */
2311 			ire_t *ire2;
2312 			uint_t	match_flags = MATCH_IRE_DSTONLY;
2313 
2314 			if (ire1->ire_ill != NULL)
2315 				match_flags |= MATCH_IRE_ILL;
2316 			ire2 = ire_route_recursive_impl_v4(ire1,
2317 			    ire1->ire_addr, ire1->ire_type, ire1->ire_ill,
2318 			    ire1->ire_zoneid, NULL, match_flags,
2319 			    IRR_ALLOCATE, 0, ipst, NULL, NULL, NULL);
2320 			if (ire2 != NULL)
2321 				ire_refrele(ire2);
2322 			ill1 = ire_nexthop_ill(ire1);
2323 		}
2324 
2325 		if (ill1 == NULL) {
2326 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
2327 			ip_drop_output("ipIfStatsOutDiscards - no ill",
2328 			    mp, ill);
2329 			error = ENETUNREACH;
2330 			continue;
2331 		}
2332 
2333 		/* Pick the addr and type to use for arp_nce_init */
2334 		if (nce->nce_common->ncec_flags & NCE_F_BCAST) {
2335 			ire_type = IRE_BROADCAST;
2336 			nexthop = ire1->ire_gateway_addr;
2337 		} else if (nce->nce_common->ncec_flags & NCE_F_MCAST) {
2338 			ire_type = IRE_MULTICAST;
2339 			nexthop = ipha->ipha_dst;
2340 		} else {
2341 			ire_type = ire1->ire_type;	/* Doesn't matter */
2342 			nexthop = ire1->ire_gateway_addr;
2343 		}
2344 
2345 		/* If IPMP meta or under, then we just drop */
2346 		if (ill1->ill_grp != NULL) {
2347 			BUMP_MIB(ill1->ill_ip_mib, ipIfStatsOutDiscards);
2348 			ip_drop_output("ipIfStatsOutDiscards - IPMP",
2349 			    mp, ill1);
2350 			ill_refrele(ill1);
2351 			error = ENETUNREACH;
2352 			continue;
2353 		}
2354 
2355 		nce1 = arp_nce_init(ill1, nexthop, ire_type);
2356 		if (nce1 == NULL) {
2357 			BUMP_MIB(ill1->ill_ip_mib, ipIfStatsOutDiscards);
2358 			ip_drop_output("ipIfStatsOutDiscards - no nce",
2359 			    mp, ill1);
2360 			ill_refrele(ill1);
2361 			error = ENETUNREACH;
2362 			continue;
2363 		}
2364 		mp1 = copymsg(mp);
2365 		if (mp1 == NULL) {
2366 			BUMP_MIB(ill1->ill_ip_mib, ipIfStatsOutDiscards);
2367 			ip_drop_output("ipIfStatsOutDiscards", mp, ill1);
2368 			nce_refrele(nce1);
2369 			ill_refrele(ill1);
2370 			error = ENOBUFS;
2371 			continue;
2372 		}
2373 		/* Preserve HW checksum for this copy */
2374 		DB_CKSUMSTART(mp1) = DB_CKSUMSTART(mp);
2375 		DB_CKSUMSTUFF(mp1) = DB_CKSUMSTUFF(mp);
2376 		DB_CKSUMEND(mp1) = DB_CKSUMEND(mp);
2377 		DB_CKSUMFLAGS(mp1) = DB_CKSUMFLAGS(mp);
2378 		DB_LSOMSS(mp1) = DB_LSOMSS(mp);
2379 
2380 		ire1->ire_ob_pkt_count++;
2381 		err = ip_xmit(mp1, nce1, ixaflags, pkt_len, xmit_hint, szone,
2382 		    0, ixacookie);
2383 		if (err == 0)
2384 			num_sent++;
2385 		else
2386 			error = err;
2387 		nce_refrele(nce1);
2388 		ill_refrele(ill1);
2389 	}
2390 	irb_refrele(irb);
2391 	ire_refrele(ire);
2392 	/* Finally, the main one */
2393 	err = ip_xmit(mp, nce, ixaflags, pkt_len, xmit_hint, szone, 0,
2394 	    ixacookie);
2395 	if (err == 0)
2396 		num_sent++;
2397 	else
2398 		error = err;
2399 	if (num_sent > 0)
2400 		return (0);
2401 	else
2402 		return (error);
2403 }
2404 
2405 /*
2406  * Verify local connectivity. This check is called by ULP fusion code.
2407  * The generation number on an IRE_LOCAL or IRE_LOOPBACK only changes if
2408  * the interface is brought down and back up. So we simply fail the local
2409  * process. The caller, TCP Fusion, should unfuse the connection.
2410  */
2411 boolean_t
2412 ip_output_verify_local(ip_xmit_attr_t *ixa)
2413 {
2414 	ire_t		*ire = ixa->ixa_ire;
2415 
2416 	if (!(ire->ire_type & (IRE_LOCAL | IRE_LOOPBACK)))
2417 		return (B_FALSE);
2418 
2419 	return (ixa->ixa_ire->ire_generation == ixa->ixa_ire_generation);
2420 }
2421 
2422 /*
2423  * Local process for ULP loopback, TCP Fusion. Handle both IPv4 and IPv6.
2424  *
2425  * The caller must call ip_output_verify_local() first. This function handles
2426  * IPobs, FW_HOOKS, and/or IPsec cases sequentially.
2427  */
2428 mblk_t *
2429 ip_output_process_local(mblk_t *mp, ip_xmit_attr_t *ixa, boolean_t hooks_out,
2430     boolean_t hooks_in, conn_t *peer_connp)
2431 {
2432 	ill_t		*ill = ixa->ixa_ire->ire_ill;
2433 	ipha_t		*ipha = NULL;
2434 	ip6_t		*ip6h = NULL;
2435 	ip_stack_t	*ipst = ixa->ixa_ipst;
2436 	iaflags_t	ixaflags = ixa->ixa_flags;
2437 	ip_recv_attr_t	iras;
2438 	int		error;
2439 
2440 	ASSERT(mp != NULL);
2441 
2442 	if (ixaflags & IXAF_IS_IPV4) {
2443 		ipha = (ipha_t *)mp->b_rptr;
2444 
2445 		/*
2446 		 * If a callback is enabled then we need to know the
2447 		 * source and destination zoneids for the packet. We already
2448 		 * have those handy.
2449 		 */
2450 		if (ipst->ips_ip4_observe.he_interested) {
2451 			zoneid_t szone, dzone;
2452 			zoneid_t stackzoneid;
2453 
2454 			stackzoneid = netstackid_to_zoneid(
2455 			    ipst->ips_netstack->netstack_stackid);
2456 
2457 			if (stackzoneid == GLOBAL_ZONEID) {
2458 				/* Shared-IP zone */
2459 				dzone = ixa->ixa_ire->ire_zoneid;
2460 				szone = ixa->ixa_zoneid;
2461 			} else {
2462 				szone = dzone = stackzoneid;
2463 			}
2464 			ipobs_hook(mp, IPOBS_HOOK_LOCAL, szone, dzone, ill,
2465 			    ipst);
2466 		}
2467 		DTRACE_IP7(send, mblk_t *, mp, conn_t *, NULL, void_ip_t *,
2468 		    ipha, __dtrace_ipsr_ill_t *, ill, ipha_t *, ipha, ip6_t *,
2469 		    NULL, int, 1);
2470 
2471 		/* FW_HOOKS: LOOPBACK_OUT */
2472 		if (hooks_out) {
2473 			DTRACE_PROBE4(ip4__loopback__out__start, ill_t *, NULL,
2474 			    ill_t *, ill, ipha_t *, ipha, mblk_t *, mp);
2475 			FW_HOOKS(ipst->ips_ip4_loopback_out_event,
2476 			    ipst->ips_ipv4firewall_loopback_out,
2477 			    NULL, ill, ipha, mp, mp, 0, ipst, error);
2478 			DTRACE_PROBE1(ip4__loopback__out__end, mblk_t *, mp);
2479 		}
2480 		if (mp == NULL)
2481 			return (NULL);
2482 
2483 		/* FW_HOOKS: LOOPBACK_IN */
2484 		if (hooks_in) {
2485 			DTRACE_PROBE4(ip4__loopback__in__start, ill_t *, ill,
2486 			    ill_t *, NULL, ipha_t *, ipha, mblk_t *, mp);
2487 			FW_HOOKS(ipst->ips_ip4_loopback_in_event,
2488 			    ipst->ips_ipv4firewall_loopback_in,
2489 			    ill, NULL, ipha, mp, mp, 0, ipst, error);
2490 			DTRACE_PROBE1(ip4__loopback__in__end, mblk_t *, mp);
2491 		}
2492 		if (mp == NULL)
2493 			return (NULL);
2494 
2495 		DTRACE_IP7(receive, mblk_t *, mp, conn_t *, NULL, void_ip_t *,
2496 		    ipha, __dtrace_ipsr_ill_t *, ill, ipha_t *, ipha, ip6_t *,
2497 		    NULL, int, 1);
2498 
2499 		/* Inbound IPsec polocies */
2500 		if (peer_connp != NULL) {
2501 			/* Map ixa to ira including IPsec policies. */
2502 			ipsec_out_to_in(ixa, ill, &iras);
2503 			mp = ipsec_check_inbound_policy(mp, peer_connp, ipha,
2504 			    NULL, &iras);
2505 		}
2506 	} else {
2507 		ip6h = (ip6_t *)mp->b_rptr;
2508 
2509 		/*
2510 		 * If a callback is enabled then we need to know the
2511 		 * source and destination zoneids for the packet. We already
2512 		 * have those handy.
2513 		 */
2514 		if (ipst->ips_ip6_observe.he_interested) {
2515 			zoneid_t szone, dzone;
2516 			zoneid_t stackzoneid;
2517 
2518 			stackzoneid = netstackid_to_zoneid(
2519 			    ipst->ips_netstack->netstack_stackid);
2520 
2521 			if (stackzoneid == GLOBAL_ZONEID) {
2522 				/* Shared-IP zone */
2523 				dzone = ixa->ixa_ire->ire_zoneid;
2524 				szone = ixa->ixa_zoneid;
2525 			} else {
2526 				szone = dzone = stackzoneid;
2527 			}
2528 			ipobs_hook(mp, IPOBS_HOOK_LOCAL, szone, dzone, ill,
2529 			    ipst);
2530 		}
2531 		DTRACE_IP7(send, mblk_t *, mp, conn_t *, NULL, void_ip_t *,
2532 		    ip6h, __dtrace_ipsr_ill_t *, ill, ipha_t *, NULL, ip6_t *,
2533 		    ip6h, int, 1);
2534 
2535 		/* FW_HOOKS: LOOPBACK_OUT */
2536 		if (hooks_out) {
2537 			DTRACE_PROBE4(ip6__loopback__out__start, ill_t *, NULL,
2538 			    ill_t *, ill, ip6_t *, ip6h, mblk_t *, mp);
2539 			FW_HOOKS6(ipst->ips_ip6_loopback_out_event,
2540 			    ipst->ips_ipv6firewall_loopback_out,
2541 			    NULL, ill, ip6h, mp, mp, 0, ipst, error);
2542 			DTRACE_PROBE1(ip6__loopback__out__end, mblk_t *, mp);
2543 		}
2544 		if (mp == NULL)
2545 			return (NULL);
2546 
2547 		/* FW_HOOKS: LOOPBACK_IN */
2548 		if (hooks_in) {
2549 			DTRACE_PROBE4(ip6__loopback__in__start, ill_t *, ill,
2550 			    ill_t *, NULL, ip6_t *, ip6h, mblk_t *, mp);
2551 			FW_HOOKS6(ipst->ips_ip6_loopback_in_event,
2552 			    ipst->ips_ipv6firewall_loopback_in,
2553 			    ill, NULL, ip6h, mp, mp, 0, ipst, error);
2554 			DTRACE_PROBE1(ip6__loopback__in__end, mblk_t *, mp);
2555 		}
2556 		if (mp == NULL)
2557 			return (NULL);
2558 
2559 		DTRACE_IP7(receive, mblk_t *, mp, conn_t *, NULL, void_ip_t *,
2560 		    ip6h, __dtrace_ipsr_ill_t *, ill, ipha_t *, NULL, ip6_t *,
2561 		    ip6h, int, 1);
2562 
2563 		/* Inbound IPsec polocies */
2564 		if (peer_connp != NULL) {
2565 			/* Map ixa to ira including IPsec policies. */
2566 			ipsec_out_to_in(ixa, ill, &iras);
2567 			mp = ipsec_check_inbound_policy(mp, peer_connp, NULL,
2568 			    ip6h, &iras);
2569 		}
2570 	}
2571 
2572 	if (mp == NULL) {
2573 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2574 		ip_drop_input("ipIfStatsInDiscards", NULL, ill);
2575 	}
2576 
2577 	return (mp);
2578 }
2579