1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
24 */
25 /* Copyright (c) 1990 Mentat Inc. */
26
27 #include <sys/types.h>
28 #include <sys/stream.h>
29 #include <sys/strsubr.h>
30 #include <sys/dlpi.h>
31 #include <sys/strsun.h>
32 #include <sys/zone.h>
33 #include <sys/ddi.h>
34 #include <sys/sunddi.h>
35 #include <sys/cmn_err.h>
36 #include <sys/debug.h>
37 #include <sys/atomic.h>
38
39 #include <sys/systm.h>
40 #include <sys/param.h>
41 #include <sys/kmem.h>
42 #include <sys/sdt.h>
43 #include <sys/socket.h>
44 #include <sys/mac.h>
45 #include <net/if.h>
46 #include <net/if_arp.h>
47 #include <net/route.h>
48 #include <sys/sockio.h>
49 #include <netinet/in.h>
50 #include <net/if_dl.h>
51
52 #include <inet/common.h>
53 #include <inet/mi.h>
54 #include <inet/mib2.h>
55 #include <inet/nd.h>
56 #include <inet/arp.h>
57 #include <inet/snmpcom.h>
58 #include <inet/kstatcom.h>
59
60 #include <netinet/igmp_var.h>
61 #include <netinet/ip6.h>
62 #include <netinet/icmp6.h>
63 #include <netinet/sctp.h>
64
65 #include <inet/ip.h>
66 #include <inet/ip_impl.h>
67 #include <inet/ip6.h>
68 #include <inet/ip6_asp.h>
69 #include <inet/tcp.h>
70 #include <inet/ip_multi.h>
71 #include <inet/ip_if.h>
72 #include <inet/ip_ire.h>
73 #include <inet/ip_ftable.h>
74 #include <inet/ip_rts.h>
75 #include <inet/optcom.h>
76 #include <inet/ip_ndp.h>
77 #include <inet/ip_listutils.h>
78 #include <netinet/igmp.h>
79 #include <netinet/ip_mroute.h>
80 #include <inet/ipp_common.h>
81
82 #include <net/pfkeyv2.h>
83 #include <inet/sadb.h>
84 #include <inet/ipsec_impl.h>
85 #include <inet/ipdrop.h>
86 #include <inet/ip_netinfo.h>
87
88 #include <sys/pattr.h>
89 #include <inet/ipclassifier.h>
90 #include <inet/sctp_ip.h>
91 #include <inet/sctp/sctp_impl.h>
92 #include <inet/udp_impl.h>
93 #include <sys/sunddi.h>
94
95 #include <sys/tsol/label.h>
96 #include <sys/tsol/tnet.h>
97
98 #include <sys/clock_impl.h> /* For LBOLT_FASTPATH{,64} */
99
100 #ifdef DEBUG
101 extern boolean_t skip_sctp_cksum;
102 #endif
103
104 static int ip_verify_nce(mblk_t *, ip_xmit_attr_t *);
105 static int ip_verify_dce(mblk_t *, ip_xmit_attr_t *);
106 static boolean_t ip_verify_lso(ill_t *, ip_xmit_attr_t *);
107 static boolean_t ip_verify_zcopy(ill_t *, ip_xmit_attr_t *);
108 static void ip_output_simple_broadcast(ip_xmit_attr_t *, mblk_t *);
109
110 /*
111 * There are two types of output functions for IP used for different
112 * purposes:
113 * - ip_output_simple() is when sending ICMP errors, TCP resets, etc when there
114 * is no context in the form of a conn_t. However, there is a
115 * ip_xmit_attr_t that the callers use to influence interface selection
116 * (needed for ICMP echo as well as IPv6 link-locals) and IPsec.
117 *
118 * - conn_ip_output() is used when sending packets with a conn_t and
119 * ip_set_destination has been called to cache information. In that case
120 * various socket options are recorded in the ip_xmit_attr_t and should
121 * be taken into account.
122 */
123
124 /*
125 * The caller *must* have called conn_connect() or ip_attr_connect()
126 * before calling conn_ip_output(). The caller needs to redo that each time
127 * the destination IP address or port changes, as well as each time there is
128 * a change to any socket option that would modify how packets are routed out
129 * of the box (e.g., SO_DONTROUTE, IP_NEXTHOP, IP_BOUND_IF).
130 *
131 * The ULP caller has to serialize the use of a single ip_xmit_attr_t.
132 * We assert for that here.
133 */
134 int
conn_ip_output(mblk_t * mp,ip_xmit_attr_t * ixa)135 conn_ip_output(mblk_t *mp, ip_xmit_attr_t *ixa)
136 {
137 iaflags_t ixaflags = ixa->ixa_flags;
138 ire_t *ire;
139 nce_t *nce;
140 dce_t *dce;
141 ill_t *ill;
142 ip_stack_t *ipst = ixa->ixa_ipst;
143 int error;
144
145 /* We defer ipIfStatsHCOutRequests until an error or we have an ill */
146
147 ASSERT(ixa->ixa_ire != NULL);
148 /* Note there is no ixa_nce when reject and blackhole routes */
149 ASSERT(ixa->ixa_dce != NULL); /* Could be default dce */
150
151 #ifdef DEBUG
152 ASSERT(ixa->ixa_curthread == NULL);
153 ixa->ixa_curthread = curthread;
154 #endif
155
156 /*
157 * Even on labeled systems we can have a NULL ixa_tsl e.g.,
158 * for IGMP/MLD traffic.
159 */
160
161 ire = ixa->ixa_ire;
162
163 /*
164 * If the ULP says the (old) IRE resulted in reachability we
165 * record this before determine whether to use a new IRE.
166 * No locking for performance reasons.
167 */
168 if (ixaflags & IXAF_REACH_CONF)
169 ire->ire_badcnt = 0;
170
171 /*
172 * Has routing changed since we cached the results of the lookup?
173 *
174 * This check captures all of:
175 * - the cached ire being deleted (by means of the special
176 * IRE_GENERATION_CONDEMNED)
177 * - A potentially better ire being added (ire_generation being
178 * increased)
179 * - A deletion of the nexthop ire that was used when we did the
180 * lookup.
181 * - An addition of a potentially better nexthop ire.
182 * The last two are handled by walking and increasing the generation
183 * number on all dependant IREs in ire_flush_cache().
184 *
185 * The check also handles all cases of RTF_REJECT and RTF_BLACKHOLE
186 * since we ensure that each time we set ixa_ire to such an IRE we
187 * make sure the ixa_ire_generation does not match (by using
188 * IRE_GENERATION_VERIFY).
189 */
190 if (ire->ire_generation != ixa->ixa_ire_generation) {
191 error = ip_verify_ire(mp, ixa);
192 if (error != 0) {
193 ip_drop_output("ipIfStatsOutDiscards - verify ire",
194 mp, NULL);
195 goto drop;
196 }
197 ire = ixa->ixa_ire;
198 ASSERT(ire != NULL);
199 if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
200 #ifdef DEBUG
201 ASSERT(ixa->ixa_curthread == curthread);
202 ixa->ixa_curthread = NULL;
203 #endif
204 ire->ire_ob_pkt_count++;
205 /* ixa_dce might be condemned; use default one */
206 return ((ire->ire_sendfn)(ire, mp, mp->b_rptr, ixa,
207 &ipst->ips_dce_default->dce_ident));
208 }
209 /*
210 * If the ncec changed then ip_verify_ire already set
211 * ixa->ixa_dce_generation = DCE_GENERATION_VERIFY;
212 * so we can recheck the interface mtu.
213 */
214
215 /*
216 * Note that ire->ire_generation could already have changed.
217 * We catch that next time we send a packet.
218 */
219 }
220
221 /*
222 * No need to lock access to ixa_nce since the ip_xmit_attr usage
223 * is single threaded.
224 */
225 ASSERT(ixa->ixa_nce != NULL);
226 nce = ixa->ixa_nce;
227 if (nce->nce_is_condemned) {
228 error = ip_verify_nce(mp, ixa);
229 /*
230 * In case ZEROCOPY capability become not available, we
231 * copy the message and free the original one. We might
232 * be copying more data than needed but it doesn't hurt
233 * since such change rarely happens.
234 */
235 switch (error) {
236 case 0:
237 break;
238 case ENOTSUP: { /* ZEROCOPY */
239 mblk_t *nmp;
240
241 if ((nmp = copymsg(mp)) != NULL) {
242 freemsg(mp);
243 mp = nmp;
244
245 break;
246 }
247 /* FALLTHROUGH */
248 }
249 default:
250 ip_drop_output("ipIfStatsOutDiscards - verify nce",
251 mp, NULL);
252 goto drop;
253 }
254 ire = ixa->ixa_ire;
255 ASSERT(ire != NULL);
256 if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
257 #ifdef DEBUG
258 ASSERT(ixa->ixa_curthread == curthread);
259 ixa->ixa_curthread = NULL;
260 #endif
261 ire->ire_ob_pkt_count++;
262 /* ixa_dce might be condemned; use default one */
263 return ((ire->ire_sendfn)(ire, mp, mp->b_rptr,
264 ixa, &ipst->ips_dce_default->dce_ident));
265 }
266 ASSERT(ixa->ixa_nce != NULL);
267 nce = ixa->ixa_nce;
268
269 /*
270 * Note that some other event could already have made
271 * the new nce condemned. We catch that next time we
272 * try to send a packet.
273 */
274 }
275 /*
276 * If there is no per-destination dce_t then we have a reference to
277 * the default dce_t (which merely contains the dce_ipid).
278 * The generation check captures both the introduction of a
279 * per-destination dce_t (e.g., due to ICMP packet too big) and
280 * any change to the per-destination dce (including it becoming
281 * condemned by use of the special DCE_GENERATION_CONDEMNED).
282 */
283 dce = ixa->ixa_dce;
284
285 /*
286 * To avoid a periodic timer to increase the path MTU we
287 * look at dce_last_change_time each time we send a packet.
288 */
289 if (dce->dce_flags & DCEF_PMTU) {
290 int64_t now = LBOLT_FASTPATH64;
291
292 if ((TICK_TO_SEC(now) - dce->dce_last_change_time >
293 ipst->ips_ip_pathmtu_interval)) {
294 /*
295 * Older than 20 minutes. Drop the path MTU information.
296 * Since the path MTU changes as a result of this,
297 * twiddle ixa_dce_generation to make us go through the
298 * dce verification code in conn_ip_output.
299 */
300 mutex_enter(&dce->dce_lock);
301 dce->dce_flags &= ~(DCEF_PMTU|DCEF_TOO_SMALL_PMTU);
302 dce->dce_last_change_time = TICK_TO_SEC(now);
303 mutex_exit(&dce->dce_lock);
304 dce_increment_generation(dce);
305 }
306 }
307
308 if (dce->dce_generation != ixa->ixa_dce_generation) {
309 error = ip_verify_dce(mp, ixa);
310 if (error != 0) {
311 ip_drop_output("ipIfStatsOutDiscards - verify dce",
312 mp, NULL);
313 goto drop;
314 }
315 dce = ixa->ixa_dce;
316
317 /*
318 * Note that some other event could already have made the
319 * new dce's generation number change.
320 * We catch that next time we try to send a packet.
321 */
322 }
323
324 ill = nce->nce_ill;
325
326 /*
327 * An initial ixa_fragsize was set in ip_set_destination
328 * and we update it if any routing changes above.
329 * A change to ill_mtu with ifconfig will increase all dce_generation
330 * so that we will detect that with the generation check. Ditto for
331 * ill_mc_mtu.
332 */
333
334 /*
335 * Caller needs to make sure IXAF_VERIFY_SRC is not set if
336 * conn_unspec_src.
337 */
338 if ((ixaflags & IXAF_VERIFY_SOURCE) &&
339 ixa->ixa_src_generation != ipst->ips_src_generation) {
340 /* Check if the IP source is still assigned to the host. */
341 uint_t gen;
342
343 if (!ip_verify_src(mp, ixa, &gen)) {
344 /* Don't send a packet with a source that isn't ours */
345 error = EADDRNOTAVAIL;
346 ip_drop_output("ipIfStatsOutDiscards - invalid src",
347 mp, NULL);
348 goto drop;
349 }
350 /* The source is still valid - update the generation number */
351 ixa->ixa_src_generation = gen;
352 }
353
354 /*
355 * We don't have an IRE when we fragment, hence ire_ob_pkt_count
356 * can only count the use prior to fragmentation. However the MIB
357 * counters on the ill will be incremented in post fragmentation.
358 */
359 ire->ire_ob_pkt_count++;
360 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCOutRequests);
361
362 /*
363 * Based on ire_type and ire_flags call one of:
364 * ire_send_local_v* - for IRE_LOCAL and IRE_LOOPBACK
365 * ire_send_multirt_v* - if RTF_MULTIRT
366 * ire_send_noroute_v* - if RTF_REJECT or RTF_BLACHOLE
367 * ire_send_multicast_v* - for IRE_MULTICAST
368 * ire_send_broadcast_v4 - for IRE_BROADCAST
369 * ire_send_wire_v* - for the rest.
370 */
371 #ifdef DEBUG
372 ASSERT(ixa->ixa_curthread == curthread);
373 ixa->ixa_curthread = NULL;
374 #endif
375 return ((ire->ire_sendfn)(ire, mp, mp->b_rptr, ixa, &dce->dce_ident));
376
377 drop:
378 if (ixaflags & IXAF_IS_IPV4) {
379 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests);
380 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
381 } else {
382 BUMP_MIB(&ipst->ips_ip6_mib, ipIfStatsHCOutRequests);
383 BUMP_MIB(&ipst->ips_ip6_mib, ipIfStatsOutDiscards);
384 }
385 freemsg(mp);
386 #ifdef DEBUG
387 ASSERT(ixa->ixa_curthread == curthread);
388 ixa->ixa_curthread = NULL;
389 #endif
390 return (error);
391 }
392
393 /*
394 * Handle both IPv4 and IPv6. Sets the generation number
395 * to allow the caller to know when to call us again.
396 * Returns true if the source address in the packet is a valid source.
397 * We handle callers which try to send with a zero address (since we only
398 * get here if UNSPEC_SRC is not set).
399 */
400 boolean_t
ip_verify_src(mblk_t * mp,ip_xmit_attr_t * ixa,uint_t * generationp)401 ip_verify_src(mblk_t *mp, ip_xmit_attr_t *ixa, uint_t *generationp)
402 {
403 ip_stack_t *ipst = ixa->ixa_ipst;
404
405 /*
406 * Need to grab the generation number before we check to
407 * avoid a race with a change to the set of local addresses.
408 * No lock needed since the thread which updates the set of local
409 * addresses use ipif/ill locks and exit those (hence a store memory
410 * barrier) before doing the atomic increase of ips_src_generation.
411 */
412 if (generationp != NULL)
413 *generationp = ipst->ips_src_generation;
414
415 if (ixa->ixa_flags & IXAF_IS_IPV4) {
416 ipha_t *ipha = (ipha_t *)mp->b_rptr;
417
418 if (ipha->ipha_src == INADDR_ANY)
419 return (B_FALSE);
420
421 return (ip_laddr_verify_v4(ipha->ipha_src, ixa->ixa_zoneid,
422 ipst, B_FALSE) != IPVL_BAD);
423 } else {
424 ip6_t *ip6h = (ip6_t *)mp->b_rptr;
425 uint_t scopeid;
426
427 if (IN6_IS_ADDR_UNSPECIFIED(&ip6h->ip6_src))
428 return (B_FALSE);
429
430 if (ixa->ixa_flags & IXAF_SCOPEID_SET)
431 scopeid = ixa->ixa_scopeid;
432 else
433 scopeid = 0;
434
435 return (ip_laddr_verify_v6(&ip6h->ip6_src, ixa->ixa_zoneid,
436 ipst, B_FALSE, scopeid) != IPVL_BAD);
437 }
438 }
439
440 /*
441 * Handle both IPv4 and IPv6. Reverify/recalculate the IRE to use.
442 */
443 int
ip_verify_ire(mblk_t * mp,ip_xmit_attr_t * ixa)444 ip_verify_ire(mblk_t *mp, ip_xmit_attr_t *ixa)
445 {
446 uint_t gen;
447 ire_t *ire;
448 nce_t *nce;
449 int error;
450 boolean_t multirt = B_FALSE;
451
452 /*
453 * Redo ip_select_route.
454 * Need to grab generation number as part of the lookup to
455 * avoid race.
456 */
457 error = 0;
458 ire = ip_select_route_pkt(mp, ixa, &gen, &error, &multirt);
459 ASSERT(ire != NULL); /* IRE_NOROUTE if none found */
460 if (error != 0) {
461 ire_refrele(ire);
462 return (error);
463 }
464
465 if (ixa->ixa_ire != NULL)
466 ire_refrele_notr(ixa->ixa_ire);
467 #ifdef DEBUG
468 ire_refhold_notr(ire);
469 ire_refrele(ire);
470 #endif
471 ixa->ixa_ire = ire;
472 ixa->ixa_ire_generation = gen;
473 if (multirt) {
474 if (ixa->ixa_flags & IXAF_IS_IPV4)
475 ixa->ixa_postfragfn = ip_postfrag_multirt_v4;
476 else
477 ixa->ixa_postfragfn = ip_postfrag_multirt_v6;
478 ixa->ixa_flags |= IXAF_MULTIRT_MULTICAST;
479 } else {
480 ixa->ixa_postfragfn = ire->ire_postfragfn;
481 ixa->ixa_flags &= ~IXAF_MULTIRT_MULTICAST;
482 }
483
484 /*
485 * Don't look for an nce for reject or blackhole.
486 * They have ire_generation set to IRE_GENERATION_VERIFY which
487 * makes conn_ip_output avoid references to ixa_nce.
488 */
489 if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
490 ASSERT(ixa->ixa_ire_generation == IRE_GENERATION_VERIFY);
491 ixa->ixa_dce_generation = DCE_GENERATION_VERIFY;
492 return (0);
493 }
494
495 /* The NCE could now be different */
496 nce = ire_to_nce_pkt(ire, mp);
497 if (nce == NULL) {
498 /*
499 * Allocation failure. Make sure we redo ire/nce selection
500 * next time we send.
501 */
502 ixa->ixa_ire_generation = IRE_GENERATION_VERIFY;
503 ixa->ixa_dce_generation = DCE_GENERATION_VERIFY;
504 return (ENOBUFS);
505 }
506 if (nce == ixa->ixa_nce) {
507 /* No change */
508 nce_refrele(nce);
509 return (0);
510 }
511
512 /*
513 * Since the path MTU might change as a result of this
514 * route change, we twiddle ixa_dce_generation to
515 * make conn_ip_output go through the ip_verify_dce code.
516 */
517 ixa->ixa_dce_generation = DCE_GENERATION_VERIFY;
518
519 if (ixa->ixa_nce != NULL)
520 nce_refrele(ixa->ixa_nce);
521 ixa->ixa_nce = nce;
522 return (0);
523 }
524
525 /*
526 * Handle both IPv4 and IPv6. Reverify/recalculate the NCE to use.
527 */
528 static int
ip_verify_nce(mblk_t * mp,ip_xmit_attr_t * ixa)529 ip_verify_nce(mblk_t *mp, ip_xmit_attr_t *ixa)
530 {
531 ire_t *ire = ixa->ixa_ire;
532 nce_t *nce;
533 int error = 0;
534 ipha_t *ipha = NULL;
535 ip6_t *ip6h = NULL;
536
537 if (ire->ire_ipversion == IPV4_VERSION)
538 ipha = (ipha_t *)mp->b_rptr;
539 else
540 ip6h = (ip6_t *)mp->b_rptr;
541
542 nce = ire_handle_condemned_nce(ixa->ixa_nce, ire, ipha, ip6h, B_TRUE);
543 if (nce == NULL) {
544 /* Try to find a better ire */
545 return (ip_verify_ire(mp, ixa));
546 }
547
548 /*
549 * The hardware offloading capabilities, for example LSO, of the
550 * interface might have changed, so do sanity verification here.
551 */
552 if (ixa->ixa_flags & IXAF_VERIFY_LSO) {
553 if (!ip_verify_lso(nce->nce_ill, ixa)) {
554 ASSERT(ixa->ixa_notify != NULL);
555 ixa->ixa_notify(ixa->ixa_notify_cookie, ixa,
556 IXAN_LSO, 0);
557 error = ENOTSUP;
558 }
559 }
560
561 /*
562 * Verify ZEROCOPY capability of underlying ill. Notify the ULP with
563 * any ZEROCOPY changes. In case ZEROCOPY capability is not available
564 * any more, return error so that conn_ip_output() can take care of
565 * the ZEROCOPY message properly. It's safe to continue send the
566 * message when ZEROCOPY newly become available.
567 */
568 if (ixa->ixa_flags & IXAF_VERIFY_ZCOPY) {
569 if (!ip_verify_zcopy(nce->nce_ill, ixa)) {
570 ASSERT(ixa->ixa_notify != NULL);
571 ixa->ixa_notify(ixa->ixa_notify_cookie, ixa,
572 IXAN_ZCOPY, 0);
573 if ((ixa->ixa_flags & IXAF_ZCOPY_CAPAB) == 0)
574 error = ENOTSUP;
575 }
576 }
577
578 /*
579 * Since the path MTU might change as a result of this
580 * change, we twiddle ixa_dce_generation to
581 * make conn_ip_output go through the ip_verify_dce code.
582 */
583 ixa->ixa_dce_generation = DCE_GENERATION_VERIFY;
584
585 nce_refrele(ixa->ixa_nce);
586 ixa->ixa_nce = nce;
587 return (error);
588 }
589
590 /*
591 * Handle both IPv4 and IPv6. Reverify/recalculate the DCE to use.
592 */
593 static int
ip_verify_dce(mblk_t * mp,ip_xmit_attr_t * ixa)594 ip_verify_dce(mblk_t *mp, ip_xmit_attr_t *ixa)
595 {
596 dce_t *dce;
597 uint_t gen;
598 uint_t pmtu;
599
600 dce = dce_lookup_pkt(mp, ixa, &gen);
601 ASSERT(dce != NULL);
602
603 dce_refrele_notr(ixa->ixa_dce);
604 #ifdef DEBUG
605 dce_refhold_notr(dce);
606 dce_refrele(dce);
607 #endif
608 ixa->ixa_dce = dce;
609 ixa->ixa_dce_generation = gen;
610
611 /* Extract the (path) mtu from the dce, ncec_ill etc */
612 pmtu = ip_get_pmtu(ixa);
613
614 /*
615 * Tell ULP about PMTU changes - increase or decrease - by returning
616 * an error if IXAF_VERIFY_PMTU is set. In such case, ULP should update
617 * both ixa_pmtu and ixa_fragsize appropriately.
618 *
619 * If ULP doesn't set that flag then we need to update ixa_fragsize
620 * since routing could have changed the ill after after ixa_fragsize
621 * was set previously in the conn_ip_output path or in
622 * ip_set_destination.
623 *
624 * In case of LSO, ixa_fragsize might be greater than ixa_pmtu.
625 *
626 * In the case of a path MTU increase we send the packet after the
627 * notify to the ULP.
628 */
629 if (ixa->ixa_flags & IXAF_VERIFY_PMTU) {
630 if (ixa->ixa_pmtu != pmtu) {
631 uint_t oldmtu = ixa->ixa_pmtu;
632
633 DTRACE_PROBE2(verify_pmtu, uint32_t, pmtu,
634 uint32_t, ixa->ixa_pmtu);
635 ASSERT(ixa->ixa_notify != NULL);
636 ixa->ixa_notify(ixa->ixa_notify_cookie, ixa,
637 IXAN_PMTU, pmtu);
638 if (pmtu < oldmtu)
639 return (EMSGSIZE);
640 }
641 } else {
642 ixa->ixa_fragsize = pmtu;
643 }
644 return (0);
645 }
646
647 /*
648 * Verify LSO usability. Keep the return value simple to indicate whether
649 * the LSO capability has changed. Handle both IPv4 and IPv6.
650 */
651 static boolean_t
ip_verify_lso(ill_t * ill,ip_xmit_attr_t * ixa)652 ip_verify_lso(ill_t *ill, ip_xmit_attr_t *ixa)
653 {
654 ill_lso_capab_t *lsoc = &ixa->ixa_lso_capab;
655 ill_lso_capab_t *new_lsoc = ill->ill_lso_capab;
656
657 if (ixa->ixa_flags & IXAF_LSO_CAPAB) {
658 /*
659 * Not unsable any more.
660 */
661 if ((ixa->ixa_flags & IXAF_IPSEC_SECURE) ||
662 (ixa->ixa_ire->ire_type & (IRE_LOCAL | IRE_LOOPBACK)) ||
663 (ixa->ixa_ire->ire_flags & RTF_MULTIRT) ||
664 ((ixa->ixa_flags & IXAF_IS_IPV4) ?
665 !ILL_LSO_TCP_IPV4_USABLE(ill) :
666 !ILL_LSO_TCP_IPV6_USABLE(ill))) {
667 ixa->ixa_flags &= ~IXAF_LSO_CAPAB;
668
669 return (B_FALSE);
670 }
671
672 /*
673 * Capability has changed, refresh the copy in ixa.
674 */
675 if (lsoc->ill_lso_max != new_lsoc->ill_lso_max) {
676 *lsoc = *new_lsoc;
677
678 return (B_FALSE);
679 }
680 } else { /* Was not usable */
681 if (!(ixa->ixa_flags & IXAF_IPSEC_SECURE) &&
682 !(ixa->ixa_ire->ire_type & (IRE_LOCAL | IRE_LOOPBACK)) &&
683 !(ixa->ixa_ire->ire_flags & RTF_MULTIRT) &&
684 ((ixa->ixa_flags & IXAF_IS_IPV4) ?
685 ILL_LSO_TCP_IPV4_USABLE(ill) :
686 ILL_LSO_TCP_IPV6_USABLE(ill))) {
687 *lsoc = *new_lsoc;
688 ixa->ixa_flags |= IXAF_LSO_CAPAB;
689
690 return (B_FALSE);
691 }
692 }
693
694 return (B_TRUE);
695 }
696
697 /*
698 * Verify ZEROCOPY usability. Keep the return value simple to indicate whether
699 * the ZEROCOPY capability has changed. Handle both IPv4 and IPv6.
700 */
701 static boolean_t
ip_verify_zcopy(ill_t * ill,ip_xmit_attr_t * ixa)702 ip_verify_zcopy(ill_t *ill, ip_xmit_attr_t *ixa)
703 {
704 if (ixa->ixa_flags & IXAF_ZCOPY_CAPAB) {
705 /*
706 * Not unsable any more.
707 */
708 if ((ixa->ixa_flags & IXAF_IPSEC_SECURE) ||
709 (ixa->ixa_ire->ire_type & (IRE_LOCAL | IRE_LOOPBACK)) ||
710 (ixa->ixa_ire->ire_flags & RTF_MULTIRT) ||
711 !ILL_ZCOPY_USABLE(ill)) {
712 ixa->ixa_flags &= ~IXAF_ZCOPY_CAPAB;
713
714 return (B_FALSE);
715 }
716 } else { /* Was not usable */
717 if (!(ixa->ixa_flags & IXAF_IPSEC_SECURE) &&
718 !(ixa->ixa_ire->ire_type & (IRE_LOCAL | IRE_LOOPBACK)) &&
719 !(ixa->ixa_ire->ire_flags & RTF_MULTIRT) &&
720 ILL_ZCOPY_USABLE(ill)) {
721 ixa->ixa_flags |= IXAF_ZCOPY_CAPAB;
722
723 return (B_FALSE);
724 }
725 }
726
727 return (B_TRUE);
728 }
729
730
731 /*
732 * When there is no conn_t context, this will send a packet.
733 * The caller must *not* have called conn_connect() or ip_attr_connect()
734 * before calling ip_output_simple().
735 * Handles IPv4 and IPv6. Returns zero or an errno such as ENETUNREACH.
736 * Honors IXAF_SET_SOURCE.
737 *
738 * We acquire the ire and after calling ire_sendfn we release
739 * the hold on the ire. Ditto for the nce and dce.
740 *
741 * This assumes that the caller has set the following in ip_xmit_attr_t:
742 * ixa_tsl, ixa_zoneid, and ixa_ipst must always be set.
743 * If ixa_ifindex is non-zero it means send out that ill. (If it is
744 * an upper IPMP ill we load balance across the group; if a lower we send
745 * on that lower ill without load balancing.)
746 * IXAF_IS_IPV4 must be set correctly.
747 * If IXAF_IPSEC_SECURE is set then the ixa_ipsec_* fields must be set.
748 * If IXAF_NO_IPSEC is set we'd skip IPsec policy lookup.
749 * If neither of those two are set we do an IPsec policy lookup.
750 *
751 * We handle setting things like
752 * ixa_pktlen
753 * ixa_ip_hdr_length
754 * ixa->ixa_protocol
755 *
756 * The caller may set ixa_xmit_hint, which is used for ECMP selection and
757 * transmit ring selecting in GLD.
758 *
759 * The caller must do an ixa_cleanup() to release any IPsec references
760 * after we return.
761 */
762 int
ip_output_simple(mblk_t * mp,ip_xmit_attr_t * ixa)763 ip_output_simple(mblk_t *mp, ip_xmit_attr_t *ixa)
764 {
765 ts_label_t *effective_tsl = NULL;
766 int err;
767
768 ASSERT(ixa->ixa_ipst != NULL);
769
770 if (is_system_labeled()) {
771 ip_stack_t *ipst = ixa->ixa_ipst;
772
773 if (ixa->ixa_flags & IXAF_IS_IPV4) {
774 err = tsol_check_label_v4(ixa->ixa_tsl, ixa->ixa_zoneid,
775 &mp, CONN_MAC_DEFAULT, B_FALSE, ixa->ixa_ipst,
776 &effective_tsl);
777 } else {
778 err = tsol_check_label_v6(ixa->ixa_tsl, ixa->ixa_zoneid,
779 &mp, CONN_MAC_DEFAULT, B_FALSE, ixa->ixa_ipst,
780 &effective_tsl);
781 }
782 if (err != 0) {
783 ip2dbg(("tsol_check: label check failed (%d)\n", err));
784 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests);
785 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
786 ip_drop_output("tsol_check_label", mp, NULL);
787 freemsg(mp);
788 return (err);
789 }
790 if (effective_tsl != NULL) {
791 /* Update the label */
792 ip_xmit_attr_replace_tsl(ixa, effective_tsl);
793 }
794 }
795
796 if (ixa->ixa_flags & IXAF_IS_IPV4)
797 return (ip_output_simple_v4(mp, ixa));
798 else
799 return (ip_output_simple_v6(mp, ixa));
800 }
801
802 int
ip_output_simple_v4(mblk_t * mp,ip_xmit_attr_t * ixa)803 ip_output_simple_v4(mblk_t *mp, ip_xmit_attr_t *ixa)
804 {
805 ipha_t *ipha;
806 ipaddr_t firsthop; /* In IP header */
807 ipaddr_t dst; /* End of source route, or ipha_dst if none */
808 ire_t *ire;
809 ipaddr_t setsrc; /* RTF_SETSRC */
810 int error;
811 ill_t *ill = NULL;
812 dce_t *dce = NULL;
813 nce_t *nce;
814 iaflags_t ixaflags = ixa->ixa_flags;
815 ip_stack_t *ipst = ixa->ixa_ipst;
816 boolean_t repeat = B_FALSE;
817 boolean_t multirt = B_FALSE;
818 int64_t now;
819
820 ipha = (ipha_t *)mp->b_rptr;
821 ASSERT(IPH_HDR_VERSION(ipha) == IPV4_VERSION);
822
823 /*
824 * Even on labeled systems we can have a NULL ixa_tsl e.g.,
825 * for IGMP/MLD traffic.
826 */
827
828 /* Caller already set flags */
829 ASSERT(ixa->ixa_flags & IXAF_IS_IPV4);
830
831 ASSERT(ixa->ixa_nce == NULL);
832
833 ixa->ixa_pktlen = ntohs(ipha->ipha_length);
834 ASSERT(ixa->ixa_pktlen == msgdsize(mp));
835 ixa->ixa_ip_hdr_length = IPH_HDR_LENGTH(ipha);
836 ixa->ixa_protocol = ipha->ipha_protocol;
837
838 /*
839 * Assumes that source routed packets have already been massaged by
840 * the ULP (ip_massage_options) and as a result ipha_dst is the next
841 * hop in the source route. The final destination is used for IPsec
842 * policy and DCE lookup.
843 */
844 firsthop = ipha->ipha_dst;
845 dst = ip_get_dst(ipha);
846
847 repeat_ire:
848 error = 0;
849 setsrc = INADDR_ANY;
850 ire = ip_select_route_v4(firsthop, ipha->ipha_src, ixa, NULL,
851 &setsrc, &error, &multirt);
852 ASSERT(ire != NULL); /* IRE_NOROUTE if none found */
853 if (error != 0) {
854 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests);
855 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
856 ip_drop_output("ipIfStatsOutDiscards - select route", mp, NULL);
857 freemsg(mp);
858 goto done;
859 }
860
861 if (ire->ire_flags & (RTF_BLACKHOLE|RTF_REJECT)) {
862 /* ire_ill might be NULL hence need to skip some code */
863 if (ixaflags & IXAF_SET_SOURCE)
864 ipha->ipha_src = htonl(INADDR_LOOPBACK);
865 ixa->ixa_fragsize = IP_MAXPACKET;
866 ill = NULL;
867 nce = NULL;
868 ire->ire_ob_pkt_count++;
869 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests);
870 /* No dce yet; use default one */
871 error = (ire->ire_sendfn)(ire, mp, ipha, ixa,
872 &ipst->ips_dce_default->dce_ident);
873 goto done;
874 }
875
876 /* Note that ipha_dst is only used for IRE_MULTICAST */
877 nce = ire_to_nce(ire, ipha->ipha_dst, NULL);
878 if (nce == NULL) {
879 /* Allocation failure? */
880 ip_drop_output("ire_to_nce", mp, ill);
881 freemsg(mp);
882 error = ENOBUFS;
883 goto done;
884 }
885 if (nce->nce_is_condemned) {
886 nce_t *nce1;
887
888 nce1 = ire_handle_condemned_nce(nce, ire, ipha, NULL, B_TRUE);
889 nce_refrele(nce);
890 if (nce1 == NULL) {
891 if (!repeat) {
892 /* Try finding a better IRE */
893 repeat = B_TRUE;
894 ire_refrele(ire);
895 goto repeat_ire;
896 }
897 /* Tried twice - drop packet */
898 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
899 ip_drop_output("No nce", mp, ill);
900 freemsg(mp);
901 error = ENOBUFS;
902 goto done;
903 }
904 nce = nce1;
905 }
906
907 /*
908 * For multicast with multirt we have a flag passed back from
909 * ire_lookup_multi_ill_v4 since we don't have an IRE for each
910 * possible multicast address.
911 * We also need a flag for multicast since we can't check
912 * whether RTF_MULTIRT is set in ixa_ire for multicast.
913 */
914 if (multirt) {
915 ixa->ixa_postfragfn = ip_postfrag_multirt_v4;
916 ixa->ixa_flags |= IXAF_MULTIRT_MULTICAST;
917 } else {
918 ixa->ixa_postfragfn = ire->ire_postfragfn;
919 ixa->ixa_flags &= ~IXAF_MULTIRT_MULTICAST;
920 }
921 ASSERT(ixa->ixa_nce == NULL);
922 ixa->ixa_nce = nce;
923
924 /*
925 * Check for a dce_t with a path mtu.
926 */
927 dce = dce_lookup_v4(dst, ipst, NULL);
928 ASSERT(dce != NULL);
929
930 if (!(ixaflags & IXAF_PMTU_DISCOVERY)) {
931 ixa->ixa_fragsize = ip_get_base_mtu(nce->nce_ill, ire);
932 } else if (dce->dce_flags & DCEF_PMTU) {
933 /*
934 * To avoid a periodic timer to increase the path MTU we
935 * look at dce_last_change_time each time we send a packet.
936 */
937 now = ddi_get_lbolt64();
938 if (TICK_TO_SEC(now) - dce->dce_last_change_time >
939 ipst->ips_ip_pathmtu_interval) {
940 /*
941 * Older than 20 minutes. Drop the path MTU information.
942 */
943 mutex_enter(&dce->dce_lock);
944 dce->dce_flags &= ~(DCEF_PMTU|DCEF_TOO_SMALL_PMTU);
945 dce->dce_last_change_time = TICK_TO_SEC(now);
946 mutex_exit(&dce->dce_lock);
947 dce_increment_generation(dce);
948 ixa->ixa_fragsize = ip_get_base_mtu(nce->nce_ill, ire);
949 } else {
950 uint_t fragsize;
951
952 fragsize = ip_get_base_mtu(nce->nce_ill, ire);
953 if (fragsize > dce->dce_pmtu)
954 fragsize = dce->dce_pmtu;
955 ixa->ixa_fragsize = fragsize;
956 }
957 } else {
958 ixa->ixa_fragsize = ip_get_base_mtu(nce->nce_ill, ire);
959 }
960
961 /*
962 * We use use ire_nexthop_ill (and not ncec_ill) to avoid the under ipmp
963 * interface for source address selection.
964 */
965 ill = ire_nexthop_ill(ire);
966
967 if (ixaflags & IXAF_SET_SOURCE) {
968 ipaddr_t src;
969
970 /*
971 * We use the final destination to get
972 * correct selection for source routed packets
973 */
974
975 /* If unreachable we have no ill but need some source */
976 if (ill == NULL) {
977 src = htonl(INADDR_LOOPBACK);
978 error = 0;
979 } else {
980 error = ip_select_source_v4(ill, setsrc, dst,
981 ixa->ixa_multicast_ifaddr, ixa->ixa_zoneid, ipst,
982 &src, NULL, NULL);
983 }
984 if (error != 0) {
985 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCOutRequests);
986 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
987 ip_drop_output("ipIfStatsOutDiscards - no source",
988 mp, ill);
989 freemsg(mp);
990 goto done;
991 }
992 ipha->ipha_src = src;
993 } else if (ixaflags & IXAF_VERIFY_SOURCE) {
994 /* Check if the IP source is assigned to the host. */
995 if (!ip_verify_src(mp, ixa, NULL)) {
996 /* Don't send a packet with a source that isn't ours */
997 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests);
998 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
999 ip_drop_output("ipIfStatsOutDiscards - invalid source",
1000 mp, ill);
1001 freemsg(mp);
1002 error = EADDRNOTAVAIL;
1003 goto done;
1004 }
1005 }
1006
1007
1008 /*
1009 * Check against global IPsec policy to set the AH/ESP attributes.
1010 * IPsec will set IXAF_IPSEC_* and ixa_ipsec_* as appropriate.
1011 */
1012 if (!(ixaflags & (IXAF_NO_IPSEC|IXAF_IPSEC_SECURE))) {
1013 ASSERT(ixa->ixa_ipsec_policy == NULL);
1014 mp = ip_output_attach_policy(mp, ipha, NULL, NULL, ixa);
1015 if (mp == NULL) {
1016 /* MIB and ip_drop_packet already done */
1017 return (EHOSTUNREACH); /* IPsec policy failure */
1018 }
1019 }
1020
1021 if (ill != NULL) {
1022 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCOutRequests);
1023 } else {
1024 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests);
1025 }
1026
1027 /*
1028 * We update the statistics on the most specific IRE i.e., the first
1029 * one we found.
1030 * We don't have an IRE when we fragment, hence ire_ob_pkt_count
1031 * can only count the use prior to fragmentation. However the MIB
1032 * counters on the ill will be incremented in post fragmentation.
1033 */
1034 ire->ire_ob_pkt_count++;
1035
1036 /*
1037 * Based on ire_type and ire_flags call one of:
1038 * ire_send_local_v4 - for IRE_LOCAL and IRE_LOOPBACK
1039 * ire_send_multirt_v4 - if RTF_MULTIRT
1040 * ire_send_noroute_v4 - if RTF_REJECT or RTF_BLACHOLE
1041 * ire_send_multicast_v4 - for IRE_MULTICAST
1042 * ire_send_broadcast_v4 - for IRE_BROADCAST
1043 * ire_send_wire_v4 - for the rest.
1044 */
1045 error = (ire->ire_sendfn)(ire, mp, ipha, ixa, &dce->dce_ident);
1046 done:
1047 ire_refrele(ire);
1048 if (dce != NULL)
1049 dce_refrele(dce);
1050 if (ill != NULL)
1051 ill_refrele(ill);
1052 if (ixa->ixa_nce != NULL)
1053 nce_refrele(ixa->ixa_nce);
1054 ixa->ixa_nce = NULL;
1055 return (error);
1056 }
1057
1058 /*
1059 * ire_sendfn() functions.
1060 * These functions use the following xmit_attr:
1061 * - ixa_fragsize - read to determine whether or not to fragment
1062 * - IXAF_IPSEC_SECURE - to determine whether or not to invoke IPsec
1063 * - ixa_ipsec_* are used inside IPsec
1064 * - IXAF_SET_SOURCE - replace IP source in broadcast case.
1065 * - IXAF_LOOPBACK_COPY - for multicast and broadcast
1066 */
1067
1068
1069 /*
1070 * ire_sendfn for IRE_LOCAL and IRE_LOOPBACK
1071 *
1072 * The checks for restrict_interzone_loopback are done in ire_route_recursive.
1073 */
1074 /* ARGSUSED4 */
1075 int
ire_send_local_v4(ire_t * ire,mblk_t * mp,void * iph_arg,ip_xmit_attr_t * ixa,uint32_t * identp)1076 ire_send_local_v4(ire_t *ire, mblk_t *mp, void *iph_arg,
1077 ip_xmit_attr_t *ixa, uint32_t *identp)
1078 {
1079 ipha_t *ipha = (ipha_t *)iph_arg;
1080 ip_stack_t *ipst = ixa->ixa_ipst;
1081 ill_t *ill = ire->ire_ill;
1082 ip_recv_attr_t iras; /* NOTE: No bzero for performance */
1083 uint_t pktlen = ixa->ixa_pktlen;
1084
1085 /*
1086 * No fragmentation, no nce, no application of IPsec,
1087 * and no ipha_ident assignment.
1088 *
1089 * Note different order between IP provider and FW_HOOKS than in
1090 * send_wire case.
1091 */
1092
1093 /*
1094 * DTrace this as ip:::send. A packet blocked by FW_HOOKS will fire the
1095 * send probe, but not the receive probe.
1096 */
1097 DTRACE_IP7(send, mblk_t *, mp, conn_t *, NULL, void_ip_t *,
1098 ipha, __dtrace_ipsr_ill_t *, ill, ipha_t *, ipha, ip6_t *, NULL,
1099 int, 1);
1100
1101 if (HOOKS4_INTERESTED_LOOPBACK_OUT(ipst)) {
1102 int error;
1103
1104 DTRACE_PROBE4(ip4__loopback__out__start, ill_t *, NULL,
1105 ill_t *, ill, ipha_t *, ipha, mblk_t *, mp);
1106 FW_HOOKS(ipst->ips_ip4_loopback_out_event,
1107 ipst->ips_ipv4firewall_loopback_out,
1108 NULL, ill, ipha, mp, mp, 0, ipst, error);
1109 DTRACE_PROBE1(ip4__loopback__out__end, mblk_t *, mp);
1110 if (mp == NULL)
1111 return (error);
1112
1113 /*
1114 * Even if the destination was changed by the filter we use the
1115 * forwarding decision that was made based on the address
1116 * in ip_output/ip_set_destination.
1117 */
1118 /* Length could be different */
1119 ipha = (ipha_t *)mp->b_rptr;
1120 pktlen = ntohs(ipha->ipha_length);
1121 }
1122
1123 /*
1124 * If a callback is enabled then we need to know the
1125 * source and destination zoneids for the packet. We already
1126 * have those handy.
1127 */
1128 if (ipst->ips_ip4_observe.he_interested) {
1129 zoneid_t szone, dzone;
1130 zoneid_t stackzoneid;
1131
1132 stackzoneid = netstackid_to_zoneid(
1133 ipst->ips_netstack->netstack_stackid);
1134
1135 if (stackzoneid == GLOBAL_ZONEID) {
1136 /* Shared-IP zone */
1137 dzone = ire->ire_zoneid;
1138 szone = ixa->ixa_zoneid;
1139 } else {
1140 szone = dzone = stackzoneid;
1141 }
1142 ipobs_hook(mp, IPOBS_HOOK_LOCAL, szone, dzone, ill, ipst);
1143 }
1144
1145 /* Handle lo0 stats */
1146 ipst->ips_loopback_packets++;
1147
1148 /* Map ixa to ira including IPsec policies */
1149 ipsec_out_to_in(ixa, ill, &iras);
1150 iras.ira_pktlen = pktlen;
1151
1152 if (!IS_SIMPLE_IPH(ipha)) {
1153 ip_output_local_options(ipha, ipst);
1154 iras.ira_flags |= IRAF_IPV4_OPTIONS;
1155 }
1156
1157 if (HOOKS4_INTERESTED_LOOPBACK_IN(ipst)) {
1158 int error;
1159
1160 DTRACE_PROBE4(ip4__loopback__in__start, ill_t *, ill,
1161 ill_t *, NULL, ipha_t *, ipha, mblk_t *, mp);
1162 FW_HOOKS(ipst->ips_ip4_loopback_in_event,
1163 ipst->ips_ipv4firewall_loopback_in,
1164 ill, NULL, ipha, mp, mp, 0, ipst, error);
1165
1166 DTRACE_PROBE1(ip4__loopback__in__end, mblk_t *, mp);
1167 if (mp == NULL) {
1168 ira_cleanup(&iras, B_FALSE);
1169 return (error);
1170 }
1171 /*
1172 * Even if the destination was changed by the filter we use the
1173 * forwarding decision that was made based on the address
1174 * in ip_output/ip_set_destination.
1175 */
1176 /* Length could be different */
1177 ipha = (ipha_t *)mp->b_rptr;
1178 pktlen = iras.ira_pktlen = ntohs(ipha->ipha_length);
1179 }
1180
1181 DTRACE_IP7(receive, mblk_t *, mp, conn_t *, NULL, void_ip_t *,
1182 ipha, __dtrace_ipsr_ill_t *, ill, ipha_t *, ipha, ip6_t *, NULL,
1183 int, 1);
1184
1185 ire->ire_ib_pkt_count++;
1186 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInReceives);
1187 UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCInOctets, pktlen);
1188
1189 /* Destined to ire_zoneid - use that for fanout */
1190 iras.ira_zoneid = ire->ire_zoneid;
1191
1192 if (is_system_labeled()) {
1193 iras.ira_flags |= IRAF_SYSTEM_LABELED;
1194
1195 /*
1196 * This updates ira_cred, ira_tsl and ira_free_flags based
1197 * on the label. We don't expect this to ever fail for
1198 * loopback packets, so we silently drop the packet should it
1199 * fail.
1200 */
1201 if (!tsol_get_pkt_label(mp, IPV4_VERSION, &iras)) {
1202 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
1203 ip_drop_input("tsol_get_pkt_label", mp, ill);
1204 freemsg(mp);
1205 return (0);
1206 }
1207 ASSERT(iras.ira_tsl != NULL);
1208
1209 /* tsol_get_pkt_label sometimes does pullupmsg */
1210 ipha = (ipha_t *)mp->b_rptr;
1211 }
1212
1213 ip_fanout_v4(mp, ipha, &iras);
1214
1215 /* We moved any IPsec refs from ixa to iras */
1216 ira_cleanup(&iras, B_FALSE);
1217 return (0);
1218 }
1219
1220 /*
1221 * ire_sendfn for IRE_BROADCAST
1222 * If the broadcast address is present on multiple ills and ixa_ifindex
1223 * isn't set, then we generate
1224 * a separate datagram (potentially with different source address) for
1225 * those ills. In any case, only one copy is looped back to ip_input_v4.
1226 */
1227 int
ire_send_broadcast_v4(ire_t * ire,mblk_t * mp,void * iph_arg,ip_xmit_attr_t * ixa,uint32_t * identp)1228 ire_send_broadcast_v4(ire_t *ire, mblk_t *mp, void *iph_arg,
1229 ip_xmit_attr_t *ixa, uint32_t *identp)
1230 {
1231 ipha_t *ipha = (ipha_t *)iph_arg;
1232 ip_stack_t *ipst = ixa->ixa_ipst;
1233 irb_t *irb = ire->ire_bucket;
1234 ire_t *ire1;
1235 mblk_t *mp1;
1236 ipha_t *ipha1;
1237 iaflags_t ixaflags = ixa->ixa_flags;
1238 nce_t *nce1, *nce_orig;
1239
1240 /*
1241 * Unless ire_send_multirt_v4 already set a ttl, force the
1242 * ttl to a smallish value.
1243 */
1244 if (!(ixa->ixa_flags & IXAF_NO_TTL_CHANGE)) {
1245 /*
1246 * To avoid broadcast storms, we usually set the TTL to 1 for
1247 * broadcasts. This can
1248 * be overridden stack-wide through the ip_broadcast_ttl
1249 * ndd tunable, or on a per-connection basis through the
1250 * IP_BROADCAST_TTL socket option.
1251 *
1252 * If SO_DONTROUTE/IXAF_DONTROUTE is set, then ire_send_wire_v4
1253 * will force ttl to one after we've set this.
1254 */
1255 if (ixaflags & IXAF_BROADCAST_TTL_SET)
1256 ipha->ipha_ttl = ixa->ixa_broadcast_ttl;
1257 else
1258 ipha->ipha_ttl = ipst->ips_ip_broadcast_ttl;
1259 }
1260 /*
1261 * Make sure we get a loopback copy (after IPsec and frag)
1262 * Skip hardware checksum so that loopback copy is checksumed.
1263 */
1264 ixa->ixa_flags |= IXAF_LOOPBACK_COPY | IXAF_NO_HW_CKSUM;
1265
1266 /* Do we need to potentially generate multiple copies? */
1267 if (irb->irb_ire_cnt == 1 || ixa->ixa_ifindex != 0)
1268 return (ire_send_wire_v4(ire, mp, ipha, ixa, identp));
1269
1270 /*
1271 * Loop over all IRE_BROADCAST in the bucket (might only be one).
1272 * Note that everything in the bucket has the same destination address.
1273 */
1274 irb_refhold(irb);
1275 for (ire1 = irb->irb_ire; ire1 != NULL; ire1 = ire1->ire_next) {
1276 /* We do the main IRE after the end of the loop */
1277 if (ire1 == ire)
1278 continue;
1279
1280 /*
1281 * Only IREs for the same IP address should be in the same
1282 * bucket.
1283 * But could have IRE_HOSTs in the case of CGTP.
1284 * If we find any multirt routes we bail out of the loop
1285 * and just do the single packet at the end; ip_postfrag_multirt
1286 * will duplicate the packet.
1287 */
1288 ASSERT(ire1->ire_addr == ire->ire_addr);
1289 if (!(ire1->ire_type & IRE_BROADCAST))
1290 continue;
1291
1292 if (IRE_IS_CONDEMNED(ire1))
1293 continue;
1294
1295 if (ixa->ixa_zoneid != ALL_ZONES &&
1296 ire->ire_zoneid != ire1->ire_zoneid)
1297 continue;
1298
1299 ASSERT(ire->ire_ill != ire1->ire_ill && ire1->ire_ill != NULL);
1300
1301 if (ire1->ire_flags & RTF_MULTIRT)
1302 break;
1303
1304 /*
1305 * For IPMP we only send for the ipmp_ill. arp_nce_init() will
1306 * ensure that this goes out on the cast_ill.
1307 */
1308 if (IS_UNDER_IPMP(ire1->ire_ill))
1309 continue;
1310
1311 mp1 = copymsg(mp);
1312 if (mp1 == NULL) {
1313 BUMP_MIB(ire1->ire_ill->ill_ip_mib,
1314 ipIfStatsOutDiscards);
1315 ip_drop_output("ipIfStatsOutDiscards",
1316 mp, ire1->ire_ill);
1317 continue;
1318 }
1319
1320 ipha1 = (ipha_t *)mp1->b_rptr;
1321 if (ixa->ixa_flags & IXAF_SET_SOURCE) {
1322 /*
1323 * Need to pick a different source address for each
1324 * interface. If we have a global IPsec policy and
1325 * no per-socket policy then we punt to
1326 * ip_output_simple_v4 using a separate ip_xmit_attr_t.
1327 */
1328 if (ixaflags & IXAF_IPSEC_GLOBAL_POLICY) {
1329 ip_output_simple_broadcast(ixa, mp1);
1330 continue;
1331 }
1332 /* Pick a new source address for each interface */
1333 if (ip_select_source_v4(ire1->ire_ill, INADDR_ANY,
1334 ipha1->ipha_dst, INADDR_ANY, ixa->ixa_zoneid, ipst,
1335 &ipha1->ipha_src, NULL, NULL) != 0) {
1336 BUMP_MIB(ire1->ire_ill->ill_ip_mib,
1337 ipIfStatsOutDiscards);
1338 ip_drop_output("ipIfStatsOutDiscards - select "
1339 "broadcast source", mp1, ire1->ire_ill);
1340 freemsg(mp1);
1341 continue;
1342 }
1343 /*
1344 * Check against global IPsec policy to set the AH/ESP
1345 * attributes. IPsec will set IXAF_IPSEC_* and
1346 * ixa_ipsec_* as appropriate.
1347 */
1348 if (!(ixaflags & (IXAF_NO_IPSEC|IXAF_IPSEC_SECURE))) {
1349 ASSERT(ixa->ixa_ipsec_policy == NULL);
1350 mp1 = ip_output_attach_policy(mp1, ipha, NULL,
1351 NULL, ixa);
1352 if (mp1 == NULL) {
1353 /*
1354 * MIB and ip_drop_packet already
1355 * done
1356 */
1357 continue;
1358 }
1359 }
1360 }
1361 /* Make sure we have an NCE on this ill */
1362 nce1 = arp_nce_init(ire1->ire_ill, ire1->ire_addr,
1363 ire1->ire_type);
1364 if (nce1 == NULL) {
1365 BUMP_MIB(ire1->ire_ill->ill_ip_mib,
1366 ipIfStatsOutDiscards);
1367 ip_drop_output("ipIfStatsOutDiscards - broadcast nce",
1368 mp1, ire1->ire_ill);
1369 freemsg(mp1);
1370 continue;
1371 }
1372 nce_orig = ixa->ixa_nce;
1373 ixa->ixa_nce = nce1;
1374
1375 ire_refhold(ire1);
1376 /*
1377 * Ignore any errors here. We just collect the errno for
1378 * the main ire below
1379 */
1380 (void) ire_send_wire_v4(ire1, mp1, ipha1, ixa, identp);
1381 ire_refrele(ire1);
1382
1383 ixa->ixa_nce = nce_orig;
1384 nce_refrele(nce1);
1385
1386 ixa->ixa_flags &= ~IXAF_LOOPBACK_COPY;
1387 }
1388 irb_refrele(irb);
1389 /* Finally, the main one */
1390
1391 /*
1392 * For IPMP we only send broadcasts on the ipmp_ill.
1393 */
1394 if (IS_UNDER_IPMP(ire->ire_ill)) {
1395 freemsg(mp);
1396 return (0);
1397 }
1398
1399 return (ire_send_wire_v4(ire, mp, ipha, ixa, identp));
1400 }
1401
1402 /*
1403 * Send a packet using a different source address and different
1404 * IPsec policy.
1405 */
1406 static void
ip_output_simple_broadcast(ip_xmit_attr_t * ixa,mblk_t * mp)1407 ip_output_simple_broadcast(ip_xmit_attr_t *ixa, mblk_t *mp)
1408 {
1409 ip_xmit_attr_t ixas;
1410
1411 bzero(&ixas, sizeof (ixas));
1412 ixas.ixa_flags = IXAF_BASIC_SIMPLE_V4;
1413 ixas.ixa_zoneid = ixa->ixa_zoneid;
1414 ixas.ixa_ifindex = 0;
1415 ixas.ixa_ipst = ixa->ixa_ipst;
1416 ixas.ixa_cred = ixa->ixa_cred;
1417 ixas.ixa_cpid = ixa->ixa_cpid;
1418 ixas.ixa_tsl = ixa->ixa_tsl;
1419 ixas.ixa_multicast_ttl = IP_DEFAULT_MULTICAST_TTL;
1420
1421 (void) ip_output_simple(mp, &ixas);
1422 ixa_cleanup(&ixas);
1423 }
1424
1425
1426 static void
multirt_check_v4(ire_t * ire,ipha_t * ipha,ip_xmit_attr_t * ixa)1427 multirt_check_v4(ire_t *ire, ipha_t *ipha, ip_xmit_attr_t *ixa)
1428 {
1429 ip_stack_t *ipst = ixa->ixa_ipst;
1430
1431 /* Limit the TTL on multirt packets */
1432 if (ire->ire_type & IRE_MULTICAST) {
1433 if (ipha->ipha_ttl > 1) {
1434 ip2dbg(("ire_send_multirt_v4: forcing multicast "
1435 "multirt TTL to 1 (was %d), dst 0x%08x\n",
1436 ipha->ipha_ttl, ntohl(ire->ire_addr)));
1437 ipha->ipha_ttl = 1;
1438 }
1439 ixa->ixa_flags |= IXAF_NO_TTL_CHANGE;
1440 } else if ((ipst->ips_ip_multirt_ttl > 0) &&
1441 (ipha->ipha_ttl > ipst->ips_ip_multirt_ttl)) {
1442 ipha->ipha_ttl = ipst->ips_ip_multirt_ttl;
1443 /*
1444 * Need to ensure we don't increase the ttl should we go through
1445 * ire_send_broadcast or multicast.
1446 */
1447 ixa->ixa_flags |= IXAF_NO_TTL_CHANGE;
1448 }
1449 }
1450
1451 /*
1452 * ire_sendfn for IRE_MULTICAST
1453 */
1454 int
ire_send_multicast_v4(ire_t * ire,mblk_t * mp,void * iph_arg,ip_xmit_attr_t * ixa,uint32_t * identp)1455 ire_send_multicast_v4(ire_t *ire, mblk_t *mp, void *iph_arg,
1456 ip_xmit_attr_t *ixa, uint32_t *identp)
1457 {
1458 ipha_t *ipha = (ipha_t *)iph_arg;
1459 ip_stack_t *ipst = ixa->ixa_ipst;
1460 ill_t *ill = ire->ire_ill;
1461 iaflags_t ixaflags = ixa->ixa_flags;
1462
1463 /*
1464 * The IRE_MULTICAST is the same whether or not multirt is in use.
1465 * Hence we need special-case code.
1466 */
1467 if (ixaflags & IXAF_MULTIRT_MULTICAST)
1468 multirt_check_v4(ire, ipha, ixa);
1469
1470 /*
1471 * Check if anything in ip_input_v4 wants a copy of the transmitted
1472 * packet (after IPsec and fragmentation)
1473 *
1474 * 1. Multicast routers always need a copy unless SO_DONTROUTE is set
1475 * RSVP and the rsvp daemon is an example of a
1476 * protocol and user level process that
1477 * handles it's own routing. Hence, it uses the
1478 * SO_DONTROUTE option to accomplish this.
1479 * 2. If the sender has set IP_MULTICAST_LOOP, then we just
1480 * check whether there are any receivers for the group on the ill
1481 * (ignoring the zoneid).
1482 * 3. If IP_MULTICAST_LOOP is not set, then we check if there are
1483 * any members in other shared-IP zones.
1484 * If such members exist, then we indicate that the sending zone
1485 * shouldn't get a loopback copy to preserve the IP_MULTICAST_LOOP
1486 * behavior.
1487 *
1488 * When we loopback we skip hardware checksum to make sure loopback
1489 * copy is checksumed.
1490 *
1491 * Note that ire_ill is the upper in the case of IPMP.
1492 */
1493 ixa->ixa_flags &= ~(IXAF_LOOPBACK_COPY | IXAF_NO_HW_CKSUM);
1494 if (ipst->ips_ip_g_mrouter && ill->ill_mrouter_cnt > 0 &&
1495 !(ixaflags & IXAF_DONTROUTE)) {
1496 ixa->ixa_flags |= IXAF_LOOPBACK_COPY | IXAF_NO_HW_CKSUM;
1497 } else if (ixaflags & IXAF_MULTICAST_LOOP) {
1498 /*
1499 * If this zone or any other zone has members then loopback
1500 * a copy.
1501 */
1502 if (ill_hasmembers_v4(ill, ipha->ipha_dst))
1503 ixa->ixa_flags |= IXAF_LOOPBACK_COPY | IXAF_NO_HW_CKSUM;
1504 } else if (ipst->ips_netstack->netstack_numzones > 1) {
1505 /*
1506 * This zone should not have a copy. But there are some other
1507 * zones which might have members.
1508 */
1509 if (ill_hasmembers_otherzones_v4(ill, ipha->ipha_dst,
1510 ixa->ixa_zoneid)) {
1511 ixa->ixa_flags |= IXAF_NO_LOOP_ZONEID_SET;
1512 ixa->ixa_no_loop_zoneid = ixa->ixa_zoneid;
1513 ixa->ixa_flags |= IXAF_LOOPBACK_COPY | IXAF_NO_HW_CKSUM;
1514 }
1515 }
1516
1517 /*
1518 * Unless ire_send_multirt_v4 or icmp_output_hdrincl already set a ttl,
1519 * force the ttl to the IP_MULTICAST_TTL value
1520 */
1521 if (!(ixaflags & IXAF_NO_TTL_CHANGE)) {
1522 ipha->ipha_ttl = ixa->ixa_multicast_ttl;
1523 }
1524
1525 return (ire_send_wire_v4(ire, mp, ipha, ixa, identp));
1526 }
1527
1528 /*
1529 * ire_sendfn for IREs with RTF_MULTIRT
1530 */
1531 int
ire_send_multirt_v4(ire_t * ire,mblk_t * mp,void * iph_arg,ip_xmit_attr_t * ixa,uint32_t * identp)1532 ire_send_multirt_v4(ire_t *ire, mblk_t *mp, void *iph_arg,
1533 ip_xmit_attr_t *ixa, uint32_t *identp)
1534 {
1535 ipha_t *ipha = (ipha_t *)iph_arg;
1536
1537 multirt_check_v4(ire, ipha, ixa);
1538
1539 if (ire->ire_type & IRE_MULTICAST)
1540 return (ire_send_multicast_v4(ire, mp, ipha, ixa, identp));
1541 else if (ire->ire_type & IRE_BROADCAST)
1542 return (ire_send_broadcast_v4(ire, mp, ipha, ixa, identp));
1543 else
1544 return (ire_send_wire_v4(ire, mp, ipha, ixa, identp));
1545 }
1546
1547 /*
1548 * ire_sendfn for IREs with RTF_REJECT/RTF_BLACKHOLE, including IRE_NOROUTE
1549 */
1550 int
ire_send_noroute_v4(ire_t * ire,mblk_t * mp,void * iph_arg,ip_xmit_attr_t * ixa,uint32_t * identp)1551 ire_send_noroute_v4(ire_t *ire, mblk_t *mp, void *iph_arg,
1552 ip_xmit_attr_t *ixa, uint32_t *identp)
1553 {
1554 ip_stack_t *ipst = ixa->ixa_ipst;
1555 ipha_t *ipha = (ipha_t *)iph_arg;
1556 ill_t *ill;
1557 ip_recv_attr_t iras;
1558 boolean_t dummy;
1559
1560 /* We assign an IP ident for nice errors */
1561 ipha->ipha_ident = atomic_inc_32_nv(identp);
1562
1563 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutNoRoutes);
1564
1565 if (ire->ire_type & IRE_NOROUTE) {
1566 /* A lack of a route as opposed to RTF_REJECT|BLACKHOLE */
1567 ip_rts_change(RTM_MISS, ipha->ipha_dst, 0, 0, 0, 0, 0, 0,
1568 RTA_DST, ipst);
1569 }
1570
1571 if (ire->ire_flags & RTF_BLACKHOLE) {
1572 ip_drop_output("ipIfStatsOutNoRoutes RTF_BLACKHOLE", mp, NULL);
1573 freemsg(mp);
1574 /* No error even for local senders - silent blackhole */
1575 return (0);
1576 }
1577 ip_drop_output("ipIfStatsOutNoRoutes RTF_REJECT", mp, NULL);
1578
1579 /*
1580 * We need an ill_t for the ip_recv_attr_t even though this packet
1581 * was never received and icmp_unreachable doesn't currently use
1582 * ira_ill.
1583 */
1584 ill = ill_lookup_on_name("lo0", B_FALSE,
1585 !(ixa->ixa_flags & IRAF_IS_IPV4), &dummy, ipst);
1586 if (ill == NULL) {
1587 freemsg(mp);
1588 return (EHOSTUNREACH);
1589 }
1590
1591 bzero(&iras, sizeof (iras));
1592 /* Map ixa to ira including IPsec policies */
1593 ipsec_out_to_in(ixa, ill, &iras);
1594
1595 if (ip_source_routed(ipha, ipst)) {
1596 icmp_unreachable(mp, ICMP_SOURCE_ROUTE_FAILED, &iras);
1597 } else {
1598 icmp_unreachable(mp, ICMP_HOST_UNREACHABLE, &iras);
1599 }
1600 /* We moved any IPsec refs from ixa to iras */
1601 ira_cleanup(&iras, B_FALSE);
1602 ill_refrele(ill);
1603 return (EHOSTUNREACH);
1604 }
1605
1606 /*
1607 * Calculate a checksum ignoring any hardware capabilities
1608 *
1609 * Returns B_FALSE if the packet was too short for the checksum. Caller
1610 * should free and do stats.
1611 */
1612 static boolean_t
ip_output_sw_cksum_v4(mblk_t * mp,ipha_t * ipha,ip_xmit_attr_t * ixa)1613 ip_output_sw_cksum_v4(mblk_t *mp, ipha_t *ipha, ip_xmit_attr_t *ixa)
1614 {
1615 ip_stack_t *ipst = ixa->ixa_ipst;
1616 uint_t pktlen = ixa->ixa_pktlen;
1617 uint16_t *cksump;
1618 uint32_t cksum;
1619 uint8_t protocol = ixa->ixa_protocol;
1620 uint16_t ip_hdr_length = ixa->ixa_ip_hdr_length;
1621 ipaddr_t dst = ipha->ipha_dst;
1622 ipaddr_t src = ipha->ipha_src;
1623
1624 /* Just in case it contained garbage */
1625 DB_CKSUMFLAGS(mp) &= ~HCK_FLAGS;
1626
1627 /*
1628 * Calculate ULP checksum
1629 */
1630 if (protocol == IPPROTO_TCP) {
1631 cksump = IPH_TCPH_CHECKSUMP(ipha, ip_hdr_length);
1632 cksum = IP_TCP_CSUM_COMP;
1633 } else if (protocol == IPPROTO_UDP) {
1634 cksump = IPH_UDPH_CHECKSUMP(ipha, ip_hdr_length);
1635 cksum = IP_UDP_CSUM_COMP;
1636 } else if (protocol == IPPROTO_SCTP) {
1637 sctp_hdr_t *sctph;
1638
1639 ASSERT(MBLKL(mp) >= (ip_hdr_length + sizeof (*sctph)));
1640 sctph = (sctp_hdr_t *)(mp->b_rptr + ip_hdr_length);
1641 /*
1642 * Zero out the checksum field to ensure proper
1643 * checksum calculation.
1644 */
1645 sctph->sh_chksum = 0;
1646 #ifdef DEBUG
1647 if (!skip_sctp_cksum)
1648 #endif
1649 sctph->sh_chksum = sctp_cksum(mp, ip_hdr_length);
1650 goto ip_hdr_cksum;
1651 } else {
1652 goto ip_hdr_cksum;
1653 }
1654
1655 /* ULP puts the checksum field is in the first mblk */
1656 ASSERT(((uchar_t *)cksump) + sizeof (uint16_t) <= mp->b_wptr);
1657
1658 /*
1659 * We accumulate the pseudo header checksum in cksum.
1660 * This is pretty hairy code, so watch close. One
1661 * thing to keep in mind is that UDP and TCP have
1662 * stored their respective datagram lengths in their
1663 * checksum fields. This lines things up real nice.
1664 */
1665 cksum += (dst >> 16) + (dst & 0xFFFF) + (src >> 16) + (src & 0xFFFF);
1666
1667 cksum = IP_CSUM(mp, ip_hdr_length, cksum);
1668 /*
1669 * For UDP/IPv4 a zero means that the packets wasn't checksummed.
1670 * Change to 0xffff
1671 */
1672 if (protocol == IPPROTO_UDP && cksum == 0)
1673 *cksump = ~cksum;
1674 else
1675 *cksump = cksum;
1676
1677 IP_STAT(ipst, ip_out_sw_cksum);
1678 IP_STAT_UPDATE(ipst, ip_out_sw_cksum_bytes, pktlen);
1679
1680 ip_hdr_cksum:
1681 /* Calculate IPv4 header checksum */
1682 ipha->ipha_hdr_checksum = 0;
1683 ipha->ipha_hdr_checksum = ip_csum_hdr(ipha);
1684 return (B_TRUE);
1685 }
1686
1687 /*
1688 * Calculate the ULP checksum - try to use hardware.
1689 * In the case of MULTIRT, broadcast or multicast the
1690 * IXAF_NO_HW_CKSUM is set in which case we use software.
1691 *
1692 * If the hardware supports IP header checksum offload; then clear the
1693 * contents of IP header checksum field as expected by NIC.
1694 * Do this only if we offloaded either full or partial sum.
1695 *
1696 * Returns B_FALSE if the packet was too short for the checksum. Caller
1697 * should free and do stats.
1698 */
1699 static boolean_t
ip_output_cksum_v4(iaflags_t ixaflags,mblk_t * mp,ipha_t * ipha,ip_xmit_attr_t * ixa,ill_t * ill)1700 ip_output_cksum_v4(iaflags_t ixaflags, mblk_t *mp, ipha_t *ipha,
1701 ip_xmit_attr_t *ixa, ill_t *ill)
1702 {
1703 uint_t pktlen = ixa->ixa_pktlen;
1704 uint16_t *cksump;
1705 uint16_t hck_flags;
1706 uint32_t cksum;
1707 uint8_t protocol = ixa->ixa_protocol;
1708 uint16_t ip_hdr_length = ixa->ixa_ip_hdr_length;
1709
1710 if ((ixaflags & IXAF_NO_HW_CKSUM) || !ILL_HCKSUM_CAPABLE(ill) ||
1711 !dohwcksum) {
1712 return (ip_output_sw_cksum_v4(mp, ipha, ixa));
1713 }
1714
1715 /*
1716 * Calculate ULP checksum. Note that we don't use cksump and cksum
1717 * if the ill has FULL support.
1718 */
1719 if (protocol == IPPROTO_TCP) {
1720 cksump = IPH_TCPH_CHECKSUMP(ipha, ip_hdr_length);
1721 cksum = IP_TCP_CSUM_COMP; /* Pseudo-header cksum */
1722 } else if (protocol == IPPROTO_UDP) {
1723 cksump = IPH_UDPH_CHECKSUMP(ipha, ip_hdr_length);
1724 cksum = IP_UDP_CSUM_COMP; /* Pseudo-header cksum */
1725 } else if (protocol == IPPROTO_SCTP) {
1726 sctp_hdr_t *sctph;
1727
1728 ASSERT(MBLKL(mp) >= (ip_hdr_length + sizeof (*sctph)));
1729 sctph = (sctp_hdr_t *)(mp->b_rptr + ip_hdr_length);
1730 /*
1731 * Zero out the checksum field to ensure proper
1732 * checksum calculation.
1733 */
1734 sctph->sh_chksum = 0;
1735 #ifdef DEBUG
1736 if (!skip_sctp_cksum)
1737 #endif
1738 sctph->sh_chksum = sctp_cksum(mp, ip_hdr_length);
1739 goto ip_hdr_cksum;
1740 } else {
1741 ip_hdr_cksum:
1742 /* Calculate IPv4 header checksum */
1743 ipha->ipha_hdr_checksum = 0;
1744 ipha->ipha_hdr_checksum = ip_csum_hdr(ipha);
1745 return (B_TRUE);
1746 }
1747
1748 /* ULP puts the checksum field is in the first mblk */
1749 ASSERT(((uchar_t *)cksump) + sizeof (uint16_t) <= mp->b_wptr);
1750
1751 /*
1752 * Underlying interface supports hardware checksum offload for
1753 * the payload; leave the payload checksum for the hardware to
1754 * calculate. N.B: We only need to set up checksum info on the
1755 * first mblk.
1756 */
1757 hck_flags = ill->ill_hcksum_capab->ill_hcksum_txflags;
1758
1759 DB_CKSUMFLAGS(mp) &= ~HCK_FLAGS;
1760 if (hck_flags & HCKSUM_INET_FULL_V4) {
1761 /*
1762 * Hardware calculates pseudo-header, header and the
1763 * payload checksums, so clear the checksum field in
1764 * the protocol header.
1765 */
1766 *cksump = 0;
1767 DB_CKSUMFLAGS(mp) |= HCK_FULLCKSUM;
1768
1769 ipha->ipha_hdr_checksum = 0;
1770 if (hck_flags & HCKSUM_IPHDRCKSUM) {
1771 DB_CKSUMFLAGS(mp) |= HCK_IPV4_HDRCKSUM;
1772 } else {
1773 ipha->ipha_hdr_checksum = ip_csum_hdr(ipha);
1774 }
1775 return (B_TRUE);
1776 }
1777 if ((hck_flags) & HCKSUM_INET_PARTIAL) {
1778 ipaddr_t dst = ipha->ipha_dst;
1779 ipaddr_t src = ipha->ipha_src;
1780 /*
1781 * Partial checksum offload has been enabled. Fill
1782 * the checksum field in the protocol header with the
1783 * pseudo-header checksum value.
1784 *
1785 * We accumulate the pseudo header checksum in cksum.
1786 * This is pretty hairy code, so watch close. One
1787 * thing to keep in mind is that UDP and TCP have
1788 * stored their respective datagram lengths in their
1789 * checksum fields. This lines things up real nice.
1790 */
1791 cksum += (dst >> 16) + (dst & 0xFFFF) +
1792 (src >> 16) + (src & 0xFFFF);
1793 cksum += *(cksump);
1794 cksum = (cksum & 0xFFFF) + (cksum >> 16);
1795 *(cksump) = (cksum & 0xFFFF) + (cksum >> 16);
1796
1797 /*
1798 * Offsets are relative to beginning of IP header.
1799 */
1800 DB_CKSUMSTART(mp) = ip_hdr_length;
1801 DB_CKSUMSTUFF(mp) = (uint8_t *)cksump - (uint8_t *)ipha;
1802 DB_CKSUMEND(mp) = pktlen;
1803 DB_CKSUMFLAGS(mp) |= HCK_PARTIALCKSUM;
1804
1805 ipha->ipha_hdr_checksum = 0;
1806 if (hck_flags & HCKSUM_IPHDRCKSUM) {
1807 DB_CKSUMFLAGS(mp) |= HCK_IPV4_HDRCKSUM;
1808 } else {
1809 ipha->ipha_hdr_checksum = ip_csum_hdr(ipha);
1810 }
1811 return (B_TRUE);
1812 }
1813 /* Hardware capabilities include neither full nor partial IPv4 */
1814 return (ip_output_sw_cksum_v4(mp, ipha, ixa));
1815 }
1816
1817 /*
1818 * ire_sendfn for offlink and onlink destinations.
1819 * Also called from the multicast, broadcast, multirt send functions.
1820 *
1821 * Assumes that the caller has a hold on the ire.
1822 *
1823 * This function doesn't care if the IRE just became condemned since that
1824 * can happen at any time.
1825 */
1826 /* ARGSUSED */
1827 int
ire_send_wire_v4(ire_t * ire,mblk_t * mp,void * iph_arg,ip_xmit_attr_t * ixa,uint32_t * identp)1828 ire_send_wire_v4(ire_t *ire, mblk_t *mp, void *iph_arg,
1829 ip_xmit_attr_t *ixa, uint32_t *identp)
1830 {
1831 ip_stack_t *ipst = ixa->ixa_ipst;
1832 ipha_t *ipha = (ipha_t *)iph_arg;
1833 iaflags_t ixaflags = ixa->ixa_flags;
1834 ill_t *ill;
1835
1836 ASSERT(ixa->ixa_nce != NULL);
1837 ill = ixa->ixa_nce->nce_ill;
1838
1839 if (ixaflags & IXAF_DONTROUTE)
1840 ipha->ipha_ttl = 1;
1841
1842 /*
1843 * Assign an ident value for this packet. There could be other
1844 * threads targeting the same destination, so we have to arrange
1845 * for a atomic increment. Note that we use a 32-bit atomic add
1846 * because it has better performance than its 16-bit sibling.
1847 *
1848 * Normally ixa_extra_ident is 0, but in the case of LSO it will
1849 * be the number of TCP segments that the driver/hardware will
1850 * extraly construct.
1851 *
1852 * If running in cluster mode and if the source address
1853 * belongs to a replicated service then vector through
1854 * cl_inet_ipident vector to allocate ip identifier
1855 * NOTE: This is a contract private interface with the
1856 * clustering group.
1857 */
1858 if (cl_inet_ipident != NULL) {
1859 ipaddr_t src = ipha->ipha_src;
1860 ipaddr_t dst = ipha->ipha_dst;
1861 netstackid_t stack_id = ipst->ips_netstack->netstack_stackid;
1862
1863 ASSERT(cl_inet_isclusterwide != NULL);
1864 if ((*cl_inet_isclusterwide)(stack_id, IPPROTO_IP,
1865 AF_INET, (uint8_t *)(uintptr_t)src, NULL)) {
1866 /*
1867 * Note: not correct with LSO since we can't allocate
1868 * ixa_extra_ident+1 consecutive values.
1869 */
1870 ipha->ipha_ident = (*cl_inet_ipident)(stack_id,
1871 IPPROTO_IP, AF_INET, (uint8_t *)(uintptr_t)src,
1872 (uint8_t *)(uintptr_t)dst, NULL);
1873 } else {
1874 ipha->ipha_ident = atomic_add_32_nv(identp,
1875 ixa->ixa_extra_ident + 1);
1876 }
1877 } else {
1878 ipha->ipha_ident = atomic_add_32_nv(identp,
1879 ixa->ixa_extra_ident + 1);
1880 }
1881 #ifndef _BIG_ENDIAN
1882 ipha->ipha_ident = htons(ipha->ipha_ident);
1883 #endif
1884
1885 /*
1886 * This might set b_band, thus the IPsec and fragmentation
1887 * code in IP ensures that b_band is updated in the first mblk.
1888 */
1889 if (IPP_ENABLED(IPP_LOCAL_OUT, ipst)) {
1890 /* ip_process translates an IS_UNDER_IPMP */
1891 mp = ip_process(IPP_LOCAL_OUT, mp, ill, ill);
1892 if (mp == NULL) {
1893 /* ip_drop_packet and MIB done */
1894 return (0); /* Might just be delayed */
1895 }
1896 }
1897
1898 /*
1899 * Verify any IPv4 options.
1900 *
1901 * The presense of IP options also forces the network stack to
1902 * calculate the checksum in software. This is because:
1903 *
1904 * Wrap around: certain partial-checksum NICs (eri, ce) limit
1905 * the size of "start offset" width to 6-bit. This effectively
1906 * sets the largest value of the offset to 64-bytes, starting
1907 * from the MAC header. When the cumulative MAC and IP headers
1908 * exceed such limit, the offset will wrap around. This causes
1909 * the checksum to be calculated at the wrong place.
1910 *
1911 * IPv4 source routing: none of the full-checksum capable NICs
1912 * is capable of correctly handling the IPv4 source-routing
1913 * option for purposes of calculating the pseudo-header; the
1914 * actual destination is different from the destination in the
1915 * header which is that of the next-hop. (This case may not be
1916 * true for NICs which can parse IPv6 extension headers, but
1917 * we choose to simplify the implementation by not offloading
1918 * checksum when they are present.)
1919 */
1920 if (!IS_SIMPLE_IPH(ipha)) {
1921 ixaflags = ixa->ixa_flags |= IXAF_NO_HW_CKSUM;
1922 /* An IS_UNDER_IPMP ill is ok here */
1923 if (ip_output_options(mp, ipha, ixa, ill)) {
1924 /* Packet has been consumed and ICMP error sent */
1925 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
1926 return (EINVAL);
1927 }
1928 }
1929
1930 /*
1931 * To handle IPsec/iptun's labeling needs we need to tag packets
1932 * while we still have ixa_tsl
1933 */
1934 if (is_system_labeled() && ixa->ixa_tsl != NULL &&
1935 (ill->ill_mactype == DL_6TO4 || ill->ill_mactype == DL_IPV4 ||
1936 ill->ill_mactype == DL_IPV6)) {
1937 cred_t *newcr;
1938
1939 newcr = copycred_from_tslabel(ixa->ixa_cred, ixa->ixa_tsl,
1940 KM_NOSLEEP);
1941 if (newcr == NULL) {
1942 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
1943 ip_drop_output("ipIfStatsOutDiscards - newcr",
1944 mp, ill);
1945 freemsg(mp);
1946 return (ENOBUFS);
1947 }
1948 mblk_setcred(mp, newcr, NOPID);
1949 crfree(newcr); /* mblk_setcred did its own crhold */
1950 }
1951
1952 if (ixa->ixa_pktlen > ixa->ixa_fragsize ||
1953 (ixaflags & IXAF_IPSEC_SECURE)) {
1954 uint32_t pktlen;
1955
1956 pktlen = ixa->ixa_pktlen;
1957 if (ixaflags & IXAF_IPSEC_SECURE)
1958 pktlen += ipsec_out_extra_length(ixa);
1959
1960 if (pktlen > IP_MAXPACKET)
1961 return (EMSGSIZE);
1962
1963 if (ixaflags & IXAF_SET_ULP_CKSUM) {
1964 /*
1965 * Compute ULP checksum and IP header checksum
1966 * using software
1967 */
1968 if (!ip_output_sw_cksum_v4(mp, ipha, ixa)) {
1969 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
1970 ip_drop_output("ipIfStatsOutDiscards", mp, ill);
1971 freemsg(mp);
1972 return (EINVAL);
1973 }
1974 } else {
1975 /* Calculate IPv4 header checksum */
1976 ipha->ipha_hdr_checksum = 0;
1977 ipha->ipha_hdr_checksum = ip_csum_hdr(ipha);
1978 }
1979
1980 /*
1981 * If this packet would generate a icmp_frag_needed
1982 * message, we need to handle it before we do the IPsec
1983 * processing. Otherwise, we need to strip the IPsec
1984 * headers before we send up the message to the ULPs
1985 * which becomes messy and difficult.
1986 *
1987 * We check using IXAF_DONTFRAG. The DF bit in the header
1988 * is not inspected - it will be copied to any generated
1989 * fragments.
1990 */
1991 if ((pktlen > ixa->ixa_fragsize) &&
1992 (ixaflags & IXAF_DONTFRAG)) {
1993 /* Generate ICMP and return error */
1994 ip_recv_attr_t iras;
1995
1996 DTRACE_PROBE4(ip4__fragsize__fail, uint_t, pktlen,
1997 uint_t, ixa->ixa_fragsize, uint_t, ixa->ixa_pktlen,
1998 uint_t, ixa->ixa_pmtu);
1999
2000 bzero(&iras, sizeof (iras));
2001 /* Map ixa to ira including IPsec policies */
2002 ipsec_out_to_in(ixa, ill, &iras);
2003
2004 ip_drop_output("ICMP_FRAG_NEEDED", mp, ill);
2005 icmp_frag_needed(mp, ixa->ixa_fragsize, &iras);
2006 /* We moved any IPsec refs from ixa to iras */
2007 ira_cleanup(&iras, B_FALSE);
2008 return (EMSGSIZE);
2009 }
2010 DTRACE_PROBE4(ip4__fragsize__ok, uint_t, pktlen,
2011 uint_t, ixa->ixa_fragsize, uint_t, ixa->ixa_pktlen,
2012 uint_t, ixa->ixa_pmtu);
2013
2014 if (ixaflags & IXAF_IPSEC_SECURE) {
2015 /*
2016 * Pass in sufficient information so that
2017 * IPsec can determine whether to fragment, and
2018 * which function to call after fragmentation.
2019 */
2020 return (ipsec_out_process(mp, ixa));
2021 }
2022 return (ip_fragment_v4(mp, ixa->ixa_nce, ixaflags,
2023 ixa->ixa_pktlen, ixa->ixa_fragsize, ixa->ixa_xmit_hint,
2024 ixa->ixa_zoneid, ixa->ixa_no_loop_zoneid,
2025 ixa->ixa_postfragfn, &ixa->ixa_cookie));
2026 }
2027 if (ixaflags & IXAF_SET_ULP_CKSUM) {
2028 /* Compute ULP checksum and IP header checksum */
2029 /* An IS_UNDER_IPMP ill is ok here */
2030 if (!ip_output_cksum_v4(ixaflags, mp, ipha, ixa, ill)) {
2031 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
2032 ip_drop_output("ipIfStatsOutDiscards", mp, ill);
2033 freemsg(mp);
2034 return (EINVAL);
2035 }
2036 } else {
2037 /* Calculate IPv4 header checksum */
2038 ipha->ipha_hdr_checksum = 0;
2039 ipha->ipha_hdr_checksum = ip_csum_hdr(ipha);
2040 }
2041 return ((ixa->ixa_postfragfn)(mp, ixa->ixa_nce, ixaflags,
2042 ixa->ixa_pktlen, ixa->ixa_xmit_hint, ixa->ixa_zoneid,
2043 ixa->ixa_no_loop_zoneid, &ixa->ixa_cookie));
2044 }
2045
2046 /*
2047 * Send mp into ip_input
2048 * Common for IPv4 and IPv6
2049 */
2050 void
ip_postfrag_loopback(mblk_t * mp,nce_t * nce,iaflags_t ixaflags,uint_t pkt_len,zoneid_t nolzid)2051 ip_postfrag_loopback(mblk_t *mp, nce_t *nce, iaflags_t ixaflags,
2052 uint_t pkt_len, zoneid_t nolzid)
2053 {
2054 rtc_t rtc;
2055 ill_t *ill = nce->nce_ill;
2056 ip_recv_attr_t iras; /* NOTE: No bzero for performance */
2057 ncec_t *ncec;
2058
2059 ncec = nce->nce_common;
2060 iras.ira_flags = IRAF_VERIFY_IP_CKSUM | IRAF_VERIFY_ULP_CKSUM |
2061 IRAF_LOOPBACK | IRAF_L2SRC_LOOPBACK;
2062 if (ncec->ncec_flags & NCE_F_BCAST)
2063 iras.ira_flags |= IRAF_L2DST_BROADCAST;
2064 else if (ncec->ncec_flags & NCE_F_MCAST)
2065 iras.ira_flags |= IRAF_L2DST_MULTICAST;
2066
2067 iras.ira_free_flags = 0;
2068 iras.ira_cred = NULL;
2069 iras.ira_cpid = NOPID;
2070 iras.ira_tsl = NULL;
2071 iras.ira_zoneid = ALL_ZONES;
2072 iras.ira_pktlen = pkt_len;
2073 UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCInOctets, iras.ira_pktlen);
2074 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInReceives);
2075
2076 if (ixaflags & IXAF_IS_IPV4)
2077 iras.ira_flags |= IRAF_IS_IPV4;
2078
2079 iras.ira_ill = iras.ira_rill = ill;
2080 iras.ira_ruifindex = ill->ill_phyint->phyint_ifindex;
2081 iras.ira_rifindex = iras.ira_ruifindex;
2082 iras.ira_mhip = NULL;
2083
2084 iras.ira_flags |= ixaflags & IAF_MASK;
2085 iras.ira_no_loop_zoneid = nolzid;
2086
2087 /* Broadcast and multicast doesn't care about the squeue */
2088 iras.ira_sqp = NULL;
2089
2090 rtc.rtc_ire = NULL;
2091 if (ixaflags & IXAF_IS_IPV4) {
2092 ipha_t *ipha = (ipha_t *)mp->b_rptr;
2093
2094 rtc.rtc_ipaddr = INADDR_ANY;
2095
2096 (*ill->ill_inputfn)(mp, ipha, &ipha->ipha_dst, &iras, &rtc);
2097 if (rtc.rtc_ire != NULL) {
2098 ASSERT(rtc.rtc_ipaddr != INADDR_ANY);
2099 ire_refrele(rtc.rtc_ire);
2100 }
2101 } else {
2102 ip6_t *ip6h = (ip6_t *)mp->b_rptr;
2103
2104 rtc.rtc_ip6addr = ipv6_all_zeros;
2105
2106 (*ill->ill_inputfn)(mp, ip6h, &ip6h->ip6_dst, &iras, &rtc);
2107 if (rtc.rtc_ire != NULL) {
2108 ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&rtc.rtc_ip6addr));
2109 ire_refrele(rtc.rtc_ire);
2110 }
2111 }
2112 /* Any references to clean up? No hold on ira */
2113 if (iras.ira_flags & (IRAF_IPSEC_SECURE|IRAF_SYSTEM_LABELED))
2114 ira_cleanup(&iras, B_FALSE);
2115 }
2116
2117 /*
2118 * Post fragmentation function for IRE_MULTICAST and IRE_BROADCAST which
2119 * looks at the IXAF_LOOPBACK_COPY flag.
2120 * Common for IPv4 and IPv6.
2121 *
2122 * If the loopback copy fails (due to no memory) but we send the packet out
2123 * on the wire we return no failure. Only in the case we supress the wire
2124 * sending do we take the loopback failure into account.
2125 *
2126 * Note that we do not perform DTRACE_IP7 and FW_HOOKS for the looped back copy.
2127 * Those operations are performed on this packet in ip_xmit() and it would
2128 * be odd to do it twice for the same packet.
2129 */
2130 int
ip_postfrag_loopcheck(mblk_t * mp,nce_t * nce,iaflags_t ixaflags,uint_t pkt_len,uint32_t xmit_hint,zoneid_t szone,zoneid_t nolzid,uintptr_t * ixacookie)2131 ip_postfrag_loopcheck(mblk_t *mp, nce_t *nce, iaflags_t ixaflags,
2132 uint_t pkt_len, uint32_t xmit_hint, zoneid_t szone, zoneid_t nolzid,
2133 uintptr_t *ixacookie)
2134 {
2135 ill_t *ill = nce->nce_ill;
2136 int error = 0;
2137
2138 /*
2139 * Check for IXAF_LOOPBACK_COPY - send a copy to ip as if the driver
2140 * had looped it back
2141 */
2142 if (ixaflags & IXAF_LOOPBACK_COPY) {
2143 mblk_t *mp1;
2144
2145 mp1 = copymsg(mp);
2146 if (mp1 == NULL) {
2147 /* Failed to deliver the loopback copy. */
2148 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
2149 ip_drop_output("ipIfStatsOutDiscards", mp, ill);
2150 error = ENOBUFS;
2151 } else {
2152 ip_postfrag_loopback(mp1, nce, ixaflags, pkt_len,
2153 nolzid);
2154 }
2155 }
2156
2157 /*
2158 * If TTL = 0 then only do the loopback to this host i.e. we are
2159 * done. We are also done if this was the
2160 * loopback interface since it is sufficient
2161 * to loopback one copy of a multicast packet.
2162 */
2163 if (ixaflags & IXAF_IS_IPV4) {
2164 ipha_t *ipha = (ipha_t *)mp->b_rptr;
2165
2166 if (ipha->ipha_ttl == 0) {
2167 ip_drop_output("multicast ipha_ttl not sent to wire",
2168 mp, ill);
2169 freemsg(mp);
2170 return (error);
2171 }
2172 } else {
2173 ip6_t *ip6h = (ip6_t *)mp->b_rptr;
2174
2175 if (ip6h->ip6_hops == 0) {
2176 ip_drop_output("multicast ipha_ttl not sent to wire",
2177 mp, ill);
2178 freemsg(mp);
2179 return (error);
2180 }
2181 }
2182 if (nce->nce_ill->ill_wq == NULL) {
2183 /* Loopback interface */
2184 ip_drop_output("multicast on lo0 not sent to wire", mp, ill);
2185 freemsg(mp);
2186 return (error);
2187 }
2188
2189 return (ip_xmit(mp, nce, ixaflags, pkt_len, xmit_hint, szone, 0,
2190 ixacookie));
2191 }
2192
2193 /*
2194 * Post fragmentation function for RTF_MULTIRT routes.
2195 * Since IRE_BROADCASTs can have RTF_MULTIRT, this function
2196 * checks IXAF_LOOPBACK_COPY.
2197 *
2198 * If no packet is sent due to failures then we return an errno, but if at
2199 * least one succeeded we return zero.
2200 */
2201 int
ip_postfrag_multirt_v4(mblk_t * mp,nce_t * nce,iaflags_t ixaflags,uint_t pkt_len,uint32_t xmit_hint,zoneid_t szone,zoneid_t nolzid,uintptr_t * ixacookie)2202 ip_postfrag_multirt_v4(mblk_t *mp, nce_t *nce, iaflags_t ixaflags,
2203 uint_t pkt_len, uint32_t xmit_hint, zoneid_t szone, zoneid_t nolzid,
2204 uintptr_t *ixacookie)
2205 {
2206 irb_t *irb;
2207 ipha_t *ipha = (ipha_t *)mp->b_rptr;
2208 ire_t *ire;
2209 ire_t *ire1;
2210 mblk_t *mp1;
2211 nce_t *nce1;
2212 ill_t *ill = nce->nce_ill;
2213 ill_t *ill1;
2214 ip_stack_t *ipst = ill->ill_ipst;
2215 int error = 0;
2216 int num_sent = 0;
2217 int err;
2218 uint_t ire_type;
2219 ipaddr_t nexthop;
2220
2221 ASSERT(ixaflags & IXAF_IS_IPV4);
2222
2223 /* Check for IXAF_LOOPBACK_COPY */
2224 if (ixaflags & IXAF_LOOPBACK_COPY) {
2225 mblk_t *mp1;
2226
2227 mp1 = copymsg(mp);
2228 if (mp1 == NULL) {
2229 /* Failed to deliver the loopback copy. */
2230 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
2231 ip_drop_output("ipIfStatsOutDiscards", mp, ill);
2232 error = ENOBUFS;
2233 } else {
2234 ip_postfrag_loopback(mp1, nce, ixaflags, pkt_len,
2235 nolzid);
2236 }
2237 }
2238
2239 /*
2240 * Loop over RTF_MULTIRT for ipha_dst in the same bucket. Send
2241 * a copy to each one.
2242 * Use the nce (nexthop) and ipha_dst to find the ire.
2243 *
2244 * MULTIRT is not designed to work with shared-IP zones thus we don't
2245 * need to pass a zoneid or a label to the IRE lookup.
2246 */
2247 if (V4_PART_OF_V6(nce->nce_addr) == ipha->ipha_dst) {
2248 /* Broadcast and multicast case */
2249 ire = ire_ftable_lookup_v4(ipha->ipha_dst, 0, 0, 0,
2250 NULL, ALL_ZONES, NULL, MATCH_IRE_DSTONLY, 0, ipst, NULL);
2251 } else {
2252 ipaddr_t v4addr = V4_PART_OF_V6(nce->nce_addr);
2253
2254 /* Unicast case */
2255 ire = ire_ftable_lookup_v4(ipha->ipha_dst, 0, v4addr, 0,
2256 NULL, ALL_ZONES, NULL, MATCH_IRE_GW, 0, ipst, NULL);
2257 }
2258
2259 if (ire == NULL ||
2260 (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) ||
2261 !(ire->ire_flags & RTF_MULTIRT)) {
2262 /* Drop */
2263 ip_drop_output("ip_postfrag_multirt didn't find route",
2264 mp, nce->nce_ill);
2265 if (ire != NULL)
2266 ire_refrele(ire);
2267 return (ENETUNREACH);
2268 }
2269
2270 irb = ire->ire_bucket;
2271 irb_refhold(irb);
2272 for (ire1 = irb->irb_ire; ire1 != NULL; ire1 = ire1->ire_next) {
2273 /*
2274 * For broadcast we can have a mixture of IRE_BROADCAST and
2275 * IRE_HOST due to the manually added IRE_HOSTs that are used
2276 * to trigger the creation of the special CGTP broadcast routes.
2277 * Thus we have to skip if ire_type doesn't match the original.
2278 */
2279 if (IRE_IS_CONDEMNED(ire1) ||
2280 !(ire1->ire_flags & RTF_MULTIRT) ||
2281 ire1->ire_type != ire->ire_type)
2282 continue;
2283
2284 /* Do the ire argument one after the loop */
2285 if (ire1 == ire)
2286 continue;
2287
2288 ill1 = ire_nexthop_ill(ire1);
2289 if (ill1 == NULL) {
2290 /*
2291 * This ire might not have been picked by
2292 * ire_route_recursive, in which case ire_dep might
2293 * not have been setup yet.
2294 * We kick ire_route_recursive to try to resolve
2295 * starting at ire1.
2296 */
2297 ire_t *ire2;
2298 uint_t match_flags = MATCH_IRE_DSTONLY;
2299
2300 if (ire1->ire_ill != NULL)
2301 match_flags |= MATCH_IRE_ILL;
2302 ire2 = ire_route_recursive_impl_v4(ire1,
2303 ire1->ire_addr, ire1->ire_type, ire1->ire_ill,
2304 ire1->ire_zoneid, NULL, match_flags,
2305 IRR_ALLOCATE, 0, ipst, NULL, NULL, NULL);
2306 if (ire2 != NULL)
2307 ire_refrele(ire2);
2308 ill1 = ire_nexthop_ill(ire1);
2309 }
2310
2311 if (ill1 == NULL) {
2312 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
2313 ip_drop_output("ipIfStatsOutDiscards - no ill",
2314 mp, ill);
2315 error = ENETUNREACH;
2316 continue;
2317 }
2318
2319 /* Pick the addr and type to use for arp_nce_init */
2320 if (nce->nce_common->ncec_flags & NCE_F_BCAST) {
2321 ire_type = IRE_BROADCAST;
2322 nexthop = ire1->ire_gateway_addr;
2323 } else if (nce->nce_common->ncec_flags & NCE_F_MCAST) {
2324 ire_type = IRE_MULTICAST;
2325 nexthop = ipha->ipha_dst;
2326 } else {
2327 ire_type = ire1->ire_type; /* Doesn't matter */
2328 nexthop = ire1->ire_gateway_addr;
2329 }
2330
2331 /* If IPMP meta or under, then we just drop */
2332 if (ill1->ill_grp != NULL) {
2333 BUMP_MIB(ill1->ill_ip_mib, ipIfStatsOutDiscards);
2334 ip_drop_output("ipIfStatsOutDiscards - IPMP",
2335 mp, ill1);
2336 ill_refrele(ill1);
2337 error = ENETUNREACH;
2338 continue;
2339 }
2340
2341 nce1 = arp_nce_init(ill1, nexthop, ire_type);
2342 if (nce1 == NULL) {
2343 BUMP_MIB(ill1->ill_ip_mib, ipIfStatsOutDiscards);
2344 ip_drop_output("ipIfStatsOutDiscards - no nce",
2345 mp, ill1);
2346 ill_refrele(ill1);
2347 error = ENETUNREACH;
2348 continue;
2349 }
2350 mp1 = copymsg(mp);
2351 if (mp1 == NULL) {
2352 BUMP_MIB(ill1->ill_ip_mib, ipIfStatsOutDiscards);
2353 ip_drop_output("ipIfStatsOutDiscards", mp, ill1);
2354 nce_refrele(nce1);
2355 ill_refrele(ill1);
2356 error = ENOBUFS;
2357 continue;
2358 }
2359 /* Preserve HW checksum for this copy */
2360 DB_CKSUMSTART(mp1) = DB_CKSUMSTART(mp);
2361 DB_CKSUMSTUFF(mp1) = DB_CKSUMSTUFF(mp);
2362 DB_CKSUMEND(mp1) = DB_CKSUMEND(mp);
2363 DB_CKSUMFLAGS(mp1) = DB_CKSUMFLAGS(mp);
2364 DB_LSOMSS(mp1) = DB_LSOMSS(mp);
2365
2366 ire1->ire_ob_pkt_count++;
2367 err = ip_xmit(mp1, nce1, ixaflags, pkt_len, xmit_hint, szone,
2368 0, ixacookie);
2369 if (err == 0)
2370 num_sent++;
2371 else
2372 error = err;
2373 nce_refrele(nce1);
2374 ill_refrele(ill1);
2375 }
2376 irb_refrele(irb);
2377 ire_refrele(ire);
2378 /* Finally, the main one */
2379 err = ip_xmit(mp, nce, ixaflags, pkt_len, xmit_hint, szone, 0,
2380 ixacookie);
2381 if (err == 0)
2382 num_sent++;
2383 else
2384 error = err;
2385 if (num_sent > 0)
2386 return (0);
2387 else
2388 return (error);
2389 }
2390
2391 /*
2392 * Verify local connectivity. This check is called by ULP fusion code.
2393 * The generation number on an IRE_LOCAL or IRE_LOOPBACK only changes if
2394 * the interface is brought down and back up. So we simply fail the local
2395 * process. The caller, TCP Fusion, should unfuse the connection.
2396 */
2397 boolean_t
ip_output_verify_local(ip_xmit_attr_t * ixa)2398 ip_output_verify_local(ip_xmit_attr_t *ixa)
2399 {
2400 ire_t *ire = ixa->ixa_ire;
2401
2402 if (!(ire->ire_type & (IRE_LOCAL | IRE_LOOPBACK)))
2403 return (B_FALSE);
2404
2405 return (ixa->ixa_ire->ire_generation == ixa->ixa_ire_generation);
2406 }
2407
2408 /*
2409 * Local process for ULP loopback, TCP Fusion. Handle both IPv4 and IPv6.
2410 *
2411 * The caller must call ip_output_verify_local() first. This function handles
2412 * IPobs, FW_HOOKS, and/or IPsec cases sequentially.
2413 */
2414 mblk_t *
ip_output_process_local(mblk_t * mp,ip_xmit_attr_t * ixa,boolean_t hooks_out,boolean_t hooks_in,conn_t * peer_connp)2415 ip_output_process_local(mblk_t *mp, ip_xmit_attr_t *ixa, boolean_t hooks_out,
2416 boolean_t hooks_in, conn_t *peer_connp)
2417 {
2418 ill_t *ill = ixa->ixa_ire->ire_ill;
2419 ipha_t *ipha = NULL;
2420 ip6_t *ip6h = NULL;
2421 ip_stack_t *ipst = ixa->ixa_ipst;
2422 iaflags_t ixaflags = ixa->ixa_flags;
2423 ip_recv_attr_t iras;
2424 int error;
2425
2426 ASSERT(mp != NULL);
2427
2428 if (ixaflags & IXAF_IS_IPV4) {
2429 ipha = (ipha_t *)mp->b_rptr;
2430
2431 /*
2432 * If a callback is enabled then we need to know the
2433 * source and destination zoneids for the packet. We already
2434 * have those handy.
2435 */
2436 if (ipst->ips_ip4_observe.he_interested) {
2437 zoneid_t szone, dzone;
2438 zoneid_t stackzoneid;
2439
2440 stackzoneid = netstackid_to_zoneid(
2441 ipst->ips_netstack->netstack_stackid);
2442
2443 if (stackzoneid == GLOBAL_ZONEID) {
2444 /* Shared-IP zone */
2445 dzone = ixa->ixa_ire->ire_zoneid;
2446 szone = ixa->ixa_zoneid;
2447 } else {
2448 szone = dzone = stackzoneid;
2449 }
2450 ipobs_hook(mp, IPOBS_HOOK_LOCAL, szone, dzone, ill,
2451 ipst);
2452 }
2453 DTRACE_IP7(send, mblk_t *, mp, conn_t *, NULL, void_ip_t *,
2454 ipha, __dtrace_ipsr_ill_t *, ill, ipha_t *, ipha, ip6_t *,
2455 NULL, int, 1);
2456
2457 /* FW_HOOKS: LOOPBACK_OUT */
2458 if (hooks_out) {
2459 DTRACE_PROBE4(ip4__loopback__out__start, ill_t *, NULL,
2460 ill_t *, ill, ipha_t *, ipha, mblk_t *, mp);
2461 FW_HOOKS(ipst->ips_ip4_loopback_out_event,
2462 ipst->ips_ipv4firewall_loopback_out,
2463 NULL, ill, ipha, mp, mp, 0, ipst, error);
2464 DTRACE_PROBE1(ip4__loopback__out__end, mblk_t *, mp);
2465 }
2466 if (mp == NULL)
2467 return (NULL);
2468
2469 /* FW_HOOKS: LOOPBACK_IN */
2470 if (hooks_in) {
2471 DTRACE_PROBE4(ip4__loopback__in__start, ill_t *, ill,
2472 ill_t *, NULL, ipha_t *, ipha, mblk_t *, mp);
2473 FW_HOOKS(ipst->ips_ip4_loopback_in_event,
2474 ipst->ips_ipv4firewall_loopback_in,
2475 ill, NULL, ipha, mp, mp, 0, ipst, error);
2476 DTRACE_PROBE1(ip4__loopback__in__end, mblk_t *, mp);
2477 }
2478 if (mp == NULL)
2479 return (NULL);
2480
2481 DTRACE_IP7(receive, mblk_t *, mp, conn_t *, NULL, void_ip_t *,
2482 ipha, __dtrace_ipsr_ill_t *, ill, ipha_t *, ipha, ip6_t *,
2483 NULL, int, 1);
2484
2485 /* Inbound IPsec polocies */
2486 if (peer_connp != NULL) {
2487 /* Map ixa to ira including IPsec policies. */
2488 ipsec_out_to_in(ixa, ill, &iras);
2489 mp = ipsec_check_inbound_policy(mp, peer_connp, ipha,
2490 NULL, &iras);
2491 }
2492 } else {
2493 ip6h = (ip6_t *)mp->b_rptr;
2494
2495 /*
2496 * If a callback is enabled then we need to know the
2497 * source and destination zoneids for the packet. We already
2498 * have those handy.
2499 */
2500 if (ipst->ips_ip6_observe.he_interested) {
2501 zoneid_t szone, dzone;
2502 zoneid_t stackzoneid;
2503
2504 stackzoneid = netstackid_to_zoneid(
2505 ipst->ips_netstack->netstack_stackid);
2506
2507 if (stackzoneid == GLOBAL_ZONEID) {
2508 /* Shared-IP zone */
2509 dzone = ixa->ixa_ire->ire_zoneid;
2510 szone = ixa->ixa_zoneid;
2511 } else {
2512 szone = dzone = stackzoneid;
2513 }
2514 ipobs_hook(mp, IPOBS_HOOK_LOCAL, szone, dzone, ill,
2515 ipst);
2516 }
2517 DTRACE_IP7(send, mblk_t *, mp, conn_t *, NULL, void_ip_t *,
2518 ip6h, __dtrace_ipsr_ill_t *, ill, ipha_t *, NULL, ip6_t *,
2519 ip6h, int, 1);
2520
2521 /* FW_HOOKS: LOOPBACK_OUT */
2522 if (hooks_out) {
2523 DTRACE_PROBE4(ip6__loopback__out__start, ill_t *, NULL,
2524 ill_t *, ill, ip6_t *, ip6h, mblk_t *, mp);
2525 FW_HOOKS6(ipst->ips_ip6_loopback_out_event,
2526 ipst->ips_ipv6firewall_loopback_out,
2527 NULL, ill, ip6h, mp, mp, 0, ipst, error);
2528 DTRACE_PROBE1(ip6__loopback__out__end, mblk_t *, mp);
2529 }
2530 if (mp == NULL)
2531 return (NULL);
2532
2533 /* FW_HOOKS: LOOPBACK_IN */
2534 if (hooks_in) {
2535 DTRACE_PROBE4(ip6__loopback__in__start, ill_t *, ill,
2536 ill_t *, NULL, ip6_t *, ip6h, mblk_t *, mp);
2537 FW_HOOKS6(ipst->ips_ip6_loopback_in_event,
2538 ipst->ips_ipv6firewall_loopback_in,
2539 ill, NULL, ip6h, mp, mp, 0, ipst, error);
2540 DTRACE_PROBE1(ip6__loopback__in__end, mblk_t *, mp);
2541 }
2542 if (mp == NULL)
2543 return (NULL);
2544
2545 DTRACE_IP7(receive, mblk_t *, mp, conn_t *, NULL, void_ip_t *,
2546 ip6h, __dtrace_ipsr_ill_t *, ill, ipha_t *, NULL, ip6_t *,
2547 ip6h, int, 1);
2548
2549 /* Inbound IPsec polocies */
2550 if (peer_connp != NULL) {
2551 /* Map ixa to ira including IPsec policies. */
2552 ipsec_out_to_in(ixa, ill, &iras);
2553 mp = ipsec_check_inbound_policy(mp, peer_connp, NULL,
2554 ip6h, &iras);
2555 }
2556 }
2557
2558 if (mp == NULL) {
2559 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2560 ip_drop_input("ipIfStatsInDiscards", NULL, ill);
2561 }
2562
2563 return (mp);
2564 }
2565