1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 /* Copyright (c) 1990 Mentat Inc. */ 27 28 #include <sys/types.h> 29 #include <sys/stream.h> 30 #include <sys/strsubr.h> 31 #include <sys/dlpi.h> 32 #include <sys/strsun.h> 33 #include <sys/zone.h> 34 #include <sys/ddi.h> 35 #include <sys/sunddi.h> 36 #include <sys/cmn_err.h> 37 #include <sys/debug.h> 38 #include <sys/atomic.h> 39 40 #include <sys/systm.h> 41 #include <sys/param.h> 42 #include <sys/kmem.h> 43 #include <sys/sdt.h> 44 #include <sys/socket.h> 45 #include <sys/mac.h> 46 #include <net/if.h> 47 #include <net/if_arp.h> 48 #include <net/route.h> 49 #include <sys/sockio.h> 50 #include <netinet/in.h> 51 #include <net/if_dl.h> 52 53 #include <inet/common.h> 54 #include <inet/mi.h> 55 #include <inet/mib2.h> 56 #include <inet/nd.h> 57 #include <inet/arp.h> 58 #include <inet/snmpcom.h> 59 #include <inet/kstatcom.h> 60 61 #include <netinet/igmp_var.h> 62 #include <netinet/ip6.h> 63 #include <netinet/icmp6.h> 64 #include <netinet/sctp.h> 65 66 #include <inet/ip.h> 67 #include <inet/ip_impl.h> 68 #include <inet/ip6.h> 69 #include <inet/ip6_asp.h> 70 #include <inet/tcp.h> 71 #include <inet/ip_multi.h> 72 #include <inet/ip_if.h> 73 #include <inet/ip_ire.h> 74 #include <inet/ip_ftable.h> 75 #include <inet/ip_rts.h> 76 #include <inet/optcom.h> 77 #include <inet/ip_ndp.h> 78 #include <inet/ip_listutils.h> 79 #include <netinet/igmp.h> 80 #include <netinet/ip_mroute.h> 81 #include <inet/ipp_common.h> 82 83 #include <net/pfkeyv2.h> 84 #include <inet/sadb.h> 85 #include <inet/ipsec_impl.h> 86 #include <inet/ipdrop.h> 87 #include <inet/ip_netinfo.h> 88 89 #include <sys/pattr.h> 90 #include <inet/ipclassifier.h> 91 #include <inet/sctp_ip.h> 92 #include <inet/sctp/sctp_impl.h> 93 #include <inet/udp_impl.h> 94 #include <sys/sunddi.h> 95 96 #include <sys/tsol/label.h> 97 #include <sys/tsol/tnet.h> 98 99 #ifdef DEBUG 100 extern boolean_t skip_sctp_cksum; 101 #endif 102 103 static int ip_verify_nce(mblk_t *, ip_xmit_attr_t *); 104 static int ip_verify_dce(mblk_t *, ip_xmit_attr_t *); 105 static boolean_t ip_verify_lso(ill_t *, ip_xmit_attr_t *); 106 static boolean_t ip_verify_zcopy(ill_t *, ip_xmit_attr_t *); 107 static void ip_output_simple_broadcast(ip_xmit_attr_t *, mblk_t *); 108 109 /* 110 * There are two types of output functions for IP used for different 111 * purposes: 112 * - ip_output_simple() is when sending ICMP errors, TCP resets, etc when there 113 * is no context in the form of a conn_t. However, there is a 114 * ip_xmit_attr_t that the callers use to influence interface selection 115 * (needed for ICMP echo as well as IPv6 link-locals) and IPsec. 116 * 117 * - conn_ip_output() is used when sending packets with a conn_t and 118 * ip_set_destination has been called to cache information. In that case 119 * various socket options are recorded in the ip_xmit_attr_t and should 120 * be taken into account. 121 */ 122 123 /* 124 * The caller *must* have called conn_connect() or ip_attr_connect() 125 * before calling conn_ip_output(). The caller needs to redo that each time 126 * the destination IP address or port changes, as well as each time there is 127 * a change to any socket option that would modify how packets are routed out 128 * of the box (e.g., SO_DONTROUTE, IP_NEXTHOP, IP_BOUND_IF). 129 * 130 * The ULP caller has to serialize the use of a single ip_xmit_attr_t. 131 * We assert for that here. 132 */ 133 int 134 conn_ip_output(mblk_t *mp, ip_xmit_attr_t *ixa) 135 { 136 iaflags_t ixaflags = ixa->ixa_flags; 137 ire_t *ire; 138 nce_t *nce; 139 dce_t *dce; 140 ill_t *ill; 141 ip_stack_t *ipst = ixa->ixa_ipst; 142 int error; 143 144 /* We defer ipIfStatsHCOutRequests until an error or we have an ill */ 145 146 ASSERT(ixa->ixa_ire != NULL); 147 /* Note there is no ixa_nce when reject and blackhole routes */ 148 ASSERT(ixa->ixa_dce != NULL); /* Could be default dce */ 149 150 #ifdef DEBUG 151 ASSERT(ixa->ixa_curthread == NULL); 152 ixa->ixa_curthread = curthread; 153 #endif 154 155 /* 156 * Even on labeled systems we can have a NULL ixa_tsl e.g., 157 * for IGMP/MLD traffic. 158 */ 159 160 ire = ixa->ixa_ire; 161 162 /* 163 * If the ULP says the (old) IRE resulted in reachability we 164 * record this before determine whether to use a new IRE. 165 * No locking for performance reasons. 166 */ 167 if (ixaflags & IXAF_REACH_CONF) 168 ire->ire_badcnt = 0; 169 170 /* 171 * Has routing changed since we cached the results of the lookup? 172 * 173 * This check captures all of: 174 * - the cached ire being deleted (by means of the special 175 * IRE_GENERATION_CONDEMNED) 176 * - A potentially better ire being added (ire_generation being 177 * increased) 178 * - A deletion of the nexthop ire that was used when we did the 179 * lookup. 180 * - An addition of a potentially better nexthop ire. 181 * The last two are handled by walking and increasing the generation 182 * number on all dependant IREs in ire_flush_cache(). 183 * 184 * The check also handles all cases of RTF_REJECT and RTF_BLACKHOLE 185 * since we ensure that each time we set ixa_ire to such an IRE we 186 * make sure the ixa_ire_generation does not match (by using 187 * IRE_GENERATION_VERIFY). 188 */ 189 if (ire->ire_generation != ixa->ixa_ire_generation) { 190 error = ip_verify_ire(mp, ixa); 191 if (error != 0) { 192 ip_drop_output("ipIfStatsOutDiscards - verify ire", 193 mp, NULL); 194 goto drop; 195 } 196 ire = ixa->ixa_ire; 197 ASSERT(ire != NULL); 198 if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) { 199 #ifdef DEBUG 200 ASSERT(ixa->ixa_curthread == curthread); 201 ixa->ixa_curthread = NULL; 202 #endif 203 ire->ire_ob_pkt_count++; 204 /* ixa_dce might be condemned; use default one */ 205 return ((ire->ire_sendfn)(ire, mp, mp->b_rptr, ixa, 206 &ipst->ips_dce_default->dce_ident)); 207 } 208 /* 209 * If the ncec changed then ip_verify_ire already set 210 * ixa->ixa_dce_generation = DCE_GENERATION_VERIFY; 211 * so we can recheck the interface mtu. 212 */ 213 214 /* 215 * Note that ire->ire_generation could already have changed. 216 * We catch that next time we send a packet. 217 */ 218 } 219 220 /* 221 * No need to lock access to ixa_nce since the ip_xmit_attr usage 222 * is single threaded. 223 */ 224 ASSERT(ixa->ixa_nce != NULL); 225 nce = ixa->ixa_nce; 226 if (nce->nce_is_condemned) { 227 error = ip_verify_nce(mp, ixa); 228 /* 229 * In case ZEROCOPY capability become not available, we 230 * copy the message and free the original one. We might 231 * be copying more data than needed but it doesn't hurt 232 * since such change rarely happens. 233 */ 234 switch (error) { 235 case 0: 236 break; 237 case ENOTSUP: { /* ZEROCOPY */ 238 mblk_t *nmp; 239 240 if ((nmp = copymsg(mp)) != NULL) { 241 freemsg(mp); 242 mp = nmp; 243 244 break; 245 } 246 /* FALLTHROUGH */ 247 } 248 default: 249 ip_drop_output("ipIfStatsOutDiscards - verify nce", 250 mp, NULL); 251 goto drop; 252 } 253 ire = ixa->ixa_ire; 254 ASSERT(ire != NULL); 255 if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) { 256 #ifdef DEBUG 257 ASSERT(ixa->ixa_curthread == curthread); 258 ixa->ixa_curthread = NULL; 259 #endif 260 ire->ire_ob_pkt_count++; 261 /* ixa_dce might be condemned; use default one */ 262 return ((ire->ire_sendfn)(ire, mp, mp->b_rptr, 263 ixa, &ipst->ips_dce_default->dce_ident)); 264 } 265 ASSERT(ixa->ixa_nce != NULL); 266 nce = ixa->ixa_nce; 267 268 /* 269 * Note that some other event could already have made 270 * the new nce condemned. We catch that next time we 271 * try to send a packet. 272 */ 273 } 274 /* 275 * If there is no per-destination dce_t then we have a reference to 276 * the default dce_t (which merely contains the dce_ipid). 277 * The generation check captures both the introduction of a 278 * per-destination dce_t (e.g., due to ICMP packet too big) and 279 * any change to the per-destination dce (including it becoming 280 * condemned by use of the special DCE_GENERATION_CONDEMNED). 281 */ 282 dce = ixa->ixa_dce; 283 284 /* 285 * To avoid a periodic timer to increase the path MTU we 286 * look at dce_last_change_time each time we send a packet. 287 */ 288 if ((dce->dce_flags & DCEF_PMTU) && 289 (TICK_TO_SEC(lbolt64) - dce->dce_last_change_time > 290 ipst->ips_ip_pathmtu_interval)) { 291 /* 292 * Older than 20 minutes. Drop the path MTU information. 293 * Since the path MTU changes as a result of this, twiddle 294 * ixa_dce_generation to make us go through the dce 295 * verification code in conn_ip_output. 296 */ 297 mutex_enter(&dce->dce_lock); 298 dce->dce_flags &= ~(DCEF_PMTU|DCEF_TOO_SMALL_PMTU); 299 dce->dce_last_change_time = TICK_TO_SEC(lbolt64); 300 mutex_exit(&dce->dce_lock); 301 dce_increment_generation(dce); 302 } 303 304 if (dce->dce_generation != ixa->ixa_dce_generation) { 305 error = ip_verify_dce(mp, ixa); 306 if (error != 0) { 307 ip_drop_output("ipIfStatsOutDiscards - verify dce", 308 mp, NULL); 309 goto drop; 310 } 311 dce = ixa->ixa_dce; 312 313 /* 314 * Note that some other event could already have made the 315 * new dce's generation number change. 316 * We catch that next time we try to send a packet. 317 */ 318 } 319 320 ill = nce->nce_ill; 321 322 /* 323 * An initial ixa_fragsize was set in ip_set_destination 324 * and we update it if any routing changes above. 325 * A change to ill_mtu with ifconfig will increase all dce_generation 326 * so that we will detect that with the generation check. 327 */ 328 329 /* 330 * Caller needs to make sure IXAF_VERIFY_SRC is not set if 331 * conn_unspec_src. 332 */ 333 if ((ixaflags & IXAF_VERIFY_SOURCE) && 334 ixa->ixa_src_generation != ipst->ips_src_generation) { 335 /* Check if the IP source is still assigned to the host. */ 336 uint_t gen; 337 338 if (!ip_verify_src(mp, ixa, &gen)) { 339 /* Don't send a packet with a source that isn't ours */ 340 error = EADDRNOTAVAIL; 341 ip_drop_output("ipIfStatsOutDiscards - invalid src", 342 mp, NULL); 343 goto drop; 344 } 345 /* The source is still valid - update the generation number */ 346 ixa->ixa_src_generation = gen; 347 } 348 349 /* 350 * We don't have an IRE when we fragment, hence ire_ob_pkt_count 351 * can only count the use prior to fragmentation. However the MIB 352 * counters on the ill will be incremented in post fragmentation. 353 */ 354 ire->ire_ob_pkt_count++; 355 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCOutRequests); 356 357 /* 358 * Based on ire_type and ire_flags call one of: 359 * ire_send_local_v* - for IRE_LOCAL and IRE_LOOPBACK 360 * ire_send_multirt_v* - if RTF_MULTIRT 361 * ire_send_noroute_v* - if RTF_REJECT or RTF_BLACHOLE 362 * ire_send_multicast_v* - for IRE_MULTICAST 363 * ire_send_broadcast_v4 - for IRE_BROADCAST 364 * ire_send_wire_v* - for the rest. 365 */ 366 #ifdef DEBUG 367 ASSERT(ixa->ixa_curthread == curthread); 368 ixa->ixa_curthread = NULL; 369 #endif 370 return ((ire->ire_sendfn)(ire, mp, mp->b_rptr, ixa, &dce->dce_ident)); 371 372 drop: 373 if (ixaflags & IXAF_IS_IPV4) { 374 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests); 375 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards); 376 } else { 377 BUMP_MIB(&ipst->ips_ip6_mib, ipIfStatsHCOutRequests); 378 BUMP_MIB(&ipst->ips_ip6_mib, ipIfStatsOutDiscards); 379 } 380 freemsg(mp); 381 #ifdef DEBUG 382 ASSERT(ixa->ixa_curthread == curthread); 383 ixa->ixa_curthread = NULL; 384 #endif 385 return (error); 386 } 387 388 /* 389 * Handle both IPv4 and IPv6. Sets the generation number 390 * to allow the caller to know when to call us again. 391 * Returns true if the source address in the packet is a valid source. 392 * We handle callers which try to send with a zero address (since we only 393 * get here if UNSPEC_SRC is not set). 394 */ 395 boolean_t 396 ip_verify_src(mblk_t *mp, ip_xmit_attr_t *ixa, uint_t *generationp) 397 { 398 ip_stack_t *ipst = ixa->ixa_ipst; 399 400 /* 401 * Need to grab the generation number before we check to 402 * avoid a race with a change to the set of local addresses. 403 * No lock needed since the thread which updates the set of local 404 * addresses use ipif/ill locks and exit those (hence a store memory 405 * barrier) before doing the atomic increase of ips_src_generation. 406 */ 407 if (generationp != NULL) 408 *generationp = ipst->ips_src_generation; 409 410 if (ixa->ixa_flags & IXAF_IS_IPV4) { 411 ipha_t *ipha = (ipha_t *)mp->b_rptr; 412 413 if (ipha->ipha_src == INADDR_ANY) 414 return (B_FALSE); 415 416 return (ip_laddr_verify_v4(ipha->ipha_src, ixa->ixa_zoneid, 417 ipst, B_FALSE) != IPVL_BAD); 418 } else { 419 ip6_t *ip6h = (ip6_t *)mp->b_rptr; 420 uint_t scopeid; 421 422 if (IN6_IS_ADDR_UNSPECIFIED(&ip6h->ip6_src)) 423 return (B_FALSE); 424 425 if (ixa->ixa_flags & IXAF_SCOPEID_SET) 426 scopeid = ixa->ixa_scopeid; 427 else 428 scopeid = 0; 429 430 return (ip_laddr_verify_v6(&ip6h->ip6_src, ixa->ixa_zoneid, 431 ipst, B_FALSE, scopeid) != IPVL_BAD); 432 } 433 } 434 435 /* 436 * Handle both IPv4 and IPv6. Reverify/recalculate the IRE to use. 437 */ 438 int 439 ip_verify_ire(mblk_t *mp, ip_xmit_attr_t *ixa) 440 { 441 uint_t gen; 442 ire_t *ire; 443 nce_t *nce; 444 int error; 445 boolean_t multirt = B_FALSE; 446 447 /* 448 * Redo ip_select_route. 449 * Need to grab generation number as part of the lookup to 450 * avoid race. 451 */ 452 error = 0; 453 ire = ip_select_route_pkt(mp, ixa, &gen, &error, &multirt); 454 ASSERT(ire != NULL); /* IRE_NOROUTE if none found */ 455 if (error != 0) { 456 ire_refrele(ire); 457 return (error); 458 } 459 460 if (ixa->ixa_ire != NULL) 461 ire_refrele_notr(ixa->ixa_ire); 462 #ifdef DEBUG 463 ire_refhold_notr(ire); 464 ire_refrele(ire); 465 #endif 466 ixa->ixa_ire = ire; 467 ixa->ixa_ire_generation = gen; 468 if (multirt) { 469 if (ixa->ixa_flags & IXAF_IS_IPV4) 470 ixa->ixa_postfragfn = ip_postfrag_multirt_v4; 471 else 472 ixa->ixa_postfragfn = ip_postfrag_multirt_v6; 473 ixa->ixa_flags |= IXAF_MULTIRT_MULTICAST; 474 } else { 475 ixa->ixa_postfragfn = ire->ire_postfragfn; 476 ixa->ixa_flags &= ~IXAF_MULTIRT_MULTICAST; 477 } 478 479 /* 480 * Don't look for an nce for reject or blackhole. 481 * They have ire_generation set to IRE_GENERATION_VERIFY which 482 * makes conn_ip_output avoid references to ixa_nce. 483 */ 484 if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) { 485 ASSERT(ixa->ixa_ire_generation == IRE_GENERATION_VERIFY); 486 ixa->ixa_dce_generation = DCE_GENERATION_VERIFY; 487 return (0); 488 } 489 490 /* The NCE could now be different */ 491 nce = ire_to_nce_pkt(ire, mp); 492 if (nce == NULL) { 493 /* 494 * Allocation failure. Make sure we redo ire/nce selection 495 * next time we send. 496 */ 497 ixa->ixa_ire_generation = IRE_GENERATION_VERIFY; 498 ixa->ixa_dce_generation = DCE_GENERATION_VERIFY; 499 return (ENOBUFS); 500 } 501 if (nce == ixa->ixa_nce) { 502 /* No change */ 503 nce_refrele(nce); 504 return (0); 505 } 506 507 /* 508 * Since the path MTU might change as a result of this 509 * route change, we twiddle ixa_dce_generation to 510 * make conn_ip_output go through the ip_verify_dce code. 511 */ 512 ixa->ixa_dce_generation = DCE_GENERATION_VERIFY; 513 514 if (ixa->ixa_nce != NULL) 515 nce_refrele(ixa->ixa_nce); 516 ixa->ixa_nce = nce; 517 return (0); 518 } 519 520 /* 521 * Handle both IPv4 and IPv6. Reverify/recalculate the NCE to use. 522 */ 523 static int 524 ip_verify_nce(mblk_t *mp, ip_xmit_attr_t *ixa) 525 { 526 ire_t *ire = ixa->ixa_ire; 527 nce_t *nce; 528 int error = 0; 529 ipha_t *ipha = NULL; 530 ip6_t *ip6h = NULL; 531 532 if (ire->ire_ipversion == IPV4_VERSION) 533 ipha = (ipha_t *)mp->b_rptr; 534 else 535 ip6h = (ip6_t *)mp->b_rptr; 536 537 nce = ire_handle_condemned_nce(ixa->ixa_nce, ire, ipha, ip6h, B_TRUE); 538 if (nce == NULL) { 539 /* Try to find a better ire */ 540 return (ip_verify_ire(mp, ixa)); 541 } 542 543 /* 544 * The hardware offloading capabilities, for example LSO, of the 545 * interface might have changed, so do sanity verification here. 546 */ 547 if (ixa->ixa_flags & IXAF_VERIFY_LSO) { 548 if (!ip_verify_lso(nce->nce_ill, ixa)) { 549 ASSERT(ixa->ixa_notify != NULL); 550 ixa->ixa_notify(ixa->ixa_notify_cookie, ixa, 551 IXAN_LSO, 0); 552 error = ENOTSUP; 553 } 554 } 555 556 /* 557 * Verify ZEROCOPY capability of underlying ill. Notify the ULP with 558 * any ZEROCOPY changes. In case ZEROCOPY capability is not available 559 * any more, return error so that conn_ip_output() can take care of 560 * the ZEROCOPY message properly. It's safe to continue send the 561 * message when ZEROCOPY newly become available. 562 */ 563 if (ixa->ixa_flags & IXAF_VERIFY_ZCOPY) { 564 if (!ip_verify_zcopy(nce->nce_ill, ixa)) { 565 ASSERT(ixa->ixa_notify != NULL); 566 ixa->ixa_notify(ixa->ixa_notify_cookie, ixa, 567 IXAN_ZCOPY, 0); 568 if ((ixa->ixa_flags & IXAF_ZCOPY_CAPAB) == 0) 569 error = ENOTSUP; 570 } 571 } 572 573 /* 574 * Since the path MTU might change as a result of this 575 * change, we twiddle ixa_dce_generation to 576 * make conn_ip_output go through the ip_verify_dce code. 577 */ 578 ixa->ixa_dce_generation = DCE_GENERATION_VERIFY; 579 580 nce_refrele(ixa->ixa_nce); 581 ixa->ixa_nce = nce; 582 return (error); 583 } 584 585 /* 586 * Handle both IPv4 and IPv6. Reverify/recalculate the DCE to use. 587 */ 588 static int 589 ip_verify_dce(mblk_t *mp, ip_xmit_attr_t *ixa) 590 { 591 dce_t *dce; 592 uint_t gen; 593 uint_t pmtu; 594 595 dce = dce_lookup_pkt(mp, ixa, &gen); 596 ASSERT(dce != NULL); 597 598 dce_refrele_notr(ixa->ixa_dce); 599 #ifdef DEBUG 600 dce_refhold_notr(dce); 601 dce_refrele(dce); 602 #endif 603 ixa->ixa_dce = dce; 604 ixa->ixa_dce_generation = gen; 605 606 /* Extract the (path) mtu from the dce, ncec_ill etc */ 607 pmtu = ip_get_pmtu(ixa); 608 609 /* 610 * Tell ULP about PMTU changes - increase or decrease - by returning 611 * an error if IXAF_VERIFY_PMTU is set. In such case, ULP should update 612 * both ixa_pmtu and ixa_fragsize appropriately. 613 * 614 * If ULP doesn't set that flag then we need to update ixa_fragsize 615 * since routing could have changed the ill after after ixa_fragsize 616 * was set previously in the conn_ip_output path or in 617 * ip_set_destination. 618 * 619 * In case of LSO, ixa_fragsize might be greater than ixa_pmtu. 620 * 621 * In the case of a path MTU increase we send the packet after the 622 * notify to the ULP. 623 */ 624 if (ixa->ixa_flags & IXAF_VERIFY_PMTU) { 625 if (ixa->ixa_pmtu != pmtu) { 626 uint_t oldmtu = ixa->ixa_pmtu; 627 628 DTRACE_PROBE2(verify_pmtu, uint32_t, pmtu, 629 uint32_t, ixa->ixa_pmtu); 630 ASSERT(ixa->ixa_notify != NULL); 631 ixa->ixa_notify(ixa->ixa_notify_cookie, ixa, 632 IXAN_PMTU, pmtu); 633 if (pmtu < oldmtu) 634 return (EMSGSIZE); 635 } 636 } else { 637 ixa->ixa_fragsize = pmtu; 638 } 639 return (0); 640 } 641 642 /* 643 * Verify LSO usability. Keep the return value simple to indicate whether 644 * the LSO capability has changed. Handle both IPv4 and IPv6. 645 */ 646 static boolean_t 647 ip_verify_lso(ill_t *ill, ip_xmit_attr_t *ixa) 648 { 649 ill_lso_capab_t *lsoc = &ixa->ixa_lso_capab; 650 ill_lso_capab_t *new_lsoc = ill->ill_lso_capab; 651 652 if (ixa->ixa_flags & IXAF_LSO_CAPAB) { 653 /* 654 * Not unsable any more. 655 */ 656 if ((ixa->ixa_flags & IXAF_IPSEC_SECURE) || 657 (ixa->ixa_ire->ire_type & (IRE_LOCAL | IRE_LOOPBACK)) || 658 (ixa->ixa_ire->ire_flags & RTF_MULTIRT) || 659 ((ixa->ixa_flags & IXAF_IS_IPV4) ? 660 !ILL_LSO_TCP_IPV4_USABLE(ill) : 661 !ILL_LSO_TCP_IPV6_USABLE(ill))) { 662 ixa->ixa_flags &= ~IXAF_LSO_CAPAB; 663 664 return (B_FALSE); 665 } 666 667 /* 668 * Capability has changed, refresh the copy in ixa. 669 */ 670 if (lsoc->ill_lso_max != new_lsoc->ill_lso_max) { 671 *lsoc = *new_lsoc; 672 673 return (B_FALSE); 674 } 675 } else { /* Was not usable */ 676 if (!(ixa->ixa_flags & IXAF_IPSEC_SECURE) && 677 !(ixa->ixa_ire->ire_type & (IRE_LOCAL | IRE_LOOPBACK)) && 678 !(ixa->ixa_ire->ire_flags & RTF_MULTIRT) && 679 ((ixa->ixa_flags & IXAF_IS_IPV4) ? 680 ILL_LSO_TCP_IPV4_USABLE(ill) : 681 ILL_LSO_TCP_IPV6_USABLE(ill))) { 682 *lsoc = *new_lsoc; 683 ixa->ixa_flags |= IXAF_LSO_CAPAB; 684 685 return (B_FALSE); 686 } 687 } 688 689 return (B_TRUE); 690 } 691 692 /* 693 * Verify ZEROCOPY usability. Keep the return value simple to indicate whether 694 * the ZEROCOPY capability has changed. Handle both IPv4 and IPv6. 695 */ 696 static boolean_t 697 ip_verify_zcopy(ill_t *ill, ip_xmit_attr_t *ixa) 698 { 699 if (ixa->ixa_flags & IXAF_ZCOPY_CAPAB) { 700 /* 701 * Not unsable any more. 702 */ 703 if ((ixa->ixa_flags & IXAF_IPSEC_SECURE) || 704 (ixa->ixa_ire->ire_type & (IRE_LOCAL | IRE_LOOPBACK)) || 705 (ixa->ixa_ire->ire_flags & RTF_MULTIRT) || 706 !ILL_ZCOPY_USABLE(ill)) { 707 ixa->ixa_flags &= ~IXAF_ZCOPY_CAPAB; 708 709 return (B_FALSE); 710 } 711 } else { /* Was not usable */ 712 if (!(ixa->ixa_flags & IXAF_IPSEC_SECURE) && 713 !(ixa->ixa_ire->ire_type & (IRE_LOCAL | IRE_LOOPBACK)) && 714 !(ixa->ixa_ire->ire_flags & RTF_MULTIRT) && 715 ILL_ZCOPY_USABLE(ill)) { 716 ixa->ixa_flags |= IXAF_ZCOPY_CAPAB; 717 718 return (B_FALSE); 719 } 720 } 721 722 return (B_TRUE); 723 } 724 725 726 /* 727 * When there is no conn_t context, this will send a packet. 728 * The caller must *not* have called conn_connect() or ip_attr_connect() 729 * before calling ip_output_simple(). 730 * Handles IPv4 and IPv6. Returns zero or an errno such as ENETUNREACH. 731 * Honors IXAF_SET_SOURCE. 732 * 733 * We acquire the ire and after calling ire_sendfn we release 734 * the hold on the ire. Ditto for the nce and dce. 735 * 736 * This assumes that the caller has set the following in ip_xmit_attr_t: 737 * ixa_tsl, ixa_zoneid, and ixa_ipst must always be set. 738 * If ixa_ifindex is non-zero it means send out that ill. (If it is 739 * an upper IPMP ill we load balance across the group; if a lower we send 740 * on that lower ill without load balancing.) 741 * IXAF_IS_IPV4 must be set correctly. 742 * If IXAF_IPSEC_SECURE is set then the ixa_ipsec_* fields must be set. 743 * If IXAF_NO_IPSEC is set we'd skip IPsec policy lookup. 744 * If neither of those two are set we do an IPsec policy lookup. 745 * 746 * We handle setting things like 747 * ixa_pktlen 748 * ixa_ip_hdr_length 749 * ixa->ixa_protocol 750 * 751 * The caller may set ixa_xmit_hint, which is used for ECMP selection and 752 * transmit ring selecting in GLD. 753 * 754 * The caller must do an ixa_cleanup() to release any IPsec references 755 * after we return. 756 */ 757 int 758 ip_output_simple(mblk_t *mp, ip_xmit_attr_t *ixa) 759 { 760 ts_label_t *effective_tsl = NULL; 761 int err; 762 763 ASSERT(ixa->ixa_ipst != NULL); 764 765 if (is_system_labeled()) { 766 ip_stack_t *ipst = ixa->ixa_ipst; 767 768 if (ixa->ixa_flags & IXAF_IS_IPV4) { 769 err = tsol_check_label_v4(ixa->ixa_tsl, ixa->ixa_zoneid, 770 &mp, CONN_MAC_DEFAULT, B_FALSE, ixa->ixa_ipst, 771 &effective_tsl); 772 } else { 773 err = tsol_check_label_v6(ixa->ixa_tsl, ixa->ixa_zoneid, 774 &mp, CONN_MAC_DEFAULT, B_FALSE, ixa->ixa_ipst, 775 &effective_tsl); 776 } 777 if (err != 0) { 778 ip2dbg(("tsol_check: label check failed (%d)\n", err)); 779 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests); 780 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards); 781 ip_drop_output("tsol_check_label", mp, NULL); 782 freemsg(mp); 783 return (err); 784 } 785 if (effective_tsl != NULL) { 786 /* Update the label */ 787 ip_xmit_attr_replace_tsl(ixa, effective_tsl); 788 } 789 } 790 791 if (ixa->ixa_flags & IXAF_IS_IPV4) 792 return (ip_output_simple_v4(mp, ixa)); 793 else 794 return (ip_output_simple_v6(mp, ixa)); 795 } 796 797 int 798 ip_output_simple_v4(mblk_t *mp, ip_xmit_attr_t *ixa) 799 { 800 ipha_t *ipha; 801 ipaddr_t firsthop; /* In IP header */ 802 ipaddr_t dst; /* End of source route, or ipha_dst if none */ 803 ire_t *ire; 804 ipaddr_t setsrc; /* RTF_SETSRC */ 805 int error; 806 ill_t *ill = NULL; 807 dce_t *dce = NULL; 808 nce_t *nce; 809 iaflags_t ixaflags = ixa->ixa_flags; 810 ip_stack_t *ipst = ixa->ixa_ipst; 811 boolean_t repeat = B_FALSE; 812 boolean_t multirt = B_FALSE; 813 814 ipha = (ipha_t *)mp->b_rptr; 815 ASSERT(IPH_HDR_VERSION(ipha) == IPV4_VERSION); 816 817 /* 818 * Even on labeled systems we can have a NULL ixa_tsl e.g., 819 * for IGMP/MLD traffic. 820 */ 821 822 /* Caller already set flags */ 823 ASSERT(ixa->ixa_flags & IXAF_IS_IPV4); 824 825 ASSERT(ixa->ixa_nce == NULL); 826 827 ixa->ixa_pktlen = ntohs(ipha->ipha_length); 828 ASSERT(ixa->ixa_pktlen == msgdsize(mp)); 829 ixa->ixa_ip_hdr_length = IPH_HDR_LENGTH(ipha); 830 ixa->ixa_protocol = ipha->ipha_protocol; 831 832 /* 833 * Assumes that source routed packets have already been massaged by 834 * the ULP (ip_massage_options) and as a result ipha_dst is the next 835 * hop in the source route. The final destination is used for IPsec 836 * policy and DCE lookup. 837 */ 838 firsthop = ipha->ipha_dst; 839 dst = ip_get_dst(ipha); 840 841 repeat_ire: 842 error = 0; 843 setsrc = INADDR_ANY; 844 ire = ip_select_route_v4(firsthop, ixa, NULL, &setsrc, &error, 845 &multirt); 846 ASSERT(ire != NULL); /* IRE_NOROUTE if none found */ 847 if (error != 0) { 848 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests); 849 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards); 850 ip_drop_output("ipIfStatsOutDiscards - select route", mp, NULL); 851 freemsg(mp); 852 goto done; 853 } 854 855 if (ire->ire_flags & (RTF_BLACKHOLE|RTF_REJECT)) { 856 /* ire_ill might be NULL hence need to skip some code */ 857 if (ixaflags & IXAF_SET_SOURCE) 858 ipha->ipha_src = htonl(INADDR_LOOPBACK); 859 ixa->ixa_fragsize = IP_MAXPACKET; 860 ill = NULL; 861 nce = NULL; 862 ire->ire_ob_pkt_count++; 863 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests); 864 /* No dce yet; use default one */ 865 error = (ire->ire_sendfn)(ire, mp, ipha, ixa, 866 &ipst->ips_dce_default->dce_ident); 867 goto done; 868 } 869 870 /* Note that ipha_dst is only used for IRE_MULTICAST */ 871 nce = ire_to_nce(ire, ipha->ipha_dst, NULL); 872 if (nce == NULL) { 873 /* Allocation failure? */ 874 ip_drop_output("ire_to_nce", mp, ill); 875 freemsg(mp); 876 error = ENOBUFS; 877 goto done; 878 } 879 if (nce->nce_is_condemned) { 880 nce_t *nce1; 881 882 nce1 = ire_handle_condemned_nce(nce, ire, ipha, NULL, B_TRUE); 883 nce_refrele(nce); 884 if (nce1 == NULL) { 885 if (!repeat) { 886 /* Try finding a better IRE */ 887 repeat = B_TRUE; 888 ire_refrele(ire); 889 goto repeat_ire; 890 } 891 /* Tried twice - drop packet */ 892 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards); 893 ip_drop_output("No nce", mp, ill); 894 freemsg(mp); 895 error = ENOBUFS; 896 goto done; 897 } 898 nce = nce1; 899 } 900 901 /* 902 * For multicast with multirt we have a flag passed back from 903 * ire_lookup_multi_ill_v4 since we don't have an IRE for each 904 * possible multicast address. 905 * We also need a flag for multicast since we can't check 906 * whether RTF_MULTIRT is set in ixa_ire for multicast. 907 */ 908 if (multirt) { 909 ixa->ixa_postfragfn = ip_postfrag_multirt_v4; 910 ixa->ixa_flags |= IXAF_MULTIRT_MULTICAST; 911 } else { 912 ixa->ixa_postfragfn = ire->ire_postfragfn; 913 ixa->ixa_flags &= ~IXAF_MULTIRT_MULTICAST; 914 } 915 ASSERT(ixa->ixa_nce == NULL); 916 ixa->ixa_nce = nce; 917 918 /* 919 * Check for a dce_t with a path mtu. 920 */ 921 dce = dce_lookup_v4(dst, ipst, NULL); 922 ASSERT(dce != NULL); 923 924 if (!(ixaflags & IXAF_PMTU_DISCOVERY)) { 925 ixa->ixa_fragsize = ip_get_base_mtu(nce->nce_ill, ire); 926 } else if (dce->dce_flags & DCEF_PMTU) { 927 /* 928 * To avoid a periodic timer to increase the path MTU we 929 * look at dce_last_change_time each time we send a packet. 930 */ 931 if (TICK_TO_SEC(lbolt64) - dce->dce_last_change_time > 932 ipst->ips_ip_pathmtu_interval) { 933 /* 934 * Older than 20 minutes. Drop the path MTU information. 935 */ 936 mutex_enter(&dce->dce_lock); 937 dce->dce_flags &= ~(DCEF_PMTU|DCEF_TOO_SMALL_PMTU); 938 dce->dce_last_change_time = TICK_TO_SEC(lbolt64); 939 mutex_exit(&dce->dce_lock); 940 dce_increment_generation(dce); 941 ixa->ixa_fragsize = ip_get_base_mtu(nce->nce_ill, ire); 942 } else { 943 uint_t fragsize; 944 945 fragsize = ip_get_base_mtu(nce->nce_ill, ire); 946 if (fragsize > dce->dce_pmtu) 947 fragsize = dce->dce_pmtu; 948 ixa->ixa_fragsize = fragsize; 949 } 950 } else { 951 ixa->ixa_fragsize = ip_get_base_mtu(nce->nce_ill, ire); 952 } 953 954 /* 955 * We use use ire_nexthop_ill (and not ncec_ill) to avoid the under ipmp 956 * interface for source address selection. 957 */ 958 ill = ire_nexthop_ill(ire); 959 960 if (ixaflags & IXAF_SET_SOURCE) { 961 ipaddr_t src; 962 963 /* 964 * We use the final destination to get 965 * correct selection for source routed packets 966 */ 967 968 /* If unreachable we have no ill but need some source */ 969 if (ill == NULL) { 970 src = htonl(INADDR_LOOPBACK); 971 error = 0; 972 } else { 973 error = ip_select_source_v4(ill, setsrc, dst, 974 ixa->ixa_multicast_ifaddr, ixa->ixa_zoneid, ipst, 975 &src, NULL, NULL); 976 } 977 if (error != 0) { 978 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCOutRequests); 979 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); 980 ip_drop_output("ipIfStatsOutDiscards - no source", 981 mp, ill); 982 freemsg(mp); 983 goto done; 984 } 985 ipha->ipha_src = src; 986 } else if (ixaflags & IXAF_VERIFY_SOURCE) { 987 /* Check if the IP source is assigned to the host. */ 988 if (!ip_verify_src(mp, ixa, NULL)) { 989 /* Don't send a packet with a source that isn't ours */ 990 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests); 991 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards); 992 ip_drop_output("ipIfStatsOutDiscards - invalid source", 993 mp, ill); 994 freemsg(mp); 995 error = EADDRNOTAVAIL; 996 goto done; 997 } 998 } 999 1000 1001 /* 1002 * Check against global IPsec policy to set the AH/ESP attributes. 1003 * IPsec will set IXAF_IPSEC_* and ixa_ipsec_* as appropriate. 1004 */ 1005 if (!(ixaflags & (IXAF_NO_IPSEC|IXAF_IPSEC_SECURE))) { 1006 ASSERT(ixa->ixa_ipsec_policy == NULL); 1007 mp = ip_output_attach_policy(mp, ipha, NULL, NULL, ixa); 1008 if (mp == NULL) { 1009 /* MIB and ip_drop_packet already done */ 1010 return (EHOSTUNREACH); /* IPsec policy failure */ 1011 } 1012 } 1013 1014 if (ill != NULL) { 1015 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCOutRequests); 1016 } else { 1017 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests); 1018 } 1019 1020 /* 1021 * We update the statistics on the most specific IRE i.e., the first 1022 * one we found. 1023 * We don't have an IRE when we fragment, hence ire_ob_pkt_count 1024 * can only count the use prior to fragmentation. However the MIB 1025 * counters on the ill will be incremented in post fragmentation. 1026 */ 1027 ire->ire_ob_pkt_count++; 1028 1029 /* 1030 * Based on ire_type and ire_flags call one of: 1031 * ire_send_local_v4 - for IRE_LOCAL and IRE_LOOPBACK 1032 * ire_send_multirt_v4 - if RTF_MULTIRT 1033 * ire_send_noroute_v4 - if RTF_REJECT or RTF_BLACHOLE 1034 * ire_send_multicast_v4 - for IRE_MULTICAST 1035 * ire_send_broadcast_v4 - for IRE_BROADCAST 1036 * ire_send_wire_v4 - for the rest. 1037 */ 1038 error = (ire->ire_sendfn)(ire, mp, ipha, ixa, &dce->dce_ident); 1039 done: 1040 ire_refrele(ire); 1041 if (dce != NULL) 1042 dce_refrele(dce); 1043 if (ill != NULL) 1044 ill_refrele(ill); 1045 if (ixa->ixa_nce != NULL) 1046 nce_refrele(ixa->ixa_nce); 1047 ixa->ixa_nce = NULL; 1048 return (error); 1049 } 1050 1051 /* 1052 * ire_sendfn() functions. 1053 * These functions use the following xmit_attr: 1054 * - ixa_fragsize - read to determine whether or not to fragment 1055 * - IXAF_IPSEC_SECURE - to determine whether or not to invoke IPsec 1056 * - ixa_ipsec_* are used inside IPsec 1057 * - IXAF_SET_SOURCE - replace IP source in broadcast case. 1058 * - IXAF_LOOPBACK_COPY - for multicast and broadcast 1059 */ 1060 1061 1062 /* 1063 * ire_sendfn for IRE_LOCAL and IRE_LOOPBACK 1064 * 1065 * The checks for restrict_interzone_loopback are done in ire_route_recursive. 1066 */ 1067 /* ARGSUSED4 */ 1068 int 1069 ire_send_local_v4(ire_t *ire, mblk_t *mp, void *iph_arg, 1070 ip_xmit_attr_t *ixa, uint32_t *identp) 1071 { 1072 ipha_t *ipha = (ipha_t *)iph_arg; 1073 ip_stack_t *ipst = ixa->ixa_ipst; 1074 ill_t *ill = ire->ire_ill; 1075 ip_recv_attr_t iras; /* NOTE: No bzero for performance */ 1076 uint_t pktlen = ixa->ixa_pktlen; 1077 1078 /* 1079 * No fragmentation, no nce, no application of IPsec, 1080 * and no ipha_ident assignment. 1081 * 1082 * Note different order between IP provider and FW_HOOKS than in 1083 * send_wire case. 1084 */ 1085 1086 /* 1087 * DTrace this as ip:::send. A packet blocked by FW_HOOKS will fire the 1088 * send probe, but not the receive probe. 1089 */ 1090 DTRACE_IP7(send, mblk_t *, mp, conn_t *, NULL, void_ip_t *, 1091 ipha, __dtrace_ipsr_ill_t *, ill, ipha_t *, ipha, ip6_t *, NULL, 1092 int, 1); 1093 1094 if (HOOKS4_INTERESTED_LOOPBACK_OUT(ipst)) { 1095 int error; 1096 1097 DTRACE_PROBE4(ip4__loopback__out__start, ill_t *, NULL, 1098 ill_t *, ill, ipha_t *, ipha, mblk_t *, mp); 1099 FW_HOOKS(ipst->ips_ip4_loopback_out_event, 1100 ipst->ips_ipv4firewall_loopback_out, 1101 NULL, ill, ipha, mp, mp, 0, ipst, error); 1102 DTRACE_PROBE1(ip4__loopback__out__end, mblk_t *, mp); 1103 if (mp == NULL) 1104 return (error); 1105 1106 /* 1107 * Even if the destination was changed by the filter we use the 1108 * forwarding decision that was made based on the address 1109 * in ip_output/ip_set_destination. 1110 */ 1111 /* Length could be different */ 1112 ipha = (ipha_t *)mp->b_rptr; 1113 pktlen = ntohs(ipha->ipha_length); 1114 } 1115 1116 /* 1117 * If a callback is enabled then we need to know the 1118 * source and destination zoneids for the packet. We already 1119 * have those handy. 1120 */ 1121 if (ipst->ips_ip4_observe.he_interested) { 1122 zoneid_t szone, dzone; 1123 zoneid_t stackzoneid; 1124 1125 stackzoneid = netstackid_to_zoneid( 1126 ipst->ips_netstack->netstack_stackid); 1127 1128 if (stackzoneid == GLOBAL_ZONEID) { 1129 /* Shared-IP zone */ 1130 dzone = ire->ire_zoneid; 1131 szone = ixa->ixa_zoneid; 1132 } else { 1133 szone = dzone = stackzoneid; 1134 } 1135 ipobs_hook(mp, IPOBS_HOOK_LOCAL, szone, dzone, ill, ipst); 1136 } 1137 1138 /* Handle lo0 stats */ 1139 ipst->ips_loopback_packets++; 1140 1141 /* Map ixa to ira including IPsec policies */ 1142 ipsec_out_to_in(ixa, ill, &iras); 1143 iras.ira_pktlen = pktlen; 1144 1145 if (!IS_SIMPLE_IPH(ipha)) { 1146 ip_output_local_options(ipha, ipst); 1147 iras.ira_flags |= IRAF_IPV4_OPTIONS; 1148 } 1149 1150 if (HOOKS4_INTERESTED_LOOPBACK_IN(ipst)) { 1151 int error; 1152 1153 DTRACE_PROBE4(ip4__loopback__in__start, ill_t *, ill, 1154 ill_t *, NULL, ipha_t *, ipha, mblk_t *, mp); 1155 FW_HOOKS(ipst->ips_ip4_loopback_in_event, 1156 ipst->ips_ipv4firewall_loopback_in, 1157 ill, NULL, ipha, mp, mp, 0, ipst, error); 1158 1159 DTRACE_PROBE1(ip4__loopback__in__end, mblk_t *, mp); 1160 if (mp == NULL) { 1161 ira_cleanup(&iras, B_FALSE); 1162 return (error); 1163 } 1164 /* 1165 * Even if the destination was changed by the filter we use the 1166 * forwarding decision that was made based on the address 1167 * in ip_output/ip_set_destination. 1168 */ 1169 /* Length could be different */ 1170 ipha = (ipha_t *)mp->b_rptr; 1171 pktlen = iras.ira_pktlen = ntohs(ipha->ipha_length); 1172 } 1173 1174 DTRACE_IP7(receive, mblk_t *, mp, conn_t *, NULL, void_ip_t *, 1175 ipha, __dtrace_ipsr_ill_t *, ill, ipha_t *, ipha, ip6_t *, NULL, 1176 int, 1); 1177 1178 ire->ire_ib_pkt_count++; 1179 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInReceives); 1180 UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCInOctets, pktlen); 1181 1182 /* Destined to ire_zoneid - use that for fanout */ 1183 iras.ira_zoneid = ire->ire_zoneid; 1184 1185 if (is_system_labeled()) { 1186 iras.ira_flags |= IRAF_SYSTEM_LABELED; 1187 1188 /* 1189 * This updates ira_cred, ira_tsl and ira_free_flags based 1190 * on the label. We don't expect this to ever fail for 1191 * loopback packets, so we silently drop the packet should it 1192 * fail. 1193 */ 1194 if (!tsol_get_pkt_label(mp, IPV4_VERSION, &iras)) { 1195 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 1196 ip_drop_input("tsol_get_pkt_label", mp, ill); 1197 freemsg(mp); 1198 return (0); 1199 } 1200 ASSERT(iras.ira_tsl != NULL); 1201 1202 /* tsol_get_pkt_label sometimes does pullupmsg */ 1203 ipha = (ipha_t *)mp->b_rptr; 1204 } 1205 1206 ip_fanout_v4(mp, ipha, &iras); 1207 1208 /* We moved any IPsec refs from ixa to iras */ 1209 ira_cleanup(&iras, B_FALSE); 1210 return (0); 1211 } 1212 1213 /* 1214 * ire_sendfn for IRE_BROADCAST 1215 * If the broadcast address is present on multiple ills and ixa_ifindex 1216 * isn't set, then we generate 1217 * a separate datagram (potentially with different source address) for 1218 * those ills. In any case, only one copy is looped back to ip_input_v4. 1219 */ 1220 int 1221 ire_send_broadcast_v4(ire_t *ire, mblk_t *mp, void *iph_arg, 1222 ip_xmit_attr_t *ixa, uint32_t *identp) 1223 { 1224 ipha_t *ipha = (ipha_t *)iph_arg; 1225 ip_stack_t *ipst = ixa->ixa_ipst; 1226 irb_t *irb = ire->ire_bucket; 1227 ire_t *ire1; 1228 mblk_t *mp1; 1229 ipha_t *ipha1; 1230 iaflags_t ixaflags = ixa->ixa_flags; 1231 nce_t *nce1, *nce_orig; 1232 1233 /* 1234 * Unless ire_send_multirt_v4 already set a ttl, force the 1235 * ttl to a smallish value. 1236 */ 1237 if (!(ixa->ixa_flags & IXAF_NO_TTL_CHANGE)) { 1238 /* 1239 * To avoid broadcast storms, we usually set the TTL to 1 for 1240 * broadcasts. This can 1241 * be overridden stack-wide through the ip_broadcast_ttl 1242 * ndd tunable, or on a per-connection basis through the 1243 * IP_BROADCAST_TTL socket option. 1244 * 1245 * If SO_DONTROUTE/IXAF_DONTROUTE is set, then ire_send_wire_v4 1246 * will force ttl to one after we've set this. 1247 */ 1248 if (ixaflags & IXAF_BROADCAST_TTL_SET) 1249 ipha->ipha_ttl = ixa->ixa_broadcast_ttl; 1250 else 1251 ipha->ipha_ttl = ipst->ips_ip_broadcast_ttl; 1252 } 1253 /* 1254 * Make sure we get a loopback copy (after IPsec and frag) 1255 * Skip hardware checksum so that loopback copy is checksumed. 1256 */ 1257 ixa->ixa_flags |= IXAF_LOOPBACK_COPY | IXAF_NO_HW_CKSUM; 1258 1259 /* Do we need to potentially generate multiple copies? */ 1260 if (irb->irb_ire_cnt == 1 || ixa->ixa_ifindex != 0) 1261 return (ire_send_wire_v4(ire, mp, ipha, ixa, identp)); 1262 1263 /* 1264 * Loop over all IRE_BROADCAST in the bucket (might only be one). 1265 * Note that everything in the bucket has the same destination address. 1266 */ 1267 irb_refhold(irb); 1268 for (ire1 = irb->irb_ire; ire1 != NULL; ire1 = ire1->ire_next) { 1269 /* We do the main IRE after the end of the loop */ 1270 if (ire1 == ire) 1271 continue; 1272 1273 /* 1274 * Only IREs for the same IP address should be in the same 1275 * bucket. 1276 * But could have IRE_HOSTs in the case of CGTP. 1277 * If we find any multirt routes we bail out of the loop 1278 * and just do the single packet at the end; ip_postfrag_multirt 1279 * will duplicate the packet. 1280 */ 1281 ASSERT(ire1->ire_addr == ire->ire_addr); 1282 if (!(ire1->ire_type & IRE_BROADCAST)) 1283 continue; 1284 1285 if (IRE_IS_CONDEMNED(ire1)) 1286 continue; 1287 1288 if (ixa->ixa_zoneid != ALL_ZONES && 1289 ire->ire_zoneid != ire1->ire_zoneid) 1290 continue; 1291 1292 ASSERT(ire->ire_ill != ire1->ire_ill && ire1->ire_ill != NULL); 1293 1294 if (ire1->ire_flags & RTF_MULTIRT) 1295 break; 1296 1297 /* 1298 * For IPMP we only send for the ipmp_ill. arp_nce_init() will 1299 * ensure that this goes out on the cast_ill. 1300 */ 1301 if (IS_UNDER_IPMP(ire1->ire_ill)) 1302 continue; 1303 1304 mp1 = copymsg(mp); 1305 if (mp1 == NULL) { 1306 BUMP_MIB(ire1->ire_ill->ill_ip_mib, 1307 ipIfStatsOutDiscards); 1308 ip_drop_output("ipIfStatsOutDiscards", 1309 mp, ire1->ire_ill); 1310 continue; 1311 } 1312 1313 ipha1 = (ipha_t *)mp1->b_rptr; 1314 if (ixa->ixa_flags & IXAF_SET_SOURCE) { 1315 /* 1316 * Need to pick a different source address for each 1317 * interface. If we have a global IPsec policy and 1318 * no per-socket policy then we punt to 1319 * ip_output_simple_v4 using a separate ip_xmit_attr_t. 1320 */ 1321 if (ixaflags & IXAF_IPSEC_GLOBAL_POLICY) { 1322 ip_output_simple_broadcast(ixa, mp1); 1323 continue; 1324 } 1325 /* Pick a new source address for each interface */ 1326 if (ip_select_source_v4(ire1->ire_ill, INADDR_ANY, 1327 ipha1->ipha_dst, INADDR_ANY, ixa->ixa_zoneid, ipst, 1328 &ipha1->ipha_src, NULL, NULL) != 0) { 1329 BUMP_MIB(ire1->ire_ill->ill_ip_mib, 1330 ipIfStatsOutDiscards); 1331 ip_drop_output("ipIfStatsOutDiscards - select " 1332 "broadcast source", mp1, ire1->ire_ill); 1333 freemsg(mp1); 1334 continue; 1335 } 1336 /* 1337 * Check against global IPsec policy to set the AH/ESP 1338 * attributes. IPsec will set IXAF_IPSEC_* and 1339 * ixa_ipsec_* as appropriate. 1340 */ 1341 if (!(ixaflags & (IXAF_NO_IPSEC|IXAF_IPSEC_SECURE))) { 1342 ASSERT(ixa->ixa_ipsec_policy == NULL); 1343 mp1 = ip_output_attach_policy(mp1, ipha, NULL, 1344 NULL, ixa); 1345 if (mp1 == NULL) { 1346 /* 1347 * MIB and ip_drop_packet already 1348 * done 1349 */ 1350 continue; 1351 } 1352 } 1353 } 1354 /* Make sure we have an NCE on this ill */ 1355 nce1 = arp_nce_init(ire1->ire_ill, ire1->ire_addr, 1356 ire1->ire_type); 1357 if (nce1 == NULL) { 1358 BUMP_MIB(ire1->ire_ill->ill_ip_mib, 1359 ipIfStatsOutDiscards); 1360 ip_drop_output("ipIfStatsOutDiscards - broadcast nce", 1361 mp1, ire1->ire_ill); 1362 freemsg(mp1); 1363 continue; 1364 } 1365 nce_orig = ixa->ixa_nce; 1366 ixa->ixa_nce = nce1; 1367 1368 ire_refhold(ire1); 1369 /* 1370 * Ignore any errors here. We just collect the errno for 1371 * the main ire below 1372 */ 1373 (void) ire_send_wire_v4(ire1, mp1, ipha1, ixa, identp); 1374 ire_refrele(ire1); 1375 1376 ixa->ixa_nce = nce_orig; 1377 nce_refrele(nce1); 1378 1379 ixa->ixa_flags &= ~IXAF_LOOPBACK_COPY; 1380 } 1381 irb_refrele(irb); 1382 /* Finally, the main one */ 1383 1384 /* 1385 * For IPMP we only send broadcasts on the ipmp_ill. 1386 */ 1387 if (IS_UNDER_IPMP(ire->ire_ill)) { 1388 freemsg(mp); 1389 return (0); 1390 } 1391 1392 return (ire_send_wire_v4(ire, mp, ipha, ixa, identp)); 1393 } 1394 1395 /* 1396 * Send a packet using a different source address and different 1397 * IPsec policy. 1398 */ 1399 static void 1400 ip_output_simple_broadcast(ip_xmit_attr_t *ixa, mblk_t *mp) 1401 { 1402 ip_xmit_attr_t ixas; 1403 1404 bzero(&ixas, sizeof (ixas)); 1405 ixas.ixa_flags = IXAF_BASIC_SIMPLE_V4; 1406 ixas.ixa_zoneid = ixa->ixa_zoneid; 1407 ixas.ixa_ifindex = 0; 1408 ixas.ixa_ipst = ixa->ixa_ipst; 1409 ixas.ixa_cred = ixa->ixa_cred; 1410 ixas.ixa_cpid = ixa->ixa_cpid; 1411 ixas.ixa_tsl = ixa->ixa_tsl; 1412 ixas.ixa_multicast_ttl = IP_DEFAULT_MULTICAST_TTL; 1413 1414 (void) ip_output_simple(mp, &ixas); 1415 ixa_cleanup(&ixas); 1416 } 1417 1418 1419 static void 1420 multirt_check_v4(ire_t *ire, ipha_t *ipha, ip_xmit_attr_t *ixa) 1421 { 1422 ip_stack_t *ipst = ixa->ixa_ipst; 1423 1424 /* Limit the TTL on multirt packets */ 1425 if (ire->ire_type & IRE_MULTICAST) { 1426 if (ipha->ipha_ttl > 1) { 1427 ip2dbg(("ire_send_multirt_v4: forcing multicast " 1428 "multirt TTL to 1 (was %d), dst 0x%08x\n", 1429 ipha->ipha_ttl, ntohl(ire->ire_addr))); 1430 ipha->ipha_ttl = 1; 1431 } 1432 ixa->ixa_flags |= IXAF_NO_TTL_CHANGE; 1433 } else if ((ipst->ips_ip_multirt_ttl > 0) && 1434 (ipha->ipha_ttl > ipst->ips_ip_multirt_ttl)) { 1435 ipha->ipha_ttl = ipst->ips_ip_multirt_ttl; 1436 /* 1437 * Need to ensure we don't increase the ttl should we go through 1438 * ire_send_broadcast or multicast. 1439 */ 1440 ixa->ixa_flags |= IXAF_NO_TTL_CHANGE; 1441 } 1442 } 1443 1444 /* 1445 * ire_sendfn for IRE_MULTICAST 1446 */ 1447 int 1448 ire_send_multicast_v4(ire_t *ire, mblk_t *mp, void *iph_arg, 1449 ip_xmit_attr_t *ixa, uint32_t *identp) 1450 { 1451 ipha_t *ipha = (ipha_t *)iph_arg; 1452 ip_stack_t *ipst = ixa->ixa_ipst; 1453 ill_t *ill = ire->ire_ill; 1454 iaflags_t ixaflags = ixa->ixa_flags; 1455 1456 /* 1457 * The IRE_MULTICAST is the same whether or not multirt is in use. 1458 * Hence we need special-case code. 1459 */ 1460 if (ixaflags & IXAF_MULTIRT_MULTICAST) 1461 multirt_check_v4(ire, ipha, ixa); 1462 1463 /* 1464 * Check if anything in ip_input_v4 wants a copy of the transmitted 1465 * packet (after IPsec and fragmentation) 1466 * 1467 * 1. Multicast routers always need a copy unless SO_DONTROUTE is set 1468 * RSVP and the rsvp daemon is an example of a 1469 * protocol and user level process that 1470 * handles it's own routing. Hence, it uses the 1471 * SO_DONTROUTE option to accomplish this. 1472 * 2. If the sender has set IP_MULTICAST_LOOP, then we just 1473 * check whether there are any receivers for the group on the ill 1474 * (ignoring the zoneid). 1475 * 3. If IP_MULTICAST_LOOP is not set, then we check if there are 1476 * any members in other shared-IP zones. 1477 * If such members exist, then we indicate that the sending zone 1478 * shouldn't get a loopback copy to preserve the IP_MULTICAST_LOOP 1479 * behavior. 1480 * 1481 * When we loopback we skip hardware checksum to make sure loopback 1482 * copy is checksumed. 1483 * 1484 * Note that ire_ill is the upper in the case of IPMP. 1485 */ 1486 ixa->ixa_flags &= ~(IXAF_LOOPBACK_COPY | IXAF_NO_HW_CKSUM); 1487 if (ipst->ips_ip_g_mrouter && ill->ill_mrouter_cnt > 0 && 1488 !(ixaflags & IXAF_DONTROUTE)) { 1489 ixa->ixa_flags |= IXAF_LOOPBACK_COPY | IXAF_NO_HW_CKSUM; 1490 } else if (ixaflags & IXAF_MULTICAST_LOOP) { 1491 /* 1492 * If this zone or any other zone has members then loopback 1493 * a copy. 1494 */ 1495 if (ill_hasmembers_v4(ill, ipha->ipha_dst)) 1496 ixa->ixa_flags |= IXAF_LOOPBACK_COPY | IXAF_NO_HW_CKSUM; 1497 } else if (ipst->ips_netstack->netstack_numzones > 1) { 1498 /* 1499 * This zone should not have a copy. But there are some other 1500 * zones which might have members. 1501 */ 1502 if (ill_hasmembers_otherzones_v4(ill, ipha->ipha_dst, 1503 ixa->ixa_zoneid)) { 1504 ixa->ixa_flags |= IXAF_NO_LOOP_ZONEID_SET; 1505 ixa->ixa_no_loop_zoneid = ixa->ixa_zoneid; 1506 ixa->ixa_flags |= IXAF_LOOPBACK_COPY | IXAF_NO_HW_CKSUM; 1507 } 1508 } 1509 1510 /* 1511 * Unless ire_send_multirt_v4 or icmp_output_hdrincl already set a ttl, 1512 * force the ttl to the IP_MULTICAST_TTL value 1513 */ 1514 if (!(ixaflags & IXAF_NO_TTL_CHANGE)) { 1515 ipha->ipha_ttl = ixa->ixa_multicast_ttl; 1516 } 1517 1518 return (ire_send_wire_v4(ire, mp, ipha, ixa, identp)); 1519 } 1520 1521 /* 1522 * ire_sendfn for IREs with RTF_MULTIRT 1523 */ 1524 int 1525 ire_send_multirt_v4(ire_t *ire, mblk_t *mp, void *iph_arg, 1526 ip_xmit_attr_t *ixa, uint32_t *identp) 1527 { 1528 ipha_t *ipha = (ipha_t *)iph_arg; 1529 1530 multirt_check_v4(ire, ipha, ixa); 1531 1532 if (ire->ire_type & IRE_MULTICAST) 1533 return (ire_send_multicast_v4(ire, mp, ipha, ixa, identp)); 1534 else if (ire->ire_type & IRE_BROADCAST) 1535 return (ire_send_broadcast_v4(ire, mp, ipha, ixa, identp)); 1536 else 1537 return (ire_send_wire_v4(ire, mp, ipha, ixa, identp)); 1538 } 1539 1540 /* 1541 * ire_sendfn for IREs with RTF_REJECT/RTF_BLACKHOLE, including IRE_NOROUTE 1542 */ 1543 int 1544 ire_send_noroute_v4(ire_t *ire, mblk_t *mp, void *iph_arg, 1545 ip_xmit_attr_t *ixa, uint32_t *identp) 1546 { 1547 ip_stack_t *ipst = ixa->ixa_ipst; 1548 ipha_t *ipha = (ipha_t *)iph_arg; 1549 ill_t *ill; 1550 ip_recv_attr_t iras; 1551 boolean_t dummy; 1552 1553 /* We assign an IP ident for nice errors */ 1554 ipha->ipha_ident = atomic_add_32_nv(identp, 1); 1555 1556 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutNoRoutes); 1557 1558 if (ire->ire_type & IRE_NOROUTE) { 1559 /* A lack of a route as opposed to RTF_REJECT|BLACKHOLE */ 1560 ip_rts_change(RTM_MISS, ipha->ipha_dst, 0, 0, 0, 0, 0, 0, 1561 RTA_DST, ipst); 1562 } 1563 1564 if (ire->ire_flags & RTF_BLACKHOLE) { 1565 ip_drop_output("ipIfStatsOutNoRoutes RTF_BLACKHOLE", mp, NULL); 1566 freemsg(mp); 1567 /* No error even for local senders - silent blackhole */ 1568 return (0); 1569 } 1570 ip_drop_output("ipIfStatsOutNoRoutes RTF_REJECT", mp, NULL); 1571 1572 /* 1573 * We need an ill_t for the ip_recv_attr_t even though this packet 1574 * was never received and icmp_unreachable doesn't currently use 1575 * ira_ill. 1576 */ 1577 ill = ill_lookup_on_name("lo0", B_FALSE, 1578 !(ixa->ixa_flags & IRAF_IS_IPV4), &dummy, ipst); 1579 if (ill == NULL) { 1580 freemsg(mp); 1581 return (EHOSTUNREACH); 1582 } 1583 1584 bzero(&iras, sizeof (iras)); 1585 /* Map ixa to ira including IPsec policies */ 1586 ipsec_out_to_in(ixa, ill, &iras); 1587 1588 if (ip_source_routed(ipha, ipst)) { 1589 icmp_unreachable(mp, ICMP_SOURCE_ROUTE_FAILED, &iras); 1590 } else { 1591 icmp_unreachable(mp, ICMP_HOST_UNREACHABLE, &iras); 1592 } 1593 /* We moved any IPsec refs from ixa to iras */ 1594 ira_cleanup(&iras, B_FALSE); 1595 ill_refrele(ill); 1596 return (EHOSTUNREACH); 1597 } 1598 1599 /* 1600 * Calculate a checksum ignoring any hardware capabilities 1601 * 1602 * Returns B_FALSE if the packet was too short for the checksum. Caller 1603 * should free and do stats. 1604 */ 1605 static boolean_t 1606 ip_output_sw_cksum_v4(mblk_t *mp, ipha_t *ipha, ip_xmit_attr_t *ixa) 1607 { 1608 ip_stack_t *ipst = ixa->ixa_ipst; 1609 uint_t pktlen = ixa->ixa_pktlen; 1610 uint16_t *cksump; 1611 uint32_t cksum; 1612 uint8_t protocol = ixa->ixa_protocol; 1613 uint16_t ip_hdr_length = ixa->ixa_ip_hdr_length; 1614 ipaddr_t dst = ipha->ipha_dst; 1615 ipaddr_t src = ipha->ipha_src; 1616 1617 /* Just in case it contained garbage */ 1618 DB_CKSUMFLAGS(mp) &= ~HCK_FLAGS; 1619 1620 /* 1621 * Calculate ULP checksum 1622 */ 1623 if (protocol == IPPROTO_TCP) { 1624 cksump = IPH_TCPH_CHECKSUMP(ipha, ip_hdr_length); 1625 cksum = IP_TCP_CSUM_COMP; 1626 } else if (protocol == IPPROTO_UDP) { 1627 cksump = IPH_UDPH_CHECKSUMP(ipha, ip_hdr_length); 1628 cksum = IP_UDP_CSUM_COMP; 1629 } else if (protocol == IPPROTO_SCTP) { 1630 sctp_hdr_t *sctph; 1631 1632 ASSERT(MBLKL(mp) >= (ip_hdr_length + sizeof (*sctph))); 1633 sctph = (sctp_hdr_t *)(mp->b_rptr + ip_hdr_length); 1634 /* 1635 * Zero out the checksum field to ensure proper 1636 * checksum calculation. 1637 */ 1638 sctph->sh_chksum = 0; 1639 #ifdef DEBUG 1640 if (!skip_sctp_cksum) 1641 #endif 1642 sctph->sh_chksum = sctp_cksum(mp, ip_hdr_length); 1643 goto ip_hdr_cksum; 1644 } else { 1645 goto ip_hdr_cksum; 1646 } 1647 1648 /* ULP puts the checksum field is in the first mblk */ 1649 ASSERT(((uchar_t *)cksump) + sizeof (uint16_t) <= mp->b_wptr); 1650 1651 /* 1652 * We accumulate the pseudo header checksum in cksum. 1653 * This is pretty hairy code, so watch close. One 1654 * thing to keep in mind is that UDP and TCP have 1655 * stored their respective datagram lengths in their 1656 * checksum fields. This lines things up real nice. 1657 */ 1658 cksum += (dst >> 16) + (dst & 0xFFFF) + (src >> 16) + (src & 0xFFFF); 1659 1660 cksum = IP_CSUM(mp, ip_hdr_length, cksum); 1661 /* 1662 * For UDP/IPv4 a zero means that the packets wasn't checksummed. 1663 * Change to 0xffff 1664 */ 1665 if (protocol == IPPROTO_UDP && cksum == 0) 1666 *cksump = ~cksum; 1667 else 1668 *cksump = cksum; 1669 1670 IP_STAT(ipst, ip_out_sw_cksum); 1671 IP_STAT_UPDATE(ipst, ip_out_sw_cksum_bytes, pktlen); 1672 1673 ip_hdr_cksum: 1674 /* Calculate IPv4 header checksum */ 1675 ipha->ipha_hdr_checksum = 0; 1676 ipha->ipha_hdr_checksum = ip_csum_hdr(ipha); 1677 return (B_TRUE); 1678 } 1679 1680 /* 1681 * Calculate the ULP checksum - try to use hardware. 1682 * In the case of MULTIRT, broadcast or multicast the 1683 * IXAF_NO_HW_CKSUM is set in which case we use software. 1684 * 1685 * If the hardware supports IP header checksum offload; then clear the 1686 * contents of IP header checksum field as expected by NIC. 1687 * Do this only if we offloaded either full or partial sum. 1688 * 1689 * Returns B_FALSE if the packet was too short for the checksum. Caller 1690 * should free and do stats. 1691 */ 1692 static boolean_t 1693 ip_output_cksum_v4(iaflags_t ixaflags, mblk_t *mp, ipha_t *ipha, 1694 ip_xmit_attr_t *ixa, ill_t *ill) 1695 { 1696 uint_t pktlen = ixa->ixa_pktlen; 1697 uint16_t *cksump; 1698 uint16_t hck_flags; 1699 uint32_t cksum; 1700 uint8_t protocol = ixa->ixa_protocol; 1701 uint16_t ip_hdr_length = ixa->ixa_ip_hdr_length; 1702 1703 if ((ixaflags & IXAF_NO_HW_CKSUM) || !ILL_HCKSUM_CAPABLE(ill) || 1704 !dohwcksum) { 1705 return (ip_output_sw_cksum_v4(mp, ipha, ixa)); 1706 } 1707 1708 /* 1709 * Calculate ULP checksum. Note that we don't use cksump and cksum 1710 * if the ill has FULL support. 1711 */ 1712 if (protocol == IPPROTO_TCP) { 1713 cksump = IPH_TCPH_CHECKSUMP(ipha, ip_hdr_length); 1714 cksum = IP_TCP_CSUM_COMP; /* Pseudo-header cksum */ 1715 } else if (protocol == IPPROTO_UDP) { 1716 cksump = IPH_UDPH_CHECKSUMP(ipha, ip_hdr_length); 1717 cksum = IP_UDP_CSUM_COMP; /* Pseudo-header cksum */ 1718 } else if (protocol == IPPROTO_SCTP) { 1719 sctp_hdr_t *sctph; 1720 1721 ASSERT(MBLKL(mp) >= (ip_hdr_length + sizeof (*sctph))); 1722 sctph = (sctp_hdr_t *)(mp->b_rptr + ip_hdr_length); 1723 /* 1724 * Zero out the checksum field to ensure proper 1725 * checksum calculation. 1726 */ 1727 sctph->sh_chksum = 0; 1728 #ifdef DEBUG 1729 if (!skip_sctp_cksum) 1730 #endif 1731 sctph->sh_chksum = sctp_cksum(mp, ip_hdr_length); 1732 goto ip_hdr_cksum; 1733 } else { 1734 ip_hdr_cksum: 1735 /* Calculate IPv4 header checksum */ 1736 ipha->ipha_hdr_checksum = 0; 1737 ipha->ipha_hdr_checksum = ip_csum_hdr(ipha); 1738 return (B_TRUE); 1739 } 1740 1741 /* ULP puts the checksum field is in the first mblk */ 1742 ASSERT(((uchar_t *)cksump) + sizeof (uint16_t) <= mp->b_wptr); 1743 1744 /* 1745 * Underlying interface supports hardware checksum offload for 1746 * the payload; leave the payload checksum for the hardware to 1747 * calculate. N.B: We only need to set up checksum info on the 1748 * first mblk. 1749 */ 1750 hck_flags = ill->ill_hcksum_capab->ill_hcksum_txflags; 1751 1752 DB_CKSUMFLAGS(mp) &= ~HCK_FLAGS; 1753 if (hck_flags & HCKSUM_INET_FULL_V4) { 1754 /* 1755 * Hardware calculates pseudo-header, header and the 1756 * payload checksums, so clear the checksum field in 1757 * the protocol header. 1758 */ 1759 *cksump = 0; 1760 DB_CKSUMFLAGS(mp) |= HCK_FULLCKSUM; 1761 1762 ipha->ipha_hdr_checksum = 0; 1763 if (hck_flags & HCKSUM_IPHDRCKSUM) { 1764 DB_CKSUMFLAGS(mp) |= HCK_IPV4_HDRCKSUM; 1765 } else { 1766 ipha->ipha_hdr_checksum = ip_csum_hdr(ipha); 1767 } 1768 return (B_TRUE); 1769 } 1770 if ((hck_flags) & HCKSUM_INET_PARTIAL) { 1771 ipaddr_t dst = ipha->ipha_dst; 1772 ipaddr_t src = ipha->ipha_src; 1773 /* 1774 * Partial checksum offload has been enabled. Fill 1775 * the checksum field in the protocol header with the 1776 * pseudo-header checksum value. 1777 * 1778 * We accumulate the pseudo header checksum in cksum. 1779 * This is pretty hairy code, so watch close. One 1780 * thing to keep in mind is that UDP and TCP have 1781 * stored their respective datagram lengths in their 1782 * checksum fields. This lines things up real nice. 1783 */ 1784 cksum += (dst >> 16) + (dst & 0xFFFF) + 1785 (src >> 16) + (src & 0xFFFF); 1786 cksum += *(cksump); 1787 cksum = (cksum & 0xFFFF) + (cksum >> 16); 1788 *(cksump) = (cksum & 0xFFFF) + (cksum >> 16); 1789 1790 /* 1791 * Offsets are relative to beginning of IP header. 1792 */ 1793 DB_CKSUMSTART(mp) = ip_hdr_length; 1794 DB_CKSUMSTUFF(mp) = (uint8_t *)cksump - (uint8_t *)ipha; 1795 DB_CKSUMEND(mp) = pktlen; 1796 DB_CKSUMFLAGS(mp) |= HCK_PARTIALCKSUM; 1797 1798 ipha->ipha_hdr_checksum = 0; 1799 if (hck_flags & HCKSUM_IPHDRCKSUM) { 1800 DB_CKSUMFLAGS(mp) |= HCK_IPV4_HDRCKSUM; 1801 } else { 1802 ipha->ipha_hdr_checksum = ip_csum_hdr(ipha); 1803 } 1804 return (B_TRUE); 1805 } 1806 /* Hardware capabilities include neither full nor partial IPv4 */ 1807 return (ip_output_sw_cksum_v4(mp, ipha, ixa)); 1808 } 1809 1810 /* 1811 * ire_sendfn for offlink and onlink destinations. 1812 * Also called from the multicast, broadcast, multirt send functions. 1813 * 1814 * Assumes that the caller has a hold on the ire. 1815 * 1816 * This function doesn't care if the IRE just became condemned since that 1817 * can happen at any time. 1818 */ 1819 /* ARGSUSED */ 1820 int 1821 ire_send_wire_v4(ire_t *ire, mblk_t *mp, void *iph_arg, 1822 ip_xmit_attr_t *ixa, uint32_t *identp) 1823 { 1824 ip_stack_t *ipst = ixa->ixa_ipst; 1825 ipha_t *ipha = (ipha_t *)iph_arg; 1826 iaflags_t ixaflags = ixa->ixa_flags; 1827 ill_t *ill; 1828 1829 ASSERT(ixa->ixa_nce != NULL); 1830 ill = ixa->ixa_nce->nce_ill; 1831 1832 if (ixaflags & IXAF_DONTROUTE) 1833 ipha->ipha_ttl = 1; 1834 1835 /* 1836 * Assign an ident value for this packet. There could be other 1837 * threads targeting the same destination, so we have to arrange 1838 * for a atomic increment. Note that we use a 32-bit atomic add 1839 * because it has better performance than its 16-bit sibling. 1840 * 1841 * Normally ixa_extra_ident is 0, but in the case of LSO it will 1842 * be the number of TCP segments that the driver/hardware will 1843 * extraly construct. 1844 * 1845 * If running in cluster mode and if the source address 1846 * belongs to a replicated service then vector through 1847 * cl_inet_ipident vector to allocate ip identifier 1848 * NOTE: This is a contract private interface with the 1849 * clustering group. 1850 */ 1851 if (cl_inet_ipident != NULL) { 1852 ipaddr_t src = ipha->ipha_src; 1853 ipaddr_t dst = ipha->ipha_dst; 1854 netstackid_t stack_id = ipst->ips_netstack->netstack_stackid; 1855 1856 ASSERT(cl_inet_isclusterwide != NULL); 1857 if ((*cl_inet_isclusterwide)(stack_id, IPPROTO_IP, 1858 AF_INET, (uint8_t *)(uintptr_t)src, NULL)) { 1859 /* 1860 * Note: not correct with LSO since we can't allocate 1861 * ixa_extra_ident+1 consecutive values. 1862 */ 1863 ipha->ipha_ident = (*cl_inet_ipident)(stack_id, 1864 IPPROTO_IP, AF_INET, (uint8_t *)(uintptr_t)src, 1865 (uint8_t *)(uintptr_t)dst, NULL); 1866 } else { 1867 ipha->ipha_ident = atomic_add_32_nv(identp, 1868 ixa->ixa_extra_ident + 1); 1869 } 1870 } else { 1871 ipha->ipha_ident = atomic_add_32_nv(identp, 1872 ixa->ixa_extra_ident + 1); 1873 } 1874 #ifndef _BIG_ENDIAN 1875 ipha->ipha_ident = htons(ipha->ipha_ident); 1876 #endif 1877 1878 /* 1879 * This might set b_band, thus the IPsec and fragmentation 1880 * code in IP ensures that b_band is updated in the first mblk. 1881 */ 1882 if (IPP_ENABLED(IPP_LOCAL_OUT, ipst)) { 1883 /* ip_process translates an IS_UNDER_IPMP */ 1884 mp = ip_process(IPP_LOCAL_OUT, mp, ill, ill); 1885 if (mp == NULL) { 1886 /* ip_drop_packet and MIB done */ 1887 return (0); /* Might just be delayed */ 1888 } 1889 } 1890 1891 /* 1892 * Verify any IPv4 options. 1893 * 1894 * The presense of IP options also forces the network stack to 1895 * calculate the checksum in software. This is because: 1896 * 1897 * Wrap around: certain partial-checksum NICs (eri, ce) limit 1898 * the size of "start offset" width to 6-bit. This effectively 1899 * sets the largest value of the offset to 64-bytes, starting 1900 * from the MAC header. When the cumulative MAC and IP headers 1901 * exceed such limit, the offset will wrap around. This causes 1902 * the checksum to be calculated at the wrong place. 1903 * 1904 * IPv4 source routing: none of the full-checksum capable NICs 1905 * is capable of correctly handling the IPv4 source-routing 1906 * option for purposes of calculating the pseudo-header; the 1907 * actual destination is different from the destination in the 1908 * header which is that of the next-hop. (This case may not be 1909 * true for NICs which can parse IPv6 extension headers, but 1910 * we choose to simplify the implementation by not offloading 1911 * checksum when they are present.) 1912 */ 1913 if (!IS_SIMPLE_IPH(ipha)) { 1914 ixaflags = ixa->ixa_flags |= IXAF_NO_HW_CKSUM; 1915 /* An IS_UNDER_IPMP ill is ok here */ 1916 if (ip_output_options(mp, ipha, ixa, ill)) { 1917 /* Packet has been consumed and ICMP error sent */ 1918 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); 1919 return (EINVAL); 1920 } 1921 } 1922 1923 /* 1924 * To handle IPsec/iptun's labeling needs we need to tag packets 1925 * while we still have ixa_tsl 1926 */ 1927 if (is_system_labeled() && ixa->ixa_tsl != NULL && 1928 (ill->ill_mactype == DL_6TO4 || ill->ill_mactype == DL_IPV4 || 1929 ill->ill_mactype == DL_IPV6)) { 1930 cred_t *newcr; 1931 1932 newcr = copycred_from_tslabel(ixa->ixa_cred, ixa->ixa_tsl, 1933 KM_NOSLEEP); 1934 if (newcr == NULL) { 1935 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); 1936 ip_drop_output("ipIfStatsOutDiscards - newcr", 1937 mp, ill); 1938 freemsg(mp); 1939 return (ENOBUFS); 1940 } 1941 mblk_setcred(mp, newcr, NOPID); 1942 crfree(newcr); /* mblk_setcred did its own crhold */ 1943 } 1944 1945 if (ixa->ixa_pktlen > ixa->ixa_fragsize || 1946 (ixaflags & IXAF_IPSEC_SECURE)) { 1947 uint32_t pktlen; 1948 1949 pktlen = ixa->ixa_pktlen; 1950 if (ixaflags & IXAF_IPSEC_SECURE) 1951 pktlen += ipsec_out_extra_length(ixa); 1952 1953 if (pktlen > IP_MAXPACKET) 1954 return (EMSGSIZE); 1955 1956 if (ixaflags & IXAF_SET_ULP_CKSUM) { 1957 /* 1958 * Compute ULP checksum and IP header checksum 1959 * using software 1960 */ 1961 if (!ip_output_sw_cksum_v4(mp, ipha, ixa)) { 1962 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); 1963 ip_drop_output("ipIfStatsOutDiscards", mp, ill); 1964 freemsg(mp); 1965 return (EINVAL); 1966 } 1967 } else { 1968 /* Calculate IPv4 header checksum */ 1969 ipha->ipha_hdr_checksum = 0; 1970 ipha->ipha_hdr_checksum = ip_csum_hdr(ipha); 1971 } 1972 1973 /* 1974 * If this packet would generate a icmp_frag_needed 1975 * message, we need to handle it before we do the IPsec 1976 * processing. Otherwise, we need to strip the IPsec 1977 * headers before we send up the message to the ULPs 1978 * which becomes messy and difficult. 1979 * 1980 * We check using IXAF_DONTFRAG. The DF bit in the header 1981 * is not inspected - it will be copied to any generated 1982 * fragments. 1983 */ 1984 if ((pktlen > ixa->ixa_fragsize) && 1985 (ixaflags & IXAF_DONTFRAG)) { 1986 /* Generate ICMP and return error */ 1987 ip_recv_attr_t iras; 1988 1989 DTRACE_PROBE4(ip4__fragsize__fail, uint_t, pktlen, 1990 uint_t, ixa->ixa_fragsize, uint_t, ixa->ixa_pktlen, 1991 uint_t, ixa->ixa_pmtu); 1992 1993 bzero(&iras, sizeof (iras)); 1994 /* Map ixa to ira including IPsec policies */ 1995 ipsec_out_to_in(ixa, ill, &iras); 1996 1997 ip_drop_output("ICMP_FRAG_NEEDED", mp, ill); 1998 icmp_frag_needed(mp, ixa->ixa_fragsize, &iras); 1999 /* We moved any IPsec refs from ixa to iras */ 2000 ira_cleanup(&iras, B_FALSE); 2001 return (EMSGSIZE); 2002 } 2003 DTRACE_PROBE4(ip4__fragsize__ok, uint_t, pktlen, 2004 uint_t, ixa->ixa_fragsize, uint_t, ixa->ixa_pktlen, 2005 uint_t, ixa->ixa_pmtu); 2006 2007 if (ixaflags & IXAF_IPSEC_SECURE) { 2008 /* 2009 * Pass in sufficient information so that 2010 * IPsec can determine whether to fragment, and 2011 * which function to call after fragmentation. 2012 */ 2013 return (ipsec_out_process(mp, ixa)); 2014 } 2015 return (ip_fragment_v4(mp, ixa->ixa_nce, ixaflags, 2016 ixa->ixa_pktlen, ixa->ixa_fragsize, ixa->ixa_xmit_hint, 2017 ixa->ixa_zoneid, ixa->ixa_no_loop_zoneid, 2018 ixa->ixa_postfragfn, &ixa->ixa_cookie)); 2019 } 2020 if (ixaflags & IXAF_SET_ULP_CKSUM) { 2021 /* Compute ULP checksum and IP header checksum */ 2022 /* An IS_UNDER_IPMP ill is ok here */ 2023 if (!ip_output_cksum_v4(ixaflags, mp, ipha, ixa, ill)) { 2024 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); 2025 ip_drop_output("ipIfStatsOutDiscards", mp, ill); 2026 freemsg(mp); 2027 return (EINVAL); 2028 } 2029 } else { 2030 /* Calculate IPv4 header checksum */ 2031 ipha->ipha_hdr_checksum = 0; 2032 ipha->ipha_hdr_checksum = ip_csum_hdr(ipha); 2033 } 2034 return ((ixa->ixa_postfragfn)(mp, ixa->ixa_nce, ixaflags, 2035 ixa->ixa_pktlen, ixa->ixa_xmit_hint, ixa->ixa_zoneid, 2036 ixa->ixa_no_loop_zoneid, &ixa->ixa_cookie)); 2037 } 2038 2039 /* 2040 * Send mp into ip_input 2041 * Common for IPv4 and IPv6 2042 */ 2043 void 2044 ip_postfrag_loopback(mblk_t *mp, nce_t *nce, iaflags_t ixaflags, 2045 uint_t pkt_len, zoneid_t nolzid) 2046 { 2047 rtc_t rtc; 2048 ill_t *ill = nce->nce_ill; 2049 ip_recv_attr_t iras; /* NOTE: No bzero for performance */ 2050 ncec_t *ncec; 2051 2052 ncec = nce->nce_common; 2053 iras.ira_flags = IRAF_VERIFY_IP_CKSUM | IRAF_VERIFY_ULP_CKSUM | 2054 IRAF_LOOPBACK | IRAF_L2SRC_LOOPBACK; 2055 if (ncec->ncec_flags & NCE_F_BCAST) 2056 iras.ira_flags |= IRAF_L2DST_BROADCAST; 2057 else if (ncec->ncec_flags & NCE_F_MCAST) 2058 iras.ira_flags |= IRAF_L2DST_MULTICAST; 2059 2060 iras.ira_free_flags = 0; 2061 iras.ira_cred = NULL; 2062 iras.ira_cpid = NOPID; 2063 iras.ira_tsl = NULL; 2064 iras.ira_zoneid = ALL_ZONES; 2065 iras.ira_pktlen = pkt_len; 2066 UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCInOctets, iras.ira_pktlen); 2067 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInReceives); 2068 2069 if (ixaflags & IXAF_IS_IPV4) 2070 iras.ira_flags |= IRAF_IS_IPV4; 2071 2072 iras.ira_ill = iras.ira_rill = ill; 2073 iras.ira_ruifindex = ill->ill_phyint->phyint_ifindex; 2074 iras.ira_rifindex = iras.ira_ruifindex; 2075 iras.ira_mhip = NULL; 2076 2077 iras.ira_flags |= ixaflags & IAF_MASK; 2078 iras.ira_no_loop_zoneid = nolzid; 2079 2080 /* Broadcast and multicast doesn't care about the squeue */ 2081 iras.ira_sqp = NULL; 2082 2083 rtc.rtc_ire = NULL; 2084 if (ixaflags & IXAF_IS_IPV4) { 2085 ipha_t *ipha = (ipha_t *)mp->b_rptr; 2086 2087 rtc.rtc_ipaddr = INADDR_ANY; 2088 2089 (*ill->ill_inputfn)(mp, ipha, &ipha->ipha_dst, &iras, &rtc); 2090 if (rtc.rtc_ire != NULL) { 2091 ASSERT(rtc.rtc_ipaddr != INADDR_ANY); 2092 ire_refrele(rtc.rtc_ire); 2093 } 2094 } else { 2095 ip6_t *ip6h = (ip6_t *)mp->b_rptr; 2096 2097 rtc.rtc_ip6addr = ipv6_all_zeros; 2098 2099 (*ill->ill_inputfn)(mp, ip6h, &ip6h->ip6_dst, &iras, &rtc); 2100 if (rtc.rtc_ire != NULL) { 2101 ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&rtc.rtc_ip6addr)); 2102 ire_refrele(rtc.rtc_ire); 2103 } 2104 } 2105 /* Any references to clean up? No hold on ira */ 2106 if (iras.ira_flags & (IRAF_IPSEC_SECURE|IRAF_SYSTEM_LABELED)) 2107 ira_cleanup(&iras, B_FALSE); 2108 } 2109 2110 /* 2111 * Post fragmentation function for IRE_MULTICAST and IRE_BROADCAST which 2112 * looks at the IXAF_LOOPBACK_COPY flag. 2113 * Common for IPv4 and IPv6. 2114 * 2115 * If the loopback copy fails (due to no memory) but we send the packet out 2116 * on the wire we return no failure. Only in the case we supress the wire 2117 * sending do we take the loopback failure into account. 2118 * 2119 * Note that we do not perform DTRACE_IP7 and FW_HOOKS for the looped back copy. 2120 * Those operations are performed on this packet in ip_xmit() and it would 2121 * be odd to do it twice for the same packet. 2122 */ 2123 int 2124 ip_postfrag_loopcheck(mblk_t *mp, nce_t *nce, iaflags_t ixaflags, 2125 uint_t pkt_len, uint32_t xmit_hint, zoneid_t szone, zoneid_t nolzid, 2126 uintptr_t *ixacookie) 2127 { 2128 ill_t *ill = nce->nce_ill; 2129 int error = 0; 2130 2131 /* 2132 * Check for IXAF_LOOPBACK_COPY - send a copy to ip as if the driver 2133 * had looped it back 2134 */ 2135 if (ixaflags & IXAF_LOOPBACK_COPY) { 2136 mblk_t *mp1; 2137 2138 mp1 = copymsg(mp); 2139 if (mp1 == NULL) { 2140 /* Failed to deliver the loopback copy. */ 2141 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); 2142 ip_drop_output("ipIfStatsOutDiscards", mp, ill); 2143 error = ENOBUFS; 2144 } else { 2145 ip_postfrag_loopback(mp1, nce, ixaflags, pkt_len, 2146 nolzid); 2147 } 2148 } 2149 2150 /* 2151 * If TTL = 0 then only do the loopback to this host i.e. we are 2152 * done. We are also done if this was the 2153 * loopback interface since it is sufficient 2154 * to loopback one copy of a multicast packet. 2155 */ 2156 if (ixaflags & IXAF_IS_IPV4) { 2157 ipha_t *ipha = (ipha_t *)mp->b_rptr; 2158 2159 if (ipha->ipha_ttl == 0) { 2160 ip_drop_output("multicast ipha_ttl not sent to wire", 2161 mp, ill); 2162 freemsg(mp); 2163 return (error); 2164 } 2165 } else { 2166 ip6_t *ip6h = (ip6_t *)mp->b_rptr; 2167 2168 if (ip6h->ip6_hops == 0) { 2169 ip_drop_output("multicast ipha_ttl not sent to wire", 2170 mp, ill); 2171 freemsg(mp); 2172 return (error); 2173 } 2174 } 2175 if (nce->nce_ill->ill_wq == NULL) { 2176 /* Loopback interface */ 2177 ip_drop_output("multicast on lo0 not sent to wire", mp, ill); 2178 freemsg(mp); 2179 return (error); 2180 } 2181 2182 return (ip_xmit(mp, nce, ixaflags, pkt_len, xmit_hint, szone, 0, 2183 ixacookie)); 2184 } 2185 2186 /* 2187 * Post fragmentation function for RTF_MULTIRT routes. 2188 * Since IRE_BROADCASTs can have RTF_MULTIRT, this function 2189 * checks IXAF_LOOPBACK_COPY. 2190 * 2191 * If no packet is sent due to failures then we return an errno, but if at 2192 * least one succeeded we return zero. 2193 */ 2194 int 2195 ip_postfrag_multirt_v4(mblk_t *mp, nce_t *nce, iaflags_t ixaflags, 2196 uint_t pkt_len, uint32_t xmit_hint, zoneid_t szone, zoneid_t nolzid, 2197 uintptr_t *ixacookie) 2198 { 2199 irb_t *irb; 2200 ipha_t *ipha = (ipha_t *)mp->b_rptr; 2201 ire_t *ire; 2202 ire_t *ire1; 2203 mblk_t *mp1; 2204 nce_t *nce1; 2205 ill_t *ill = nce->nce_ill; 2206 ill_t *ill1; 2207 ip_stack_t *ipst = ill->ill_ipst; 2208 int error = 0; 2209 int num_sent = 0; 2210 int err; 2211 uint_t ire_type; 2212 ipaddr_t nexthop; 2213 2214 ASSERT(ixaflags & IXAF_IS_IPV4); 2215 2216 /* Check for IXAF_LOOPBACK_COPY */ 2217 if (ixaflags & IXAF_LOOPBACK_COPY) { 2218 mblk_t *mp1; 2219 2220 mp1 = copymsg(mp); 2221 if (mp1 == NULL) { 2222 /* Failed to deliver the loopback copy. */ 2223 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); 2224 ip_drop_output("ipIfStatsOutDiscards", mp, ill); 2225 error = ENOBUFS; 2226 } else { 2227 ip_postfrag_loopback(mp1, nce, ixaflags, pkt_len, 2228 nolzid); 2229 } 2230 } 2231 2232 /* 2233 * Loop over RTF_MULTIRT for ipha_dst in the same bucket. Send 2234 * a copy to each one. 2235 * Use the nce (nexthop) and ipha_dst to find the ire. 2236 * 2237 * MULTIRT is not designed to work with shared-IP zones thus we don't 2238 * need to pass a zoneid or a label to the IRE lookup. 2239 */ 2240 if (V4_PART_OF_V6(nce->nce_addr) == ipha->ipha_dst) { 2241 /* Broadcast and multicast case */ 2242 ire = ire_ftable_lookup_v4(ipha->ipha_dst, 0, 0, 0, 2243 NULL, ALL_ZONES, NULL, MATCH_IRE_DSTONLY, 0, ipst, NULL); 2244 } else { 2245 ipaddr_t v4addr = V4_PART_OF_V6(nce->nce_addr); 2246 2247 /* Unicast case */ 2248 ire = ire_ftable_lookup_v4(ipha->ipha_dst, 0, v4addr, 0, 2249 NULL, ALL_ZONES, NULL, MATCH_IRE_GW, 0, ipst, NULL); 2250 } 2251 2252 if (ire == NULL || 2253 (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) || 2254 !(ire->ire_flags & RTF_MULTIRT)) { 2255 /* Drop */ 2256 ip_drop_output("ip_postfrag_multirt didn't find route", 2257 mp, nce->nce_ill); 2258 if (ire != NULL) 2259 ire_refrele(ire); 2260 return (ENETUNREACH); 2261 } 2262 2263 irb = ire->ire_bucket; 2264 irb_refhold(irb); 2265 for (ire1 = irb->irb_ire; ire1 != NULL; ire1 = ire1->ire_next) { 2266 /* 2267 * For broadcast we can have a mixture of IRE_BROADCAST and 2268 * IRE_HOST due to the manually added IRE_HOSTs that are used 2269 * to trigger the creation of the special CGTP broadcast routes. 2270 * Thus we have to skip if ire_type doesn't match the original. 2271 */ 2272 if (IRE_IS_CONDEMNED(ire1) || 2273 !(ire1->ire_flags & RTF_MULTIRT) || 2274 ire1->ire_type != ire->ire_type) 2275 continue; 2276 2277 /* Do the ire argument one after the loop */ 2278 if (ire1 == ire) 2279 continue; 2280 2281 ill1 = ire_nexthop_ill(ire1); 2282 if (ill1 == NULL) { 2283 /* 2284 * This ire might not have been picked by 2285 * ire_route_recursive, in which case ire_dep might 2286 * not have been setup yet. 2287 * We kick ire_route_recursive to try to resolve 2288 * starting at ire1. 2289 */ 2290 ire_t *ire2; 2291 2292 ire2 = ire_route_recursive_impl_v4(ire1, 2293 ire1->ire_addr, ire1->ire_type, ire1->ire_ill, 2294 ire1->ire_zoneid, NULL, MATCH_IRE_DSTONLY, 2295 B_TRUE, 0, ipst, NULL, NULL, NULL); 2296 if (ire2 != NULL) 2297 ire_refrele(ire2); 2298 ill1 = ire_nexthop_ill(ire1); 2299 } 2300 2301 if (ill1 == NULL) { 2302 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); 2303 ip_drop_output("ipIfStatsOutDiscards - no ill", 2304 mp, ill); 2305 error = ENETUNREACH; 2306 continue; 2307 } 2308 2309 /* Pick the addr and type to use for arp_nce_init */ 2310 if (nce->nce_common->ncec_flags & NCE_F_BCAST) { 2311 ire_type = IRE_BROADCAST; 2312 nexthop = ire1->ire_gateway_addr; 2313 } else if (nce->nce_common->ncec_flags & NCE_F_MCAST) { 2314 ire_type = IRE_MULTICAST; 2315 nexthop = ipha->ipha_dst; 2316 } else { 2317 ire_type = ire1->ire_type; /* Doesn't matter */ 2318 nexthop = ire1->ire_gateway_addr; 2319 } 2320 2321 /* If IPMP meta or under, then we just drop */ 2322 if (ill1->ill_grp != NULL) { 2323 BUMP_MIB(ill1->ill_ip_mib, ipIfStatsOutDiscards); 2324 ip_drop_output("ipIfStatsOutDiscards - IPMP", 2325 mp, ill1); 2326 ill_refrele(ill1); 2327 error = ENETUNREACH; 2328 continue; 2329 } 2330 2331 nce1 = arp_nce_init(ill1, nexthop, ire_type); 2332 if (nce1 == NULL) { 2333 BUMP_MIB(ill1->ill_ip_mib, ipIfStatsOutDiscards); 2334 ip_drop_output("ipIfStatsOutDiscards - no nce", 2335 mp, ill1); 2336 ill_refrele(ill1); 2337 error = ENETUNREACH; 2338 continue; 2339 } 2340 mp1 = copymsg(mp); 2341 if (mp1 == NULL) { 2342 BUMP_MIB(ill1->ill_ip_mib, ipIfStatsOutDiscards); 2343 ip_drop_output("ipIfStatsOutDiscards", mp, ill1); 2344 nce_refrele(nce1); 2345 ill_refrele(ill1); 2346 error = ENOBUFS; 2347 continue; 2348 } 2349 /* Preserve HW checksum for this copy */ 2350 DB_CKSUMSTART(mp1) = DB_CKSUMSTART(mp); 2351 DB_CKSUMSTUFF(mp1) = DB_CKSUMSTUFF(mp); 2352 DB_CKSUMEND(mp1) = DB_CKSUMEND(mp); 2353 DB_CKSUMFLAGS(mp1) = DB_CKSUMFLAGS(mp); 2354 DB_LSOMSS(mp1) = DB_LSOMSS(mp); 2355 2356 ire1->ire_ob_pkt_count++; 2357 err = ip_xmit(mp1, nce1, ixaflags, pkt_len, xmit_hint, szone, 2358 0, ixacookie); 2359 if (err == 0) 2360 num_sent++; 2361 else 2362 error = err; 2363 nce_refrele(nce1); 2364 ill_refrele(ill1); 2365 } 2366 irb_refrele(irb); 2367 ire_refrele(ire); 2368 /* Finally, the main one */ 2369 err = ip_xmit(mp, nce, ixaflags, pkt_len, xmit_hint, szone, 0, 2370 ixacookie); 2371 if (err == 0) 2372 num_sent++; 2373 else 2374 error = err; 2375 if (num_sent > 0) 2376 return (0); 2377 else 2378 return (error); 2379 } 2380 2381 /* 2382 * Verify local connectivity. This check is called by ULP fusion code. 2383 * The generation number on an IRE_LOCAL or IRE_LOOPBACK only changes if 2384 * the interface is brought down and back up. So we simply fail the local 2385 * process. The caller, TCP Fusion, should unfuse the connection. 2386 */ 2387 boolean_t 2388 ip_output_verify_local(ip_xmit_attr_t *ixa) 2389 { 2390 ire_t *ire = ixa->ixa_ire; 2391 2392 if (!(ire->ire_type & (IRE_LOCAL | IRE_LOOPBACK))) 2393 return (B_FALSE); 2394 2395 return (ixa->ixa_ire->ire_generation == ixa->ixa_ire_generation); 2396 } 2397 2398 /* 2399 * Local process for ULP loopback, TCP Fusion. Handle both IPv4 and IPv6. 2400 * 2401 * The caller must call ip_output_verify_local() first. This function handles 2402 * IPobs, FW_HOOKS, and/or IPsec cases sequentially. 2403 */ 2404 mblk_t * 2405 ip_output_process_local(mblk_t *mp, ip_xmit_attr_t *ixa, boolean_t hooks_out, 2406 boolean_t hooks_in, conn_t *peer_connp) 2407 { 2408 ill_t *ill = ixa->ixa_ire->ire_ill; 2409 ipha_t *ipha = NULL; 2410 ip6_t *ip6h = NULL; 2411 ip_stack_t *ipst = ixa->ixa_ipst; 2412 iaflags_t ixaflags = ixa->ixa_flags; 2413 ip_recv_attr_t iras; 2414 int error; 2415 2416 ASSERT(mp != NULL); 2417 2418 if (ixaflags & IXAF_IS_IPV4) { 2419 ipha = (ipha_t *)mp->b_rptr; 2420 2421 /* 2422 * If a callback is enabled then we need to know the 2423 * source and destination zoneids for the packet. We already 2424 * have those handy. 2425 */ 2426 if (ipst->ips_ip4_observe.he_interested) { 2427 zoneid_t szone, dzone; 2428 zoneid_t stackzoneid; 2429 2430 stackzoneid = netstackid_to_zoneid( 2431 ipst->ips_netstack->netstack_stackid); 2432 2433 if (stackzoneid == GLOBAL_ZONEID) { 2434 /* Shared-IP zone */ 2435 dzone = ixa->ixa_ire->ire_zoneid; 2436 szone = ixa->ixa_zoneid; 2437 } else { 2438 szone = dzone = stackzoneid; 2439 } 2440 ipobs_hook(mp, IPOBS_HOOK_LOCAL, szone, dzone, ill, 2441 ipst); 2442 } 2443 DTRACE_IP7(send, mblk_t *, mp, conn_t *, NULL, void_ip_t *, 2444 ipha, __dtrace_ipsr_ill_t *, ill, ipha_t *, ipha, ip6_t *, 2445 NULL, int, 1); 2446 2447 /* FW_HOOKS: LOOPBACK_OUT */ 2448 if (hooks_out) { 2449 DTRACE_PROBE4(ip4__loopback__out__start, ill_t *, NULL, 2450 ill_t *, ill, ipha_t *, ipha, mblk_t *, mp); 2451 FW_HOOKS(ipst->ips_ip4_loopback_out_event, 2452 ipst->ips_ipv4firewall_loopback_out, 2453 NULL, ill, ipha, mp, mp, 0, ipst, error); 2454 DTRACE_PROBE1(ip4__loopback__out__end, mblk_t *, mp); 2455 } 2456 if (mp == NULL) 2457 return (NULL); 2458 2459 /* FW_HOOKS: LOOPBACK_IN */ 2460 if (hooks_in) { 2461 DTRACE_PROBE4(ip4__loopback__in__start, ill_t *, ill, 2462 ill_t *, NULL, ipha_t *, ipha, mblk_t *, mp); 2463 FW_HOOKS(ipst->ips_ip4_loopback_in_event, 2464 ipst->ips_ipv4firewall_loopback_in, 2465 ill, NULL, ipha, mp, mp, 0, ipst, error); 2466 DTRACE_PROBE1(ip4__loopback__in__end, mblk_t *, mp); 2467 } 2468 if (mp == NULL) 2469 return (NULL); 2470 2471 DTRACE_IP7(receive, mblk_t *, mp, conn_t *, NULL, void_ip_t *, 2472 ipha, __dtrace_ipsr_ill_t *, ill, ipha_t *, ipha, ip6_t *, 2473 NULL, int, 1); 2474 2475 /* Inbound IPsec polocies */ 2476 if (peer_connp != NULL) { 2477 /* Map ixa to ira including IPsec policies. */ 2478 ipsec_out_to_in(ixa, ill, &iras); 2479 mp = ipsec_check_inbound_policy(mp, peer_connp, ipha, 2480 NULL, &iras); 2481 } 2482 } else { 2483 ip6h = (ip6_t *)mp->b_rptr; 2484 2485 /* 2486 * If a callback is enabled then we need to know the 2487 * source and destination zoneids for the packet. We already 2488 * have those handy. 2489 */ 2490 if (ipst->ips_ip6_observe.he_interested) { 2491 zoneid_t szone, dzone; 2492 zoneid_t stackzoneid; 2493 2494 stackzoneid = netstackid_to_zoneid( 2495 ipst->ips_netstack->netstack_stackid); 2496 2497 if (stackzoneid == GLOBAL_ZONEID) { 2498 /* Shared-IP zone */ 2499 dzone = ixa->ixa_ire->ire_zoneid; 2500 szone = ixa->ixa_zoneid; 2501 } else { 2502 szone = dzone = stackzoneid; 2503 } 2504 ipobs_hook(mp, IPOBS_HOOK_LOCAL, szone, dzone, ill, 2505 ipst); 2506 } 2507 DTRACE_IP7(send, mblk_t *, mp, conn_t *, NULL, void_ip_t *, 2508 ip6h, __dtrace_ipsr_ill_t *, ill, ipha_t *, NULL, ip6_t *, 2509 ip6h, int, 1); 2510 2511 /* FW_HOOKS: LOOPBACK_OUT */ 2512 if (hooks_out) { 2513 DTRACE_PROBE4(ip6__loopback__out__start, ill_t *, NULL, 2514 ill_t *, ill, ip6_t *, ip6h, mblk_t *, mp); 2515 FW_HOOKS6(ipst->ips_ip6_loopback_out_event, 2516 ipst->ips_ipv6firewall_loopback_out, 2517 NULL, ill, ip6h, mp, mp, 0, ipst, error); 2518 DTRACE_PROBE1(ip6__loopback__out__end, mblk_t *, mp); 2519 } 2520 if (mp == NULL) 2521 return (NULL); 2522 2523 /* FW_HOOKS: LOOPBACK_IN */ 2524 if (hooks_in) { 2525 DTRACE_PROBE4(ip6__loopback__in__start, ill_t *, ill, 2526 ill_t *, NULL, ip6_t *, ip6h, mblk_t *, mp); 2527 FW_HOOKS6(ipst->ips_ip6_loopback_in_event, 2528 ipst->ips_ipv6firewall_loopback_in, 2529 ill, NULL, ip6h, mp, mp, 0, ipst, error); 2530 DTRACE_PROBE1(ip6__loopback__in__end, mblk_t *, mp); 2531 } 2532 if (mp == NULL) 2533 return (NULL); 2534 2535 DTRACE_IP7(receive, mblk_t *, mp, conn_t *, NULL, void_ip_t *, 2536 ip6h, __dtrace_ipsr_ill_t *, ill, ipha_t *, NULL, ip6_t *, 2537 ip6h, int, 1); 2538 2539 /* Inbound IPsec polocies */ 2540 if (peer_connp != NULL) { 2541 /* Map ixa to ira including IPsec policies. */ 2542 ipsec_out_to_in(ixa, ill, &iras); 2543 mp = ipsec_check_inbound_policy(mp, peer_connp, NULL, 2544 ip6h, &iras); 2545 } 2546 } 2547 2548 if (mp == NULL) { 2549 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 2550 ip_drop_input("ipIfStatsInDiscards", NULL, ill); 2551 } 2552 2553 return (mp); 2554 } 2555