1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. 24 * Copyright 2018 Joyent, Inc. 25 */ 26 /* Copyright (c) 1990 Mentat Inc. */ 27 28 #include <sys/types.h> 29 #include <sys/stream.h> 30 #include <sys/strsubr.h> 31 #include <sys/dlpi.h> 32 #include <sys/strsun.h> 33 #include <sys/zone.h> 34 #include <sys/ddi.h> 35 #include <sys/sunddi.h> 36 #include <sys/cmn_err.h> 37 #include <sys/debug.h> 38 #include <sys/atomic.h> 39 40 #include <sys/systm.h> 41 #include <sys/param.h> 42 #include <sys/kmem.h> 43 #include <sys/sdt.h> 44 #include <sys/socket.h> 45 #include <sys/mac.h> 46 #include <net/if.h> 47 #include <net/if_arp.h> 48 #include <net/route.h> 49 #include <sys/sockio.h> 50 #include <netinet/in.h> 51 #include <net/if_dl.h> 52 53 #include <inet/common.h> 54 #include <inet/mi.h> 55 #include <inet/mib2.h> 56 #include <inet/nd.h> 57 #include <inet/arp.h> 58 #include <inet/snmpcom.h> 59 #include <inet/kstatcom.h> 60 61 #include <netinet/igmp_var.h> 62 #include <netinet/ip6.h> 63 #include <netinet/icmp6.h> 64 #include <netinet/sctp.h> 65 66 #include <inet/ip.h> 67 #include <inet/ip_impl.h> 68 #include <inet/ip6.h> 69 #include <inet/ip6_asp.h> 70 #include <inet/tcp.h> 71 #include <inet/ip_multi.h> 72 #include <inet/ip_if.h> 73 #include <inet/ip_ire.h> 74 #include <inet/ip_ftable.h> 75 #include <inet/ip_rts.h> 76 #include <inet/optcom.h> 77 #include <inet/ip_ndp.h> 78 #include <inet/ip_listutils.h> 79 #include <netinet/igmp.h> 80 #include <netinet/ip_mroute.h> 81 #include <inet/ipp_common.h> 82 83 #include <net/pfkeyv2.h> 84 #include <inet/sadb.h> 85 #include <inet/ipsec_impl.h> 86 #include <inet/ipdrop.h> 87 #include <inet/ip_netinfo.h> 88 89 #include <sys/pattr.h> 90 #include <inet/ipclassifier.h> 91 #include <inet/sctp_ip.h> 92 #include <inet/sctp/sctp_impl.h> 93 #include <inet/udp_impl.h> 94 #include <sys/sunddi.h> 95 96 #include <sys/tsol/label.h> 97 #include <sys/tsol/tnet.h> 98 99 #include <sys/clock_impl.h> /* For LBOLT_FASTPATH{,64} */ 100 101 #ifdef DEBUG 102 extern boolean_t skip_sctp_cksum; 103 #endif 104 105 static int ip_verify_nce(mblk_t *, ip_xmit_attr_t *); 106 static int ip_verify_dce(mblk_t *, ip_xmit_attr_t *); 107 static boolean_t ip_verify_lso(ill_t *, ip_xmit_attr_t *); 108 static boolean_t ip_verify_zcopy(ill_t *, ip_xmit_attr_t *); 109 static void ip_output_simple_broadcast(ip_xmit_attr_t *, mblk_t *); 110 111 /* 112 * There are two types of output functions for IP used for different 113 * purposes: 114 * - ip_output_simple() is when sending ICMP errors, TCP resets, etc when there 115 * is no context in the form of a conn_t. However, there is a 116 * ip_xmit_attr_t that the callers use to influence interface selection 117 * (needed for ICMP echo as well as IPv6 link-locals) and IPsec. 118 * 119 * - conn_ip_output() is used when sending packets with a conn_t and 120 * ip_set_destination has been called to cache information. In that case 121 * various socket options are recorded in the ip_xmit_attr_t and should 122 * be taken into account. 123 */ 124 125 /* 126 * The caller *must* have called conn_connect() or ip_attr_connect() 127 * before calling conn_ip_output(). The caller needs to redo that each time 128 * the destination IP address or port changes, as well as each time there is 129 * a change to any socket option that would modify how packets are routed out 130 * of the box (e.g., SO_DONTROUTE, IP_NEXTHOP, IP_BOUND_IF). 131 * 132 * The ULP caller has to serialize the use of a single ip_xmit_attr_t. 133 * We assert for that here. 134 */ 135 int 136 conn_ip_output(mblk_t *mp, ip_xmit_attr_t *ixa) 137 { 138 iaflags_t ixaflags = ixa->ixa_flags; 139 ire_t *ire; 140 nce_t *nce; 141 dce_t *dce; 142 ill_t *ill; 143 ip_stack_t *ipst = ixa->ixa_ipst; 144 int error; 145 146 /* We defer ipIfStatsHCOutRequests until an error or we have an ill */ 147 148 ASSERT(ixa->ixa_ire != NULL); 149 /* Note there is no ixa_nce when reject and blackhole routes */ 150 ASSERT(ixa->ixa_dce != NULL); /* Could be default dce */ 151 152 #ifdef DEBUG 153 ASSERT(ixa->ixa_curthread == NULL); 154 ixa->ixa_curthread = curthread; 155 #endif 156 157 /* 158 * Even on labeled systems we can have a NULL ixa_tsl e.g., 159 * for IGMP/MLD traffic. 160 */ 161 162 ire = ixa->ixa_ire; 163 164 /* 165 * If the ULP says the (old) IRE resulted in reachability we 166 * record this before determine whether to use a new IRE. 167 * No locking for performance reasons. 168 */ 169 if (ixaflags & IXAF_REACH_CONF) 170 ire->ire_badcnt = 0; 171 172 /* 173 * Has routing changed since we cached the results of the lookup? 174 * 175 * This check captures all of: 176 * - the cached ire being deleted (by means of the special 177 * IRE_GENERATION_CONDEMNED) 178 * - A potentially better ire being added (ire_generation being 179 * increased) 180 * - A deletion of the nexthop ire that was used when we did the 181 * lookup. 182 * - An addition of a potentially better nexthop ire. 183 * The last two are handled by walking and increasing the generation 184 * number on all dependant IREs in ire_flush_cache(). 185 * 186 * The check also handles all cases of RTF_REJECT and RTF_BLACKHOLE 187 * since we ensure that each time we set ixa_ire to such an IRE we 188 * make sure the ixa_ire_generation does not match (by using 189 * IRE_GENERATION_VERIFY). 190 */ 191 if (ire->ire_generation != ixa->ixa_ire_generation) { 192 error = ip_verify_ire(mp, ixa); 193 if (error != 0) { 194 ip_drop_output("ipIfStatsOutDiscards - verify ire", 195 mp, NULL); 196 goto drop; 197 } 198 ire = ixa->ixa_ire; 199 ASSERT(ire != NULL); 200 if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) { 201 #ifdef DEBUG 202 ASSERT(ixa->ixa_curthread == curthread); 203 ixa->ixa_curthread = NULL; 204 #endif 205 ire->ire_ob_pkt_count++; 206 /* ixa_dce might be condemned; use default one */ 207 return ((ire->ire_sendfn)(ire, mp, mp->b_rptr, ixa, 208 &ipst->ips_dce_default->dce_ident)); 209 } 210 /* 211 * If the ncec changed then ip_verify_ire already set 212 * ixa->ixa_dce_generation = DCE_GENERATION_VERIFY; 213 * so we can recheck the interface mtu. 214 */ 215 216 /* 217 * Note that ire->ire_generation could already have changed. 218 * We catch that next time we send a packet. 219 */ 220 } 221 222 /* 223 * No need to lock access to ixa_nce since the ip_xmit_attr usage 224 * is single threaded. 225 */ 226 ASSERT(ixa->ixa_nce != NULL); 227 nce = ixa->ixa_nce; 228 if (nce->nce_is_condemned) { 229 error = ip_verify_nce(mp, ixa); 230 /* 231 * In case ZEROCOPY capability become not available, we 232 * copy the message and free the original one. We might 233 * be copying more data than needed but it doesn't hurt 234 * since such change rarely happens. 235 */ 236 switch (error) { 237 case 0: 238 break; 239 case ENOTSUP: { /* ZEROCOPY */ 240 mblk_t *nmp; 241 242 if ((nmp = copymsg(mp)) != NULL) { 243 freemsg(mp); 244 mp = nmp; 245 246 break; 247 } 248 } 249 /* FALLTHROUGH */ 250 default: 251 ip_drop_output("ipIfStatsOutDiscards - verify nce", 252 mp, NULL); 253 goto drop; 254 } 255 ire = ixa->ixa_ire; 256 ASSERT(ire != NULL); 257 if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) { 258 #ifdef DEBUG 259 ASSERT(ixa->ixa_curthread == curthread); 260 ixa->ixa_curthread = NULL; 261 #endif 262 ire->ire_ob_pkt_count++; 263 /* ixa_dce might be condemned; use default one */ 264 return ((ire->ire_sendfn)(ire, mp, mp->b_rptr, 265 ixa, &ipst->ips_dce_default->dce_ident)); 266 } 267 ASSERT(ixa->ixa_nce != NULL); 268 nce = ixa->ixa_nce; 269 270 /* 271 * Note that some other event could already have made 272 * the new nce condemned. We catch that next time we 273 * try to send a packet. 274 */ 275 } 276 /* 277 * If there is no per-destination dce_t then we have a reference to 278 * the default dce_t (which merely contains the dce_ipid). 279 * The generation check captures both the introduction of a 280 * per-destination dce_t (e.g., due to ICMP packet too big) and 281 * any change to the per-destination dce (including it becoming 282 * condemned by use of the special DCE_GENERATION_CONDEMNED). 283 */ 284 dce = ixa->ixa_dce; 285 286 /* 287 * To avoid a periodic timer to increase the path MTU we 288 * look at dce_last_change_time each time we send a packet. 289 */ 290 if (dce->dce_flags & DCEF_PMTU) { 291 int64_t now = LBOLT_FASTPATH64; 292 293 if ((TICK_TO_SEC(now) - dce->dce_last_change_time > 294 ipst->ips_ip_pathmtu_interval)) { 295 /* 296 * Older than 20 minutes. Drop the path MTU information. 297 * Since the path MTU changes as a result of this, 298 * twiddle ixa_dce_generation to make us go through the 299 * dce verification code in conn_ip_output. 300 */ 301 mutex_enter(&dce->dce_lock); 302 dce->dce_flags &= ~(DCEF_PMTU|DCEF_TOO_SMALL_PMTU); 303 dce->dce_last_change_time = TICK_TO_SEC(now); 304 mutex_exit(&dce->dce_lock); 305 dce_increment_generation(dce); 306 } 307 } 308 309 if (dce->dce_generation != ixa->ixa_dce_generation) { 310 error = ip_verify_dce(mp, ixa); 311 if (error != 0) { 312 ip_drop_output("ipIfStatsOutDiscards - verify dce", 313 mp, NULL); 314 goto drop; 315 } 316 dce = ixa->ixa_dce; 317 318 /* 319 * Note that some other event could already have made the 320 * new dce's generation number change. 321 * We catch that next time we try to send a packet. 322 */ 323 } 324 325 ill = nce->nce_ill; 326 327 /* 328 * An initial ixa_fragsize was set in ip_set_destination 329 * and we update it if any routing changes above. 330 * A change to ill_mtu with ifconfig will increase all dce_generation 331 * so that we will detect that with the generation check. Ditto for 332 * ill_mc_mtu. 333 */ 334 335 /* 336 * Caller needs to make sure IXAF_VERIFY_SRC is not set if 337 * conn_unspec_src. 338 */ 339 if ((ixaflags & IXAF_VERIFY_SOURCE) && 340 ixa->ixa_src_generation != ipst->ips_src_generation) { 341 /* Check if the IP source is still assigned to the host. */ 342 uint_t gen; 343 344 if (!ip_verify_src(mp, ixa, &gen)) { 345 /* Don't send a packet with a source that isn't ours */ 346 error = EADDRNOTAVAIL; 347 ip_drop_output("ipIfStatsOutDiscards - invalid src", 348 mp, NULL); 349 goto drop; 350 } 351 /* The source is still valid - update the generation number */ 352 ixa->ixa_src_generation = gen; 353 } 354 355 /* 356 * We don't have an IRE when we fragment, hence ire_ob_pkt_count 357 * can only count the use prior to fragmentation. However the MIB 358 * counters on the ill will be incremented in post fragmentation. 359 */ 360 ire->ire_ob_pkt_count++; 361 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCOutRequests); 362 363 /* 364 * Based on ire_type and ire_flags call one of: 365 * ire_send_local_v* - for IRE_LOCAL and IRE_LOOPBACK 366 * ire_send_multirt_v* - if RTF_MULTIRT 367 * ire_send_noroute_v* - if RTF_REJECT or RTF_BLACHOLE 368 * ire_send_multicast_v* - for IRE_MULTICAST 369 * ire_send_broadcast_v4 - for IRE_BROADCAST 370 * ire_send_wire_v* - for the rest. 371 */ 372 #ifdef DEBUG 373 ASSERT(ixa->ixa_curthread == curthread); 374 ixa->ixa_curthread = NULL; 375 #endif 376 return ((ire->ire_sendfn)(ire, mp, mp->b_rptr, ixa, &dce->dce_ident)); 377 378 drop: 379 if (ixaflags & IXAF_IS_IPV4) { 380 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests); 381 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards); 382 } else { 383 BUMP_MIB(&ipst->ips_ip6_mib, ipIfStatsHCOutRequests); 384 BUMP_MIB(&ipst->ips_ip6_mib, ipIfStatsOutDiscards); 385 } 386 freemsg(mp); 387 #ifdef DEBUG 388 ASSERT(ixa->ixa_curthread == curthread); 389 ixa->ixa_curthread = NULL; 390 #endif 391 return (error); 392 } 393 394 /* 395 * Handle both IPv4 and IPv6. Sets the generation number 396 * to allow the caller to know when to call us again. 397 * Returns true if the source address in the packet is a valid source. 398 * We handle callers which try to send with a zero address (since we only 399 * get here if UNSPEC_SRC is not set). 400 */ 401 boolean_t 402 ip_verify_src(mblk_t *mp, ip_xmit_attr_t *ixa, uint_t *generationp) 403 { 404 ip_stack_t *ipst = ixa->ixa_ipst; 405 406 /* 407 * Need to grab the generation number before we check to 408 * avoid a race with a change to the set of local addresses. 409 * No lock needed since the thread which updates the set of local 410 * addresses use ipif/ill locks and exit those (hence a store memory 411 * barrier) before doing the atomic increase of ips_src_generation. 412 */ 413 if (generationp != NULL) 414 *generationp = ipst->ips_src_generation; 415 416 if (ixa->ixa_flags & IXAF_IS_IPV4) { 417 ipha_t *ipha = (ipha_t *)mp->b_rptr; 418 419 if (ipha->ipha_src == INADDR_ANY) 420 return (B_FALSE); 421 422 return (ip_laddr_verify_v4(ipha->ipha_src, ixa->ixa_zoneid, 423 ipst, B_FALSE) != IPVL_BAD); 424 } else { 425 ip6_t *ip6h = (ip6_t *)mp->b_rptr; 426 uint_t scopeid; 427 428 if (IN6_IS_ADDR_UNSPECIFIED(&ip6h->ip6_src)) 429 return (B_FALSE); 430 431 if (ixa->ixa_flags & IXAF_SCOPEID_SET) 432 scopeid = ixa->ixa_scopeid; 433 else 434 scopeid = 0; 435 436 return (ip_laddr_verify_v6(&ip6h->ip6_src, ixa->ixa_zoneid, 437 ipst, B_FALSE, scopeid) != IPVL_BAD); 438 } 439 } 440 441 /* 442 * Handle both IPv4 and IPv6. Reverify/recalculate the IRE to use. 443 */ 444 int 445 ip_verify_ire(mblk_t *mp, ip_xmit_attr_t *ixa) 446 { 447 uint_t gen; 448 ire_t *ire; 449 nce_t *nce; 450 int error; 451 boolean_t multirt = B_FALSE; 452 453 /* 454 * Redo ip_select_route. 455 * Need to grab generation number as part of the lookup to 456 * avoid race. 457 */ 458 error = 0; 459 ire = ip_select_route_pkt(mp, ixa, &gen, &error, &multirt); 460 ASSERT(ire != NULL); /* IRE_NOROUTE if none found */ 461 if (error != 0) { 462 ire_refrele(ire); 463 return (error); 464 } 465 466 if (ixa->ixa_ire != NULL) 467 ire_refrele_notr(ixa->ixa_ire); 468 #ifdef DEBUG 469 ire_refhold_notr(ire); 470 ire_refrele(ire); 471 #endif 472 ixa->ixa_ire = ire; 473 ixa->ixa_ire_generation = gen; 474 if (multirt) { 475 if (ixa->ixa_flags & IXAF_IS_IPV4) 476 ixa->ixa_postfragfn = ip_postfrag_multirt_v4; 477 else 478 ixa->ixa_postfragfn = ip_postfrag_multirt_v6; 479 ixa->ixa_flags |= IXAF_MULTIRT_MULTICAST; 480 } else { 481 ixa->ixa_postfragfn = ire->ire_postfragfn; 482 ixa->ixa_flags &= ~IXAF_MULTIRT_MULTICAST; 483 } 484 485 /* 486 * Don't look for an nce for reject or blackhole. 487 * They have ire_generation set to IRE_GENERATION_VERIFY which 488 * makes conn_ip_output avoid references to ixa_nce. 489 */ 490 if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) { 491 ASSERT(ixa->ixa_ire_generation == IRE_GENERATION_VERIFY); 492 ixa->ixa_dce_generation = DCE_GENERATION_VERIFY; 493 return (0); 494 } 495 496 /* The NCE could now be different */ 497 nce = ire_to_nce_pkt(ire, mp); 498 if (nce == NULL) { 499 /* 500 * Allocation failure. Make sure we redo ire/nce selection 501 * next time we send. 502 */ 503 ixa->ixa_ire_generation = IRE_GENERATION_VERIFY; 504 ixa->ixa_dce_generation = DCE_GENERATION_VERIFY; 505 return (ENOBUFS); 506 } 507 if (nce == ixa->ixa_nce) { 508 /* No change */ 509 nce_refrele(nce); 510 return (0); 511 } 512 513 /* 514 * Since the path MTU might change as a result of this 515 * route change, we twiddle ixa_dce_generation to 516 * make conn_ip_output go through the ip_verify_dce code. 517 */ 518 ixa->ixa_dce_generation = DCE_GENERATION_VERIFY; 519 520 if (ixa->ixa_nce != NULL) 521 nce_refrele(ixa->ixa_nce); 522 ixa->ixa_nce = nce; 523 return (0); 524 } 525 526 /* 527 * Handle both IPv4 and IPv6. Reverify/recalculate the NCE to use. 528 */ 529 static int 530 ip_verify_nce(mblk_t *mp, ip_xmit_attr_t *ixa) 531 { 532 ire_t *ire = ixa->ixa_ire; 533 nce_t *nce; 534 int error = 0; 535 ipha_t *ipha = NULL; 536 ip6_t *ip6h = NULL; 537 538 if (ire->ire_ipversion == IPV4_VERSION) 539 ipha = (ipha_t *)mp->b_rptr; 540 else 541 ip6h = (ip6_t *)mp->b_rptr; 542 543 nce = ire_handle_condemned_nce(ixa->ixa_nce, ire, ipha, ip6h, B_TRUE); 544 if (nce == NULL) { 545 /* Try to find a better ire */ 546 return (ip_verify_ire(mp, ixa)); 547 } 548 549 /* 550 * The hardware offloading capabilities, for example LSO, of the 551 * interface might have changed, so do sanity verification here. 552 */ 553 if (ixa->ixa_flags & IXAF_VERIFY_LSO) { 554 if (!ip_verify_lso(nce->nce_ill, ixa)) { 555 ASSERT(ixa->ixa_notify != NULL); 556 ixa->ixa_notify(ixa->ixa_notify_cookie, ixa, 557 IXAN_LSO, 0); 558 error = ENOTSUP; 559 } 560 } 561 562 /* 563 * Verify ZEROCOPY capability of underlying ill. Notify the ULP with 564 * any ZEROCOPY changes. In case ZEROCOPY capability is not available 565 * any more, return error so that conn_ip_output() can take care of 566 * the ZEROCOPY message properly. It's safe to continue send the 567 * message when ZEROCOPY newly become available. 568 */ 569 if (ixa->ixa_flags & IXAF_VERIFY_ZCOPY) { 570 if (!ip_verify_zcopy(nce->nce_ill, ixa)) { 571 ASSERT(ixa->ixa_notify != NULL); 572 ixa->ixa_notify(ixa->ixa_notify_cookie, ixa, 573 IXAN_ZCOPY, 0); 574 if ((ixa->ixa_flags & IXAF_ZCOPY_CAPAB) == 0) 575 error = ENOTSUP; 576 } 577 } 578 579 /* 580 * Since the path MTU might change as a result of this 581 * change, we twiddle ixa_dce_generation to 582 * make conn_ip_output go through the ip_verify_dce code. 583 */ 584 ixa->ixa_dce_generation = DCE_GENERATION_VERIFY; 585 586 nce_refrele(ixa->ixa_nce); 587 ixa->ixa_nce = nce; 588 return (error); 589 } 590 591 /* 592 * Handle both IPv4 and IPv6. Reverify/recalculate the DCE to use. 593 */ 594 static int 595 ip_verify_dce(mblk_t *mp, ip_xmit_attr_t *ixa) 596 { 597 dce_t *dce; 598 uint_t gen; 599 uint_t pmtu; 600 601 dce = dce_lookup_pkt(mp, ixa, &gen); 602 ASSERT(dce != NULL); 603 604 dce_refrele_notr(ixa->ixa_dce); 605 #ifdef DEBUG 606 dce_refhold_notr(dce); 607 dce_refrele(dce); 608 #endif 609 ixa->ixa_dce = dce; 610 ixa->ixa_dce_generation = gen; 611 612 /* Extract the (path) mtu from the dce, ncec_ill etc */ 613 pmtu = ip_get_pmtu(ixa); 614 615 /* 616 * Tell ULP about PMTU changes - increase or decrease - by returning 617 * an error if IXAF_VERIFY_PMTU is set. In such case, ULP should update 618 * both ixa_pmtu and ixa_fragsize appropriately. 619 * 620 * If ULP doesn't set that flag then we need to update ixa_fragsize 621 * since routing could have changed the ill after after ixa_fragsize 622 * was set previously in the conn_ip_output path or in 623 * ip_set_destination. 624 * 625 * In case of LSO, ixa_fragsize might be greater than ixa_pmtu. 626 * 627 * In the case of a path MTU increase we send the packet after the 628 * notify to the ULP. 629 */ 630 if (ixa->ixa_flags & IXAF_VERIFY_PMTU) { 631 if (ixa->ixa_pmtu != pmtu) { 632 uint_t oldmtu = ixa->ixa_pmtu; 633 634 DTRACE_PROBE2(verify_pmtu, uint32_t, pmtu, 635 uint32_t, ixa->ixa_pmtu); 636 ASSERT(ixa->ixa_notify != NULL); 637 ixa->ixa_notify(ixa->ixa_notify_cookie, ixa, 638 IXAN_PMTU, pmtu); 639 if (pmtu < oldmtu) 640 return (EMSGSIZE); 641 } 642 } else { 643 ixa->ixa_fragsize = pmtu; 644 } 645 return (0); 646 } 647 648 /* 649 * Verify LSO usability. Keep the return value simple to indicate whether 650 * the LSO capability has changed. Handle both IPv4 and IPv6. 651 */ 652 static boolean_t 653 ip_verify_lso(ill_t *ill, ip_xmit_attr_t *ixa) 654 { 655 ill_lso_capab_t *lsoc = &ixa->ixa_lso_capab; 656 ill_lso_capab_t *new_lsoc = ill->ill_lso_capab; 657 658 if (ixa->ixa_flags & IXAF_LSO_CAPAB) { 659 /* 660 * Not unsable any more. 661 */ 662 if ((ixa->ixa_flags & IXAF_IPSEC_SECURE) || 663 (ixa->ixa_ire->ire_type & (IRE_LOCAL | IRE_LOOPBACK)) || 664 (ixa->ixa_ire->ire_flags & RTF_MULTIRT) || 665 ((ixa->ixa_flags & IXAF_IS_IPV4) ? 666 !ILL_LSO_TCP_IPV4_USABLE(ill) : 667 !ILL_LSO_TCP_IPV6_USABLE(ill))) { 668 ixa->ixa_flags &= ~IXAF_LSO_CAPAB; 669 670 return (B_FALSE); 671 } 672 673 /* 674 * Capability has changed, refresh the copy in ixa. 675 */ 676 if (lsoc->ill_lso_max_tcpv4 != new_lsoc->ill_lso_max_tcpv4 || 677 lsoc->ill_lso_max_tcpv6 != new_lsoc->ill_lso_max_tcpv6) { 678 *lsoc = *new_lsoc; 679 680 return (B_FALSE); 681 } 682 } else { /* Was not usable */ 683 if (!(ixa->ixa_flags & IXAF_IPSEC_SECURE) && 684 !(ixa->ixa_ire->ire_type & (IRE_LOCAL | IRE_LOOPBACK)) && 685 !(ixa->ixa_ire->ire_flags & RTF_MULTIRT) && 686 ((ixa->ixa_flags & IXAF_IS_IPV4) ? 687 ILL_LSO_TCP_IPV4_USABLE(ill) : 688 ILL_LSO_TCP_IPV6_USABLE(ill))) { 689 *lsoc = *new_lsoc; 690 ixa->ixa_flags |= IXAF_LSO_CAPAB; 691 692 return (B_FALSE); 693 } 694 } 695 696 return (B_TRUE); 697 } 698 699 /* 700 * Verify ZEROCOPY usability. Keep the return value simple to indicate whether 701 * the ZEROCOPY capability has changed. Handle both IPv4 and IPv6. 702 */ 703 static boolean_t 704 ip_verify_zcopy(ill_t *ill, ip_xmit_attr_t *ixa) 705 { 706 if (ixa->ixa_flags & IXAF_ZCOPY_CAPAB) { 707 /* 708 * Not unsable any more. 709 */ 710 if ((ixa->ixa_flags & IXAF_IPSEC_SECURE) || 711 (ixa->ixa_ire->ire_type & (IRE_LOCAL | IRE_LOOPBACK)) || 712 (ixa->ixa_ire->ire_flags & RTF_MULTIRT) || 713 !ILL_ZCOPY_USABLE(ill)) { 714 ixa->ixa_flags &= ~IXAF_ZCOPY_CAPAB; 715 716 return (B_FALSE); 717 } 718 } else { /* Was not usable */ 719 if (!(ixa->ixa_flags & IXAF_IPSEC_SECURE) && 720 !(ixa->ixa_ire->ire_type & (IRE_LOCAL | IRE_LOOPBACK)) && 721 !(ixa->ixa_ire->ire_flags & RTF_MULTIRT) && 722 ILL_ZCOPY_USABLE(ill)) { 723 ixa->ixa_flags |= IXAF_ZCOPY_CAPAB; 724 725 return (B_FALSE); 726 } 727 } 728 729 return (B_TRUE); 730 } 731 732 733 /* 734 * When there is no conn_t context, this will send a packet. 735 * The caller must *not* have called conn_connect() or ip_attr_connect() 736 * before calling ip_output_simple(). 737 * Handles IPv4 and IPv6. Returns zero or an errno such as ENETUNREACH. 738 * Honors IXAF_SET_SOURCE. 739 * 740 * We acquire the ire and after calling ire_sendfn we release 741 * the hold on the ire. Ditto for the nce and dce. 742 * 743 * This assumes that the caller has set the following in ip_xmit_attr_t: 744 * ixa_tsl, ixa_zoneid, and ixa_ipst must always be set. 745 * If ixa_ifindex is non-zero it means send out that ill. (If it is 746 * an upper IPMP ill we load balance across the group; if a lower we send 747 * on that lower ill without load balancing.) 748 * IXAF_IS_IPV4 must be set correctly. 749 * If IXAF_IPSEC_SECURE is set then the ixa_ipsec_* fields must be set. 750 * If IXAF_NO_IPSEC is set we'd skip IPsec policy lookup. 751 * If neither of those two are set we do an IPsec policy lookup. 752 * 753 * We handle setting things like 754 * ixa_pktlen 755 * ixa_ip_hdr_length 756 * ixa->ixa_protocol 757 * 758 * The caller may set ixa_xmit_hint, which is used for ECMP selection and 759 * transmit ring selecting in GLD. 760 * 761 * The caller must do an ixa_cleanup() to release any IPsec references 762 * after we return. 763 */ 764 int 765 ip_output_simple(mblk_t *mp, ip_xmit_attr_t *ixa) 766 { 767 ts_label_t *effective_tsl = NULL; 768 int err; 769 770 ASSERT(ixa->ixa_ipst != NULL); 771 772 if (is_system_labeled()) { 773 ip_stack_t *ipst = ixa->ixa_ipst; 774 775 if (ixa->ixa_flags & IXAF_IS_IPV4) { 776 err = tsol_check_label_v4(ixa->ixa_tsl, ixa->ixa_zoneid, 777 &mp, CONN_MAC_DEFAULT, B_FALSE, ixa->ixa_ipst, 778 &effective_tsl); 779 } else { 780 err = tsol_check_label_v6(ixa->ixa_tsl, ixa->ixa_zoneid, 781 &mp, CONN_MAC_DEFAULT, B_FALSE, ixa->ixa_ipst, 782 &effective_tsl); 783 } 784 if (err != 0) { 785 ip2dbg(("tsol_check: label check failed (%d)\n", err)); 786 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests); 787 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards); 788 ip_drop_output("tsol_check_label", mp, NULL); 789 freemsg(mp); 790 return (err); 791 } 792 if (effective_tsl != NULL) { 793 /* Update the label */ 794 ip_xmit_attr_replace_tsl(ixa, effective_tsl); 795 } 796 } 797 798 if (ixa->ixa_flags & IXAF_IS_IPV4) 799 return (ip_output_simple_v4(mp, ixa)); 800 else 801 return (ip_output_simple_v6(mp, ixa)); 802 } 803 804 int 805 ip_output_simple_v4(mblk_t *mp, ip_xmit_attr_t *ixa) 806 { 807 ipha_t *ipha; 808 ipaddr_t firsthop; /* In IP header */ 809 ipaddr_t dst; /* End of source route, or ipha_dst if none */ 810 ire_t *ire; 811 ipaddr_t setsrc; /* RTF_SETSRC */ 812 int error; 813 ill_t *ill = NULL; 814 dce_t *dce = NULL; 815 nce_t *nce; 816 iaflags_t ixaflags = ixa->ixa_flags; 817 ip_stack_t *ipst = ixa->ixa_ipst; 818 boolean_t repeat = B_FALSE; 819 boolean_t multirt = B_FALSE; 820 int64_t now; 821 822 ipha = (ipha_t *)mp->b_rptr; 823 ASSERT(IPH_HDR_VERSION(ipha) == IPV4_VERSION); 824 825 /* 826 * Even on labeled systems we can have a NULL ixa_tsl e.g., 827 * for IGMP/MLD traffic. 828 */ 829 830 /* Caller already set flags */ 831 ASSERT(ixa->ixa_flags & IXAF_IS_IPV4); 832 833 ASSERT(ixa->ixa_nce == NULL); 834 835 ixa->ixa_pktlen = ntohs(ipha->ipha_length); 836 ASSERT(ixa->ixa_pktlen == msgdsize(mp)); 837 ixa->ixa_ip_hdr_length = IPH_HDR_LENGTH(ipha); 838 ixa->ixa_protocol = ipha->ipha_protocol; 839 840 /* 841 * Assumes that source routed packets have already been massaged by 842 * the ULP (ip_massage_options) and as a result ipha_dst is the next 843 * hop in the source route. The final destination is used for IPsec 844 * policy and DCE lookup. 845 */ 846 firsthop = ipha->ipha_dst; 847 dst = ip_get_dst(ipha); 848 849 repeat_ire: 850 error = 0; 851 setsrc = INADDR_ANY; 852 ire = ip_select_route_v4(firsthop, ipha->ipha_src, ixa, NULL, 853 &setsrc, &error, &multirt); 854 ASSERT(ire != NULL); /* IRE_NOROUTE if none found */ 855 if (error != 0) { 856 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests); 857 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards); 858 ip_drop_output("ipIfStatsOutDiscards - select route", mp, NULL); 859 freemsg(mp); 860 goto done; 861 } 862 863 if (ire->ire_flags & (RTF_BLACKHOLE|RTF_REJECT)) { 864 /* ire_ill might be NULL hence need to skip some code */ 865 if (ixaflags & IXAF_SET_SOURCE) 866 ipha->ipha_src = htonl(INADDR_LOOPBACK); 867 ixa->ixa_fragsize = IP_MAXPACKET; 868 ill = NULL; 869 nce = NULL; 870 ire->ire_ob_pkt_count++; 871 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests); 872 /* No dce yet; use default one */ 873 error = (ire->ire_sendfn)(ire, mp, ipha, ixa, 874 &ipst->ips_dce_default->dce_ident); 875 goto done; 876 } 877 878 /* Note that ipha_dst is only used for IRE_MULTICAST */ 879 nce = ire_to_nce(ire, ipha->ipha_dst, NULL); 880 if (nce == NULL) { 881 /* Allocation failure? */ 882 ip_drop_output("ire_to_nce", mp, ill); 883 freemsg(mp); 884 error = ENOBUFS; 885 goto done; 886 } 887 if (nce->nce_is_condemned) { 888 nce_t *nce1; 889 890 nce1 = ire_handle_condemned_nce(nce, ire, ipha, NULL, B_TRUE); 891 nce_refrele(nce); 892 if (nce1 == NULL) { 893 if (!repeat) { 894 /* Try finding a better IRE */ 895 repeat = B_TRUE; 896 ire_refrele(ire); 897 goto repeat_ire; 898 } 899 /* Tried twice - drop packet */ 900 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards); 901 ip_drop_output("No nce", mp, ill); 902 freemsg(mp); 903 error = ENOBUFS; 904 goto done; 905 } 906 nce = nce1; 907 } 908 909 /* 910 * For multicast with multirt we have a flag passed back from 911 * ire_lookup_multi_ill_v4 since we don't have an IRE for each 912 * possible multicast address. 913 * We also need a flag for multicast since we can't check 914 * whether RTF_MULTIRT is set in ixa_ire for multicast. 915 */ 916 if (multirt) { 917 ixa->ixa_postfragfn = ip_postfrag_multirt_v4; 918 ixa->ixa_flags |= IXAF_MULTIRT_MULTICAST; 919 } else { 920 ixa->ixa_postfragfn = ire->ire_postfragfn; 921 ixa->ixa_flags &= ~IXAF_MULTIRT_MULTICAST; 922 } 923 ASSERT(ixa->ixa_nce == NULL); 924 ixa->ixa_nce = nce; 925 926 /* 927 * Check for a dce_t with a path mtu. 928 */ 929 dce = dce_lookup_v4(dst, ipst, NULL); 930 ASSERT(dce != NULL); 931 932 if (!(ixaflags & IXAF_PMTU_DISCOVERY)) { 933 ixa->ixa_fragsize = ip_get_base_mtu(nce->nce_ill, ire); 934 } else if (dce->dce_flags & DCEF_PMTU) { 935 /* 936 * To avoid a periodic timer to increase the path MTU we 937 * look at dce_last_change_time each time we send a packet. 938 */ 939 now = ddi_get_lbolt64(); 940 if (TICK_TO_SEC(now) - dce->dce_last_change_time > 941 ipst->ips_ip_pathmtu_interval) { 942 /* 943 * Older than 20 minutes. Drop the path MTU information. 944 */ 945 mutex_enter(&dce->dce_lock); 946 dce->dce_flags &= ~(DCEF_PMTU|DCEF_TOO_SMALL_PMTU); 947 dce->dce_last_change_time = TICK_TO_SEC(now); 948 mutex_exit(&dce->dce_lock); 949 dce_increment_generation(dce); 950 ixa->ixa_fragsize = ip_get_base_mtu(nce->nce_ill, ire); 951 } else { 952 uint_t fragsize; 953 954 fragsize = ip_get_base_mtu(nce->nce_ill, ire); 955 if (fragsize > dce->dce_pmtu) 956 fragsize = dce->dce_pmtu; 957 ixa->ixa_fragsize = fragsize; 958 } 959 } else { 960 ixa->ixa_fragsize = ip_get_base_mtu(nce->nce_ill, ire); 961 } 962 963 /* 964 * We use use ire_nexthop_ill (and not ncec_ill) to avoid the under ipmp 965 * interface for source address selection. 966 */ 967 ill = ire_nexthop_ill(ire); 968 969 if (ixaflags & IXAF_SET_SOURCE) { 970 ipaddr_t src; 971 972 /* 973 * We use the final destination to get 974 * correct selection for source routed packets 975 */ 976 977 /* If unreachable we have no ill but need some source */ 978 if (ill == NULL) { 979 src = htonl(INADDR_LOOPBACK); 980 error = 0; 981 } else { 982 error = ip_select_source_v4(ill, setsrc, dst, 983 ixa->ixa_multicast_ifaddr, ixa->ixa_zoneid, ipst, 984 &src, NULL, NULL); 985 } 986 if (error != 0) { 987 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCOutRequests); 988 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); 989 ip_drop_output("ipIfStatsOutDiscards - no source", 990 mp, ill); 991 freemsg(mp); 992 goto done; 993 } 994 ipha->ipha_src = src; 995 } else if (ixaflags & IXAF_VERIFY_SOURCE) { 996 /* Check if the IP source is assigned to the host. */ 997 if (!ip_verify_src(mp, ixa, NULL)) { 998 /* Don't send a packet with a source that isn't ours */ 999 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests); 1000 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards); 1001 ip_drop_output("ipIfStatsOutDiscards - invalid source", 1002 mp, ill); 1003 freemsg(mp); 1004 error = EADDRNOTAVAIL; 1005 goto done; 1006 } 1007 } 1008 1009 1010 /* 1011 * Check against global IPsec policy to set the AH/ESP attributes. 1012 * IPsec will set IXAF_IPSEC_* and ixa_ipsec_* as appropriate. 1013 */ 1014 if (!(ixaflags & (IXAF_NO_IPSEC|IXAF_IPSEC_SECURE))) { 1015 ASSERT(ixa->ixa_ipsec_policy == NULL); 1016 mp = ip_output_attach_policy(mp, ipha, NULL, NULL, ixa); 1017 if (mp == NULL) { 1018 /* MIB and ip_drop_packet already done */ 1019 return (EHOSTUNREACH); /* IPsec policy failure */ 1020 } 1021 } 1022 1023 if (ill != NULL) { 1024 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCOutRequests); 1025 } else { 1026 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests); 1027 } 1028 1029 /* 1030 * We update the statistics on the most specific IRE i.e., the first 1031 * one we found. 1032 * We don't have an IRE when we fragment, hence ire_ob_pkt_count 1033 * can only count the use prior to fragmentation. However the MIB 1034 * counters on the ill will be incremented in post fragmentation. 1035 */ 1036 ire->ire_ob_pkt_count++; 1037 1038 /* 1039 * Based on ire_type and ire_flags call one of: 1040 * ire_send_local_v4 - for IRE_LOCAL and IRE_LOOPBACK 1041 * ire_send_multirt_v4 - if RTF_MULTIRT 1042 * ire_send_noroute_v4 - if RTF_REJECT or RTF_BLACHOLE 1043 * ire_send_multicast_v4 - for IRE_MULTICAST 1044 * ire_send_broadcast_v4 - for IRE_BROADCAST 1045 * ire_send_wire_v4 - for the rest. 1046 */ 1047 error = (ire->ire_sendfn)(ire, mp, ipha, ixa, &dce->dce_ident); 1048 done: 1049 ire_refrele(ire); 1050 if (dce != NULL) 1051 dce_refrele(dce); 1052 if (ill != NULL) 1053 ill_refrele(ill); 1054 if (ixa->ixa_nce != NULL) 1055 nce_refrele(ixa->ixa_nce); 1056 ixa->ixa_nce = NULL; 1057 return (error); 1058 } 1059 1060 /* 1061 * ire_sendfn() functions. 1062 * These functions use the following xmit_attr: 1063 * - ixa_fragsize - read to determine whether or not to fragment 1064 * - IXAF_IPSEC_SECURE - to determine whether or not to invoke IPsec 1065 * - ixa_ipsec_* are used inside IPsec 1066 * - IXAF_SET_SOURCE - replace IP source in broadcast case. 1067 * - IXAF_LOOPBACK_COPY - for multicast and broadcast 1068 */ 1069 1070 1071 /* 1072 * ire_sendfn for IRE_LOCAL and IRE_LOOPBACK 1073 * 1074 * The checks for restrict_interzone_loopback are done in ire_route_recursive. 1075 */ 1076 /* ARGSUSED4 */ 1077 int 1078 ire_send_local_v4(ire_t *ire, mblk_t *mp, void *iph_arg, 1079 ip_xmit_attr_t *ixa, uint32_t *identp) 1080 { 1081 ipha_t *ipha = (ipha_t *)iph_arg; 1082 ip_stack_t *ipst = ixa->ixa_ipst; 1083 ill_t *ill = ire->ire_ill; 1084 ip_recv_attr_t iras; /* NOTE: No bzero for performance */ 1085 uint_t pktlen = ixa->ixa_pktlen; 1086 1087 /* 1088 * No fragmentation, no nce, no application of IPsec, 1089 * and no ipha_ident assignment. 1090 * 1091 * Note different order between IP provider and FW_HOOKS than in 1092 * send_wire case. 1093 */ 1094 1095 /* 1096 * DTrace this as ip:::send. A packet blocked by FW_HOOKS will fire the 1097 * send probe, but not the receive probe. 1098 */ 1099 DTRACE_IP7(send, mblk_t *, mp, conn_t *, NULL, void_ip_t *, 1100 ipha, __dtrace_ipsr_ill_t *, ill, ipha_t *, ipha, ip6_t *, NULL, 1101 int, 1); 1102 1103 if (HOOKS4_INTERESTED_LOOPBACK_OUT(ipst)) { 1104 int error = 0; 1105 1106 DTRACE_PROBE4(ip4__loopback__out__start, ill_t *, NULL, 1107 ill_t *, ill, ipha_t *, ipha, mblk_t *, mp); 1108 FW_HOOKS(ipst->ips_ip4_loopback_out_event, 1109 ipst->ips_ipv4firewall_loopback_out, 1110 NULL, ill, ipha, mp, mp, 0, ipst, error); 1111 DTRACE_PROBE1(ip4__loopback__out__end, mblk_t *, mp); 1112 if (mp == NULL) 1113 return (error); 1114 1115 /* 1116 * Even if the destination was changed by the filter we use the 1117 * forwarding decision that was made based on the address 1118 * in ip_output/ip_set_destination. 1119 */ 1120 /* Length could be different */ 1121 ipha = (ipha_t *)mp->b_rptr; 1122 pktlen = ntohs(ipha->ipha_length); 1123 } 1124 1125 /* 1126 * If a callback is enabled then we need to know the 1127 * source and destination zoneids for the packet. We already 1128 * have those handy. 1129 */ 1130 if (ipst->ips_ip4_observe.he_interested) { 1131 zoneid_t szone, dzone; 1132 zoneid_t stackzoneid; 1133 1134 stackzoneid = netstackid_to_zoneid( 1135 ipst->ips_netstack->netstack_stackid); 1136 1137 if (stackzoneid == GLOBAL_ZONEID) { 1138 /* Shared-IP zone */ 1139 dzone = ire->ire_zoneid; 1140 szone = ixa->ixa_zoneid; 1141 } else { 1142 szone = dzone = stackzoneid; 1143 } 1144 ipobs_hook(mp, IPOBS_HOOK_LOCAL, szone, dzone, ill, ipst); 1145 } 1146 1147 /* Handle lo0 stats */ 1148 ipst->ips_loopback_packets++; 1149 1150 /* Map ixa to ira including IPsec policies */ 1151 ipsec_out_to_in(ixa, ill, &iras); 1152 iras.ira_pktlen = pktlen; 1153 1154 if (!IS_SIMPLE_IPH(ipha)) { 1155 ip_output_local_options(ipha, ipst); 1156 iras.ira_flags |= IRAF_IPV4_OPTIONS; 1157 } 1158 1159 if (HOOKS4_INTERESTED_LOOPBACK_IN(ipst)) { 1160 int error = 0; 1161 1162 DTRACE_PROBE4(ip4__loopback__in__start, ill_t *, ill, 1163 ill_t *, NULL, ipha_t *, ipha, mblk_t *, mp); 1164 FW_HOOKS(ipst->ips_ip4_loopback_in_event, 1165 ipst->ips_ipv4firewall_loopback_in, 1166 ill, NULL, ipha, mp, mp, 0, ipst, error); 1167 1168 DTRACE_PROBE1(ip4__loopback__in__end, mblk_t *, mp); 1169 if (mp == NULL) { 1170 ira_cleanup(&iras, B_FALSE); 1171 return (error); 1172 } 1173 /* 1174 * Even if the destination was changed by the filter we use the 1175 * forwarding decision that was made based on the address 1176 * in ip_output/ip_set_destination. 1177 */ 1178 /* Length could be different */ 1179 ipha = (ipha_t *)mp->b_rptr; 1180 pktlen = iras.ira_pktlen = ntohs(ipha->ipha_length); 1181 } 1182 1183 DTRACE_IP7(receive, mblk_t *, mp, conn_t *, NULL, void_ip_t *, 1184 ipha, __dtrace_ipsr_ill_t *, ill, ipha_t *, ipha, ip6_t *, NULL, 1185 int, 1); 1186 1187 ire->ire_ib_pkt_count++; 1188 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInReceives); 1189 UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCInOctets, pktlen); 1190 1191 /* Destined to ire_zoneid - use that for fanout */ 1192 iras.ira_zoneid = ire->ire_zoneid; 1193 1194 if (is_system_labeled()) { 1195 iras.ira_flags |= IRAF_SYSTEM_LABELED; 1196 1197 /* 1198 * This updates ira_cred, ira_tsl and ira_free_flags based 1199 * on the label. We don't expect this to ever fail for 1200 * loopback packets, so we silently drop the packet should it 1201 * fail. 1202 */ 1203 if (!tsol_get_pkt_label(mp, IPV4_VERSION, &iras)) { 1204 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 1205 ip_drop_input("tsol_get_pkt_label", mp, ill); 1206 freemsg(mp); 1207 return (0); 1208 } 1209 ASSERT(iras.ira_tsl != NULL); 1210 1211 /* tsol_get_pkt_label sometimes does pullupmsg */ 1212 ipha = (ipha_t *)mp->b_rptr; 1213 } 1214 1215 ip_fanout_v4(mp, ipha, &iras); 1216 1217 /* We moved any IPsec refs from ixa to iras */ 1218 ira_cleanup(&iras, B_FALSE); 1219 return (0); 1220 } 1221 1222 /* 1223 * ire_sendfn for IRE_BROADCAST 1224 * If the broadcast address is present on multiple ills and ixa_ifindex 1225 * isn't set, then we generate 1226 * a separate datagram (potentially with different source address) for 1227 * those ills. In any case, only one copy is looped back to ip_input_v4. 1228 */ 1229 int 1230 ire_send_broadcast_v4(ire_t *ire, mblk_t *mp, void *iph_arg, 1231 ip_xmit_attr_t *ixa, uint32_t *identp) 1232 { 1233 ipha_t *ipha = (ipha_t *)iph_arg; 1234 ip_stack_t *ipst = ixa->ixa_ipst; 1235 irb_t *irb = ire->ire_bucket; 1236 ire_t *ire1; 1237 mblk_t *mp1; 1238 ipha_t *ipha1; 1239 iaflags_t ixaflags = ixa->ixa_flags; 1240 nce_t *nce1, *nce_orig; 1241 1242 /* 1243 * Unless ire_send_multirt_v4 already set a ttl, force the 1244 * ttl to a smallish value. 1245 */ 1246 if (!(ixa->ixa_flags & IXAF_NO_TTL_CHANGE)) { 1247 /* 1248 * To avoid broadcast storms, we usually set the TTL to 1 for 1249 * broadcasts. This can 1250 * be overridden stack-wide through the ip_broadcast_ttl 1251 * ndd tunable, or on a per-connection basis through the 1252 * IP_BROADCAST_TTL socket option. 1253 * 1254 * If SO_DONTROUTE/IXAF_DONTROUTE is set, then ire_send_wire_v4 1255 * will force ttl to one after we've set this. 1256 */ 1257 if (ixaflags & IXAF_BROADCAST_TTL_SET) 1258 ipha->ipha_ttl = ixa->ixa_broadcast_ttl; 1259 else 1260 ipha->ipha_ttl = ipst->ips_ip_broadcast_ttl; 1261 } 1262 /* 1263 * Make sure we get a loopback copy (after IPsec and frag) 1264 * Skip hardware checksum so that loopback copy is checksumed. 1265 */ 1266 ixa->ixa_flags |= IXAF_LOOPBACK_COPY | IXAF_NO_HW_CKSUM; 1267 1268 /* Do we need to potentially generate multiple copies? */ 1269 if (irb->irb_ire_cnt == 1 || ixa->ixa_ifindex != 0) 1270 return (ire_send_wire_v4(ire, mp, ipha, ixa, identp)); 1271 1272 /* 1273 * Loop over all IRE_BROADCAST in the bucket (might only be one). 1274 * Note that everything in the bucket has the same destination address. 1275 */ 1276 irb_refhold(irb); 1277 for (ire1 = irb->irb_ire; ire1 != NULL; ire1 = ire1->ire_next) { 1278 /* We do the main IRE after the end of the loop */ 1279 if (ire1 == ire) 1280 continue; 1281 1282 /* 1283 * Only IREs for the same IP address should be in the same 1284 * bucket. 1285 * But could have IRE_HOSTs in the case of CGTP. 1286 * If we find any multirt routes we bail out of the loop 1287 * and just do the single packet at the end; ip_postfrag_multirt 1288 * will duplicate the packet. 1289 */ 1290 ASSERT(ire1->ire_addr == ire->ire_addr); 1291 if (!(ire1->ire_type & IRE_BROADCAST)) 1292 continue; 1293 1294 if (IRE_IS_CONDEMNED(ire1)) 1295 continue; 1296 1297 if (ixa->ixa_zoneid != ALL_ZONES && 1298 ire->ire_zoneid != ire1->ire_zoneid) 1299 continue; 1300 1301 ASSERT(ire->ire_ill != ire1->ire_ill && ire1->ire_ill != NULL); 1302 1303 if (ire1->ire_flags & RTF_MULTIRT) 1304 break; 1305 1306 /* 1307 * For IPMP we only send for the ipmp_ill. arp_nce_init() will 1308 * ensure that this goes out on the cast_ill. 1309 */ 1310 if (IS_UNDER_IPMP(ire1->ire_ill)) 1311 continue; 1312 1313 mp1 = copymsg(mp); 1314 if (mp1 == NULL) { 1315 BUMP_MIB(ire1->ire_ill->ill_ip_mib, 1316 ipIfStatsOutDiscards); 1317 ip_drop_output("ipIfStatsOutDiscards", 1318 mp, ire1->ire_ill); 1319 continue; 1320 } 1321 1322 ipha1 = (ipha_t *)mp1->b_rptr; 1323 if (ixa->ixa_flags & IXAF_SET_SOURCE) { 1324 /* 1325 * Need to pick a different source address for each 1326 * interface. If we have a global IPsec policy and 1327 * no per-socket policy then we punt to 1328 * ip_output_simple_v4 using a separate ip_xmit_attr_t. 1329 */ 1330 if (ixaflags & IXAF_IPSEC_GLOBAL_POLICY) { 1331 ip_output_simple_broadcast(ixa, mp1); 1332 continue; 1333 } 1334 /* Pick a new source address for each interface */ 1335 if (ip_select_source_v4(ire1->ire_ill, INADDR_ANY, 1336 ipha1->ipha_dst, INADDR_ANY, ixa->ixa_zoneid, ipst, 1337 &ipha1->ipha_src, NULL, NULL) != 0) { 1338 BUMP_MIB(ire1->ire_ill->ill_ip_mib, 1339 ipIfStatsOutDiscards); 1340 ip_drop_output("ipIfStatsOutDiscards - select " 1341 "broadcast source", mp1, ire1->ire_ill); 1342 freemsg(mp1); 1343 continue; 1344 } 1345 /* 1346 * Check against global IPsec policy to set the AH/ESP 1347 * attributes. IPsec will set IXAF_IPSEC_* and 1348 * ixa_ipsec_* as appropriate. 1349 */ 1350 if (!(ixaflags & (IXAF_NO_IPSEC|IXAF_IPSEC_SECURE))) { 1351 ASSERT(ixa->ixa_ipsec_policy == NULL); 1352 mp1 = ip_output_attach_policy(mp1, ipha, NULL, 1353 NULL, ixa); 1354 if (mp1 == NULL) { 1355 /* 1356 * MIB and ip_drop_packet already 1357 * done 1358 */ 1359 continue; 1360 } 1361 } 1362 } 1363 /* Make sure we have an NCE on this ill */ 1364 nce1 = arp_nce_init(ire1->ire_ill, ire1->ire_addr, 1365 ire1->ire_type); 1366 if (nce1 == NULL) { 1367 BUMP_MIB(ire1->ire_ill->ill_ip_mib, 1368 ipIfStatsOutDiscards); 1369 ip_drop_output("ipIfStatsOutDiscards - broadcast nce", 1370 mp1, ire1->ire_ill); 1371 freemsg(mp1); 1372 continue; 1373 } 1374 nce_orig = ixa->ixa_nce; 1375 ixa->ixa_nce = nce1; 1376 1377 ire_refhold(ire1); 1378 /* 1379 * Ignore any errors here. We just collect the errno for 1380 * the main ire below 1381 */ 1382 (void) ire_send_wire_v4(ire1, mp1, ipha1, ixa, identp); 1383 ire_refrele(ire1); 1384 1385 ixa->ixa_nce = nce_orig; 1386 nce_refrele(nce1); 1387 1388 ixa->ixa_flags &= ~IXAF_LOOPBACK_COPY; 1389 } 1390 irb_refrele(irb); 1391 /* Finally, the main one */ 1392 1393 /* 1394 * For IPMP we only send broadcasts on the ipmp_ill. 1395 */ 1396 if (IS_UNDER_IPMP(ire->ire_ill)) { 1397 freemsg(mp); 1398 return (0); 1399 } 1400 1401 return (ire_send_wire_v4(ire, mp, ipha, ixa, identp)); 1402 } 1403 1404 /* 1405 * Send a packet using a different source address and different 1406 * IPsec policy. 1407 */ 1408 static void 1409 ip_output_simple_broadcast(ip_xmit_attr_t *ixa, mblk_t *mp) 1410 { 1411 ip_xmit_attr_t ixas; 1412 1413 bzero(&ixas, sizeof (ixas)); 1414 ixas.ixa_flags = IXAF_BASIC_SIMPLE_V4; 1415 ixas.ixa_zoneid = ixa->ixa_zoneid; 1416 ixas.ixa_ifindex = 0; 1417 ixas.ixa_ipst = ixa->ixa_ipst; 1418 ixas.ixa_cred = ixa->ixa_cred; 1419 ixas.ixa_cpid = ixa->ixa_cpid; 1420 ixas.ixa_tsl = ixa->ixa_tsl; 1421 ixas.ixa_multicast_ttl = IP_DEFAULT_MULTICAST_TTL; 1422 1423 (void) ip_output_simple(mp, &ixas); 1424 ixa_cleanup(&ixas); 1425 } 1426 1427 1428 static void 1429 multirt_check_v4(ire_t *ire, ipha_t *ipha, ip_xmit_attr_t *ixa) 1430 { 1431 ip_stack_t *ipst = ixa->ixa_ipst; 1432 1433 /* Limit the TTL on multirt packets */ 1434 if (ire->ire_type & IRE_MULTICAST) { 1435 if (ipha->ipha_ttl > 1) { 1436 ip2dbg(("ire_send_multirt_v4: forcing multicast " 1437 "multirt TTL to 1 (was %d), dst 0x%08x\n", 1438 ipha->ipha_ttl, ntohl(ire->ire_addr))); 1439 ipha->ipha_ttl = 1; 1440 } 1441 ixa->ixa_flags |= IXAF_NO_TTL_CHANGE; 1442 } else if ((ipst->ips_ip_multirt_ttl > 0) && 1443 (ipha->ipha_ttl > ipst->ips_ip_multirt_ttl)) { 1444 ipha->ipha_ttl = ipst->ips_ip_multirt_ttl; 1445 /* 1446 * Need to ensure we don't increase the ttl should we go through 1447 * ire_send_broadcast or multicast. 1448 */ 1449 ixa->ixa_flags |= IXAF_NO_TTL_CHANGE; 1450 } 1451 } 1452 1453 /* 1454 * ire_sendfn for IRE_MULTICAST 1455 */ 1456 int 1457 ire_send_multicast_v4(ire_t *ire, mblk_t *mp, void *iph_arg, 1458 ip_xmit_attr_t *ixa, uint32_t *identp) 1459 { 1460 ipha_t *ipha = (ipha_t *)iph_arg; 1461 ip_stack_t *ipst = ixa->ixa_ipst; 1462 ill_t *ill = ire->ire_ill; 1463 iaflags_t ixaflags = ixa->ixa_flags; 1464 1465 /* 1466 * The IRE_MULTICAST is the same whether or not multirt is in use. 1467 * Hence we need special-case code. 1468 */ 1469 if (ixaflags & IXAF_MULTIRT_MULTICAST) 1470 multirt_check_v4(ire, ipha, ixa); 1471 1472 /* 1473 * Check if anything in ip_input_v4 wants a copy of the transmitted 1474 * packet (after IPsec and fragmentation) 1475 * 1476 * 1. Multicast routers always need a copy unless SO_DONTROUTE is set 1477 * RSVP and the rsvp daemon is an example of a 1478 * protocol and user level process that 1479 * handles it's own routing. Hence, it uses the 1480 * SO_DONTROUTE option to accomplish this. 1481 * 2. If the sender has set IP_MULTICAST_LOOP, then we just 1482 * check whether there are any receivers for the group on the ill 1483 * (ignoring the zoneid). 1484 * 3. If IP_MULTICAST_LOOP is not set, then we check if there are 1485 * any members in other shared-IP zones. 1486 * If such members exist, then we indicate that the sending zone 1487 * shouldn't get a loopback copy to preserve the IP_MULTICAST_LOOP 1488 * behavior. 1489 * 1490 * When we loopback we skip hardware checksum to make sure loopback 1491 * copy is checksumed. 1492 * 1493 * Note that ire_ill is the upper in the case of IPMP. 1494 */ 1495 ixa->ixa_flags &= ~(IXAF_LOOPBACK_COPY | IXAF_NO_HW_CKSUM); 1496 if (ipst->ips_ip_g_mrouter && ill->ill_mrouter_cnt > 0 && 1497 !(ixaflags & IXAF_DONTROUTE)) { 1498 ixa->ixa_flags |= IXAF_LOOPBACK_COPY | IXAF_NO_HW_CKSUM; 1499 } else if (ixaflags & IXAF_MULTICAST_LOOP) { 1500 /* 1501 * If this zone or any other zone has members then loopback 1502 * a copy. 1503 */ 1504 if (ill_hasmembers_v4(ill, ipha->ipha_dst)) 1505 ixa->ixa_flags |= IXAF_LOOPBACK_COPY | IXAF_NO_HW_CKSUM; 1506 } else if (ipst->ips_netstack->netstack_numzones > 1) { 1507 /* 1508 * This zone should not have a copy. But there are some other 1509 * zones which might have members. 1510 */ 1511 if (ill_hasmembers_otherzones_v4(ill, ipha->ipha_dst, 1512 ixa->ixa_zoneid)) { 1513 ixa->ixa_flags |= IXAF_NO_LOOP_ZONEID_SET; 1514 ixa->ixa_no_loop_zoneid = ixa->ixa_zoneid; 1515 ixa->ixa_flags |= IXAF_LOOPBACK_COPY | IXAF_NO_HW_CKSUM; 1516 } 1517 } 1518 1519 /* 1520 * Unless ire_send_multirt_v4 or icmp_output_hdrincl already set a ttl, 1521 * force the ttl to the IP_MULTICAST_TTL value 1522 */ 1523 if (!(ixaflags & IXAF_NO_TTL_CHANGE)) { 1524 ipha->ipha_ttl = ixa->ixa_multicast_ttl; 1525 } 1526 1527 return (ire_send_wire_v4(ire, mp, ipha, ixa, identp)); 1528 } 1529 1530 /* 1531 * ire_sendfn for IREs with RTF_MULTIRT 1532 */ 1533 int 1534 ire_send_multirt_v4(ire_t *ire, mblk_t *mp, void *iph_arg, 1535 ip_xmit_attr_t *ixa, uint32_t *identp) 1536 { 1537 ipha_t *ipha = (ipha_t *)iph_arg; 1538 1539 multirt_check_v4(ire, ipha, ixa); 1540 1541 if (ire->ire_type & IRE_MULTICAST) 1542 return (ire_send_multicast_v4(ire, mp, ipha, ixa, identp)); 1543 else if (ire->ire_type & IRE_BROADCAST) 1544 return (ire_send_broadcast_v4(ire, mp, ipha, ixa, identp)); 1545 else 1546 return (ire_send_wire_v4(ire, mp, ipha, ixa, identp)); 1547 } 1548 1549 /* 1550 * ire_sendfn for IREs with RTF_REJECT/RTF_BLACKHOLE, including IRE_NOROUTE 1551 */ 1552 int 1553 ire_send_noroute_v4(ire_t *ire, mblk_t *mp, void *iph_arg, 1554 ip_xmit_attr_t *ixa, uint32_t *identp) 1555 { 1556 ip_stack_t *ipst = ixa->ixa_ipst; 1557 ipha_t *ipha = (ipha_t *)iph_arg; 1558 ill_t *ill; 1559 ip_recv_attr_t iras; 1560 boolean_t dummy; 1561 1562 /* We assign an IP ident for nice errors */ 1563 ipha->ipha_ident = atomic_inc_32_nv(identp); 1564 1565 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutNoRoutes); 1566 1567 if (ire->ire_type & IRE_NOROUTE) { 1568 /* A lack of a route as opposed to RTF_REJECT|BLACKHOLE */ 1569 ip_rts_change(RTM_MISS, ipha->ipha_dst, 0, 0, 0, 0, 0, 0, 1570 RTA_DST, ipst); 1571 } 1572 1573 if (ire->ire_flags & RTF_BLACKHOLE) { 1574 ip_drop_output("ipIfStatsOutNoRoutes RTF_BLACKHOLE", mp, NULL); 1575 freemsg(mp); 1576 /* No error even for local senders - silent blackhole */ 1577 return (0); 1578 } 1579 ip_drop_output("ipIfStatsOutNoRoutes RTF_REJECT", mp, NULL); 1580 1581 /* 1582 * We need an ill_t for the ip_recv_attr_t even though this packet 1583 * was never received and icmp_unreachable doesn't currently use 1584 * ira_ill. 1585 */ 1586 ill = ill_lookup_on_name("lo0", B_FALSE, 1587 !(ixa->ixa_flags & IRAF_IS_IPV4), &dummy, ipst); 1588 if (ill == NULL) { 1589 freemsg(mp); 1590 return (EHOSTUNREACH); 1591 } 1592 1593 bzero(&iras, sizeof (iras)); 1594 /* Map ixa to ira including IPsec policies */ 1595 ipsec_out_to_in(ixa, ill, &iras); 1596 1597 if (ip_source_routed(ipha, ipst)) { 1598 icmp_unreachable(mp, ICMP_SOURCE_ROUTE_FAILED, &iras); 1599 } else { 1600 icmp_unreachable(mp, ICMP_HOST_UNREACHABLE, &iras); 1601 } 1602 /* We moved any IPsec refs from ixa to iras */ 1603 ira_cleanup(&iras, B_FALSE); 1604 ill_refrele(ill); 1605 return (EHOSTUNREACH); 1606 } 1607 1608 /* 1609 * Calculate a checksum ignoring any hardware capabilities 1610 * 1611 * Returns B_FALSE if the packet was too short for the checksum. Caller 1612 * should free and do stats. 1613 */ 1614 static boolean_t 1615 ip_output_sw_cksum_v4(mblk_t *mp, ipha_t *ipha, ip_xmit_attr_t *ixa) 1616 { 1617 ip_stack_t *ipst = ixa->ixa_ipst; 1618 uint_t pktlen = ixa->ixa_pktlen; 1619 uint16_t *cksump; 1620 uint32_t cksum; 1621 uint8_t protocol = ixa->ixa_protocol; 1622 uint16_t ip_hdr_length = ixa->ixa_ip_hdr_length; 1623 ipaddr_t dst = ipha->ipha_dst; 1624 ipaddr_t src = ipha->ipha_src; 1625 1626 /* Just in case it contained garbage */ 1627 DB_CKSUMFLAGS(mp) &= ~HCK_FLAGS; 1628 1629 /* 1630 * Calculate ULP checksum 1631 */ 1632 if (protocol == IPPROTO_TCP) { 1633 cksump = IPH_TCPH_CHECKSUMP(ipha, ip_hdr_length); 1634 cksum = IP_TCP_CSUM_COMP; 1635 } else if (protocol == IPPROTO_UDP) { 1636 cksump = IPH_UDPH_CHECKSUMP(ipha, ip_hdr_length); 1637 cksum = IP_UDP_CSUM_COMP; 1638 } else if (protocol == IPPROTO_SCTP) { 1639 sctp_hdr_t *sctph; 1640 1641 ASSERT(MBLKL(mp) >= (ip_hdr_length + sizeof (*sctph))); 1642 sctph = (sctp_hdr_t *)(mp->b_rptr + ip_hdr_length); 1643 /* 1644 * Zero out the checksum field to ensure proper 1645 * checksum calculation. 1646 */ 1647 sctph->sh_chksum = 0; 1648 #ifdef DEBUG 1649 if (!skip_sctp_cksum) 1650 #endif 1651 sctph->sh_chksum = sctp_cksum(mp, ip_hdr_length); 1652 goto ip_hdr_cksum; 1653 } else { 1654 goto ip_hdr_cksum; 1655 } 1656 1657 /* ULP puts the checksum field is in the first mblk */ 1658 ASSERT(((uchar_t *)cksump) + sizeof (uint16_t) <= mp->b_wptr); 1659 1660 /* 1661 * We accumulate the pseudo header checksum in cksum. 1662 * This is pretty hairy code, so watch close. One 1663 * thing to keep in mind is that UDP and TCP have 1664 * stored their respective datagram lengths in their 1665 * checksum fields. This lines things up real nice. 1666 */ 1667 cksum += (dst >> 16) + (dst & 0xFFFF) + (src >> 16) + (src & 0xFFFF); 1668 1669 cksum = IP_CSUM(mp, ip_hdr_length, cksum); 1670 /* 1671 * For UDP/IPv4 a zero means that the packets wasn't checksummed. 1672 * Change to 0xffff 1673 */ 1674 if (protocol == IPPROTO_UDP && cksum == 0) 1675 *cksump = ~cksum; 1676 else 1677 *cksump = cksum; 1678 1679 IP_STAT(ipst, ip_out_sw_cksum); 1680 IP_STAT_UPDATE(ipst, ip_out_sw_cksum_bytes, pktlen); 1681 1682 ip_hdr_cksum: 1683 /* Calculate IPv4 header checksum */ 1684 ipha->ipha_hdr_checksum = 0; 1685 ipha->ipha_hdr_checksum = ip_csum_hdr(ipha); 1686 return (B_TRUE); 1687 } 1688 1689 /* 1690 * Calculate the ULP checksum - try to use hardware. 1691 * In the case of MULTIRT, broadcast or multicast the 1692 * IXAF_NO_HW_CKSUM is set in which case we use software. 1693 * 1694 * If the hardware supports IP header checksum offload; then clear the 1695 * contents of IP header checksum field as expected by NIC. 1696 * Do this only if we offloaded either full or partial sum. 1697 * 1698 * Returns B_FALSE if the packet was too short for the checksum. Caller 1699 * should free and do stats. 1700 */ 1701 static boolean_t 1702 ip_output_cksum_v4(iaflags_t ixaflags, mblk_t *mp, ipha_t *ipha, 1703 ip_xmit_attr_t *ixa, ill_t *ill) 1704 { 1705 uint_t pktlen = ixa->ixa_pktlen; 1706 uint16_t *cksump; 1707 uint16_t hck_flags; 1708 uint32_t cksum; 1709 uint8_t protocol = ixa->ixa_protocol; 1710 uint16_t ip_hdr_length = ixa->ixa_ip_hdr_length; 1711 1712 if ((ixaflags & IXAF_NO_HW_CKSUM) || !ILL_HCKSUM_CAPABLE(ill) || 1713 !dohwcksum) { 1714 return (ip_output_sw_cksum_v4(mp, ipha, ixa)); 1715 } 1716 1717 /* 1718 * Calculate ULP checksum. Note that we don't use cksump and cksum 1719 * if the ill has FULL support. 1720 */ 1721 if (protocol == IPPROTO_TCP) { 1722 cksump = IPH_TCPH_CHECKSUMP(ipha, ip_hdr_length); 1723 cksum = IP_TCP_CSUM_COMP; /* Pseudo-header cksum */ 1724 } else if (protocol == IPPROTO_UDP) { 1725 cksump = IPH_UDPH_CHECKSUMP(ipha, ip_hdr_length); 1726 cksum = IP_UDP_CSUM_COMP; /* Pseudo-header cksum */ 1727 } else if (protocol == IPPROTO_SCTP) { 1728 sctp_hdr_t *sctph; 1729 1730 ASSERT(MBLKL(mp) >= (ip_hdr_length + sizeof (*sctph))); 1731 sctph = (sctp_hdr_t *)(mp->b_rptr + ip_hdr_length); 1732 /* 1733 * Zero out the checksum field to ensure proper 1734 * checksum calculation. 1735 */ 1736 sctph->sh_chksum = 0; 1737 #ifdef DEBUG 1738 if (!skip_sctp_cksum) 1739 #endif 1740 sctph->sh_chksum = sctp_cksum(mp, ip_hdr_length); 1741 goto ip_hdr_cksum; 1742 } else if (protocol == IPPROTO_ICMP) { 1743 /* 1744 * Note that we always calculate a SW checksum for ICMP. In the 1745 * future, if HW support for ICMP is advertised, we can change 1746 * this. 1747 */ 1748 return (ip_output_sw_cksum_v4(mp, ipha, ixa)); 1749 } else { 1750 ip_hdr_cksum: 1751 /* Calculate IPv4 header checksum */ 1752 ipha->ipha_hdr_checksum = 0; 1753 ipha->ipha_hdr_checksum = ip_csum_hdr(ipha); 1754 return (B_TRUE); 1755 } 1756 1757 /* ULP puts the checksum field is in the first mblk */ 1758 ASSERT(((uchar_t *)cksump) + sizeof (uint16_t) <= mp->b_wptr); 1759 1760 /* 1761 * Underlying interface supports hardware checksum offload for 1762 * the payload; leave the payload checksum for the hardware to 1763 * calculate. N.B: We only need to set up checksum info on the 1764 * first mblk. 1765 */ 1766 hck_flags = ill->ill_hcksum_capab->ill_hcksum_txflags; 1767 1768 DB_CKSUMFLAGS(mp) &= ~HCK_FLAGS; 1769 if (hck_flags & HCKSUM_INET_FULL_V4) { 1770 /* 1771 * Hardware calculates pseudo-header, header and the 1772 * payload checksums, so clear the checksum field in 1773 * the protocol header. 1774 */ 1775 *cksump = 0; 1776 DB_CKSUMFLAGS(mp) |= HCK_FULLCKSUM; 1777 1778 ipha->ipha_hdr_checksum = 0; 1779 if (hck_flags & HCKSUM_IPHDRCKSUM) { 1780 DB_CKSUMFLAGS(mp) |= HCK_IPV4_HDRCKSUM; 1781 } else { 1782 ipha->ipha_hdr_checksum = ip_csum_hdr(ipha); 1783 } 1784 return (B_TRUE); 1785 } 1786 if ((hck_flags) & HCKSUM_INET_PARTIAL) { 1787 ipaddr_t dst = ipha->ipha_dst; 1788 ipaddr_t src = ipha->ipha_src; 1789 /* 1790 * Partial checksum offload has been enabled. Fill 1791 * the checksum field in the protocol header with the 1792 * pseudo-header checksum value. 1793 * 1794 * We accumulate the pseudo header checksum in cksum. 1795 * This is pretty hairy code, so watch close. One 1796 * thing to keep in mind is that UDP and TCP have 1797 * stored their respective datagram lengths in their 1798 * checksum fields. This lines things up real nice. 1799 */ 1800 cksum += (dst >> 16) + (dst & 0xFFFF) + 1801 (src >> 16) + (src & 0xFFFF); 1802 cksum += *(cksump); 1803 cksum = (cksum & 0xFFFF) + (cksum >> 16); 1804 *(cksump) = (cksum & 0xFFFF) + (cksum >> 16); 1805 1806 /* 1807 * Offsets are relative to beginning of IP header. 1808 */ 1809 DB_CKSUMSTART(mp) = ip_hdr_length; 1810 DB_CKSUMSTUFF(mp) = (uint8_t *)cksump - (uint8_t *)ipha; 1811 DB_CKSUMEND(mp) = pktlen; 1812 DB_CKSUMFLAGS(mp) |= HCK_PARTIALCKSUM; 1813 1814 ipha->ipha_hdr_checksum = 0; 1815 if (hck_flags & HCKSUM_IPHDRCKSUM) { 1816 DB_CKSUMFLAGS(mp) |= HCK_IPV4_HDRCKSUM; 1817 } else { 1818 ipha->ipha_hdr_checksum = ip_csum_hdr(ipha); 1819 } 1820 return (B_TRUE); 1821 } 1822 /* Hardware capabilities include neither full nor partial IPv4 */ 1823 return (ip_output_sw_cksum_v4(mp, ipha, ixa)); 1824 } 1825 1826 /* 1827 * ire_sendfn for offlink and onlink destinations. 1828 * Also called from the multicast, broadcast, multirt send functions. 1829 * 1830 * Assumes that the caller has a hold on the ire. 1831 * 1832 * This function doesn't care if the IRE just became condemned since that 1833 * can happen at any time. 1834 */ 1835 /* ARGSUSED */ 1836 int 1837 ire_send_wire_v4(ire_t *ire, mblk_t *mp, void *iph_arg, 1838 ip_xmit_attr_t *ixa, uint32_t *identp) 1839 { 1840 ip_stack_t *ipst = ixa->ixa_ipst; 1841 ipha_t *ipha = (ipha_t *)iph_arg; 1842 iaflags_t ixaflags = ixa->ixa_flags; 1843 ill_t *ill; 1844 1845 ASSERT(ixa->ixa_nce != NULL); 1846 ill = ixa->ixa_nce->nce_ill; 1847 1848 if (ixaflags & IXAF_DONTROUTE) 1849 ipha->ipha_ttl = 1; 1850 1851 /* 1852 * Assign an ident value for this packet. There could be other 1853 * threads targeting the same destination, so we have to arrange 1854 * for a atomic increment. Note that we use a 32-bit atomic add 1855 * because it has better performance than its 16-bit sibling. 1856 * 1857 * Normally ixa_extra_ident is 0, but in the case of LSO it will 1858 * be the number of TCP segments that the driver/hardware will 1859 * extraly construct. 1860 * 1861 * If running in cluster mode and if the source address 1862 * belongs to a replicated service then vector through 1863 * cl_inet_ipident vector to allocate ip identifier 1864 * NOTE: This is a contract private interface with the 1865 * clustering group. 1866 */ 1867 if (cl_inet_ipident != NULL) { 1868 ipaddr_t src = ipha->ipha_src; 1869 ipaddr_t dst = ipha->ipha_dst; 1870 netstackid_t stack_id = ipst->ips_netstack->netstack_stackid; 1871 1872 ASSERT(cl_inet_isclusterwide != NULL); 1873 if ((*cl_inet_isclusterwide)(stack_id, IPPROTO_IP, 1874 AF_INET, (uint8_t *)(uintptr_t)src, NULL)) { 1875 /* 1876 * Note: not correct with LSO since we can't allocate 1877 * ixa_extra_ident+1 consecutive values. 1878 */ 1879 ipha->ipha_ident = (*cl_inet_ipident)(stack_id, 1880 IPPROTO_IP, AF_INET, (uint8_t *)(uintptr_t)src, 1881 (uint8_t *)(uintptr_t)dst, NULL); 1882 } else { 1883 ipha->ipha_ident = atomic_add_32_nv(identp, 1884 ixa->ixa_extra_ident + 1); 1885 } 1886 } else { 1887 ipha->ipha_ident = atomic_add_32_nv(identp, 1888 ixa->ixa_extra_ident + 1); 1889 } 1890 #ifndef _BIG_ENDIAN 1891 ipha->ipha_ident = htons(ipha->ipha_ident); 1892 #endif 1893 1894 /* 1895 * This might set b_band, thus the IPsec and fragmentation 1896 * code in IP ensures that b_band is updated in the first mblk. 1897 */ 1898 if (IPP_ENABLED(IPP_LOCAL_OUT, ipst)) { 1899 /* ip_process translates an IS_UNDER_IPMP */ 1900 mp = ip_process(IPP_LOCAL_OUT, mp, ill, ill); 1901 if (mp == NULL) { 1902 /* ip_drop_packet and MIB done */ 1903 return (0); /* Might just be delayed */ 1904 } 1905 } 1906 1907 /* 1908 * Verify any IPv4 options. 1909 * 1910 * The presense of IP options also forces the network stack to 1911 * calculate the checksum in software. This is because: 1912 * 1913 * Wrap around: certain partial-checksum NICs (eri, ce) limit 1914 * the size of "start offset" width to 6-bit. This effectively 1915 * sets the largest value of the offset to 64-bytes, starting 1916 * from the MAC header. When the cumulative MAC and IP headers 1917 * exceed such limit, the offset will wrap around. This causes 1918 * the checksum to be calculated at the wrong place. 1919 * 1920 * IPv4 source routing: none of the full-checksum capable NICs 1921 * is capable of correctly handling the IPv4 source-routing 1922 * option for purposes of calculating the pseudo-header; the 1923 * actual destination is different from the destination in the 1924 * header which is that of the next-hop. (This case may not be 1925 * true for NICs which can parse IPv6 extension headers, but 1926 * we choose to simplify the implementation by not offloading 1927 * checksum when they are present.) 1928 */ 1929 if (!IS_SIMPLE_IPH(ipha)) { 1930 ixaflags = ixa->ixa_flags |= IXAF_NO_HW_CKSUM; 1931 /* An IS_UNDER_IPMP ill is ok here */ 1932 if (ip_output_options(mp, ipha, ixa, ill)) { 1933 /* Packet has been consumed and ICMP error sent */ 1934 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); 1935 return (EINVAL); 1936 } 1937 } 1938 1939 /* 1940 * To handle IPsec/iptun's labeling needs we need to tag packets 1941 * while we still have ixa_tsl 1942 */ 1943 if (is_system_labeled() && ixa->ixa_tsl != NULL && 1944 (ill->ill_mactype == DL_6TO4 || ill->ill_mactype == DL_IPV4 || 1945 ill->ill_mactype == DL_IPV6)) { 1946 cred_t *newcr; 1947 1948 newcr = copycred_from_tslabel(ixa->ixa_cred, ixa->ixa_tsl, 1949 KM_NOSLEEP); 1950 if (newcr == NULL) { 1951 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); 1952 ip_drop_output("ipIfStatsOutDiscards - newcr", 1953 mp, ill); 1954 freemsg(mp); 1955 return (ENOBUFS); 1956 } 1957 mblk_setcred(mp, newcr, NOPID); 1958 crfree(newcr); /* mblk_setcred did its own crhold */ 1959 } 1960 1961 if (ixa->ixa_pktlen > ixa->ixa_fragsize || 1962 (ixaflags & IXAF_IPSEC_SECURE)) { 1963 uint32_t pktlen; 1964 1965 pktlen = ixa->ixa_pktlen; 1966 if (ixaflags & IXAF_IPSEC_SECURE) 1967 pktlen += ipsec_out_extra_length(ixa); 1968 1969 if (pktlen > IP_MAXPACKET) 1970 return (EMSGSIZE); 1971 1972 if (ixaflags & IXAF_SET_ULP_CKSUM) { 1973 /* 1974 * Compute ULP checksum and IP header checksum 1975 * using software 1976 */ 1977 if (!ip_output_sw_cksum_v4(mp, ipha, ixa)) { 1978 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); 1979 ip_drop_output("ipIfStatsOutDiscards", mp, ill); 1980 freemsg(mp); 1981 return (EINVAL); 1982 } 1983 } else { 1984 /* Calculate IPv4 header checksum */ 1985 ipha->ipha_hdr_checksum = 0; 1986 ipha->ipha_hdr_checksum = ip_csum_hdr(ipha); 1987 } 1988 1989 /* 1990 * If this packet would generate a icmp_frag_needed 1991 * message, we need to handle it before we do the IPsec 1992 * processing. Otherwise, we need to strip the IPsec 1993 * headers before we send up the message to the ULPs 1994 * which becomes messy and difficult. 1995 * 1996 * We check using IXAF_DONTFRAG. The DF bit in the header 1997 * is not inspected - it will be copied to any generated 1998 * fragments. 1999 */ 2000 if ((pktlen > ixa->ixa_fragsize) && 2001 (ixaflags & IXAF_DONTFRAG)) { 2002 /* Generate ICMP and return error */ 2003 ip_recv_attr_t iras; 2004 2005 DTRACE_PROBE4(ip4__fragsize__fail, uint_t, pktlen, 2006 uint_t, ixa->ixa_fragsize, uint_t, ixa->ixa_pktlen, 2007 uint_t, ixa->ixa_pmtu); 2008 2009 bzero(&iras, sizeof (iras)); 2010 /* Map ixa to ira including IPsec policies */ 2011 ipsec_out_to_in(ixa, ill, &iras); 2012 2013 ip_drop_output("ICMP_FRAG_NEEDED", mp, ill); 2014 icmp_frag_needed(mp, ixa->ixa_fragsize, &iras); 2015 /* We moved any IPsec refs from ixa to iras */ 2016 ira_cleanup(&iras, B_FALSE); 2017 return (EMSGSIZE); 2018 } 2019 DTRACE_PROBE4(ip4__fragsize__ok, uint_t, pktlen, 2020 uint_t, ixa->ixa_fragsize, uint_t, ixa->ixa_pktlen, 2021 uint_t, ixa->ixa_pmtu); 2022 2023 if (ixaflags & IXAF_IPSEC_SECURE) { 2024 /* 2025 * Pass in sufficient information so that 2026 * IPsec can determine whether to fragment, and 2027 * which function to call after fragmentation. 2028 */ 2029 return (ipsec_out_process(mp, ixa)); 2030 } 2031 return (ip_fragment_v4(mp, ixa->ixa_nce, ixaflags, 2032 ixa->ixa_pktlen, ixa->ixa_fragsize, ixa->ixa_xmit_hint, 2033 ixa->ixa_zoneid, ixa->ixa_no_loop_zoneid, 2034 ixa->ixa_postfragfn, &ixa->ixa_cookie)); 2035 } 2036 if (ixaflags & IXAF_SET_ULP_CKSUM) { 2037 /* Compute ULP checksum and IP header checksum */ 2038 /* An IS_UNDER_IPMP ill is ok here */ 2039 if (!ip_output_cksum_v4(ixaflags, mp, ipha, ixa, ill)) { 2040 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); 2041 ip_drop_output("ipIfStatsOutDiscards", mp, ill); 2042 freemsg(mp); 2043 return (EINVAL); 2044 } 2045 } else { 2046 /* Calculate IPv4 header checksum */ 2047 ipha->ipha_hdr_checksum = 0; 2048 ipha->ipha_hdr_checksum = ip_csum_hdr(ipha); 2049 } 2050 return ((ixa->ixa_postfragfn)(mp, ixa->ixa_nce, ixaflags, 2051 ixa->ixa_pktlen, ixa->ixa_xmit_hint, ixa->ixa_zoneid, 2052 ixa->ixa_no_loop_zoneid, &ixa->ixa_cookie)); 2053 } 2054 2055 /* 2056 * Send mp into ip_input 2057 * Common for IPv4 and IPv6 2058 */ 2059 void 2060 ip_postfrag_loopback(mblk_t *mp, nce_t *nce, iaflags_t ixaflags, 2061 uint_t pkt_len, zoneid_t nolzid) 2062 { 2063 rtc_t rtc; 2064 ill_t *ill = nce->nce_ill; 2065 ip_recv_attr_t iras; /* NOTE: No bzero for performance */ 2066 ncec_t *ncec; 2067 2068 ncec = nce->nce_common; 2069 iras.ira_flags = IRAF_VERIFY_IP_CKSUM | IRAF_VERIFY_ULP_CKSUM | 2070 IRAF_LOOPBACK | IRAF_L2SRC_LOOPBACK; 2071 if (ncec->ncec_flags & NCE_F_BCAST) 2072 iras.ira_flags |= IRAF_L2DST_BROADCAST; 2073 else if (ncec->ncec_flags & NCE_F_MCAST) 2074 iras.ira_flags |= IRAF_L2DST_MULTICAST; 2075 2076 iras.ira_free_flags = 0; 2077 iras.ira_cred = NULL; 2078 iras.ira_cpid = NOPID; 2079 iras.ira_tsl = NULL; 2080 iras.ira_zoneid = ALL_ZONES; 2081 iras.ira_pktlen = pkt_len; 2082 UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCInOctets, iras.ira_pktlen); 2083 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInReceives); 2084 2085 if (ixaflags & IXAF_IS_IPV4) 2086 iras.ira_flags |= IRAF_IS_IPV4; 2087 2088 iras.ira_ill = iras.ira_rill = ill; 2089 iras.ira_ruifindex = ill->ill_phyint->phyint_ifindex; 2090 iras.ira_rifindex = iras.ira_ruifindex; 2091 iras.ira_mhip = NULL; 2092 2093 iras.ira_flags |= ixaflags & IAF_MASK; 2094 iras.ira_no_loop_zoneid = nolzid; 2095 2096 /* Broadcast and multicast doesn't care about the squeue */ 2097 iras.ira_sqp = NULL; 2098 2099 rtc.rtc_ire = NULL; 2100 if (ixaflags & IXAF_IS_IPV4) { 2101 ipha_t *ipha = (ipha_t *)mp->b_rptr; 2102 2103 rtc.rtc_ipaddr = INADDR_ANY; 2104 2105 (*ill->ill_inputfn)(mp, ipha, &ipha->ipha_dst, &iras, &rtc); 2106 if (rtc.rtc_ire != NULL) { 2107 ASSERT(rtc.rtc_ipaddr != INADDR_ANY); 2108 ire_refrele(rtc.rtc_ire); 2109 } 2110 } else { 2111 ip6_t *ip6h = (ip6_t *)mp->b_rptr; 2112 2113 rtc.rtc_ip6addr = ipv6_all_zeros; 2114 2115 (*ill->ill_inputfn)(mp, ip6h, &ip6h->ip6_dst, &iras, &rtc); 2116 if (rtc.rtc_ire != NULL) { 2117 ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&rtc.rtc_ip6addr)); 2118 ire_refrele(rtc.rtc_ire); 2119 } 2120 } 2121 /* Any references to clean up? No hold on ira */ 2122 if (iras.ira_flags & (IRAF_IPSEC_SECURE|IRAF_SYSTEM_LABELED)) 2123 ira_cleanup(&iras, B_FALSE); 2124 } 2125 2126 /* 2127 * Post fragmentation function for IRE_MULTICAST and IRE_BROADCAST which 2128 * looks at the IXAF_LOOPBACK_COPY flag. 2129 * Common for IPv4 and IPv6. 2130 * 2131 * If the loopback copy fails (due to no memory) but we send the packet out 2132 * on the wire we return no failure. Only in the case we supress the wire 2133 * sending do we take the loopback failure into account. 2134 * 2135 * Note that we do not perform DTRACE_IP7 and FW_HOOKS for the looped back copy. 2136 * Those operations are performed on this packet in ip_xmit() and it would 2137 * be odd to do it twice for the same packet. 2138 */ 2139 int 2140 ip_postfrag_loopcheck(mblk_t *mp, nce_t *nce, iaflags_t ixaflags, 2141 uint_t pkt_len, uint32_t xmit_hint, zoneid_t szone, zoneid_t nolzid, 2142 uintptr_t *ixacookie) 2143 { 2144 ill_t *ill = nce->nce_ill; 2145 int error = 0; 2146 2147 /* 2148 * Check for IXAF_LOOPBACK_COPY - send a copy to ip as if the driver 2149 * had looped it back 2150 */ 2151 if (ixaflags & IXAF_LOOPBACK_COPY) { 2152 mblk_t *mp1; 2153 2154 mp1 = copymsg(mp); 2155 if (mp1 == NULL) { 2156 /* Failed to deliver the loopback copy. */ 2157 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); 2158 ip_drop_output("ipIfStatsOutDiscards", mp, ill); 2159 error = ENOBUFS; 2160 } else { 2161 ip_postfrag_loopback(mp1, nce, ixaflags, pkt_len, 2162 nolzid); 2163 } 2164 } 2165 2166 /* 2167 * If TTL = 0 then only do the loopback to this host i.e. we are 2168 * done. We are also done if this was the 2169 * loopback interface since it is sufficient 2170 * to loopback one copy of a multicast packet. 2171 */ 2172 if (ixaflags & IXAF_IS_IPV4) { 2173 ipha_t *ipha = (ipha_t *)mp->b_rptr; 2174 2175 if (ipha->ipha_ttl == 0) { 2176 ip_drop_output("multicast ipha_ttl not sent to wire", 2177 mp, ill); 2178 freemsg(mp); 2179 return (error); 2180 } 2181 } else { 2182 ip6_t *ip6h = (ip6_t *)mp->b_rptr; 2183 2184 if (ip6h->ip6_hops == 0) { 2185 ip_drop_output("multicast ipha_ttl not sent to wire", 2186 mp, ill); 2187 freemsg(mp); 2188 return (error); 2189 } 2190 } 2191 if (nce->nce_ill->ill_wq == NULL) { 2192 /* Loopback interface */ 2193 ip_drop_output("multicast on lo0 not sent to wire", mp, ill); 2194 freemsg(mp); 2195 return (error); 2196 } 2197 2198 return (ip_xmit(mp, nce, ixaflags, pkt_len, xmit_hint, szone, 0, 2199 ixacookie)); 2200 } 2201 2202 /* 2203 * Post fragmentation function for RTF_MULTIRT routes. 2204 * Since IRE_BROADCASTs can have RTF_MULTIRT, this function 2205 * checks IXAF_LOOPBACK_COPY. 2206 * 2207 * If no packet is sent due to failures then we return an errno, but if at 2208 * least one succeeded we return zero. 2209 */ 2210 int 2211 ip_postfrag_multirt_v4(mblk_t *mp, nce_t *nce, iaflags_t ixaflags, 2212 uint_t pkt_len, uint32_t xmit_hint, zoneid_t szone, zoneid_t nolzid, 2213 uintptr_t *ixacookie) 2214 { 2215 irb_t *irb; 2216 ipha_t *ipha = (ipha_t *)mp->b_rptr; 2217 ire_t *ire; 2218 ire_t *ire1; 2219 mblk_t *mp1; 2220 nce_t *nce1; 2221 ill_t *ill = nce->nce_ill; 2222 ill_t *ill1; 2223 ip_stack_t *ipst = ill->ill_ipst; 2224 int error = 0; 2225 int num_sent = 0; 2226 int err; 2227 uint_t ire_type; 2228 ipaddr_t nexthop; 2229 2230 ASSERT(ixaflags & IXAF_IS_IPV4); 2231 2232 /* Check for IXAF_LOOPBACK_COPY */ 2233 if (ixaflags & IXAF_LOOPBACK_COPY) { 2234 mblk_t *mp1; 2235 2236 mp1 = copymsg(mp); 2237 if (mp1 == NULL) { 2238 /* Failed to deliver the loopback copy. */ 2239 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); 2240 ip_drop_output("ipIfStatsOutDiscards", mp, ill); 2241 error = ENOBUFS; 2242 } else { 2243 ip_postfrag_loopback(mp1, nce, ixaflags, pkt_len, 2244 nolzid); 2245 } 2246 } 2247 2248 /* 2249 * Loop over RTF_MULTIRT for ipha_dst in the same bucket. Send 2250 * a copy to each one. 2251 * Use the nce (nexthop) and ipha_dst to find the ire. 2252 * 2253 * MULTIRT is not designed to work with shared-IP zones thus we don't 2254 * need to pass a zoneid or a label to the IRE lookup. 2255 */ 2256 if (V4_PART_OF_V6(nce->nce_addr) == ipha->ipha_dst) { 2257 /* Broadcast and multicast case */ 2258 ire = ire_ftable_lookup_v4(ipha->ipha_dst, 0, 0, 0, 2259 NULL, ALL_ZONES, NULL, MATCH_IRE_DSTONLY, 0, ipst, NULL); 2260 } else { 2261 ipaddr_t v4addr = V4_PART_OF_V6(nce->nce_addr); 2262 2263 /* Unicast case */ 2264 ire = ire_ftable_lookup_v4(ipha->ipha_dst, 0, v4addr, 0, 2265 NULL, ALL_ZONES, NULL, MATCH_IRE_GW, 0, ipst, NULL); 2266 } 2267 2268 if (ire == NULL || 2269 (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) || 2270 !(ire->ire_flags & RTF_MULTIRT)) { 2271 /* Drop */ 2272 ip_drop_output("ip_postfrag_multirt didn't find route", 2273 mp, nce->nce_ill); 2274 if (ire != NULL) 2275 ire_refrele(ire); 2276 return (ENETUNREACH); 2277 } 2278 2279 irb = ire->ire_bucket; 2280 irb_refhold(irb); 2281 for (ire1 = irb->irb_ire; ire1 != NULL; ire1 = ire1->ire_next) { 2282 /* 2283 * For broadcast we can have a mixture of IRE_BROADCAST and 2284 * IRE_HOST due to the manually added IRE_HOSTs that are used 2285 * to trigger the creation of the special CGTP broadcast routes. 2286 * Thus we have to skip if ire_type doesn't match the original. 2287 */ 2288 if (IRE_IS_CONDEMNED(ire1) || 2289 !(ire1->ire_flags & RTF_MULTIRT) || 2290 ire1->ire_type != ire->ire_type) 2291 continue; 2292 2293 /* Do the ire argument one after the loop */ 2294 if (ire1 == ire) 2295 continue; 2296 2297 ill1 = ire_nexthop_ill(ire1); 2298 if (ill1 == NULL) { 2299 /* 2300 * This ire might not have been picked by 2301 * ire_route_recursive, in which case ire_dep might 2302 * not have been setup yet. 2303 * We kick ire_route_recursive to try to resolve 2304 * starting at ire1. 2305 */ 2306 ire_t *ire2; 2307 uint_t match_flags = MATCH_IRE_DSTONLY; 2308 2309 if (ire1->ire_ill != NULL) 2310 match_flags |= MATCH_IRE_ILL; 2311 ire2 = ire_route_recursive_impl_v4(ire1, 2312 ire1->ire_addr, ire1->ire_type, ire1->ire_ill, 2313 ire1->ire_zoneid, NULL, match_flags, 2314 IRR_ALLOCATE, 0, ipst, NULL, NULL, NULL); 2315 if (ire2 != NULL) 2316 ire_refrele(ire2); 2317 ill1 = ire_nexthop_ill(ire1); 2318 } 2319 2320 if (ill1 == NULL) { 2321 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); 2322 ip_drop_output("ipIfStatsOutDiscards - no ill", 2323 mp, ill); 2324 error = ENETUNREACH; 2325 continue; 2326 } 2327 2328 /* Pick the addr and type to use for arp_nce_init */ 2329 if (nce->nce_common->ncec_flags & NCE_F_BCAST) { 2330 ire_type = IRE_BROADCAST; 2331 nexthop = ire1->ire_gateway_addr; 2332 } else if (nce->nce_common->ncec_flags & NCE_F_MCAST) { 2333 ire_type = IRE_MULTICAST; 2334 nexthop = ipha->ipha_dst; 2335 } else { 2336 ire_type = ire1->ire_type; /* Doesn't matter */ 2337 nexthop = ire1->ire_gateway_addr; 2338 } 2339 2340 /* If IPMP meta or under, then we just drop */ 2341 if (ill1->ill_grp != NULL) { 2342 BUMP_MIB(ill1->ill_ip_mib, ipIfStatsOutDiscards); 2343 ip_drop_output("ipIfStatsOutDiscards - IPMP", 2344 mp, ill1); 2345 ill_refrele(ill1); 2346 error = ENETUNREACH; 2347 continue; 2348 } 2349 2350 nce1 = arp_nce_init(ill1, nexthop, ire_type); 2351 if (nce1 == NULL) { 2352 BUMP_MIB(ill1->ill_ip_mib, ipIfStatsOutDiscards); 2353 ip_drop_output("ipIfStatsOutDiscards - no nce", 2354 mp, ill1); 2355 ill_refrele(ill1); 2356 error = ENETUNREACH; 2357 continue; 2358 } 2359 mp1 = copymsg(mp); 2360 if (mp1 == NULL) { 2361 BUMP_MIB(ill1->ill_ip_mib, ipIfStatsOutDiscards); 2362 ip_drop_output("ipIfStatsOutDiscards", mp, ill1); 2363 nce_refrele(nce1); 2364 ill_refrele(ill1); 2365 error = ENOBUFS; 2366 continue; 2367 } 2368 /* Preserve HW checksum for this copy */ 2369 DB_CKSUMSTART(mp1) = DB_CKSUMSTART(mp); 2370 DB_CKSUMSTUFF(mp1) = DB_CKSUMSTUFF(mp); 2371 DB_CKSUMEND(mp1) = DB_CKSUMEND(mp); 2372 DB_CKSUMFLAGS(mp1) = DB_CKSUMFLAGS(mp); 2373 DB_LSOMSS(mp1) = DB_LSOMSS(mp); 2374 2375 ire1->ire_ob_pkt_count++; 2376 err = ip_xmit(mp1, nce1, ixaflags, pkt_len, xmit_hint, szone, 2377 0, ixacookie); 2378 if (err == 0) 2379 num_sent++; 2380 else 2381 error = err; 2382 nce_refrele(nce1); 2383 ill_refrele(ill1); 2384 } 2385 irb_refrele(irb); 2386 ire_refrele(ire); 2387 /* Finally, the main one */ 2388 err = ip_xmit(mp, nce, ixaflags, pkt_len, xmit_hint, szone, 0, 2389 ixacookie); 2390 if (err == 0) 2391 num_sent++; 2392 else 2393 error = err; 2394 if (num_sent > 0) 2395 return (0); 2396 else 2397 return (error); 2398 } 2399 2400 /* 2401 * Verify local connectivity. This check is called by ULP fusion code. 2402 * The generation number on an IRE_LOCAL or IRE_LOOPBACK only changes if 2403 * the interface is brought down and back up. So we simply fail the local 2404 * process. The caller, TCP Fusion, should unfuse the connection. 2405 */ 2406 boolean_t 2407 ip_output_verify_local(ip_xmit_attr_t *ixa) 2408 { 2409 ire_t *ire = ixa->ixa_ire; 2410 2411 if (!(ire->ire_type & (IRE_LOCAL | IRE_LOOPBACK))) 2412 return (B_FALSE); 2413 2414 return (ixa->ixa_ire->ire_generation == ixa->ixa_ire_generation); 2415 } 2416 2417 /* 2418 * Local process for ULP loopback, TCP Fusion. Handle both IPv4 and IPv6. 2419 * 2420 * The caller must call ip_output_verify_local() first. This function handles 2421 * IPobs, FW_HOOKS, and/or IPsec cases sequentially. 2422 */ 2423 mblk_t * 2424 ip_output_process_local(mblk_t *mp, ip_xmit_attr_t *ixa, boolean_t hooks_out, 2425 boolean_t hooks_in, conn_t *peer_connp) 2426 { 2427 ill_t *ill = ixa->ixa_ire->ire_ill; 2428 ipha_t *ipha = NULL; 2429 ip6_t *ip6h = NULL; 2430 ip_stack_t *ipst = ixa->ixa_ipst; 2431 iaflags_t ixaflags = ixa->ixa_flags; 2432 ip_recv_attr_t iras; 2433 int error; 2434 2435 ASSERT(mp != NULL); 2436 2437 if (ixaflags & IXAF_IS_IPV4) { 2438 ipha = (ipha_t *)mp->b_rptr; 2439 2440 /* 2441 * If a callback is enabled then we need to know the 2442 * source and destination zoneids for the packet. We already 2443 * have those handy. 2444 */ 2445 if (ipst->ips_ip4_observe.he_interested) { 2446 zoneid_t szone, dzone; 2447 zoneid_t stackzoneid; 2448 2449 stackzoneid = netstackid_to_zoneid( 2450 ipst->ips_netstack->netstack_stackid); 2451 2452 if (stackzoneid == GLOBAL_ZONEID) { 2453 /* Shared-IP zone */ 2454 dzone = ixa->ixa_ire->ire_zoneid; 2455 szone = ixa->ixa_zoneid; 2456 } else { 2457 szone = dzone = stackzoneid; 2458 } 2459 ipobs_hook(mp, IPOBS_HOOK_LOCAL, szone, dzone, ill, 2460 ipst); 2461 } 2462 DTRACE_IP7(send, mblk_t *, mp, conn_t *, NULL, void_ip_t *, 2463 ipha, __dtrace_ipsr_ill_t *, ill, ipha_t *, ipha, ip6_t *, 2464 NULL, int, 1); 2465 2466 /* FW_HOOKS: LOOPBACK_OUT */ 2467 if (hooks_out) { 2468 DTRACE_PROBE4(ip4__loopback__out__start, ill_t *, NULL, 2469 ill_t *, ill, ipha_t *, ipha, mblk_t *, mp); 2470 FW_HOOKS(ipst->ips_ip4_loopback_out_event, 2471 ipst->ips_ipv4firewall_loopback_out, 2472 NULL, ill, ipha, mp, mp, 0, ipst, error); 2473 DTRACE_PROBE1(ip4__loopback__out__end, mblk_t *, mp); 2474 } 2475 if (mp == NULL) 2476 return (NULL); 2477 2478 /* FW_HOOKS: LOOPBACK_IN */ 2479 if (hooks_in) { 2480 DTRACE_PROBE4(ip4__loopback__in__start, ill_t *, ill, 2481 ill_t *, NULL, ipha_t *, ipha, mblk_t *, mp); 2482 FW_HOOKS(ipst->ips_ip4_loopback_in_event, 2483 ipst->ips_ipv4firewall_loopback_in, 2484 ill, NULL, ipha, mp, mp, 0, ipst, error); 2485 DTRACE_PROBE1(ip4__loopback__in__end, mblk_t *, mp); 2486 } 2487 if (mp == NULL) 2488 return (NULL); 2489 2490 DTRACE_IP7(receive, mblk_t *, mp, conn_t *, NULL, void_ip_t *, 2491 ipha, __dtrace_ipsr_ill_t *, ill, ipha_t *, ipha, ip6_t *, 2492 NULL, int, 1); 2493 2494 /* Inbound IPsec polocies */ 2495 if (peer_connp != NULL) { 2496 /* Map ixa to ira including IPsec policies. */ 2497 ipsec_out_to_in(ixa, ill, &iras); 2498 mp = ipsec_check_inbound_policy(mp, peer_connp, ipha, 2499 NULL, &iras); 2500 } 2501 } else { 2502 ip6h = (ip6_t *)mp->b_rptr; 2503 2504 /* 2505 * If a callback is enabled then we need to know the 2506 * source and destination zoneids for the packet. We already 2507 * have those handy. 2508 */ 2509 if (ipst->ips_ip6_observe.he_interested) { 2510 zoneid_t szone, dzone; 2511 zoneid_t stackzoneid; 2512 2513 stackzoneid = netstackid_to_zoneid( 2514 ipst->ips_netstack->netstack_stackid); 2515 2516 if (stackzoneid == GLOBAL_ZONEID) { 2517 /* Shared-IP zone */ 2518 dzone = ixa->ixa_ire->ire_zoneid; 2519 szone = ixa->ixa_zoneid; 2520 } else { 2521 szone = dzone = stackzoneid; 2522 } 2523 ipobs_hook(mp, IPOBS_HOOK_LOCAL, szone, dzone, ill, 2524 ipst); 2525 } 2526 DTRACE_IP7(send, mblk_t *, mp, conn_t *, NULL, void_ip_t *, 2527 ip6h, __dtrace_ipsr_ill_t *, ill, ipha_t *, NULL, ip6_t *, 2528 ip6h, int, 1); 2529 2530 /* FW_HOOKS: LOOPBACK_OUT */ 2531 if (hooks_out) { 2532 DTRACE_PROBE4(ip6__loopback__out__start, ill_t *, NULL, 2533 ill_t *, ill, ip6_t *, ip6h, mblk_t *, mp); 2534 FW_HOOKS6(ipst->ips_ip6_loopback_out_event, 2535 ipst->ips_ipv6firewall_loopback_out, 2536 NULL, ill, ip6h, mp, mp, 0, ipst, error); 2537 DTRACE_PROBE1(ip6__loopback__out__end, mblk_t *, mp); 2538 } 2539 if (mp == NULL) 2540 return (NULL); 2541 2542 /* FW_HOOKS: LOOPBACK_IN */ 2543 if (hooks_in) { 2544 DTRACE_PROBE4(ip6__loopback__in__start, ill_t *, ill, 2545 ill_t *, NULL, ip6_t *, ip6h, mblk_t *, mp); 2546 FW_HOOKS6(ipst->ips_ip6_loopback_in_event, 2547 ipst->ips_ipv6firewall_loopback_in, 2548 ill, NULL, ip6h, mp, mp, 0, ipst, error); 2549 DTRACE_PROBE1(ip6__loopback__in__end, mblk_t *, mp); 2550 } 2551 if (mp == NULL) 2552 return (NULL); 2553 2554 DTRACE_IP7(receive, mblk_t *, mp, conn_t *, NULL, void_ip_t *, 2555 ip6h, __dtrace_ipsr_ill_t *, ill, ipha_t *, NULL, ip6_t *, 2556 ip6h, int, 1); 2557 2558 /* Inbound IPsec polocies */ 2559 if (peer_connp != NULL) { 2560 /* Map ixa to ira including IPsec policies. */ 2561 ipsec_out_to_in(ixa, ill, &iras); 2562 mp = ipsec_check_inbound_policy(mp, peer_connp, NULL, 2563 ip6h, &iras); 2564 } 2565 } 2566 2567 if (mp == NULL) { 2568 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 2569 ip_drop_input("ipIfStatsInDiscards", NULL, ill); 2570 } 2571 2572 return (mp); 2573 } 2574