1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. 24 * Copyright 2018 Joyent, Inc. 25 * Copyright 2024 Oxide Computer Company 26 */ 27 /* Copyright (c) 1990 Mentat Inc. */ 28 29 #include <sys/types.h> 30 #include <sys/stream.h> 31 #include <sys/strsubr.h> 32 #include <sys/dlpi.h> 33 #include <sys/strsun.h> 34 #include <sys/zone.h> 35 #include <sys/ddi.h> 36 #include <sys/sunddi.h> 37 #include <sys/cmn_err.h> 38 #include <sys/debug.h> 39 #include <sys/atomic.h> 40 41 #include <sys/systm.h> 42 #include <sys/param.h> 43 #include <sys/kmem.h> 44 #include <sys/sdt.h> 45 #include <sys/socket.h> 46 #include <sys/mac.h> 47 #include <net/if.h> 48 #include <net/if_arp.h> 49 #include <net/route.h> 50 #include <sys/sockio.h> 51 #include <netinet/in.h> 52 #include <net/if_dl.h> 53 54 #include <inet/common.h> 55 #include <inet/mi.h> 56 #include <inet/mib2.h> 57 #include <inet/nd.h> 58 #include <inet/arp.h> 59 #include <inet/snmpcom.h> 60 #include <inet/kstatcom.h> 61 62 #include <netinet/igmp_var.h> 63 #include <netinet/ip6.h> 64 #include <netinet/icmp6.h> 65 #include <netinet/sctp.h> 66 67 #include <inet/ip.h> 68 #include <inet/ip_impl.h> 69 #include <inet/ip6.h> 70 #include <inet/ip6_asp.h> 71 #include <inet/tcp.h> 72 #include <inet/ip_multi.h> 73 #include <inet/ip_if.h> 74 #include <inet/ip_ire.h> 75 #include <inet/ip_ftable.h> 76 #include <inet/ip_rts.h> 77 #include <inet/optcom.h> 78 #include <inet/ip_ndp.h> 79 #include <inet/ip_listutils.h> 80 #include <netinet/igmp.h> 81 #include <netinet/ip_mroute.h> 82 #include <inet/ipp_common.h> 83 84 #include <net/pfkeyv2.h> 85 #include <inet/sadb.h> 86 #include <inet/ipsec_impl.h> 87 #include <inet/ipdrop.h> 88 #include <inet/ip_netinfo.h> 89 90 #include <sys/pattr.h> 91 #include <inet/ipclassifier.h> 92 #include <inet/sctp_ip.h> 93 #include <inet/sctp/sctp_impl.h> 94 #include <inet/udp_impl.h> 95 #include <sys/sunddi.h> 96 97 #include <sys/tsol/label.h> 98 #include <sys/tsol/tnet.h> 99 100 #include <sys/clock_impl.h> /* For LBOLT_FASTPATH{,64} */ 101 102 #ifdef DEBUG 103 extern boolean_t skip_sctp_cksum; 104 #endif 105 106 static int ip_verify_nce(mblk_t *, ip_xmit_attr_t *); 107 static int ip_verify_dce(mblk_t *, ip_xmit_attr_t *); 108 static boolean_t ip_verify_lso(ill_t *, ip_xmit_attr_t *); 109 static boolean_t ip_verify_zcopy(ill_t *, ip_xmit_attr_t *); 110 static void ip_output_simple_broadcast(ip_xmit_attr_t *, mblk_t *); 111 112 /* 113 * There are two types of output functions for IP used for different 114 * purposes: 115 * - ip_output_simple() is when sending ICMP errors, TCP resets, etc when there 116 * is no context in the form of a conn_t. However, there is a 117 * ip_xmit_attr_t that the callers use to influence interface selection 118 * (needed for ICMP echo as well as IPv6 link-locals) and IPsec. 119 * 120 * - conn_ip_output() is used when sending packets with a conn_t and 121 * ip_set_destination has been called to cache information. In that case 122 * various socket options are recorded in the ip_xmit_attr_t and should 123 * be taken into account. 124 */ 125 126 /* 127 * The caller *must* have called conn_connect() or ip_attr_connect() 128 * before calling conn_ip_output(). The caller needs to redo that each time 129 * the destination IP address or port changes, as well as each time there is 130 * a change to any socket option that would modify how packets are routed out 131 * of the box (e.g., SO_DONTROUTE, IP_NEXTHOP, IP_BOUND_IF). 132 * 133 * The ULP caller has to serialize the use of a single ip_xmit_attr_t. 134 * We assert for that here. 135 */ 136 int 137 conn_ip_output(mblk_t *mp, ip_xmit_attr_t *ixa) 138 { 139 iaflags_t ixaflags = ixa->ixa_flags; 140 ire_t *ire; 141 nce_t *nce; 142 dce_t *dce; 143 ill_t *ill; 144 ip_stack_t *ipst = ixa->ixa_ipst; 145 int error; 146 147 /* We defer ipIfStatsHCOutRequests until an error or we have an ill */ 148 149 ASSERT(ixa->ixa_ire != NULL); 150 /* Note there is no ixa_nce when reject and blackhole routes */ 151 ASSERT(ixa->ixa_dce != NULL); /* Could be default dce */ 152 153 #ifdef DEBUG 154 ASSERT(ixa->ixa_curthread == NULL); 155 ixa->ixa_curthread = curthread; 156 #endif 157 158 /* 159 * Even on labeled systems we can have a NULL ixa_tsl e.g., 160 * for IGMP/MLD traffic. 161 */ 162 163 ire = ixa->ixa_ire; 164 165 /* 166 * If the ULP says the (old) IRE resulted in reachability we 167 * record this before determine whether to use a new IRE. 168 * No locking for performance reasons. 169 */ 170 if (ixaflags & IXAF_REACH_CONF) 171 ire->ire_badcnt = 0; 172 173 /* 174 * Has routing changed since we cached the results of the lookup? 175 * 176 * This check captures all of: 177 * - the cached ire being deleted (by means of the special 178 * IRE_GENERATION_CONDEMNED) 179 * - A potentially better ire being added (ire_generation being 180 * increased) 181 * - A deletion of the nexthop ire that was used when we did the 182 * lookup. 183 * - An addition of a potentially better nexthop ire. 184 * The last two are handled by walking and increasing the generation 185 * number on all dependant IREs in ire_flush_cache(). 186 * 187 * The check also handles all cases of RTF_REJECT and RTF_BLACKHOLE 188 * since we ensure that each time we set ixa_ire to such an IRE we 189 * make sure the ixa_ire_generation does not match (by using 190 * IRE_GENERATION_VERIFY). 191 */ 192 if (ire->ire_generation != ixa->ixa_ire_generation) { 193 error = ip_verify_ire(mp, ixa); 194 if (error != 0) { 195 ip_drop_output("ipIfStatsOutDiscards - verify ire", 196 mp, NULL); 197 goto drop; 198 } 199 ire = ixa->ixa_ire; 200 ASSERT(ire != NULL); 201 if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) { 202 #ifdef DEBUG 203 ASSERT(ixa->ixa_curthread == curthread); 204 ixa->ixa_curthread = NULL; 205 #endif 206 ire->ire_ob_pkt_count++; 207 /* ixa_dce might be condemned; use default one */ 208 return ((ire->ire_sendfn)(ire, mp, mp->b_rptr, ixa, 209 &ipst->ips_dce_default->dce_ident)); 210 } 211 /* 212 * If the ncec changed then ip_verify_ire already set 213 * ixa->ixa_dce_generation = DCE_GENERATION_VERIFY; 214 * so we can recheck the interface mtu. 215 */ 216 217 /* 218 * Note that ire->ire_generation could already have changed. 219 * We catch that next time we send a packet. 220 */ 221 } 222 223 /* 224 * No need to lock access to ixa_nce since the ip_xmit_attr usage 225 * is single threaded. 226 */ 227 ASSERT(ixa->ixa_nce != NULL); 228 nce = ixa->ixa_nce; 229 if (nce->nce_is_condemned) { 230 error = ip_verify_nce(mp, ixa); 231 /* 232 * In case ZEROCOPY capability become not available, we 233 * copy the message and free the original one. We might 234 * be copying more data than needed but it doesn't hurt 235 * since such change rarely happens. 236 */ 237 switch (error) { 238 case 0: 239 break; 240 case ENOTSUP: { /* ZEROCOPY */ 241 mblk_t *nmp; 242 243 if ((nmp = copymsg(mp)) != NULL) { 244 freemsg(mp); 245 mp = nmp; 246 247 break; 248 } 249 } 250 /* FALLTHROUGH */ 251 default: 252 ip_drop_output("ipIfStatsOutDiscards - verify nce", 253 mp, NULL); 254 goto drop; 255 } 256 ire = ixa->ixa_ire; 257 ASSERT(ire != NULL); 258 if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) { 259 #ifdef DEBUG 260 ASSERT(ixa->ixa_curthread == curthread); 261 ixa->ixa_curthread = NULL; 262 #endif 263 ire->ire_ob_pkt_count++; 264 /* ixa_dce might be condemned; use default one */ 265 return ((ire->ire_sendfn)(ire, mp, mp->b_rptr, 266 ixa, &ipst->ips_dce_default->dce_ident)); 267 } 268 ASSERT(ixa->ixa_nce != NULL); 269 nce = ixa->ixa_nce; 270 271 /* 272 * Note that some other event could already have made 273 * the new nce condemned. We catch that next time we 274 * try to send a packet. 275 */ 276 } 277 /* 278 * If there is no per-destination dce_t then we have a reference to 279 * the default dce_t (which merely contains the dce_ipid). 280 * The generation check captures both the introduction of a 281 * per-destination dce_t (e.g., due to ICMP packet too big) and 282 * any change to the per-destination dce (including it becoming 283 * condemned by use of the special DCE_GENERATION_CONDEMNED). 284 */ 285 dce = ixa->ixa_dce; 286 287 /* 288 * To avoid a periodic timer to increase the path MTU we 289 * look at dce_last_change_time each time we send a packet. 290 */ 291 if (dce->dce_flags & DCEF_PMTU) { 292 int64_t now = LBOLT_FASTPATH64; 293 294 if ((TICK_TO_SEC(now) - dce->dce_last_change_time > 295 ipst->ips_ip_pathmtu_interval)) { 296 /* 297 * Older than 20 minutes. Drop the path MTU information. 298 * Since the path MTU changes as a result of this, 299 * twiddle ixa_dce_generation to make us go through the 300 * dce verification code in conn_ip_output. 301 */ 302 mutex_enter(&dce->dce_lock); 303 dce->dce_flags &= ~(DCEF_PMTU|DCEF_TOO_SMALL_PMTU); 304 dce->dce_last_change_time = TICK_TO_SEC(now); 305 mutex_exit(&dce->dce_lock); 306 dce_increment_generation(dce); 307 } 308 } 309 310 if (dce->dce_generation != ixa->ixa_dce_generation) { 311 error = ip_verify_dce(mp, ixa); 312 if (error != 0) { 313 ip_drop_output("ipIfStatsOutDiscards - verify dce", 314 mp, NULL); 315 goto drop; 316 } 317 dce = ixa->ixa_dce; 318 319 /* 320 * Note that some other event could already have made the 321 * new dce's generation number change. 322 * We catch that next time we try to send a packet. 323 */ 324 } 325 326 ill = nce->nce_ill; 327 328 /* 329 * An initial ixa_fragsize was set in ip_set_destination 330 * and we update it if any routing changes above. 331 * A change to ill_mtu with ifconfig will increase all dce_generation 332 * so that we will detect that with the generation check. Ditto for 333 * ill_mc_mtu. 334 */ 335 336 /* 337 * Caller needs to make sure IXAF_VERIFY_SRC is not set if 338 * conn_unspec_src. 339 */ 340 if ((ixaflags & IXAF_VERIFY_SOURCE) && 341 ixa->ixa_src_generation != ipst->ips_src_generation) { 342 /* Check if the IP source is still assigned to the host. */ 343 uint_t gen; 344 345 if (!ip_verify_src(mp, ixa, &gen)) { 346 /* Don't send a packet with a source that isn't ours */ 347 error = EADDRNOTAVAIL; 348 ip_drop_output("ipIfStatsOutDiscards - invalid src", 349 mp, NULL); 350 goto drop; 351 } 352 /* The source is still valid - update the generation number */ 353 ixa->ixa_src_generation = gen; 354 } 355 356 /* 357 * We don't have an IRE when we fragment, hence ire_ob_pkt_count 358 * can only count the use prior to fragmentation. However the MIB 359 * counters on the ill will be incremented in post fragmentation. 360 */ 361 ire->ire_ob_pkt_count++; 362 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCOutRequests); 363 364 /* 365 * Based on ire_type and ire_flags call one of: 366 * ire_send_local_v* - for IRE_LOCAL and IRE_LOOPBACK 367 * ire_send_multirt_v* - if RTF_MULTIRT 368 * ire_send_noroute_v* - if RTF_REJECT or RTF_BLACHOLE 369 * ire_send_multicast_v* - for IRE_MULTICAST 370 * ire_send_broadcast_v4 - for IRE_BROADCAST 371 * ire_send_wire_v* - for the rest. 372 */ 373 #ifdef DEBUG 374 ASSERT(ixa->ixa_curthread == curthread); 375 ixa->ixa_curthread = NULL; 376 #endif 377 return ((ire->ire_sendfn)(ire, mp, mp->b_rptr, ixa, &dce->dce_ident)); 378 379 drop: 380 if (ixaflags & IXAF_IS_IPV4) { 381 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests); 382 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards); 383 } else { 384 BUMP_MIB(&ipst->ips_ip6_mib, ipIfStatsHCOutRequests); 385 BUMP_MIB(&ipst->ips_ip6_mib, ipIfStatsOutDiscards); 386 } 387 freemsg(mp); 388 #ifdef DEBUG 389 ASSERT(ixa->ixa_curthread == curthread); 390 ixa->ixa_curthread = NULL; 391 #endif 392 return (error); 393 } 394 395 /* 396 * Handle both IPv4 and IPv6. Sets the generation number 397 * to allow the caller to know when to call us again. 398 * Returns true if the source address in the packet is a valid source. 399 * We handle callers which try to send with a zero address (since we only 400 * get here if UNSPEC_SRC is not set). 401 */ 402 boolean_t 403 ip_verify_src(mblk_t *mp, ip_xmit_attr_t *ixa, uint_t *generationp) 404 { 405 ip_stack_t *ipst = ixa->ixa_ipst; 406 407 /* 408 * Need to grab the generation number before we check to 409 * avoid a race with a change to the set of local addresses. 410 * No lock needed since the thread which updates the set of local 411 * addresses use ipif/ill locks and exit those (hence a store memory 412 * barrier) before doing the atomic increase of ips_src_generation. 413 */ 414 if (generationp != NULL) 415 *generationp = ipst->ips_src_generation; 416 417 if (ixa->ixa_flags & IXAF_IS_IPV4) { 418 ipha_t *ipha = (ipha_t *)mp->b_rptr; 419 420 if (ipha->ipha_src == INADDR_ANY) 421 return (B_FALSE); 422 423 return (ip_laddr_verify_v4(ipha->ipha_src, ixa->ixa_zoneid, 424 ipst, B_FALSE) != IPVL_BAD); 425 } else { 426 ip6_t *ip6h = (ip6_t *)mp->b_rptr; 427 uint_t scopeid; 428 429 if (IN6_IS_ADDR_UNSPECIFIED(&ip6h->ip6_src)) 430 return (B_FALSE); 431 432 if (ixa->ixa_flags & IXAF_SCOPEID_SET) 433 scopeid = ixa->ixa_scopeid; 434 else 435 scopeid = 0; 436 437 return (ip_laddr_verify_v6(&ip6h->ip6_src, ixa->ixa_zoneid, 438 ipst, B_FALSE, scopeid) != IPVL_BAD); 439 } 440 } 441 442 /* 443 * Handle both IPv4 and IPv6. Reverify/recalculate the IRE to use. 444 */ 445 int 446 ip_verify_ire(mblk_t *mp, ip_xmit_attr_t *ixa) 447 { 448 uint_t gen; 449 ire_t *ire; 450 nce_t *nce; 451 int error; 452 boolean_t multirt = B_FALSE; 453 454 /* 455 * Redo ip_select_route. 456 * Need to grab generation number as part of the lookup to 457 * avoid race. 458 */ 459 error = 0; 460 ire = ip_select_route_pkt(mp, ixa, &gen, &error, &multirt); 461 ASSERT(ire != NULL); /* IRE_NOROUTE if none found */ 462 if (error != 0) { 463 ire_refrele(ire); 464 return (error); 465 } 466 467 if (ixa->ixa_ire != NULL) 468 ire_refrele_notr(ixa->ixa_ire); 469 #ifdef DEBUG 470 ire_refhold_notr(ire); 471 ire_refrele(ire); 472 #endif 473 ixa->ixa_ire = ire; 474 ixa->ixa_ire_generation = gen; 475 if (multirt) { 476 if (ixa->ixa_flags & IXAF_IS_IPV4) 477 ixa->ixa_postfragfn = ip_postfrag_multirt_v4; 478 else 479 ixa->ixa_postfragfn = ip_postfrag_multirt_v6; 480 ixa->ixa_flags |= IXAF_MULTIRT_MULTICAST; 481 } else { 482 ixa->ixa_postfragfn = ire->ire_postfragfn; 483 ixa->ixa_flags &= ~IXAF_MULTIRT_MULTICAST; 484 } 485 486 /* 487 * Don't look for an nce for reject or blackhole. 488 * They have ire_generation set to IRE_GENERATION_VERIFY which 489 * makes conn_ip_output avoid references to ixa_nce. 490 */ 491 if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) { 492 ASSERT(ixa->ixa_ire_generation == IRE_GENERATION_VERIFY); 493 ixa->ixa_dce_generation = DCE_GENERATION_VERIFY; 494 return (0); 495 } 496 497 /* The NCE could now be different */ 498 nce = ire_to_nce_pkt(ire, mp); 499 if (nce == NULL) { 500 /* 501 * Allocation failure. Make sure we redo ire/nce selection 502 * next time we send. 503 */ 504 ixa->ixa_ire_generation = IRE_GENERATION_VERIFY; 505 ixa->ixa_dce_generation = DCE_GENERATION_VERIFY; 506 return (ENOBUFS); 507 } 508 if (nce == ixa->ixa_nce) { 509 /* No change */ 510 nce_refrele(nce); 511 return (0); 512 } 513 514 /* 515 * Since the path MTU might change as a result of this 516 * route change, we twiddle ixa_dce_generation to 517 * make conn_ip_output go through the ip_verify_dce code. 518 */ 519 ixa->ixa_dce_generation = DCE_GENERATION_VERIFY; 520 521 if (ixa->ixa_nce != NULL) 522 nce_refrele(ixa->ixa_nce); 523 ixa->ixa_nce = nce; 524 return (0); 525 } 526 527 /* 528 * Handle both IPv4 and IPv6. Reverify/recalculate the NCE to use. 529 */ 530 static int 531 ip_verify_nce(mblk_t *mp, ip_xmit_attr_t *ixa) 532 { 533 ire_t *ire = ixa->ixa_ire; 534 nce_t *nce; 535 int error = 0; 536 ipha_t *ipha = NULL; 537 ip6_t *ip6h = NULL; 538 539 if (ire->ire_ipversion == IPV4_VERSION) 540 ipha = (ipha_t *)mp->b_rptr; 541 else 542 ip6h = (ip6_t *)mp->b_rptr; 543 544 nce = ire_handle_condemned_nce(ixa->ixa_nce, ire, ipha, ip6h, B_TRUE); 545 if (nce == NULL) { 546 /* Try to find a better ire */ 547 return (ip_verify_ire(mp, ixa)); 548 } 549 550 /* 551 * The hardware offloading capabilities, for example LSO, of the 552 * interface might have changed, so do sanity verification here. 553 */ 554 if (ixa->ixa_flags & IXAF_VERIFY_LSO) { 555 if (!ip_verify_lso(nce->nce_ill, ixa)) { 556 ASSERT(ixa->ixa_notify != NULL); 557 ixa->ixa_notify(ixa->ixa_notify_cookie, ixa, 558 IXAN_LSO, 0); 559 error = ENOTSUP; 560 } 561 } 562 563 /* 564 * Verify ZEROCOPY capability of underlying ill. Notify the ULP with 565 * any ZEROCOPY changes. In case ZEROCOPY capability is not available 566 * any more, return error so that conn_ip_output() can take care of 567 * the ZEROCOPY message properly. It's safe to continue send the 568 * message when ZEROCOPY newly become available. 569 */ 570 if (ixa->ixa_flags & IXAF_VERIFY_ZCOPY) { 571 if (!ip_verify_zcopy(nce->nce_ill, ixa)) { 572 ASSERT(ixa->ixa_notify != NULL); 573 ixa->ixa_notify(ixa->ixa_notify_cookie, ixa, 574 IXAN_ZCOPY, 0); 575 if ((ixa->ixa_flags & IXAF_ZCOPY_CAPAB) == 0) 576 error = ENOTSUP; 577 } 578 } 579 580 /* 581 * Since the path MTU might change as a result of this 582 * change, we twiddle ixa_dce_generation to 583 * make conn_ip_output go through the ip_verify_dce code. 584 */ 585 ixa->ixa_dce_generation = DCE_GENERATION_VERIFY; 586 587 nce_refrele(ixa->ixa_nce); 588 ixa->ixa_nce = nce; 589 return (error); 590 } 591 592 /* 593 * Handle both IPv4 and IPv6. Reverify/recalculate the DCE to use. 594 */ 595 static int 596 ip_verify_dce(mblk_t *mp, ip_xmit_attr_t *ixa) 597 { 598 dce_t *dce; 599 uint_t gen; 600 uint_t pmtu; 601 602 dce = dce_lookup_pkt(mp, ixa, &gen); 603 ASSERT(dce != NULL); 604 605 dce_refrele_notr(ixa->ixa_dce); 606 #ifdef DEBUG 607 dce_refhold_notr(dce); 608 dce_refrele(dce); 609 #endif 610 ixa->ixa_dce = dce; 611 ixa->ixa_dce_generation = gen; 612 613 /* Extract the (path) mtu from the dce, ncec_ill etc */ 614 pmtu = ip_get_pmtu(ixa); 615 616 /* 617 * Tell ULP about PMTU changes - increase or decrease - by returning 618 * an error if IXAF_VERIFY_PMTU is set. In such case, ULP should update 619 * both ixa_pmtu and ixa_fragsize appropriately. 620 * 621 * If ULP doesn't set that flag then we need to update ixa_fragsize 622 * since routing could have changed the ill after after ixa_fragsize 623 * was set previously in the conn_ip_output path or in 624 * ip_set_destination. 625 * 626 * In case of LSO, ixa_fragsize might be greater than ixa_pmtu. 627 * 628 * In the case of a path MTU increase we send the packet after the 629 * notify to the ULP. 630 */ 631 if (ixa->ixa_flags & IXAF_VERIFY_PMTU) { 632 if (ixa->ixa_pmtu != pmtu) { 633 uint_t oldmtu = ixa->ixa_pmtu; 634 635 DTRACE_PROBE2(verify_pmtu, uint32_t, pmtu, 636 uint32_t, ixa->ixa_pmtu); 637 ASSERT(ixa->ixa_notify != NULL); 638 ixa->ixa_notify(ixa->ixa_notify_cookie, ixa, 639 IXAN_PMTU, pmtu); 640 if (pmtu < oldmtu) 641 return (EMSGSIZE); 642 } 643 } else { 644 ixa->ixa_fragsize = pmtu; 645 } 646 return (0); 647 } 648 649 /* 650 * Verify LSO usability. Keep the return value simple to indicate whether 651 * the LSO capability has changed. Handle both IPv4 and IPv6. 652 */ 653 static boolean_t 654 ip_verify_lso(ill_t *ill, ip_xmit_attr_t *ixa) 655 { 656 ill_lso_capab_t *lsoc = &ixa->ixa_lso_capab; 657 ill_lso_capab_t *new_lsoc = ill->ill_lso_capab; 658 659 if (ixa->ixa_flags & IXAF_LSO_CAPAB) { 660 /* 661 * Not unsable any more. 662 */ 663 if ((ixa->ixa_flags & IXAF_IPSEC_SECURE) || 664 (ixa->ixa_ire->ire_type & (IRE_LOCAL | IRE_LOOPBACK)) || 665 (ixa->ixa_ire->ire_flags & RTF_MULTIRT) || 666 ((ixa->ixa_flags & IXAF_IS_IPV4) ? 667 !ILL_LSO_TCP_IPV4_USABLE(ill) : 668 !ILL_LSO_TCP_IPV6_USABLE(ill))) { 669 ixa->ixa_flags &= ~IXAF_LSO_CAPAB; 670 671 return (B_FALSE); 672 } 673 674 /* 675 * Capability has changed, refresh the copy in ixa. 676 */ 677 if (lsoc->ill_lso_max_tcpv4 != new_lsoc->ill_lso_max_tcpv4 || 678 lsoc->ill_lso_max_tcpv6 != new_lsoc->ill_lso_max_tcpv6) { 679 *lsoc = *new_lsoc; 680 681 return (B_FALSE); 682 } 683 } else { /* Was not usable */ 684 if (!(ixa->ixa_flags & IXAF_IPSEC_SECURE) && 685 !(ixa->ixa_ire->ire_type & (IRE_LOCAL | IRE_LOOPBACK)) && 686 !(ixa->ixa_ire->ire_flags & RTF_MULTIRT) && 687 ((ixa->ixa_flags & IXAF_IS_IPV4) ? 688 ILL_LSO_TCP_IPV4_USABLE(ill) : 689 ILL_LSO_TCP_IPV6_USABLE(ill))) { 690 *lsoc = *new_lsoc; 691 ixa->ixa_flags |= IXAF_LSO_CAPAB; 692 693 return (B_FALSE); 694 } 695 } 696 697 return (B_TRUE); 698 } 699 700 /* 701 * Verify ZEROCOPY usability. Keep the return value simple to indicate whether 702 * the ZEROCOPY capability has changed. Handle both IPv4 and IPv6. 703 */ 704 static boolean_t 705 ip_verify_zcopy(ill_t *ill, ip_xmit_attr_t *ixa) 706 { 707 if (ixa->ixa_flags & IXAF_ZCOPY_CAPAB) { 708 /* 709 * Not unsable any more. 710 */ 711 if ((ixa->ixa_flags & IXAF_IPSEC_SECURE) || 712 (ixa->ixa_ire->ire_type & (IRE_LOCAL | IRE_LOOPBACK)) || 713 (ixa->ixa_ire->ire_flags & RTF_MULTIRT) || 714 !ILL_ZCOPY_USABLE(ill)) { 715 ixa->ixa_flags &= ~IXAF_ZCOPY_CAPAB; 716 717 return (B_FALSE); 718 } 719 } else { /* Was not usable */ 720 if (!(ixa->ixa_flags & IXAF_IPSEC_SECURE) && 721 !(ixa->ixa_ire->ire_type & (IRE_LOCAL | IRE_LOOPBACK)) && 722 !(ixa->ixa_ire->ire_flags & RTF_MULTIRT) && 723 ILL_ZCOPY_USABLE(ill)) { 724 ixa->ixa_flags |= IXAF_ZCOPY_CAPAB; 725 726 return (B_FALSE); 727 } 728 } 729 730 return (B_TRUE); 731 } 732 733 734 /* 735 * When there is no conn_t context, this will send a packet. 736 * The caller must *not* have called conn_connect() or ip_attr_connect() 737 * before calling ip_output_simple(). 738 * Handles IPv4 and IPv6. Returns zero or an errno such as ENETUNREACH. 739 * Honors IXAF_SET_SOURCE. 740 * 741 * We acquire the ire and after calling ire_sendfn we release 742 * the hold on the ire. Ditto for the nce and dce. 743 * 744 * This assumes that the caller has set the following in ip_xmit_attr_t: 745 * ixa_tsl, ixa_zoneid, and ixa_ipst must always be set. 746 * If ixa_ifindex is non-zero it means send out that ill. (If it is 747 * an upper IPMP ill we load balance across the group; if a lower we send 748 * on that lower ill without load balancing.) 749 * IXAF_IS_IPV4 must be set correctly. 750 * If IXAF_IPSEC_SECURE is set then the ixa_ipsec_* fields must be set. 751 * If IXAF_NO_IPSEC is set we'd skip IPsec policy lookup. 752 * If neither of those two are set we do an IPsec policy lookup. 753 * 754 * We handle setting things like 755 * ixa_pktlen 756 * ixa_ip_hdr_length 757 * ixa->ixa_protocol 758 * 759 * The caller may set ixa_xmit_hint, which is used for ECMP selection and 760 * transmit ring selecting in GLD. 761 * 762 * The caller must do an ixa_cleanup() to release any IPsec references 763 * after we return. 764 */ 765 int 766 ip_output_simple(mblk_t *mp, ip_xmit_attr_t *ixa) 767 { 768 ts_label_t *effective_tsl = NULL; 769 int err; 770 771 ASSERT(ixa->ixa_ipst != NULL); 772 773 if (is_system_labeled()) { 774 ip_stack_t *ipst = ixa->ixa_ipst; 775 776 if (ixa->ixa_flags & IXAF_IS_IPV4) { 777 err = tsol_check_label_v4(ixa->ixa_tsl, ixa->ixa_zoneid, 778 &mp, CONN_MAC_DEFAULT, B_FALSE, ixa->ixa_ipst, 779 &effective_tsl); 780 } else { 781 err = tsol_check_label_v6(ixa->ixa_tsl, ixa->ixa_zoneid, 782 &mp, CONN_MAC_DEFAULT, B_FALSE, ixa->ixa_ipst, 783 &effective_tsl); 784 } 785 if (err != 0) { 786 ip2dbg(("tsol_check: label check failed (%d)\n", err)); 787 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests); 788 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards); 789 ip_drop_output("tsol_check_label", mp, NULL); 790 freemsg(mp); 791 return (err); 792 } 793 if (effective_tsl != NULL) { 794 /* Update the label */ 795 ip_xmit_attr_replace_tsl(ixa, effective_tsl); 796 } 797 } 798 799 if (ixa->ixa_flags & IXAF_IS_IPV4) 800 return (ip_output_simple_v4(mp, ixa)); 801 else 802 return (ip_output_simple_v6(mp, ixa)); 803 } 804 805 int 806 ip_output_simple_v4(mblk_t *mp, ip_xmit_attr_t *ixa) 807 { 808 ipha_t *ipha; 809 ipaddr_t firsthop; /* In IP header */ 810 ipaddr_t dst; /* End of source route, or ipha_dst if none */ 811 ire_t *ire; 812 ipaddr_t setsrc; /* RTF_SETSRC */ 813 int error; 814 ill_t *ill = NULL; 815 dce_t *dce = NULL; 816 nce_t *nce; 817 iaflags_t ixaflags = ixa->ixa_flags; 818 ip_stack_t *ipst = ixa->ixa_ipst; 819 boolean_t repeat = B_FALSE; 820 boolean_t multirt = B_FALSE; 821 int64_t now; 822 823 ipha = (ipha_t *)mp->b_rptr; 824 ASSERT(IPH_HDR_VERSION(ipha) == IPV4_VERSION); 825 826 /* 827 * Even on labeled systems we can have a NULL ixa_tsl e.g., 828 * for IGMP/MLD traffic. 829 */ 830 831 /* Caller already set flags */ 832 ASSERT(ixa->ixa_flags & IXAF_IS_IPV4); 833 834 ASSERT(ixa->ixa_nce == NULL); 835 836 ixa->ixa_pktlen = ntohs(ipha->ipha_length); 837 ASSERT(ixa->ixa_pktlen == msgdsize(mp)); 838 ixa->ixa_ip_hdr_length = IPH_HDR_LENGTH(ipha); 839 ixa->ixa_protocol = ipha->ipha_protocol; 840 841 /* 842 * Assumes that source routed packets have already been massaged by 843 * the ULP (ip_massage_options) and as a result ipha_dst is the next 844 * hop in the source route. The final destination is used for IPsec 845 * policy and DCE lookup. 846 */ 847 firsthop = ipha->ipha_dst; 848 dst = ip_get_dst(ipha); 849 850 repeat_ire: 851 error = 0; 852 setsrc = INADDR_ANY; 853 ire = ip_select_route_v4(firsthop, ipha->ipha_src, ixa, NULL, 854 &setsrc, &error, &multirt); 855 ASSERT(ire != NULL); /* IRE_NOROUTE if none found */ 856 if (error != 0) { 857 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests); 858 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards); 859 ip_drop_output("ipIfStatsOutDiscards - select route", mp, NULL); 860 freemsg(mp); 861 goto done; 862 } 863 864 if (ire->ire_flags & (RTF_BLACKHOLE|RTF_REJECT)) { 865 /* ire_ill might be NULL hence need to skip some code */ 866 if (ixaflags & IXAF_SET_SOURCE) 867 ipha->ipha_src = htonl(INADDR_LOOPBACK); 868 ixa->ixa_fragsize = IP_MAXPACKET; 869 ill = NULL; 870 nce = NULL; 871 ire->ire_ob_pkt_count++; 872 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests); 873 /* No dce yet; use default one */ 874 error = (ire->ire_sendfn)(ire, mp, ipha, ixa, 875 &ipst->ips_dce_default->dce_ident); 876 goto done; 877 } 878 879 /* Note that ipha_dst is only used for IRE_MULTICAST */ 880 nce = ire_to_nce(ire, ipha->ipha_dst, NULL); 881 if (nce == NULL) { 882 /* Allocation failure? */ 883 ip_drop_output("ire_to_nce", mp, ill); 884 freemsg(mp); 885 error = ENOBUFS; 886 goto done; 887 } 888 if (nce->nce_is_condemned) { 889 nce_t *nce1; 890 891 nce1 = ire_handle_condemned_nce(nce, ire, ipha, NULL, B_TRUE); 892 nce_refrele(nce); 893 if (nce1 == NULL) { 894 if (!repeat) { 895 /* Try finding a better IRE */ 896 repeat = B_TRUE; 897 ire_refrele(ire); 898 goto repeat_ire; 899 } 900 /* Tried twice - drop packet */ 901 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards); 902 ip_drop_output("No nce", mp, ill); 903 freemsg(mp); 904 error = ENOBUFS; 905 goto done; 906 } 907 nce = nce1; 908 } 909 910 /* 911 * For multicast with multirt we have a flag passed back from 912 * ire_lookup_multi_ill_v4 since we don't have an IRE for each 913 * possible multicast address. 914 * We also need a flag for multicast since we can't check 915 * whether RTF_MULTIRT is set in ixa_ire for multicast. 916 */ 917 if (multirt) { 918 ixa->ixa_postfragfn = ip_postfrag_multirt_v4; 919 ixa->ixa_flags |= IXAF_MULTIRT_MULTICAST; 920 } else { 921 ixa->ixa_postfragfn = ire->ire_postfragfn; 922 ixa->ixa_flags &= ~IXAF_MULTIRT_MULTICAST; 923 } 924 ASSERT(ixa->ixa_nce == NULL); 925 ixa->ixa_nce = nce; 926 927 /* 928 * Check for a dce_t with a path mtu. 929 */ 930 dce = dce_lookup_v4(dst, ipst, NULL); 931 ASSERT(dce != NULL); 932 933 if (!(ixaflags & IXAF_PMTU_DISCOVERY)) { 934 ixa->ixa_fragsize = ip_get_base_mtu(nce->nce_ill, ire); 935 } else if (dce->dce_flags & DCEF_PMTU) { 936 /* 937 * To avoid a periodic timer to increase the path MTU we 938 * look at dce_last_change_time each time we send a packet. 939 */ 940 now = ddi_get_lbolt64(); 941 if (TICK_TO_SEC(now) - dce->dce_last_change_time > 942 ipst->ips_ip_pathmtu_interval) { 943 /* 944 * Older than 20 minutes. Drop the path MTU information. 945 */ 946 mutex_enter(&dce->dce_lock); 947 dce->dce_flags &= ~(DCEF_PMTU|DCEF_TOO_SMALL_PMTU); 948 dce->dce_last_change_time = TICK_TO_SEC(now); 949 mutex_exit(&dce->dce_lock); 950 dce_increment_generation(dce); 951 ixa->ixa_fragsize = ip_get_base_mtu(nce->nce_ill, ire); 952 } else { 953 uint_t fragsize; 954 955 fragsize = ip_get_base_mtu(nce->nce_ill, ire); 956 if (fragsize > dce->dce_pmtu) 957 fragsize = dce->dce_pmtu; 958 ixa->ixa_fragsize = fragsize; 959 } 960 } else { 961 ixa->ixa_fragsize = ip_get_base_mtu(nce->nce_ill, ire); 962 } 963 964 /* 965 * We use use ire_nexthop_ill (and not ncec_ill) to avoid the under ipmp 966 * interface for source address selection. 967 */ 968 ill = ire_nexthop_ill(ire); 969 970 if (ixaflags & IXAF_SET_SOURCE) { 971 ipaddr_t src; 972 973 /* 974 * We use the final destination to get 975 * correct selection for source routed packets 976 */ 977 978 /* If unreachable we have no ill but need some source */ 979 if (ill == NULL) { 980 src = htonl(INADDR_LOOPBACK); 981 error = 0; 982 } else { 983 error = ip_select_source_v4(ill, setsrc, dst, 984 ixa->ixa_multicast_ifaddr, ixa->ixa_zoneid, ipst, 985 &src, NULL, NULL); 986 } 987 if (error != 0) { 988 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCOutRequests); 989 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); 990 ip_drop_output("ipIfStatsOutDiscards - no source", 991 mp, ill); 992 freemsg(mp); 993 goto done; 994 } 995 ipha->ipha_src = src; 996 } else if (ixaflags & IXAF_VERIFY_SOURCE) { 997 /* Check if the IP source is assigned to the host. */ 998 if (!ip_verify_src(mp, ixa, NULL)) { 999 /* Don't send a packet with a source that isn't ours */ 1000 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests); 1001 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards); 1002 ip_drop_output("ipIfStatsOutDiscards - invalid source", 1003 mp, ill); 1004 freemsg(mp); 1005 error = EADDRNOTAVAIL; 1006 goto done; 1007 } 1008 } 1009 1010 1011 /* 1012 * Check against global IPsec policy to set the AH/ESP attributes. 1013 * IPsec will set IXAF_IPSEC_* and ixa_ipsec_* as appropriate. 1014 */ 1015 if (!(ixaflags & (IXAF_NO_IPSEC|IXAF_IPSEC_SECURE))) { 1016 ASSERT(ixa->ixa_ipsec_policy == NULL); 1017 mp = ip_output_attach_policy(mp, ipha, NULL, NULL, ixa); 1018 if (mp == NULL) { 1019 /* MIB and ip_drop_packet already done */ 1020 return (EHOSTUNREACH); /* IPsec policy failure */ 1021 } 1022 } 1023 1024 if (ill != NULL) { 1025 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCOutRequests); 1026 } else { 1027 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests); 1028 } 1029 1030 /* 1031 * We update the statistics on the most specific IRE i.e., the first 1032 * one we found. 1033 * We don't have an IRE when we fragment, hence ire_ob_pkt_count 1034 * can only count the use prior to fragmentation. However the MIB 1035 * counters on the ill will be incremented in post fragmentation. 1036 */ 1037 ire->ire_ob_pkt_count++; 1038 1039 /* 1040 * Based on ire_type and ire_flags call one of: 1041 * ire_send_local_v4 - for IRE_LOCAL and IRE_LOOPBACK 1042 * ire_send_multirt_v4 - if RTF_MULTIRT 1043 * ire_send_noroute_v4 - if RTF_REJECT or RTF_BLACHOLE 1044 * ire_send_multicast_v4 - for IRE_MULTICAST 1045 * ire_send_broadcast_v4 - for IRE_BROADCAST 1046 * ire_send_wire_v4 - for the rest. 1047 */ 1048 error = (ire->ire_sendfn)(ire, mp, ipha, ixa, &dce->dce_ident); 1049 done: 1050 ire_refrele(ire); 1051 if (dce != NULL) 1052 dce_refrele(dce); 1053 if (ill != NULL) 1054 ill_refrele(ill); 1055 if (ixa->ixa_nce != NULL) 1056 nce_refrele(ixa->ixa_nce); 1057 ixa->ixa_nce = NULL; 1058 return (error); 1059 } 1060 1061 /* 1062 * ire_sendfn() functions. 1063 * These functions use the following xmit_attr: 1064 * - ixa_fragsize - read to determine whether or not to fragment 1065 * - IXAF_IPSEC_SECURE - to determine whether or not to invoke IPsec 1066 * - ixa_ipsec_* are used inside IPsec 1067 * - IXAF_SET_SOURCE - replace IP source in broadcast case. 1068 * - IXAF_LOOPBACK_COPY - for multicast and broadcast 1069 */ 1070 1071 1072 /* 1073 * ire_sendfn for IRE_LOCAL and IRE_LOOPBACK 1074 * 1075 * The checks for restrict_interzone_loopback are done in ire_route_recursive. 1076 */ 1077 /* ARGSUSED4 */ 1078 int 1079 ire_send_local_v4(ire_t *ire, mblk_t *mp, void *iph_arg, 1080 ip_xmit_attr_t *ixa, uint32_t *identp) 1081 { 1082 ipha_t *ipha = (ipha_t *)iph_arg; 1083 ip_stack_t *ipst = ixa->ixa_ipst; 1084 ill_t *ill = ire->ire_ill; 1085 ip_recv_attr_t iras; /* NOTE: No bzero for performance */ 1086 uint_t pktlen = ixa->ixa_pktlen; 1087 1088 /* 1089 * No fragmentation, no nce, no application of IPsec, 1090 * and no ipha_ident assignment. 1091 * 1092 * Note different order between IP provider and FW_HOOKS than in 1093 * send_wire case. 1094 */ 1095 1096 /* 1097 * DTrace this as ip:::send. A packet blocked by FW_HOOKS will fire the 1098 * send probe, but not the receive probe. 1099 */ 1100 DTRACE_IP7(send, mblk_t *, mp, conn_t *, NULL, void_ip_t *, 1101 ipha, __dtrace_ipsr_ill_t *, ill, ipha_t *, ipha, ip6_t *, NULL, 1102 int, 1); 1103 1104 if (HOOKS4_INTERESTED_LOOPBACK_OUT(ipst)) { 1105 int error = 0; 1106 1107 DTRACE_PROBE4(ip4__loopback__out__start, ill_t *, NULL, 1108 ill_t *, ill, ipha_t *, ipha, mblk_t *, mp); 1109 FW_HOOKS(ipst->ips_ip4_loopback_out_event, 1110 ipst->ips_ipv4firewall_loopback_out, 1111 NULL, ill, ipha, mp, mp, 0, ipst, error); 1112 DTRACE_PROBE1(ip4__loopback__out__end, mblk_t *, mp); 1113 if (mp == NULL) 1114 return (error); 1115 1116 /* 1117 * Even if the destination was changed by the filter we use the 1118 * forwarding decision that was made based on the address 1119 * in ip_output/ip_set_destination. 1120 */ 1121 /* Length could be different */ 1122 ipha = (ipha_t *)mp->b_rptr; 1123 pktlen = ntohs(ipha->ipha_length); 1124 } 1125 1126 /* 1127 * If a callback is enabled then we need to know the 1128 * source and destination zoneids for the packet. We already 1129 * have those handy. 1130 */ 1131 if (ipst->ips_ip4_observe.he_interested) { 1132 zoneid_t szone, dzone; 1133 zoneid_t stackzoneid; 1134 1135 stackzoneid = netstackid_to_zoneid( 1136 ipst->ips_netstack->netstack_stackid); 1137 1138 if (stackzoneid == GLOBAL_ZONEID) { 1139 /* Shared-IP zone */ 1140 dzone = ire->ire_zoneid; 1141 szone = ixa->ixa_zoneid; 1142 } else { 1143 szone = dzone = stackzoneid; 1144 } 1145 ipobs_hook(mp, IPOBS_HOOK_LOCAL, szone, dzone, ill, ipst); 1146 } 1147 1148 /* Handle lo0 stats */ 1149 ipst->ips_loopback_packets++; 1150 1151 /* Map ixa to ira including IPsec policies */ 1152 ipsec_out_to_in(ixa, ill, &iras); 1153 iras.ira_pktlen = pktlen; 1154 iras.ira_ttl = ipha->ipha_ttl; 1155 1156 if (!IS_SIMPLE_IPH(ipha)) { 1157 ip_output_local_options(ipha, ipst); 1158 iras.ira_flags |= IRAF_IPV4_OPTIONS; 1159 } 1160 1161 if (HOOKS4_INTERESTED_LOOPBACK_IN(ipst)) { 1162 int error = 0; 1163 1164 DTRACE_PROBE4(ip4__loopback__in__start, ill_t *, ill, 1165 ill_t *, NULL, ipha_t *, ipha, mblk_t *, mp); 1166 FW_HOOKS(ipst->ips_ip4_loopback_in_event, 1167 ipst->ips_ipv4firewall_loopback_in, 1168 ill, NULL, ipha, mp, mp, 0, ipst, error); 1169 1170 DTRACE_PROBE1(ip4__loopback__in__end, mblk_t *, mp); 1171 if (mp == NULL) { 1172 ira_cleanup(&iras, B_FALSE); 1173 return (error); 1174 } 1175 /* 1176 * Even if the destination was changed by the filter we use the 1177 * forwarding decision that was made based on the address 1178 * in ip_output/ip_set_destination. 1179 */ 1180 /* Length could be different */ 1181 ipha = (ipha_t *)mp->b_rptr; 1182 pktlen = iras.ira_pktlen = ntohs(ipha->ipha_length); 1183 } 1184 1185 DTRACE_IP7(receive, mblk_t *, mp, conn_t *, NULL, void_ip_t *, 1186 ipha, __dtrace_ipsr_ill_t *, ill, ipha_t *, ipha, ip6_t *, NULL, 1187 int, 1); 1188 1189 ire->ire_ib_pkt_count++; 1190 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInReceives); 1191 UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCInOctets, pktlen); 1192 1193 /* Destined to ire_zoneid - use that for fanout */ 1194 iras.ira_zoneid = ire->ire_zoneid; 1195 1196 if (is_system_labeled()) { 1197 iras.ira_flags |= IRAF_SYSTEM_LABELED; 1198 1199 /* 1200 * This updates ira_cred, ira_tsl and ira_free_flags based 1201 * on the label. We don't expect this to ever fail for 1202 * loopback packets, so we silently drop the packet should it 1203 * fail. 1204 */ 1205 if (!tsol_get_pkt_label(mp, IPV4_VERSION, &iras)) { 1206 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 1207 ip_drop_input("tsol_get_pkt_label", mp, ill); 1208 freemsg(mp); 1209 return (0); 1210 } 1211 ASSERT(iras.ira_tsl != NULL); 1212 1213 /* tsol_get_pkt_label sometimes does pullupmsg */ 1214 ipha = (ipha_t *)mp->b_rptr; 1215 } 1216 1217 ip_fanout_v4(mp, ipha, &iras); 1218 1219 /* We moved any IPsec refs from ixa to iras */ 1220 ira_cleanup(&iras, B_FALSE); 1221 return (0); 1222 } 1223 1224 /* 1225 * ire_sendfn for IRE_BROADCAST 1226 * If the broadcast address is present on multiple ills and ixa_ifindex 1227 * isn't set, then we generate 1228 * a separate datagram (potentially with different source address) for 1229 * those ills. In any case, only one copy is looped back to ip_input_v4. 1230 */ 1231 int 1232 ire_send_broadcast_v4(ire_t *ire, mblk_t *mp, void *iph_arg, 1233 ip_xmit_attr_t *ixa, uint32_t *identp) 1234 { 1235 ipha_t *ipha = (ipha_t *)iph_arg; 1236 ip_stack_t *ipst = ixa->ixa_ipst; 1237 irb_t *irb = ire->ire_bucket; 1238 ire_t *ire1; 1239 mblk_t *mp1; 1240 ipha_t *ipha1; 1241 iaflags_t ixaflags = ixa->ixa_flags; 1242 nce_t *nce1, *nce_orig; 1243 1244 /* 1245 * Unless ire_send_multirt_v4 already set a ttl, force the 1246 * ttl to a smallish value. 1247 */ 1248 if (!(ixa->ixa_flags & IXAF_NO_TTL_CHANGE)) { 1249 /* 1250 * To avoid broadcast storms, we usually set the TTL to 1 for 1251 * broadcasts. This can 1252 * be overridden stack-wide through the ip_broadcast_ttl 1253 * ndd tunable, or on a per-connection basis through the 1254 * IP_BROADCAST_TTL socket option. 1255 * 1256 * If SO_DONTROUTE/IXAF_DONTROUTE is set, then ire_send_wire_v4 1257 * will force ttl to one after we've set this. 1258 */ 1259 if (ixaflags & IXAF_BROADCAST_TTL_SET) 1260 ipha->ipha_ttl = ixa->ixa_broadcast_ttl; 1261 else 1262 ipha->ipha_ttl = ipst->ips_ip_broadcast_ttl; 1263 } 1264 /* 1265 * Make sure we get a loopback copy (after IPsec and frag) 1266 * Skip hardware checksum so that loopback copy is checksumed. 1267 */ 1268 ixa->ixa_flags |= IXAF_LOOPBACK_COPY | IXAF_NO_HW_CKSUM; 1269 1270 /* Do we need to potentially generate multiple copies? */ 1271 if (irb->irb_ire_cnt == 1 || ixa->ixa_ifindex != 0) 1272 return (ire_send_wire_v4(ire, mp, ipha, ixa, identp)); 1273 1274 /* 1275 * Loop over all IRE_BROADCAST in the bucket (might only be one). 1276 * Note that everything in the bucket has the same destination address. 1277 */ 1278 irb_refhold(irb); 1279 for (ire1 = irb->irb_ire; ire1 != NULL; ire1 = ire1->ire_next) { 1280 /* We do the main IRE after the end of the loop */ 1281 if (ire1 == ire) 1282 continue; 1283 1284 /* 1285 * Only IREs for the same IP address should be in the same 1286 * bucket. 1287 * But could have IRE_HOSTs in the case of CGTP. 1288 * If we find any multirt routes we bail out of the loop 1289 * and just do the single packet at the end; ip_postfrag_multirt 1290 * will duplicate the packet. 1291 */ 1292 ASSERT(ire1->ire_addr == ire->ire_addr); 1293 if (!(ire1->ire_type & IRE_BROADCAST)) 1294 continue; 1295 1296 if (IRE_IS_CONDEMNED(ire1)) 1297 continue; 1298 1299 if (ixa->ixa_zoneid != ALL_ZONES && 1300 ire->ire_zoneid != ire1->ire_zoneid) 1301 continue; 1302 1303 ASSERT(ire->ire_ill != ire1->ire_ill && ire1->ire_ill != NULL); 1304 1305 if (ire1->ire_flags & RTF_MULTIRT) 1306 break; 1307 1308 /* 1309 * For IPMP we only send for the ipmp_ill. arp_nce_init() will 1310 * ensure that this goes out on the cast_ill. 1311 */ 1312 if (IS_UNDER_IPMP(ire1->ire_ill)) 1313 continue; 1314 1315 mp1 = copymsg(mp); 1316 if (mp1 == NULL) { 1317 BUMP_MIB(ire1->ire_ill->ill_ip_mib, 1318 ipIfStatsOutDiscards); 1319 ip_drop_output("ipIfStatsOutDiscards", 1320 mp, ire1->ire_ill); 1321 continue; 1322 } 1323 1324 ipha1 = (ipha_t *)mp1->b_rptr; 1325 if (ixa->ixa_flags & IXAF_SET_SOURCE) { 1326 /* 1327 * Need to pick a different source address for each 1328 * interface. If we have a global IPsec policy and 1329 * no per-socket policy then we punt to 1330 * ip_output_simple_v4 using a separate ip_xmit_attr_t. 1331 */ 1332 if (ixaflags & IXAF_IPSEC_GLOBAL_POLICY) { 1333 ip_output_simple_broadcast(ixa, mp1); 1334 continue; 1335 } 1336 /* Pick a new source address for each interface */ 1337 if (ip_select_source_v4(ire1->ire_ill, INADDR_ANY, 1338 ipha1->ipha_dst, INADDR_ANY, ixa->ixa_zoneid, ipst, 1339 &ipha1->ipha_src, NULL, NULL) != 0) { 1340 BUMP_MIB(ire1->ire_ill->ill_ip_mib, 1341 ipIfStatsOutDiscards); 1342 ip_drop_output("ipIfStatsOutDiscards - select " 1343 "broadcast source", mp1, ire1->ire_ill); 1344 freemsg(mp1); 1345 continue; 1346 } 1347 /* 1348 * Check against global IPsec policy to set the AH/ESP 1349 * attributes. IPsec will set IXAF_IPSEC_* and 1350 * ixa_ipsec_* as appropriate. 1351 */ 1352 if (!(ixaflags & (IXAF_NO_IPSEC|IXAF_IPSEC_SECURE))) { 1353 ASSERT(ixa->ixa_ipsec_policy == NULL); 1354 mp1 = ip_output_attach_policy(mp1, ipha, NULL, 1355 NULL, ixa); 1356 if (mp1 == NULL) { 1357 /* 1358 * MIB and ip_drop_packet already 1359 * done 1360 */ 1361 continue; 1362 } 1363 } 1364 } 1365 /* Make sure we have an NCE on this ill */ 1366 nce1 = arp_nce_init(ire1->ire_ill, ire1->ire_addr, 1367 ire1->ire_type); 1368 if (nce1 == NULL) { 1369 BUMP_MIB(ire1->ire_ill->ill_ip_mib, 1370 ipIfStatsOutDiscards); 1371 ip_drop_output("ipIfStatsOutDiscards - broadcast nce", 1372 mp1, ire1->ire_ill); 1373 freemsg(mp1); 1374 continue; 1375 } 1376 nce_orig = ixa->ixa_nce; 1377 ixa->ixa_nce = nce1; 1378 1379 ire_refhold(ire1); 1380 /* 1381 * Ignore any errors here. We just collect the errno for 1382 * the main ire below 1383 */ 1384 (void) ire_send_wire_v4(ire1, mp1, ipha1, ixa, identp); 1385 ire_refrele(ire1); 1386 1387 ixa->ixa_nce = nce_orig; 1388 nce_refrele(nce1); 1389 1390 ixa->ixa_flags &= ~IXAF_LOOPBACK_COPY; 1391 } 1392 irb_refrele(irb); 1393 /* Finally, the main one */ 1394 1395 /* 1396 * For IPMP we only send broadcasts on the ipmp_ill. 1397 */ 1398 if (IS_UNDER_IPMP(ire->ire_ill)) { 1399 freemsg(mp); 1400 return (0); 1401 } 1402 1403 return (ire_send_wire_v4(ire, mp, ipha, ixa, identp)); 1404 } 1405 1406 /* 1407 * Send a packet using a different source address and different 1408 * IPsec policy. 1409 */ 1410 static void 1411 ip_output_simple_broadcast(ip_xmit_attr_t *ixa, mblk_t *mp) 1412 { 1413 ip_xmit_attr_t ixas; 1414 1415 bzero(&ixas, sizeof (ixas)); 1416 ixas.ixa_flags = IXAF_BASIC_SIMPLE_V4; 1417 ixas.ixa_zoneid = ixa->ixa_zoneid; 1418 ixas.ixa_ifindex = 0; 1419 ixas.ixa_ipst = ixa->ixa_ipst; 1420 ixas.ixa_cred = ixa->ixa_cred; 1421 ixas.ixa_cpid = ixa->ixa_cpid; 1422 ixas.ixa_tsl = ixa->ixa_tsl; 1423 ixas.ixa_multicast_ttl = IP_DEFAULT_MULTICAST_TTL; 1424 1425 (void) ip_output_simple(mp, &ixas); 1426 ixa_cleanup(&ixas); 1427 } 1428 1429 1430 static void 1431 multirt_check_v4(ire_t *ire, ipha_t *ipha, ip_xmit_attr_t *ixa) 1432 { 1433 ip_stack_t *ipst = ixa->ixa_ipst; 1434 1435 /* Limit the TTL on multirt packets */ 1436 if (ire->ire_type & IRE_MULTICAST) { 1437 if (ipha->ipha_ttl > 1) { 1438 ip2dbg(("ire_send_multirt_v4: forcing multicast " 1439 "multirt TTL to 1 (was %d), dst 0x%08x\n", 1440 ipha->ipha_ttl, ntohl(ire->ire_addr))); 1441 ipha->ipha_ttl = 1; 1442 } 1443 ixa->ixa_flags |= IXAF_NO_TTL_CHANGE; 1444 } else if ((ipst->ips_ip_multirt_ttl > 0) && 1445 (ipha->ipha_ttl > ipst->ips_ip_multirt_ttl)) { 1446 ipha->ipha_ttl = ipst->ips_ip_multirt_ttl; 1447 /* 1448 * Need to ensure we don't increase the ttl should we go through 1449 * ire_send_broadcast or multicast. 1450 */ 1451 ixa->ixa_flags |= IXAF_NO_TTL_CHANGE; 1452 } 1453 } 1454 1455 /* 1456 * ire_sendfn for IRE_MULTICAST 1457 */ 1458 int 1459 ire_send_multicast_v4(ire_t *ire, mblk_t *mp, void *iph_arg, 1460 ip_xmit_attr_t *ixa, uint32_t *identp) 1461 { 1462 ipha_t *ipha = (ipha_t *)iph_arg; 1463 ip_stack_t *ipst = ixa->ixa_ipst; 1464 ill_t *ill = ire->ire_ill; 1465 iaflags_t ixaflags = ixa->ixa_flags; 1466 1467 /* 1468 * The IRE_MULTICAST is the same whether or not multirt is in use. 1469 * Hence we need special-case code. 1470 */ 1471 if (ixaflags & IXAF_MULTIRT_MULTICAST) 1472 multirt_check_v4(ire, ipha, ixa); 1473 1474 /* 1475 * Check if anything in ip_input_v4 wants a copy of the transmitted 1476 * packet (after IPsec and fragmentation) 1477 * 1478 * 1. Multicast routers always need a copy unless SO_DONTROUTE is set 1479 * RSVP and the rsvp daemon is an example of a 1480 * protocol and user level process that 1481 * handles it's own routing. Hence, it uses the 1482 * SO_DONTROUTE option to accomplish this. 1483 * 2. If the sender has set IP_MULTICAST_LOOP, then we just 1484 * check whether there are any receivers for the group on the ill 1485 * (ignoring the zoneid). 1486 * 3. If IP_MULTICAST_LOOP is not set, then we check if there are 1487 * any members in other shared-IP zones. 1488 * If such members exist, then we indicate that the sending zone 1489 * shouldn't get a loopback copy to preserve the IP_MULTICAST_LOOP 1490 * behavior. 1491 * 1492 * When we loopback we skip hardware checksum to make sure loopback 1493 * copy is checksumed. 1494 * 1495 * Note that ire_ill is the upper in the case of IPMP. 1496 */ 1497 ixa->ixa_flags &= ~(IXAF_LOOPBACK_COPY | IXAF_NO_HW_CKSUM); 1498 if (ipst->ips_ip_g_mrouter && ill->ill_mrouter_cnt > 0 && 1499 !(ixaflags & IXAF_DONTROUTE)) { 1500 ixa->ixa_flags |= IXAF_LOOPBACK_COPY | IXAF_NO_HW_CKSUM; 1501 } else if (ixaflags & IXAF_MULTICAST_LOOP) { 1502 /* 1503 * If this zone or any other zone has members then loopback 1504 * a copy. 1505 */ 1506 if (ill_hasmembers_v4(ill, ipha->ipha_dst)) 1507 ixa->ixa_flags |= IXAF_LOOPBACK_COPY | IXAF_NO_HW_CKSUM; 1508 } else if (ipst->ips_netstack->netstack_numzones > 1) { 1509 /* 1510 * This zone should not have a copy. But there are some other 1511 * zones which might have members. 1512 */ 1513 if (ill_hasmembers_otherzones_v4(ill, ipha->ipha_dst, 1514 ixa->ixa_zoneid)) { 1515 ixa->ixa_flags |= IXAF_NO_LOOP_ZONEID_SET; 1516 ixa->ixa_no_loop_zoneid = ixa->ixa_zoneid; 1517 ixa->ixa_flags |= IXAF_LOOPBACK_COPY | IXAF_NO_HW_CKSUM; 1518 } 1519 } 1520 1521 /* 1522 * Unless ire_send_multirt_v4 or icmp_output_hdrincl already set a ttl, 1523 * force the ttl to the IP_MULTICAST_TTL value 1524 */ 1525 if (!(ixaflags & IXAF_NO_TTL_CHANGE)) { 1526 ipha->ipha_ttl = ixa->ixa_multicast_ttl; 1527 } 1528 1529 return (ire_send_wire_v4(ire, mp, ipha, ixa, identp)); 1530 } 1531 1532 /* 1533 * ire_sendfn for IREs with RTF_MULTIRT 1534 */ 1535 int 1536 ire_send_multirt_v4(ire_t *ire, mblk_t *mp, void *iph_arg, 1537 ip_xmit_attr_t *ixa, uint32_t *identp) 1538 { 1539 ipha_t *ipha = (ipha_t *)iph_arg; 1540 1541 multirt_check_v4(ire, ipha, ixa); 1542 1543 if (ire->ire_type & IRE_MULTICAST) 1544 return (ire_send_multicast_v4(ire, mp, ipha, ixa, identp)); 1545 else if (ire->ire_type & IRE_BROADCAST) 1546 return (ire_send_broadcast_v4(ire, mp, ipha, ixa, identp)); 1547 else 1548 return (ire_send_wire_v4(ire, mp, ipha, ixa, identp)); 1549 } 1550 1551 /* 1552 * ire_sendfn for IREs with RTF_REJECT/RTF_BLACKHOLE, including IRE_NOROUTE 1553 */ 1554 int 1555 ire_send_noroute_v4(ire_t *ire, mblk_t *mp, void *iph_arg, 1556 ip_xmit_attr_t *ixa, uint32_t *identp) 1557 { 1558 ip_stack_t *ipst = ixa->ixa_ipst; 1559 ipha_t *ipha = (ipha_t *)iph_arg; 1560 ill_t *ill; 1561 ip_recv_attr_t iras; 1562 boolean_t dummy; 1563 1564 /* We assign an IP ident for nice errors */ 1565 ipha->ipha_ident = atomic_inc_32_nv(identp); 1566 1567 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutNoRoutes); 1568 1569 if (ire->ire_type & IRE_NOROUTE) { 1570 /* A lack of a route as opposed to RTF_REJECT|BLACKHOLE */ 1571 ip_rts_change(RTM_MISS, ipha->ipha_dst, 0, 0, 0, 0, 0, 0, 1572 RTA_DST, ipst); 1573 } 1574 1575 if (ire->ire_flags & RTF_BLACKHOLE) { 1576 ip_drop_output("ipIfStatsOutNoRoutes RTF_BLACKHOLE", mp, NULL); 1577 freemsg(mp); 1578 /* No error even for local senders - silent blackhole */ 1579 return (0); 1580 } 1581 ip_drop_output("ipIfStatsOutNoRoutes RTF_REJECT", mp, NULL); 1582 1583 /* 1584 * We need an ill_t for the ip_recv_attr_t even though this packet 1585 * was never received and icmp_unreachable doesn't currently use 1586 * ira_ill. 1587 */ 1588 ill = ill_lookup_on_name("lo0", B_FALSE, 1589 !(ixa->ixa_flags & IRAF_IS_IPV4), &dummy, ipst); 1590 if (ill == NULL) { 1591 freemsg(mp); 1592 return (EHOSTUNREACH); 1593 } 1594 1595 bzero(&iras, sizeof (iras)); 1596 /* Map ixa to ira including IPsec policies */ 1597 ipsec_out_to_in(ixa, ill, &iras); 1598 1599 if (ip_source_routed(ipha, ipst)) { 1600 icmp_unreachable(mp, ICMP_SOURCE_ROUTE_FAILED, &iras); 1601 } else { 1602 icmp_unreachable(mp, ICMP_HOST_UNREACHABLE, &iras); 1603 } 1604 /* We moved any IPsec refs from ixa to iras */ 1605 ira_cleanup(&iras, B_FALSE); 1606 ill_refrele(ill); 1607 return (EHOSTUNREACH); 1608 } 1609 1610 /* 1611 * Calculate a checksum ignoring any hardware capabilities 1612 * 1613 * Returns B_FALSE if the packet was too short for the checksum. Caller 1614 * should free and do stats. 1615 */ 1616 static boolean_t 1617 ip_output_sw_cksum_v4(mblk_t *mp, ipha_t *ipha, ip_xmit_attr_t *ixa) 1618 { 1619 ip_stack_t *ipst = ixa->ixa_ipst; 1620 uint_t pktlen = ixa->ixa_pktlen; 1621 uint16_t *cksump; 1622 uint32_t cksum; 1623 uint8_t protocol = ixa->ixa_protocol; 1624 uint16_t ip_hdr_length = ixa->ixa_ip_hdr_length; 1625 ipaddr_t dst = ipha->ipha_dst; 1626 ipaddr_t src = ipha->ipha_src; 1627 1628 /* Just in case it contained garbage */ 1629 DB_CKSUMFLAGS(mp) &= ~HCK_FLAGS; 1630 1631 /* 1632 * Calculate ULP checksum 1633 */ 1634 if (protocol == IPPROTO_TCP) { 1635 cksump = IPH_TCPH_CHECKSUMP(ipha, ip_hdr_length); 1636 cksum = IP_TCP_CSUM_COMP; 1637 } else if (protocol == IPPROTO_UDP) { 1638 cksump = IPH_UDPH_CHECKSUMP(ipha, ip_hdr_length); 1639 cksum = IP_UDP_CSUM_COMP; 1640 } else if (protocol == IPPROTO_SCTP) { 1641 sctp_hdr_t *sctph; 1642 1643 ASSERT(MBLKL(mp) >= (ip_hdr_length + sizeof (*sctph))); 1644 sctph = (sctp_hdr_t *)(mp->b_rptr + ip_hdr_length); 1645 /* 1646 * Zero out the checksum field to ensure proper 1647 * checksum calculation. 1648 */ 1649 sctph->sh_chksum = 0; 1650 #ifdef DEBUG 1651 if (!skip_sctp_cksum) 1652 #endif 1653 sctph->sh_chksum = sctp_cksum(mp, ip_hdr_length); 1654 goto ip_hdr_cksum; 1655 } else { 1656 goto ip_hdr_cksum; 1657 } 1658 1659 /* ULP puts the checksum field is in the first mblk */ 1660 ASSERT(((uchar_t *)cksump) + sizeof (uint16_t) <= mp->b_wptr); 1661 1662 /* 1663 * We accumulate the pseudo header checksum in cksum. 1664 * This is pretty hairy code, so watch close. One 1665 * thing to keep in mind is that UDP and TCP have 1666 * stored their respective datagram lengths in their 1667 * checksum fields. This lines things up real nice. 1668 */ 1669 cksum += (dst >> 16) + (dst & 0xFFFF) + (src >> 16) + (src & 0xFFFF); 1670 1671 cksum = IP_CSUM(mp, ip_hdr_length, cksum); 1672 /* 1673 * For UDP/IPv4 a zero means that the packets wasn't checksummed. 1674 * Change to 0xffff 1675 */ 1676 if (protocol == IPPROTO_UDP && cksum == 0) 1677 *cksump = ~cksum; 1678 else 1679 *cksump = cksum; 1680 1681 IP_STAT(ipst, ip_out_sw_cksum); 1682 IP_STAT_UPDATE(ipst, ip_out_sw_cksum_bytes, pktlen); 1683 1684 ip_hdr_cksum: 1685 /* Calculate IPv4 header checksum */ 1686 ipha->ipha_hdr_checksum = 0; 1687 ipha->ipha_hdr_checksum = ip_csum_hdr(ipha); 1688 return (B_TRUE); 1689 } 1690 1691 /* 1692 * Calculate the ULP checksum - try to use hardware. 1693 * In the case of MULTIRT, broadcast or multicast the 1694 * IXAF_NO_HW_CKSUM is set in which case we use software. 1695 * 1696 * If the hardware supports IP header checksum offload; then clear the 1697 * contents of IP header checksum field as expected by NIC. 1698 * Do this only if we offloaded either full or partial sum. 1699 * 1700 * Returns B_FALSE if the packet was too short for the checksum. Caller 1701 * should free and do stats. 1702 */ 1703 static boolean_t 1704 ip_output_cksum_v4(iaflags_t ixaflags, mblk_t *mp, ipha_t *ipha, 1705 ip_xmit_attr_t *ixa, ill_t *ill) 1706 { 1707 uint_t pktlen = ixa->ixa_pktlen; 1708 uint16_t *cksump; 1709 uint16_t hck_flags; 1710 uint32_t cksum; 1711 uint8_t protocol = ixa->ixa_protocol; 1712 uint16_t ip_hdr_length = ixa->ixa_ip_hdr_length; 1713 1714 if ((ixaflags & IXAF_NO_HW_CKSUM) || !ILL_HCKSUM_CAPABLE(ill) || 1715 !dohwcksum) { 1716 return (ip_output_sw_cksum_v4(mp, ipha, ixa)); 1717 } 1718 1719 /* 1720 * Calculate ULP checksum. Note that we don't use cksump and cksum 1721 * if the ill has FULL support. 1722 */ 1723 if (protocol == IPPROTO_TCP) { 1724 cksump = IPH_TCPH_CHECKSUMP(ipha, ip_hdr_length); 1725 cksum = IP_TCP_CSUM_COMP; /* Pseudo-header cksum */ 1726 } else if (protocol == IPPROTO_UDP) { 1727 cksump = IPH_UDPH_CHECKSUMP(ipha, ip_hdr_length); 1728 cksum = IP_UDP_CSUM_COMP; /* Pseudo-header cksum */ 1729 } else if (protocol == IPPROTO_SCTP) { 1730 sctp_hdr_t *sctph; 1731 1732 ASSERT(MBLKL(mp) >= (ip_hdr_length + sizeof (*sctph))); 1733 sctph = (sctp_hdr_t *)(mp->b_rptr + ip_hdr_length); 1734 /* 1735 * Zero out the checksum field to ensure proper 1736 * checksum calculation. 1737 */ 1738 sctph->sh_chksum = 0; 1739 #ifdef DEBUG 1740 if (!skip_sctp_cksum) 1741 #endif 1742 sctph->sh_chksum = sctp_cksum(mp, ip_hdr_length); 1743 goto ip_hdr_cksum; 1744 } else if (protocol == IPPROTO_ICMP) { 1745 /* 1746 * Note that we always calculate a SW checksum for ICMP. In the 1747 * future, if HW support for ICMP is advertised, we can change 1748 * this. 1749 */ 1750 return (ip_output_sw_cksum_v4(mp, ipha, ixa)); 1751 } else { 1752 ip_hdr_cksum: 1753 /* Calculate IPv4 header checksum */ 1754 ipha->ipha_hdr_checksum = 0; 1755 ipha->ipha_hdr_checksum = ip_csum_hdr(ipha); 1756 return (B_TRUE); 1757 } 1758 1759 /* ULP puts the checksum field is in the first mblk */ 1760 ASSERT(((uchar_t *)cksump) + sizeof (uint16_t) <= mp->b_wptr); 1761 1762 /* 1763 * Underlying interface supports hardware checksum offload for 1764 * the payload; leave the payload checksum for the hardware to 1765 * calculate. N.B: We only need to set up checksum info on the 1766 * first mblk. 1767 */ 1768 hck_flags = ill->ill_hcksum_capab->ill_hcksum_txflags; 1769 1770 DB_CKSUMFLAGS(mp) &= ~HCK_FLAGS; 1771 if (hck_flags & HCKSUM_INET_FULL_V4) { 1772 /* 1773 * Hardware calculates pseudo-header, header and the 1774 * payload checksums, so clear the checksum field in 1775 * the protocol header. 1776 */ 1777 *cksump = 0; 1778 DB_CKSUMFLAGS(mp) |= HCK_FULLCKSUM; 1779 1780 ipha->ipha_hdr_checksum = 0; 1781 if (hck_flags & HCKSUM_IPHDRCKSUM) { 1782 DB_CKSUMFLAGS(mp) |= HCK_IPV4_HDRCKSUM; 1783 } else { 1784 ipha->ipha_hdr_checksum = ip_csum_hdr(ipha); 1785 } 1786 return (B_TRUE); 1787 } 1788 if ((hck_flags) & HCKSUM_INET_PARTIAL) { 1789 ipaddr_t dst = ipha->ipha_dst; 1790 ipaddr_t src = ipha->ipha_src; 1791 /* 1792 * Partial checksum offload has been enabled. Fill 1793 * the checksum field in the protocol header with the 1794 * pseudo-header checksum value. 1795 * 1796 * We accumulate the pseudo header checksum in cksum. 1797 * This is pretty hairy code, so watch close. One 1798 * thing to keep in mind is that UDP and TCP have 1799 * stored their respective datagram lengths in their 1800 * checksum fields. This lines things up real nice. 1801 */ 1802 cksum += (dst >> 16) + (dst & 0xFFFF) + 1803 (src >> 16) + (src & 0xFFFF); 1804 cksum += *(cksump); 1805 cksum = (cksum & 0xFFFF) + (cksum >> 16); 1806 *(cksump) = (cksum & 0xFFFF) + (cksum >> 16); 1807 1808 /* 1809 * Offsets are relative to beginning of IP header. 1810 */ 1811 DB_CKSUMSTART(mp) = ip_hdr_length; 1812 DB_CKSUMSTUFF(mp) = (uint8_t *)cksump - (uint8_t *)ipha; 1813 DB_CKSUMEND(mp) = pktlen; 1814 DB_CKSUMFLAGS(mp) |= HCK_PARTIALCKSUM; 1815 1816 ipha->ipha_hdr_checksum = 0; 1817 if (hck_flags & HCKSUM_IPHDRCKSUM) { 1818 DB_CKSUMFLAGS(mp) |= HCK_IPV4_HDRCKSUM; 1819 } else { 1820 ipha->ipha_hdr_checksum = ip_csum_hdr(ipha); 1821 } 1822 return (B_TRUE); 1823 } 1824 /* Hardware capabilities include neither full nor partial IPv4 */ 1825 return (ip_output_sw_cksum_v4(mp, ipha, ixa)); 1826 } 1827 1828 /* 1829 * ire_sendfn for offlink and onlink destinations. 1830 * Also called from the multicast, broadcast, multirt send functions. 1831 * 1832 * Assumes that the caller has a hold on the ire. 1833 * 1834 * This function doesn't care if the IRE just became condemned since that 1835 * can happen at any time. 1836 */ 1837 /* ARGSUSED */ 1838 int 1839 ire_send_wire_v4(ire_t *ire, mblk_t *mp, void *iph_arg, 1840 ip_xmit_attr_t *ixa, uint32_t *identp) 1841 { 1842 ip_stack_t *ipst = ixa->ixa_ipst; 1843 ipha_t *ipha = (ipha_t *)iph_arg; 1844 iaflags_t ixaflags = ixa->ixa_flags; 1845 ill_t *ill; 1846 1847 ASSERT(ixa->ixa_nce != NULL); 1848 ill = ixa->ixa_nce->nce_ill; 1849 1850 if (ixaflags & IXAF_DONTROUTE) 1851 ipha->ipha_ttl = 1; 1852 1853 /* 1854 * Assign an ident value for this packet. There could be other 1855 * threads targeting the same destination, so we have to arrange 1856 * for a atomic increment. Note that we use a 32-bit atomic add 1857 * because it has better performance than its 16-bit sibling. 1858 * 1859 * Normally ixa_extra_ident is 0, but in the case of LSO it will 1860 * be the number of TCP segments that the driver/hardware will 1861 * extraly construct. 1862 * 1863 * If running in cluster mode and if the source address 1864 * belongs to a replicated service then vector through 1865 * cl_inet_ipident vector to allocate ip identifier 1866 * NOTE: This is a contract private interface with the 1867 * clustering group. 1868 */ 1869 if (cl_inet_ipident != NULL) { 1870 ipaddr_t src = ipha->ipha_src; 1871 ipaddr_t dst = ipha->ipha_dst; 1872 netstackid_t stack_id = ipst->ips_netstack->netstack_stackid; 1873 1874 ASSERT(cl_inet_isclusterwide != NULL); 1875 if ((*cl_inet_isclusterwide)(stack_id, IPPROTO_IP, 1876 AF_INET, (uint8_t *)(uintptr_t)src, NULL)) { 1877 /* 1878 * Note: not correct with LSO since we can't allocate 1879 * ixa_extra_ident+1 consecutive values. 1880 */ 1881 ipha->ipha_ident = (*cl_inet_ipident)(stack_id, 1882 IPPROTO_IP, AF_INET, (uint8_t *)(uintptr_t)src, 1883 (uint8_t *)(uintptr_t)dst, NULL); 1884 } else { 1885 ipha->ipha_ident = atomic_add_32_nv(identp, 1886 ixa->ixa_extra_ident + 1); 1887 } 1888 } else { 1889 ipha->ipha_ident = atomic_add_32_nv(identp, 1890 ixa->ixa_extra_ident + 1); 1891 } 1892 #ifndef _BIG_ENDIAN 1893 ipha->ipha_ident = htons(ipha->ipha_ident); 1894 #endif 1895 1896 /* 1897 * This might set b_band, thus the IPsec and fragmentation 1898 * code in IP ensures that b_band is updated in the first mblk. 1899 */ 1900 if (IPP_ENABLED(IPP_LOCAL_OUT, ipst)) { 1901 /* ip_process translates an IS_UNDER_IPMP */ 1902 mp = ip_process(IPP_LOCAL_OUT, mp, ill, ill); 1903 if (mp == NULL) { 1904 /* ip_drop_packet and MIB done */ 1905 return (0); /* Might just be delayed */ 1906 } 1907 } 1908 1909 /* 1910 * Verify any IPv4 options. 1911 * 1912 * The presence of IP options also forces the network stack to 1913 * calculate the checksum in software. This is because: 1914 * 1915 * Wrap around: certain partial-checksum NICs (eri, ce) limit 1916 * the size of "start offset" width to 6-bit. This effectively 1917 * sets the largest value of the offset to 64-bytes, starting 1918 * from the MAC header. When the cumulative MAC and IP headers 1919 * exceed such limit, the offset will wrap around. This causes 1920 * the checksum to be calculated at the wrong place. 1921 * 1922 * IPv4 source routing: none of the full-checksum capable NICs 1923 * is capable of correctly handling the IPv4 source-routing 1924 * option for purposes of calculating the pseudo-header; the 1925 * actual destination is different from the destination in the 1926 * header which is that of the next-hop. (This case may not be 1927 * true for NICs which can parse IPv6 extension headers, but 1928 * we choose to simplify the implementation by not offloading 1929 * checksum when they are present.) 1930 */ 1931 if (!IS_SIMPLE_IPH(ipha)) { 1932 ixaflags = ixa->ixa_flags |= IXAF_NO_HW_CKSUM; 1933 /* An IS_UNDER_IPMP ill is ok here */ 1934 if (ip_output_options(mp, ipha, ixa, ill)) { 1935 /* Packet has been consumed and ICMP error sent */ 1936 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); 1937 return (EINVAL); 1938 } 1939 } 1940 1941 /* 1942 * To handle IPsec/iptun's labeling needs we need to tag packets 1943 * while we still have ixa_tsl 1944 */ 1945 if (is_system_labeled() && ixa->ixa_tsl != NULL && 1946 (ill->ill_mactype == DL_6TO4 || ill->ill_mactype == DL_IPV4 || 1947 ill->ill_mactype == DL_IPV6)) { 1948 cred_t *newcr; 1949 1950 newcr = copycred_from_tslabel(ixa->ixa_cred, ixa->ixa_tsl, 1951 KM_NOSLEEP); 1952 if (newcr == NULL) { 1953 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); 1954 ip_drop_output("ipIfStatsOutDiscards - newcr", 1955 mp, ill); 1956 freemsg(mp); 1957 return (ENOBUFS); 1958 } 1959 mblk_setcred(mp, newcr, NOPID); 1960 crfree(newcr); /* mblk_setcred did its own crhold */ 1961 } 1962 1963 if (ixa->ixa_pktlen > ixa->ixa_fragsize || 1964 (ixaflags & IXAF_IPSEC_SECURE)) { 1965 uint32_t pktlen; 1966 1967 pktlen = ixa->ixa_pktlen; 1968 if (ixaflags & IXAF_IPSEC_SECURE) 1969 pktlen += ipsec_out_extra_length(ixa); 1970 1971 if (pktlen > IP_MAXPACKET) 1972 return (EMSGSIZE); 1973 1974 if (ixaflags & IXAF_SET_ULP_CKSUM) { 1975 /* 1976 * Compute ULP checksum and IP header checksum 1977 * using software 1978 */ 1979 if (!ip_output_sw_cksum_v4(mp, ipha, ixa)) { 1980 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); 1981 ip_drop_output("ipIfStatsOutDiscards", mp, ill); 1982 freemsg(mp); 1983 return (EINVAL); 1984 } 1985 } else { 1986 /* Calculate IPv4 header checksum */ 1987 ipha->ipha_hdr_checksum = 0; 1988 ipha->ipha_hdr_checksum = ip_csum_hdr(ipha); 1989 } 1990 1991 /* 1992 * If this packet would generate a icmp_frag_needed 1993 * message, we need to handle it before we do the IPsec 1994 * processing. Otherwise, we need to strip the IPsec 1995 * headers before we send up the message to the ULPs 1996 * which becomes messy and difficult. 1997 * 1998 * We check using IXAF_DONTFRAG. The DF bit in the header 1999 * is not inspected - it will be copied to any generated 2000 * fragments. 2001 */ 2002 if ((pktlen > ixa->ixa_fragsize) && 2003 (ixaflags & IXAF_DONTFRAG)) { 2004 /* Generate ICMP and return error */ 2005 ip_recv_attr_t iras; 2006 2007 DTRACE_PROBE4(ip4__fragsize__fail, uint_t, pktlen, 2008 uint_t, ixa->ixa_fragsize, uint_t, ixa->ixa_pktlen, 2009 uint_t, ixa->ixa_pmtu); 2010 2011 bzero(&iras, sizeof (iras)); 2012 /* Map ixa to ira including IPsec policies */ 2013 ipsec_out_to_in(ixa, ill, &iras); 2014 2015 ip_drop_output("ICMP_FRAG_NEEDED", mp, ill); 2016 icmp_frag_needed(mp, ixa->ixa_fragsize, &iras); 2017 /* We moved any IPsec refs from ixa to iras */ 2018 ira_cleanup(&iras, B_FALSE); 2019 return (EMSGSIZE); 2020 } 2021 DTRACE_PROBE4(ip4__fragsize__ok, uint_t, pktlen, 2022 uint_t, ixa->ixa_fragsize, uint_t, ixa->ixa_pktlen, 2023 uint_t, ixa->ixa_pmtu); 2024 2025 if (ixaflags & IXAF_IPSEC_SECURE) { 2026 /* 2027 * Pass in sufficient information so that 2028 * IPsec can determine whether to fragment, and 2029 * which function to call after fragmentation. 2030 */ 2031 return (ipsec_out_process(mp, ixa)); 2032 } 2033 return (ip_fragment_v4(mp, ixa->ixa_nce, ixaflags, 2034 ixa->ixa_pktlen, ixa->ixa_fragsize, ixa->ixa_xmit_hint, 2035 ixa->ixa_zoneid, ixa->ixa_no_loop_zoneid, 2036 ixa->ixa_postfragfn, &ixa->ixa_cookie)); 2037 } 2038 if (ixaflags & IXAF_SET_ULP_CKSUM) { 2039 /* Compute ULP checksum and IP header checksum */ 2040 /* An IS_UNDER_IPMP ill is ok here */ 2041 if (!ip_output_cksum_v4(ixaflags, mp, ipha, ixa, ill)) { 2042 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); 2043 ip_drop_output("ipIfStatsOutDiscards", mp, ill); 2044 freemsg(mp); 2045 return (EINVAL); 2046 } 2047 } else { 2048 /* Calculate IPv4 header checksum */ 2049 ipha->ipha_hdr_checksum = 0; 2050 ipha->ipha_hdr_checksum = ip_csum_hdr(ipha); 2051 } 2052 return ((ixa->ixa_postfragfn)(mp, ixa->ixa_nce, ixaflags, 2053 ixa->ixa_pktlen, ixa->ixa_xmit_hint, ixa->ixa_zoneid, 2054 ixa->ixa_no_loop_zoneid, &ixa->ixa_cookie)); 2055 } 2056 2057 /* 2058 * Send mp into ip_input 2059 * Common for IPv4 and IPv6 2060 */ 2061 void 2062 ip_postfrag_loopback(mblk_t *mp, nce_t *nce, iaflags_t ixaflags, 2063 uint_t pkt_len, zoneid_t nolzid) 2064 { 2065 rtc_t rtc; 2066 ill_t *ill = nce->nce_ill; 2067 ip_recv_attr_t iras; /* NOTE: No bzero for performance */ 2068 ncec_t *ncec; 2069 2070 ncec = nce->nce_common; 2071 iras.ira_flags = IRAF_VERIFY_IP_CKSUM | IRAF_VERIFY_ULP_CKSUM | 2072 IRAF_LOOPBACK | IRAF_L2SRC_LOOPBACK; 2073 if (ncec->ncec_flags & NCE_F_BCAST) 2074 iras.ira_flags |= IRAF_L2DST_BROADCAST; 2075 else if (ncec->ncec_flags & NCE_F_MCAST) 2076 iras.ira_flags |= IRAF_L2DST_MULTICAST; 2077 2078 iras.ira_free_flags = 0; 2079 iras.ira_cred = NULL; 2080 iras.ira_cpid = NOPID; 2081 iras.ira_tsl = NULL; 2082 iras.ira_zoneid = ALL_ZONES; 2083 iras.ira_pktlen = pkt_len; 2084 UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCInOctets, iras.ira_pktlen); 2085 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInReceives); 2086 2087 if (ixaflags & IXAF_IS_IPV4) 2088 iras.ira_flags |= IRAF_IS_IPV4; 2089 2090 iras.ira_ill = iras.ira_rill = ill; 2091 iras.ira_ruifindex = ill->ill_phyint->phyint_ifindex; 2092 iras.ira_rifindex = iras.ira_ruifindex; 2093 iras.ira_mhip = NULL; 2094 2095 iras.ira_flags |= ixaflags & IAF_MASK; 2096 iras.ira_no_loop_zoneid = nolzid; 2097 2098 /* Broadcast and multicast doesn't care about the squeue */ 2099 iras.ira_sqp = NULL; 2100 2101 rtc.rtc_ire = NULL; 2102 if (ixaflags & IXAF_IS_IPV4) { 2103 ipha_t *ipha = (ipha_t *)mp->b_rptr; 2104 2105 rtc.rtc_ipaddr = INADDR_ANY; 2106 2107 (*ill->ill_inputfn)(mp, ipha, &ipha->ipha_dst, &iras, &rtc); 2108 if (rtc.rtc_ire != NULL) { 2109 ASSERT(rtc.rtc_ipaddr != INADDR_ANY); 2110 ire_refrele(rtc.rtc_ire); 2111 } 2112 } else { 2113 ip6_t *ip6h = (ip6_t *)mp->b_rptr; 2114 2115 rtc.rtc_ip6addr = ipv6_all_zeros; 2116 2117 (*ill->ill_inputfn)(mp, ip6h, &ip6h->ip6_dst, &iras, &rtc); 2118 if (rtc.rtc_ire != NULL) { 2119 ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&rtc.rtc_ip6addr)); 2120 ire_refrele(rtc.rtc_ire); 2121 } 2122 } 2123 /* Any references to clean up? No hold on ira */ 2124 if (iras.ira_flags & (IRAF_IPSEC_SECURE|IRAF_SYSTEM_LABELED)) 2125 ira_cleanup(&iras, B_FALSE); 2126 } 2127 2128 /* 2129 * Post fragmentation function for IRE_MULTICAST and IRE_BROADCAST which 2130 * looks at the IXAF_LOOPBACK_COPY flag. 2131 * Common for IPv4 and IPv6. 2132 * 2133 * If the loopback copy fails (due to no memory) but we send the packet out 2134 * on the wire we return no failure. Only in the case we supress the wire 2135 * sending do we take the loopback failure into account. 2136 * 2137 * Note that we do not perform DTRACE_IP7 and FW_HOOKS for the looped back copy. 2138 * Those operations are performed on this packet in ip_xmit() and it would 2139 * be odd to do it twice for the same packet. 2140 */ 2141 int 2142 ip_postfrag_loopcheck(mblk_t *mp, nce_t *nce, iaflags_t ixaflags, 2143 uint_t pkt_len, uint32_t xmit_hint, zoneid_t szone, zoneid_t nolzid, 2144 uintptr_t *ixacookie) 2145 { 2146 ill_t *ill = nce->nce_ill; 2147 int error = 0; 2148 2149 /* 2150 * Check for IXAF_LOOPBACK_COPY - send a copy to ip as if the driver 2151 * had looped it back 2152 */ 2153 if (ixaflags & IXAF_LOOPBACK_COPY) { 2154 mblk_t *mp1; 2155 2156 mp1 = copymsg(mp); 2157 if (mp1 == NULL) { 2158 /* Failed to deliver the loopback copy. */ 2159 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); 2160 ip_drop_output("ipIfStatsOutDiscards", mp, ill); 2161 error = ENOBUFS; 2162 } else { 2163 ip_postfrag_loopback(mp1, nce, ixaflags, pkt_len, 2164 nolzid); 2165 } 2166 } 2167 2168 /* 2169 * If TTL = 0 then only do the loopback to this host i.e. we are 2170 * done. We are also done if this was the 2171 * loopback interface since it is sufficient 2172 * to loopback one copy of a multicast packet. 2173 */ 2174 if (ixaflags & IXAF_IS_IPV4) { 2175 ipha_t *ipha = (ipha_t *)mp->b_rptr; 2176 2177 if (ipha->ipha_ttl == 0) { 2178 ip_drop_output("multicast ipha_ttl not sent to wire", 2179 mp, ill); 2180 freemsg(mp); 2181 return (error); 2182 } 2183 } else { 2184 ip6_t *ip6h = (ip6_t *)mp->b_rptr; 2185 2186 if (ip6h->ip6_hops == 0) { 2187 ip_drop_output("multicast ipha_ttl not sent to wire", 2188 mp, ill); 2189 freemsg(mp); 2190 return (error); 2191 } 2192 } 2193 if (nce->nce_ill->ill_wq == NULL) { 2194 /* Loopback interface */ 2195 ip_drop_output("multicast on lo0 not sent to wire", mp, ill); 2196 freemsg(mp); 2197 return (error); 2198 } 2199 2200 return (ip_xmit(mp, nce, ixaflags, pkt_len, xmit_hint, szone, 0, 2201 ixacookie)); 2202 } 2203 2204 /* 2205 * Post fragmentation function for RTF_MULTIRT routes. 2206 * Since IRE_BROADCASTs can have RTF_MULTIRT, this function 2207 * checks IXAF_LOOPBACK_COPY. 2208 * 2209 * If no packet is sent due to failures then we return an errno, but if at 2210 * least one succeeded we return zero. 2211 */ 2212 int 2213 ip_postfrag_multirt_v4(mblk_t *mp, nce_t *nce, iaflags_t ixaflags, 2214 uint_t pkt_len, uint32_t xmit_hint, zoneid_t szone, zoneid_t nolzid, 2215 uintptr_t *ixacookie) 2216 { 2217 irb_t *irb; 2218 ipha_t *ipha = (ipha_t *)mp->b_rptr; 2219 ire_t *ire; 2220 ire_t *ire1; 2221 mblk_t *mp1; 2222 nce_t *nce1; 2223 ill_t *ill = nce->nce_ill; 2224 ill_t *ill1; 2225 ip_stack_t *ipst = ill->ill_ipst; 2226 int error = 0; 2227 int num_sent = 0; 2228 int err; 2229 uint_t ire_type; 2230 ipaddr_t nexthop; 2231 2232 ASSERT(ixaflags & IXAF_IS_IPV4); 2233 2234 /* Check for IXAF_LOOPBACK_COPY */ 2235 if (ixaflags & IXAF_LOOPBACK_COPY) { 2236 mblk_t *mp1; 2237 2238 mp1 = copymsg(mp); 2239 if (mp1 == NULL) { 2240 /* Failed to deliver the loopback copy. */ 2241 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); 2242 ip_drop_output("ipIfStatsOutDiscards", mp, ill); 2243 error = ENOBUFS; 2244 } else { 2245 ip_postfrag_loopback(mp1, nce, ixaflags, pkt_len, 2246 nolzid); 2247 } 2248 } 2249 2250 /* 2251 * Loop over RTF_MULTIRT for ipha_dst in the same bucket. Send 2252 * a copy to each one. 2253 * Use the nce (nexthop) and ipha_dst to find the ire. 2254 * 2255 * MULTIRT is not designed to work with shared-IP zones thus we don't 2256 * need to pass a zoneid or a label to the IRE lookup. 2257 */ 2258 if (V4_PART_OF_V6(nce->nce_addr) == ipha->ipha_dst) { 2259 /* Broadcast and multicast case */ 2260 ire = ire_ftable_lookup_v4(ipha->ipha_dst, 0, 0, 0, 2261 NULL, ALL_ZONES, NULL, MATCH_IRE_DSTONLY, 0, ipst, NULL); 2262 } else { 2263 ipaddr_t v4addr = V4_PART_OF_V6(nce->nce_addr); 2264 2265 /* Unicast case */ 2266 ire = ire_ftable_lookup_v4(ipha->ipha_dst, 0, v4addr, 0, 2267 NULL, ALL_ZONES, NULL, MATCH_IRE_GW, 0, ipst, NULL); 2268 } 2269 2270 if (ire == NULL || 2271 (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) || 2272 !(ire->ire_flags & RTF_MULTIRT)) { 2273 /* Drop */ 2274 ip_drop_output("ip_postfrag_multirt didn't find route", 2275 mp, nce->nce_ill); 2276 if (ire != NULL) 2277 ire_refrele(ire); 2278 return (ENETUNREACH); 2279 } 2280 2281 irb = ire->ire_bucket; 2282 irb_refhold(irb); 2283 for (ire1 = irb->irb_ire; ire1 != NULL; ire1 = ire1->ire_next) { 2284 /* 2285 * For broadcast we can have a mixture of IRE_BROADCAST and 2286 * IRE_HOST due to the manually added IRE_HOSTs that are used 2287 * to trigger the creation of the special CGTP broadcast routes. 2288 * Thus we have to skip if ire_type doesn't match the original. 2289 */ 2290 if (IRE_IS_CONDEMNED(ire1) || 2291 !(ire1->ire_flags & RTF_MULTIRT) || 2292 ire1->ire_type != ire->ire_type) 2293 continue; 2294 2295 /* Do the ire argument one after the loop */ 2296 if (ire1 == ire) 2297 continue; 2298 2299 ill1 = ire_nexthop_ill(ire1); 2300 if (ill1 == NULL) { 2301 /* 2302 * This ire might not have been picked by 2303 * ire_route_recursive, in which case ire_dep might 2304 * not have been setup yet. 2305 * We kick ire_route_recursive to try to resolve 2306 * starting at ire1. 2307 */ 2308 ire_t *ire2; 2309 uint_t match_flags = MATCH_IRE_DSTONLY; 2310 2311 if (ire1->ire_ill != NULL) 2312 match_flags |= MATCH_IRE_ILL; 2313 ire2 = ire_route_recursive_impl_v4(ire1, 2314 ire1->ire_addr, ire1->ire_type, ire1->ire_ill, 2315 ire1->ire_zoneid, NULL, match_flags, 2316 IRR_ALLOCATE, 0, ipst, NULL, NULL, NULL); 2317 if (ire2 != NULL) 2318 ire_refrele(ire2); 2319 ill1 = ire_nexthop_ill(ire1); 2320 } 2321 2322 if (ill1 == NULL) { 2323 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); 2324 ip_drop_output("ipIfStatsOutDiscards - no ill", 2325 mp, ill); 2326 error = ENETUNREACH; 2327 continue; 2328 } 2329 2330 /* Pick the addr and type to use for arp_nce_init */ 2331 if (nce->nce_common->ncec_flags & NCE_F_BCAST) { 2332 ire_type = IRE_BROADCAST; 2333 nexthop = ire1->ire_gateway_addr; 2334 } else if (nce->nce_common->ncec_flags & NCE_F_MCAST) { 2335 ire_type = IRE_MULTICAST; 2336 nexthop = ipha->ipha_dst; 2337 } else { 2338 ire_type = ire1->ire_type; /* Doesn't matter */ 2339 nexthop = ire1->ire_gateway_addr; 2340 } 2341 2342 /* If IPMP meta or under, then we just drop */ 2343 if (ill1->ill_grp != NULL) { 2344 BUMP_MIB(ill1->ill_ip_mib, ipIfStatsOutDiscards); 2345 ip_drop_output("ipIfStatsOutDiscards - IPMP", 2346 mp, ill1); 2347 ill_refrele(ill1); 2348 error = ENETUNREACH; 2349 continue; 2350 } 2351 2352 nce1 = arp_nce_init(ill1, nexthop, ire_type); 2353 if (nce1 == NULL) { 2354 BUMP_MIB(ill1->ill_ip_mib, ipIfStatsOutDiscards); 2355 ip_drop_output("ipIfStatsOutDiscards - no nce", 2356 mp, ill1); 2357 ill_refrele(ill1); 2358 error = ENETUNREACH; 2359 continue; 2360 } 2361 mp1 = copymsg(mp); 2362 if (mp1 == NULL) { 2363 BUMP_MIB(ill1->ill_ip_mib, ipIfStatsOutDiscards); 2364 ip_drop_output("ipIfStatsOutDiscards", mp, ill1); 2365 nce_refrele(nce1); 2366 ill_refrele(ill1); 2367 error = ENOBUFS; 2368 continue; 2369 } 2370 /* Preserve HW checksum for this copy */ 2371 DB_CKSUMSTART(mp1) = DB_CKSUMSTART(mp); 2372 DB_CKSUMSTUFF(mp1) = DB_CKSUMSTUFF(mp); 2373 DB_CKSUMEND(mp1) = DB_CKSUMEND(mp); 2374 DB_CKSUMFLAGS(mp1) = DB_CKSUMFLAGS(mp); 2375 DB_LSOMSS(mp1) = DB_LSOMSS(mp); 2376 2377 ire1->ire_ob_pkt_count++; 2378 err = ip_xmit(mp1, nce1, ixaflags, pkt_len, xmit_hint, szone, 2379 0, ixacookie); 2380 if (err == 0) 2381 num_sent++; 2382 else 2383 error = err; 2384 nce_refrele(nce1); 2385 ill_refrele(ill1); 2386 } 2387 irb_refrele(irb); 2388 ire_refrele(ire); 2389 /* Finally, the main one */ 2390 err = ip_xmit(mp, nce, ixaflags, pkt_len, xmit_hint, szone, 0, 2391 ixacookie); 2392 if (err == 0) 2393 num_sent++; 2394 else 2395 error = err; 2396 if (num_sent > 0) 2397 return (0); 2398 else 2399 return (error); 2400 } 2401 2402 /* 2403 * Verify local connectivity. This check is called by ULP fusion code. 2404 * The generation number on an IRE_LOCAL or IRE_LOOPBACK only changes if 2405 * the interface is brought down and back up. So we simply fail the local 2406 * process. The caller, TCP Fusion, should unfuse the connection. 2407 */ 2408 boolean_t 2409 ip_output_verify_local(ip_xmit_attr_t *ixa) 2410 { 2411 ire_t *ire = ixa->ixa_ire; 2412 2413 if (!(ire->ire_type & (IRE_LOCAL | IRE_LOOPBACK))) 2414 return (B_FALSE); 2415 2416 return (ixa->ixa_ire->ire_generation == ixa->ixa_ire_generation); 2417 } 2418 2419 /* 2420 * Local process for ULP loopback, TCP Fusion. Handle both IPv4 and IPv6. 2421 * 2422 * The caller must call ip_output_verify_local() first. This function handles 2423 * IPobs, FW_HOOKS, and/or IPsec cases sequentially. 2424 */ 2425 mblk_t * 2426 ip_output_process_local(mblk_t *mp, ip_xmit_attr_t *ixa, boolean_t hooks_out, 2427 boolean_t hooks_in, conn_t *peer_connp) 2428 { 2429 ill_t *ill = ixa->ixa_ire->ire_ill; 2430 ipha_t *ipha = NULL; 2431 ip6_t *ip6h = NULL; 2432 ip_stack_t *ipst = ixa->ixa_ipst; 2433 iaflags_t ixaflags = ixa->ixa_flags; 2434 ip_recv_attr_t iras; 2435 int error; 2436 2437 ASSERT(mp != NULL); 2438 2439 if (ixaflags & IXAF_IS_IPV4) { 2440 ipha = (ipha_t *)mp->b_rptr; 2441 2442 /* 2443 * If a callback is enabled then we need to know the 2444 * source and destination zoneids for the packet. We already 2445 * have those handy. 2446 */ 2447 if (ipst->ips_ip4_observe.he_interested) { 2448 zoneid_t szone, dzone; 2449 zoneid_t stackzoneid; 2450 2451 stackzoneid = netstackid_to_zoneid( 2452 ipst->ips_netstack->netstack_stackid); 2453 2454 if (stackzoneid == GLOBAL_ZONEID) { 2455 /* Shared-IP zone */ 2456 dzone = ixa->ixa_ire->ire_zoneid; 2457 szone = ixa->ixa_zoneid; 2458 } else { 2459 szone = dzone = stackzoneid; 2460 } 2461 ipobs_hook(mp, IPOBS_HOOK_LOCAL, szone, dzone, ill, 2462 ipst); 2463 } 2464 DTRACE_IP7(send, mblk_t *, mp, conn_t *, NULL, void_ip_t *, 2465 ipha, __dtrace_ipsr_ill_t *, ill, ipha_t *, ipha, ip6_t *, 2466 NULL, int, 1); 2467 2468 /* FW_HOOKS: LOOPBACK_OUT */ 2469 if (hooks_out) { 2470 DTRACE_PROBE4(ip4__loopback__out__start, ill_t *, NULL, 2471 ill_t *, ill, ipha_t *, ipha, mblk_t *, mp); 2472 FW_HOOKS(ipst->ips_ip4_loopback_out_event, 2473 ipst->ips_ipv4firewall_loopback_out, 2474 NULL, ill, ipha, mp, mp, 0, ipst, error); 2475 DTRACE_PROBE1(ip4__loopback__out__end, mblk_t *, mp); 2476 } 2477 if (mp == NULL) 2478 return (NULL); 2479 2480 /* FW_HOOKS: LOOPBACK_IN */ 2481 if (hooks_in) { 2482 DTRACE_PROBE4(ip4__loopback__in__start, ill_t *, ill, 2483 ill_t *, NULL, ipha_t *, ipha, mblk_t *, mp); 2484 FW_HOOKS(ipst->ips_ip4_loopback_in_event, 2485 ipst->ips_ipv4firewall_loopback_in, 2486 ill, NULL, ipha, mp, mp, 0, ipst, error); 2487 DTRACE_PROBE1(ip4__loopback__in__end, mblk_t *, mp); 2488 } 2489 if (mp == NULL) 2490 return (NULL); 2491 2492 DTRACE_IP7(receive, mblk_t *, mp, conn_t *, NULL, void_ip_t *, 2493 ipha, __dtrace_ipsr_ill_t *, ill, ipha_t *, ipha, ip6_t *, 2494 NULL, int, 1); 2495 2496 /* Inbound IPsec polocies */ 2497 if (peer_connp != NULL) { 2498 /* Map ixa to ira including IPsec policies. */ 2499 ipsec_out_to_in(ixa, ill, &iras); 2500 mp = ipsec_check_inbound_policy(mp, peer_connp, ipha, 2501 NULL, &iras); 2502 } 2503 } else { 2504 ip6h = (ip6_t *)mp->b_rptr; 2505 2506 /* 2507 * If a callback is enabled then we need to know the 2508 * source and destination zoneids for the packet. We already 2509 * have those handy. 2510 */ 2511 if (ipst->ips_ip6_observe.he_interested) { 2512 zoneid_t szone, dzone; 2513 zoneid_t stackzoneid; 2514 2515 stackzoneid = netstackid_to_zoneid( 2516 ipst->ips_netstack->netstack_stackid); 2517 2518 if (stackzoneid == GLOBAL_ZONEID) { 2519 /* Shared-IP zone */ 2520 dzone = ixa->ixa_ire->ire_zoneid; 2521 szone = ixa->ixa_zoneid; 2522 } else { 2523 szone = dzone = stackzoneid; 2524 } 2525 ipobs_hook(mp, IPOBS_HOOK_LOCAL, szone, dzone, ill, 2526 ipst); 2527 } 2528 DTRACE_IP7(send, mblk_t *, mp, conn_t *, NULL, void_ip_t *, 2529 ip6h, __dtrace_ipsr_ill_t *, ill, ipha_t *, NULL, ip6_t *, 2530 ip6h, int, 1); 2531 2532 /* FW_HOOKS: LOOPBACK_OUT */ 2533 if (hooks_out) { 2534 DTRACE_PROBE4(ip6__loopback__out__start, ill_t *, NULL, 2535 ill_t *, ill, ip6_t *, ip6h, mblk_t *, mp); 2536 FW_HOOKS6(ipst->ips_ip6_loopback_out_event, 2537 ipst->ips_ipv6firewall_loopback_out, 2538 NULL, ill, ip6h, mp, mp, 0, ipst, error); 2539 DTRACE_PROBE1(ip6__loopback__out__end, mblk_t *, mp); 2540 } 2541 if (mp == NULL) 2542 return (NULL); 2543 2544 /* FW_HOOKS: LOOPBACK_IN */ 2545 if (hooks_in) { 2546 DTRACE_PROBE4(ip6__loopback__in__start, ill_t *, ill, 2547 ill_t *, NULL, ip6_t *, ip6h, mblk_t *, mp); 2548 FW_HOOKS6(ipst->ips_ip6_loopback_in_event, 2549 ipst->ips_ipv6firewall_loopback_in, 2550 ill, NULL, ip6h, mp, mp, 0, ipst, error); 2551 DTRACE_PROBE1(ip6__loopback__in__end, mblk_t *, mp); 2552 } 2553 if (mp == NULL) 2554 return (NULL); 2555 2556 DTRACE_IP7(receive, mblk_t *, mp, conn_t *, NULL, void_ip_t *, 2557 ip6h, __dtrace_ipsr_ill_t *, ill, ipha_t *, NULL, ip6_t *, 2558 ip6h, int, 1); 2559 2560 /* Inbound IPsec polocies */ 2561 if (peer_connp != NULL) { 2562 /* Map ixa to ira including IPsec policies. */ 2563 ipsec_out_to_in(ixa, ill, &iras); 2564 mp = ipsec_check_inbound_policy(mp, peer_connp, NULL, 2565 ip6h, &iras); 2566 } 2567 } 2568 2569 if (mp == NULL) { 2570 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 2571 ip_drop_input("ipIfStatsInDiscards", NULL, ill); 2572 } 2573 2574 return (mp); 2575 } 2576