1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. 24 * Copyright 2018 Joyent, Inc. 25 * Copyright 2024 Oxide Computer Company 26 * Copyright 2026 Bill Sommerfeld <sommerfeld@hamachi.org> 27 */ 28 /* Copyright (c) 1990 Mentat Inc. */ 29 30 #include <sys/types.h> 31 #include <sys/stream.h> 32 #include <sys/strsubr.h> 33 #include <sys/dlpi.h> 34 #include <sys/strsun.h> 35 #include <sys/zone.h> 36 #include <sys/ddi.h> 37 #include <sys/sunddi.h> 38 #include <sys/cmn_err.h> 39 #include <sys/debug.h> 40 #include <sys/atomic.h> 41 42 #include <sys/systm.h> 43 #include <sys/param.h> 44 #include <sys/kmem.h> 45 #include <sys/sdt.h> 46 #include <sys/socket.h> 47 #include <sys/mac.h> 48 #include <net/if.h> 49 #include <net/if_arp.h> 50 #include <net/route.h> 51 #include <sys/sockio.h> 52 #include <netinet/in.h> 53 #include <net/if_dl.h> 54 55 #include <inet/common.h> 56 #include <inet/mi.h> 57 #include <inet/mib2.h> 58 #include <inet/nd.h> 59 #include <inet/arp.h> 60 #include <inet/snmpcom.h> 61 #include <inet/kstatcom.h> 62 63 #include <netinet/igmp_var.h> 64 #include <netinet/ip6.h> 65 #include <netinet/icmp6.h> 66 #include <netinet/sctp.h> 67 68 #include <inet/ip.h> 69 #include <inet/ip_impl.h> 70 #include <inet/ip6.h> 71 #include <inet/ip6_asp.h> 72 #include <inet/tcp.h> 73 #include <inet/ip_multi.h> 74 #include <inet/ip_if.h> 75 #include <inet/ip_ire.h> 76 #include <inet/ip_ftable.h> 77 #include <inet/ip_rts.h> 78 #include <inet/optcom.h> 79 #include <inet/ip_ndp.h> 80 #include <inet/ip_listutils.h> 81 #include <netinet/igmp.h> 82 #include <netinet/ip_mroute.h> 83 #include <inet/ipp_common.h> 84 85 #include <net/pfkeyv2.h> 86 #include <inet/sadb.h> 87 #include <inet/ipsec_impl.h> 88 #include <inet/ipdrop.h> 89 #include <inet/ip_netinfo.h> 90 91 #include <sys/pattr.h> 92 #include <inet/ipclassifier.h> 93 #include <inet/sctp_ip.h> 94 #include <inet/sctp/sctp_impl.h> 95 #include <inet/udp_impl.h> 96 #include <sys/sunddi.h> 97 98 #include <sys/tsol/label.h> 99 #include <sys/tsol/tnet.h> 100 101 #include <sys/clock_impl.h> /* For LBOLT_FASTPATH{,64} */ 102 103 #ifdef DEBUG 104 extern boolean_t skip_sctp_cksum; 105 #endif 106 107 static int ip_verify_nce(mblk_t *, ip_xmit_attr_t *); 108 static int ip_verify_dce(mblk_t *, ip_xmit_attr_t *); 109 static boolean_t ip_verify_lso(ill_t *, ip_xmit_attr_t *); 110 static boolean_t ip_verify_zcopy(ill_t *, ip_xmit_attr_t *); 111 static void ip_output_simple_broadcast(ip_xmit_attr_t *, mblk_t *); 112 113 /* 114 * There are two types of output functions for IP used for different 115 * purposes: 116 * - ip_output_simple() is when sending ICMP errors, TCP resets, etc when there 117 * is no context in the form of a conn_t. However, there is a 118 * ip_xmit_attr_t that the callers use to influence interface selection 119 * (needed for ICMP echo as well as IPv6 link-locals) and IPsec. 120 * 121 * - conn_ip_output() is used when sending packets with a conn_t and 122 * ip_set_destination has been called to cache information. In that case 123 * various socket options are recorded in the ip_xmit_attr_t and should 124 * be taken into account. 125 */ 126 127 /* 128 * The caller *must* have called conn_connect() or ip_attr_connect() 129 * before calling conn_ip_output(). The caller needs to redo that each time 130 * the destination IP address or port changes, as well as each time there is 131 * a change to any socket option that would modify how packets are routed out 132 * of the box (e.g., SO_DONTROUTE, IP_NEXTHOP, IP_BOUND_IF). 133 * 134 * The ULP caller has to serialize the use of a single ip_xmit_attr_t. 135 * We assert for that here. 136 */ 137 int 138 conn_ip_output(mblk_t *mp, ip_xmit_attr_t *ixa) 139 { 140 iaflags_t ixaflags = ixa->ixa_flags; 141 ire_t *ire; 142 nce_t *nce; 143 dce_t *dce; 144 ill_t *ill; 145 ip_stack_t *ipst = ixa->ixa_ipst; 146 int error; 147 148 /* We defer ipIfStatsHCOutRequests until an error or we have an ill */ 149 150 ASSERT(ixa->ixa_ire != NULL); 151 /* Note there is no ixa_nce when reject and blackhole routes */ 152 ASSERT(ixa->ixa_dce != NULL); /* Could be default dce */ 153 154 #ifdef DEBUG 155 ASSERT(ixa->ixa_curthread == NULL); 156 ixa->ixa_curthread = curthread; 157 #endif 158 159 /* 160 * Even on labeled systems we can have a NULL ixa_tsl e.g., 161 * for IGMP/MLD traffic. 162 */ 163 164 ire = ixa->ixa_ire; 165 166 /* 167 * If the ULP says the (old) IRE resulted in reachability we 168 * record this before determine whether to use a new IRE. 169 * No locking for performance reasons. 170 */ 171 if (ixaflags & IXAF_REACH_CONF) 172 ire->ire_badcnt = 0; 173 174 /* 175 * Has routing changed since we cached the results of the lookup? 176 * 177 * This check captures all of: 178 * - the cached ire being deleted (by means of the special 179 * IRE_GENERATION_CONDEMNED) 180 * - A potentially better ire being added (ire_generation being 181 * increased) 182 * - A deletion of the nexthop ire that was used when we did the 183 * lookup. 184 * - An addition of a potentially better nexthop ire. 185 * The last two are handled by walking and increasing the generation 186 * number on all dependant IREs in ire_flush_cache(). 187 * 188 * The check also handles all cases of RTF_REJECT and RTF_BLACKHOLE 189 * since we ensure that each time we set ixa_ire to such an IRE we 190 * make sure the ixa_ire_generation does not match (by using 191 * IRE_GENERATION_VERIFY). 192 */ 193 if (ire->ire_generation != ixa->ixa_ire_generation) { 194 error = ip_verify_ire(mp, ixa); 195 if (error != 0) { 196 ip_drop_output("ipIfStatsOutDiscards - verify ire", 197 mp, NULL); 198 goto drop; 199 } 200 ire = ixa->ixa_ire; 201 ASSERT(ire != NULL); 202 if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) { 203 #ifdef DEBUG 204 ASSERT(ixa->ixa_curthread == curthread); 205 ixa->ixa_curthread = NULL; 206 #endif 207 ire->ire_ob_pkt_count++; 208 /* ixa_dce might be condemned; use default one */ 209 return ((ire->ire_sendfn)(ire, mp, mp->b_rptr, ixa, 210 &ipst->ips_dce_default->dce_ident)); 211 } 212 /* 213 * If the ncec changed then ip_verify_ire already set 214 * ixa->ixa_dce_generation = DCE_GENERATION_VERIFY; 215 * so we can recheck the interface mtu. 216 */ 217 218 /* 219 * Note that ire->ire_generation could already have changed. 220 * We catch that next time we send a packet. 221 */ 222 } 223 224 /* 225 * No need to lock access to ixa_nce since the ip_xmit_attr usage 226 * is single threaded. 227 */ 228 ASSERT(ixa->ixa_nce != NULL); 229 nce = ixa->ixa_nce; 230 if (nce->nce_is_condemned) { 231 error = ip_verify_nce(mp, ixa); 232 /* 233 * In case ZEROCOPY capability become not available, we 234 * copy the message and free the original one. We might 235 * be copying more data than needed but it doesn't hurt 236 * since such change rarely happens. 237 */ 238 switch (error) { 239 case 0: 240 break; 241 case ENOTSUP: { /* ZEROCOPY */ 242 mblk_t *nmp; 243 244 if ((nmp = copymsg(mp)) != NULL) { 245 freemsg(mp); 246 mp = nmp; 247 248 break; 249 } 250 } 251 /* FALLTHROUGH */ 252 default: 253 ip_drop_output("ipIfStatsOutDiscards - verify nce", 254 mp, NULL); 255 goto drop; 256 } 257 ire = ixa->ixa_ire; 258 ASSERT(ire != NULL); 259 if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) { 260 #ifdef DEBUG 261 ASSERT(ixa->ixa_curthread == curthread); 262 ixa->ixa_curthread = NULL; 263 #endif 264 ire->ire_ob_pkt_count++; 265 /* ixa_dce might be condemned; use default one */ 266 return ((ire->ire_sendfn)(ire, mp, mp->b_rptr, 267 ixa, &ipst->ips_dce_default->dce_ident)); 268 } 269 ASSERT(ixa->ixa_nce != NULL); 270 nce = ixa->ixa_nce; 271 272 /* 273 * Note that some other event could already have made 274 * the new nce condemned. We catch that next time we 275 * try to send a packet. 276 */ 277 } 278 /* 279 * If there is no per-destination dce_t then we have a reference to 280 * the default dce_t (which merely contains the dce_ipid). 281 * The generation check captures both the introduction of a 282 * per-destination dce_t (e.g., due to ICMP packet too big) and 283 * any change to the per-destination dce (including it becoming 284 * condemned by use of the special DCE_GENERATION_CONDEMNED). 285 */ 286 dce = ixa->ixa_dce; 287 288 /* 289 * To avoid a periodic timer to increase the path MTU we 290 * look at dce_last_change_time each time we send a packet. 291 */ 292 if (dce->dce_flags & DCEF_PMTU) { 293 int64_t now = LBOLT_FASTPATH64; 294 295 if ((TICK_TO_SEC(now) - dce->dce_last_change_time > 296 ipst->ips_ip_pathmtu_interval)) { 297 /* 298 * Older than 20 minutes. Drop the path MTU information. 299 * Since the path MTU changes as a result of this, 300 * twiddle ixa_dce_generation to make us go through the 301 * dce verification code in conn_ip_output. 302 */ 303 mutex_enter(&dce->dce_lock); 304 dce->dce_flags &= ~(DCEF_PMTU|DCEF_TOO_SMALL_PMTU); 305 dce->dce_last_change_time = TICK_TO_SEC(now); 306 mutex_exit(&dce->dce_lock); 307 dce_increment_generation(dce); 308 } 309 } 310 311 if (dce->dce_generation != ixa->ixa_dce_generation) { 312 error = ip_verify_dce(mp, ixa); 313 if (error != 0) { 314 ip_drop_output("ipIfStatsOutDiscards - verify dce", 315 mp, NULL); 316 goto drop; 317 } 318 dce = ixa->ixa_dce; 319 320 /* 321 * Note that some other event could already have made the 322 * new dce's generation number change. 323 * We catch that next time we try to send a packet. 324 */ 325 } 326 327 ill = nce->nce_ill; 328 329 /* 330 * An initial ixa_fragsize was set in ip_set_destination 331 * and we update it if any routing changes above. 332 * A change to ill_mtu with ifconfig will increase all dce_generation 333 * so that we will detect that with the generation check. Ditto for 334 * ill_mc_mtu. 335 */ 336 337 /* 338 * Caller needs to make sure IXAF_VERIFY_SRC is not set if 339 * conn_unspec_src. 340 */ 341 if ((ixaflags & IXAF_VERIFY_SOURCE) && 342 ixa->ixa_src_generation != ipst->ips_src_generation) { 343 /* Check if the IP source is still assigned to the host. */ 344 uint_t gen; 345 346 if (!ip_verify_src(mp, ixa, &gen)) { 347 /* Don't send a packet with a source that isn't ours */ 348 error = EADDRNOTAVAIL; 349 ip_drop_output("ipIfStatsOutDiscards - invalid src", 350 mp, NULL); 351 goto drop; 352 } 353 /* The source is still valid - update the generation number */ 354 ixa->ixa_src_generation = gen; 355 } 356 357 /* 358 * We don't have an IRE when we fragment, hence ire_ob_pkt_count 359 * can only count the use prior to fragmentation. However the MIB 360 * counters on the ill will be incremented in post fragmentation. 361 */ 362 ire->ire_ob_pkt_count++; 363 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCOutRequests); 364 365 /* 366 * Based on ire_type and ire_flags call one of: 367 * ire_send_local_v* - for IRE_LOCAL and IRE_LOOPBACK 368 * ire_send_multirt_v* - if RTF_MULTIRT 369 * ire_send_noroute_v* - if RTF_REJECT or RTF_BLACHOLE 370 * ire_send_multicast_v* - for IRE_MULTICAST 371 * ire_send_broadcast_v4 - for IRE_BROADCAST 372 * ire_send_wire_v* - for the rest. 373 */ 374 #ifdef DEBUG 375 ASSERT(ixa->ixa_curthread == curthread); 376 ixa->ixa_curthread = NULL; 377 #endif 378 return ((ire->ire_sendfn)(ire, mp, mp->b_rptr, ixa, &dce->dce_ident)); 379 380 drop: 381 if (ixaflags & IXAF_IS_IPV4) { 382 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests); 383 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards); 384 } else { 385 BUMP_MIB(&ipst->ips_ip6_mib, ipIfStatsHCOutRequests); 386 BUMP_MIB(&ipst->ips_ip6_mib, ipIfStatsOutDiscards); 387 } 388 freemsg(mp); 389 #ifdef DEBUG 390 ASSERT(ixa->ixa_curthread == curthread); 391 ixa->ixa_curthread = NULL; 392 #endif 393 return (error); 394 } 395 396 /* 397 * Handle both IPv4 and IPv6. Sets the generation number 398 * to allow the caller to know when to call us again. 399 * Returns true if the source address in the packet is a valid source. 400 * We handle callers which try to send with a zero address (since we only 401 * get here if UNSPEC_SRC is not set). 402 */ 403 boolean_t 404 ip_verify_src(mblk_t *mp, ip_xmit_attr_t *ixa, uint_t *generationp) 405 { 406 ip_stack_t *ipst = ixa->ixa_ipst; 407 408 /* 409 * Need to grab the generation number before we check to 410 * avoid a race with a change to the set of local addresses. 411 * No lock needed since the thread which updates the set of local 412 * addresses use ipif/ill locks and exit those (hence a store memory 413 * barrier) before doing the atomic increase of ips_src_generation. 414 */ 415 if (generationp != NULL) 416 *generationp = ipst->ips_src_generation; 417 418 if (ixa->ixa_flags & IXAF_IS_IPV4) { 419 ipha_t *ipha = (ipha_t *)mp->b_rptr; 420 421 if (ipha->ipha_src == INADDR_ANY) 422 return (B_FALSE); 423 424 return (ip_laddr_verify_v4(ipha->ipha_src, ixa->ixa_zoneid, 425 ipst, B_FALSE) != IPVL_BAD); 426 } else { 427 ip6_t *ip6h = (ip6_t *)mp->b_rptr; 428 uint_t scopeid; 429 430 if (IN6_IS_ADDR_UNSPECIFIED(&ip6h->ip6_src)) 431 return (B_FALSE); 432 433 if (ixa->ixa_flags & IXAF_SCOPEID_SET) 434 scopeid = ixa->ixa_scopeid; 435 else 436 scopeid = 0; 437 438 return (ip_laddr_verify_v6(&ip6h->ip6_src, ixa->ixa_zoneid, 439 ipst, B_FALSE, scopeid) != IPVL_BAD); 440 } 441 } 442 443 /* 444 * Handle both IPv4 and IPv6. Reverify/recalculate the IRE to use. 445 */ 446 int 447 ip_verify_ire(mblk_t *mp, ip_xmit_attr_t *ixa) 448 { 449 uint_t gen; 450 ire_t *ire; 451 nce_t *nce; 452 int error; 453 boolean_t multirt = B_FALSE; 454 455 /* 456 * Redo ip_select_route. 457 * Need to grab generation number as part of the lookup to 458 * avoid race. 459 */ 460 error = 0; 461 ire = ip_select_route_pkt(mp, ixa, &gen, &error, &multirt); 462 ASSERT(ire != NULL); /* IRE_NOROUTE if none found */ 463 if (error != 0) { 464 ire_refrele(ire); 465 return (error); 466 } 467 468 if (ixa->ixa_ire != NULL) 469 ire_refrele_notr(ixa->ixa_ire); 470 #ifdef DEBUG 471 ire_refhold_notr(ire); 472 ire_refrele(ire); 473 #endif 474 ixa->ixa_ire = ire; 475 ixa->ixa_ire_generation = gen; 476 if (multirt) { 477 if (ixa->ixa_flags & IXAF_IS_IPV4) 478 ixa->ixa_postfragfn = ip_postfrag_multirt_v4; 479 else 480 ixa->ixa_postfragfn = ip_postfrag_multirt_v6; 481 ixa->ixa_flags |= IXAF_MULTIRT_MULTICAST; 482 } else { 483 ixa->ixa_postfragfn = ire->ire_postfragfn; 484 ixa->ixa_flags &= ~IXAF_MULTIRT_MULTICAST; 485 } 486 487 /* 488 * Don't look for an nce for reject or blackhole. 489 * They have ire_generation set to IRE_GENERATION_VERIFY which 490 * makes conn_ip_output avoid references to ixa_nce. 491 */ 492 if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) { 493 ASSERT(ixa->ixa_ire_generation == IRE_GENERATION_VERIFY); 494 ixa->ixa_dce_generation = DCE_GENERATION_VERIFY; 495 return (0); 496 } 497 498 /* The NCE could now be different */ 499 nce = ire_to_nce_pkt(ire, mp); 500 if (nce == NULL) { 501 /* 502 * Allocation failure. Make sure we redo ire/nce selection 503 * next time we send. 504 */ 505 ixa->ixa_ire_generation = IRE_GENERATION_VERIFY; 506 ixa->ixa_dce_generation = DCE_GENERATION_VERIFY; 507 return (ENOBUFS); 508 } 509 if (nce == ixa->ixa_nce) { 510 /* No change */ 511 nce_refrele(nce); 512 return (0); 513 } 514 515 /* 516 * Since the path MTU might change as a result of this 517 * route change, we twiddle ixa_dce_generation to 518 * make conn_ip_output go through the ip_verify_dce code. 519 */ 520 ixa->ixa_dce_generation = DCE_GENERATION_VERIFY; 521 522 if (ixa->ixa_nce != NULL) 523 nce_refrele(ixa->ixa_nce); 524 ixa->ixa_nce = nce; 525 return (0); 526 } 527 528 /* 529 * Handle both IPv4 and IPv6. Reverify/recalculate the NCE to use. 530 */ 531 static int 532 ip_verify_nce(mblk_t *mp, ip_xmit_attr_t *ixa) 533 { 534 ire_t *ire = ixa->ixa_ire; 535 nce_t *nce; 536 int error = 0; 537 ipha_t *ipha = NULL; 538 ip6_t *ip6h = NULL; 539 540 if (ire->ire_ipversion == IPV4_VERSION) 541 ipha = (ipha_t *)mp->b_rptr; 542 else 543 ip6h = (ip6_t *)mp->b_rptr; 544 545 nce = ire_handle_condemned_nce(ixa->ixa_nce, ire, ipha, ip6h, B_TRUE); 546 if (nce == NULL) { 547 /* Try to find a better ire */ 548 return (ip_verify_ire(mp, ixa)); 549 } 550 551 /* 552 * The hardware offloading capabilities, for example LSO, of the 553 * interface might have changed, so do sanity verification here. 554 */ 555 if (ixa->ixa_flags & IXAF_VERIFY_LSO) { 556 if (!ip_verify_lso(nce->nce_ill, ixa)) { 557 ASSERT(ixa->ixa_notify != NULL); 558 ixa->ixa_notify(ixa->ixa_notify_cookie, ixa, 559 IXAN_LSO, 0); 560 error = ENOTSUP; 561 } 562 } 563 564 /* 565 * Verify ZEROCOPY capability of underlying ill. Notify the ULP with 566 * any ZEROCOPY changes. In case ZEROCOPY capability is not available 567 * any more, return error so that conn_ip_output() can take care of 568 * the ZEROCOPY message properly. It's safe to continue send the 569 * message when ZEROCOPY newly become available. 570 */ 571 if (ixa->ixa_flags & IXAF_VERIFY_ZCOPY) { 572 if (!ip_verify_zcopy(nce->nce_ill, ixa)) { 573 ASSERT(ixa->ixa_notify != NULL); 574 ixa->ixa_notify(ixa->ixa_notify_cookie, ixa, 575 IXAN_ZCOPY, 0); 576 if ((ixa->ixa_flags & IXAF_ZCOPY_CAPAB) == 0) 577 error = ENOTSUP; 578 } 579 } 580 581 /* 582 * Since the path MTU might change as a result of this 583 * change, we twiddle ixa_dce_generation to 584 * make conn_ip_output go through the ip_verify_dce code. 585 */ 586 ixa->ixa_dce_generation = DCE_GENERATION_VERIFY; 587 588 nce_refrele(ixa->ixa_nce); 589 ixa->ixa_nce = nce; 590 return (error); 591 } 592 593 /* 594 * Handle both IPv4 and IPv6. Reverify/recalculate the DCE to use. 595 */ 596 static int 597 ip_verify_dce(mblk_t *mp, ip_xmit_attr_t *ixa) 598 { 599 dce_t *dce; 600 uint_t gen; 601 uint_t pmtu; 602 603 dce = dce_lookup_pkt(mp, ixa, &gen); 604 ASSERT(dce != NULL); 605 606 dce_refrele_notr(ixa->ixa_dce); 607 #ifdef DEBUG 608 dce_refhold_notr(dce); 609 dce_refrele(dce); 610 #endif 611 ixa->ixa_dce = dce; 612 ixa->ixa_dce_generation = gen; 613 614 /* Extract the (path) mtu from the dce, ncec_ill etc */ 615 pmtu = ip_get_pmtu(ixa); 616 617 /* 618 * Tell ULP about PMTU changes - increase or decrease - by returning 619 * an error if IXAF_VERIFY_PMTU is set. In such case, ULP should update 620 * both ixa_pmtu and ixa_fragsize appropriately. 621 * 622 * If ULP doesn't set that flag then we need to update ixa_fragsize 623 * since routing could have changed the ill after after ixa_fragsize 624 * was set previously in the conn_ip_output path or in 625 * ip_set_destination. 626 * 627 * In case of LSO, ixa_fragsize might be greater than ixa_pmtu. 628 * 629 * In the case of a path MTU increase we send the packet after the 630 * notify to the ULP. 631 */ 632 if (ixa->ixa_flags & IXAF_VERIFY_PMTU) { 633 if (ixa->ixa_pmtu != pmtu) { 634 uint_t oldmtu = ixa->ixa_pmtu; 635 636 DTRACE_PROBE2(verify_pmtu, uint32_t, pmtu, 637 uint32_t, ixa->ixa_pmtu); 638 ASSERT(ixa->ixa_notify != NULL); 639 ixa->ixa_notify(ixa->ixa_notify_cookie, ixa, 640 IXAN_PMTU, pmtu); 641 if (pmtu < oldmtu) 642 return (EMSGSIZE); 643 } 644 } else { 645 ixa->ixa_fragsize = pmtu; 646 } 647 return (0); 648 } 649 650 /* 651 * Verify LSO usability. Keep the return value simple to indicate whether 652 * the LSO capability has changed. Handle both IPv4 and IPv6. 653 */ 654 static boolean_t 655 ip_verify_lso(ill_t *ill, ip_xmit_attr_t *ixa) 656 { 657 ill_lso_capab_t *lsoc = &ixa->ixa_lso_capab; 658 ill_lso_capab_t *new_lsoc = ill->ill_lso_capab; 659 660 if (ixa->ixa_flags & IXAF_LSO_CAPAB) { 661 /* 662 * Not usable any more? 663 */ 664 if (!dohwcksum || 665 (ixa->ixa_flags & IXAF_IPSEC_SECURE) || 666 (ixa->ixa_ire->ire_type & (IRE_LOCAL | IRE_LOOPBACK)) || 667 (ixa->ixa_ire->ire_flags & RTF_MULTIRT) || 668 ((ixa->ixa_flags & IXAF_IS_IPV4) ? 669 !ILL_LSO_TCP_IPV4_USABLE(ill) : 670 !ILL_LSO_TCP_IPV6_USABLE(ill))) { 671 ixa->ixa_flags &= ~IXAF_LSO_CAPAB; 672 673 return (B_FALSE); 674 } 675 676 /* 677 * Capability has changed, refresh the copy in ixa. 678 */ 679 if (lsoc->ill_lso_max_tcpv4 != new_lsoc->ill_lso_max_tcpv4 || 680 lsoc->ill_lso_max_tcpv6 != new_lsoc->ill_lso_max_tcpv6) { 681 *lsoc = *new_lsoc; 682 683 return (B_FALSE); 684 } 685 } else { /* Was not usable */ 686 if (dohwcksum && 687 !(ixa->ixa_flags & IXAF_IPSEC_SECURE) && 688 !(ixa->ixa_ire->ire_type & (IRE_LOCAL | IRE_LOOPBACK)) && 689 !(ixa->ixa_ire->ire_flags & RTF_MULTIRT) && 690 ((ixa->ixa_flags & IXAF_IS_IPV4) ? 691 ILL_LSO_TCP_IPV4_USABLE(ill) : 692 ILL_LSO_TCP_IPV6_USABLE(ill))) { 693 *lsoc = *new_lsoc; 694 ixa->ixa_flags |= IXAF_LSO_CAPAB; 695 696 return (B_FALSE); 697 } 698 } 699 700 return (B_TRUE); 701 } 702 703 /* 704 * Verify ZEROCOPY usability. Keep the return value simple to indicate whether 705 * the ZEROCOPY capability has changed. Handle both IPv4 and IPv6. 706 */ 707 static boolean_t 708 ip_verify_zcopy(ill_t *ill, ip_xmit_attr_t *ixa) 709 { 710 if (ixa->ixa_flags & IXAF_ZCOPY_CAPAB) { 711 /* 712 * Not unsable any more. 713 */ 714 if ((ixa->ixa_flags & IXAF_IPSEC_SECURE) || 715 (ixa->ixa_ire->ire_type & (IRE_LOCAL | IRE_LOOPBACK)) || 716 (ixa->ixa_ire->ire_flags & RTF_MULTIRT) || 717 !ILL_ZCOPY_USABLE(ill)) { 718 ixa->ixa_flags &= ~IXAF_ZCOPY_CAPAB; 719 720 return (B_FALSE); 721 } 722 } else { /* Was not usable */ 723 if (!(ixa->ixa_flags & IXAF_IPSEC_SECURE) && 724 !(ixa->ixa_ire->ire_type & (IRE_LOCAL | IRE_LOOPBACK)) && 725 !(ixa->ixa_ire->ire_flags & RTF_MULTIRT) && 726 ILL_ZCOPY_USABLE(ill)) { 727 ixa->ixa_flags |= IXAF_ZCOPY_CAPAB; 728 729 return (B_FALSE); 730 } 731 } 732 733 return (B_TRUE); 734 } 735 736 737 /* 738 * When there is no conn_t context, this will send a packet. 739 * The caller must *not* have called conn_connect() or ip_attr_connect() 740 * before calling ip_output_simple(). 741 * Handles IPv4 and IPv6. Returns zero or an errno such as ENETUNREACH. 742 * Honors IXAF_SET_SOURCE. 743 * 744 * We acquire the ire and after calling ire_sendfn we release 745 * the hold on the ire. Ditto for the nce and dce. 746 * 747 * This assumes that the caller has set the following in ip_xmit_attr_t: 748 * ixa_tsl, ixa_zoneid, and ixa_ipst must always be set. 749 * If ixa_ifindex is non-zero it means send out that ill. (If it is 750 * an upper IPMP ill we load balance across the group; if a lower we send 751 * on that lower ill without load balancing.) 752 * IXAF_IS_IPV4 must be set correctly. 753 * If IXAF_IPSEC_SECURE is set then the ixa_ipsec_* fields must be set. 754 * If IXAF_NO_IPSEC is set we'd skip IPsec policy lookup. 755 * If neither of those two are set we do an IPsec policy lookup. 756 * 757 * We handle setting things like 758 * ixa_pktlen 759 * ixa_ip_hdr_length 760 * ixa->ixa_protocol 761 * 762 * The caller may set ixa_xmit_hint, which is used for ECMP selection and 763 * transmit ring selecting in GLD. 764 * 765 * The caller must do an ixa_cleanup() to release any IPsec references 766 * after we return. 767 */ 768 int 769 ip_output_simple(mblk_t *mp, ip_xmit_attr_t *ixa) 770 { 771 ts_label_t *effective_tsl = NULL; 772 int err; 773 774 ASSERT(ixa->ixa_ipst != NULL); 775 776 if (is_system_labeled()) { 777 ip_stack_t *ipst = ixa->ixa_ipst; 778 779 if (ixa->ixa_flags & IXAF_IS_IPV4) { 780 err = tsol_check_label_v4(ixa->ixa_tsl, ixa->ixa_zoneid, 781 &mp, CONN_MAC_DEFAULT, B_FALSE, ixa->ixa_ipst, 782 &effective_tsl); 783 } else { 784 err = tsol_check_label_v6(ixa->ixa_tsl, ixa->ixa_zoneid, 785 &mp, CONN_MAC_DEFAULT, B_FALSE, ixa->ixa_ipst, 786 &effective_tsl); 787 } 788 if (err != 0) { 789 ip2dbg(("tsol_check: label check failed (%d)\n", err)); 790 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests); 791 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards); 792 ip_drop_output("tsol_check_label", mp, NULL); 793 freemsg(mp); 794 return (err); 795 } 796 if (effective_tsl != NULL) { 797 /* Update the label */ 798 ip_xmit_attr_replace_tsl(ixa, effective_tsl); 799 } 800 } 801 802 if (ixa->ixa_flags & IXAF_IS_IPV4) 803 return (ip_output_simple_v4(mp, ixa)); 804 else 805 return (ip_output_simple_v6(mp, ixa)); 806 } 807 808 int 809 ip_output_simple_v4(mblk_t *mp, ip_xmit_attr_t *ixa) 810 { 811 ipha_t *ipha; 812 ipaddr_t firsthop; /* In IP header */ 813 ipaddr_t dst; /* End of source route, or ipha_dst if none */ 814 ire_t *ire; 815 ipaddr_t setsrc; /* RTF_SETSRC */ 816 int error; 817 ill_t *ill = NULL; 818 dce_t *dce = NULL; 819 nce_t *nce; 820 iaflags_t ixaflags = ixa->ixa_flags; 821 ip_stack_t *ipst = ixa->ixa_ipst; 822 boolean_t repeat = B_FALSE; 823 boolean_t multirt = B_FALSE; 824 int64_t now; 825 826 ipha = (ipha_t *)mp->b_rptr; 827 ASSERT(IPH_HDR_VERSION(ipha) == IPV4_VERSION); 828 829 /* 830 * Even on labeled systems we can have a NULL ixa_tsl e.g., 831 * for IGMP/MLD traffic. 832 */ 833 834 /* Caller already set flags */ 835 ASSERT(ixa->ixa_flags & IXAF_IS_IPV4); 836 837 ASSERT(ixa->ixa_nce == NULL); 838 839 ixa->ixa_pktlen = ntohs(ipha->ipha_length); 840 ASSERT(ixa->ixa_pktlen == msgdsize(mp)); 841 ixa->ixa_ip_hdr_length = IPH_HDR_LENGTH(ipha); 842 ixa->ixa_protocol = ipha->ipha_protocol; 843 844 /* 845 * Assumes that source routed packets have already been massaged by 846 * the ULP (ip_massage_options) and as a result ipha_dst is the next 847 * hop in the source route. The final destination is used for IPsec 848 * policy and DCE lookup. 849 */ 850 firsthop = ipha->ipha_dst; 851 dst = ip_get_dst(ipha); 852 853 repeat_ire: 854 error = 0; 855 setsrc = INADDR_ANY; 856 ire = ip_select_route_v4(firsthop, ipha->ipha_src, ixa, NULL, 857 &setsrc, &error, &multirt); 858 ASSERT(ire != NULL); /* IRE_NOROUTE if none found */ 859 if (error != 0) { 860 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests); 861 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards); 862 ip_drop_output("ipIfStatsOutDiscards - select route", mp, NULL); 863 freemsg(mp); 864 goto done; 865 } 866 867 if (ire->ire_flags & (RTF_BLACKHOLE|RTF_REJECT)) { 868 /* ire_ill might be NULL hence need to skip some code */ 869 if (ixaflags & IXAF_SET_SOURCE) 870 ipha->ipha_src = htonl(INADDR_LOOPBACK); 871 ixa->ixa_fragsize = IP_MAXPACKET; 872 ill = NULL; 873 nce = NULL; 874 ire->ire_ob_pkt_count++; 875 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests); 876 /* No dce yet; use default one */ 877 error = (ire->ire_sendfn)(ire, mp, ipha, ixa, 878 &ipst->ips_dce_default->dce_ident); 879 goto done; 880 } 881 882 /* Note that ipha_dst is only used for IRE_MULTICAST */ 883 nce = ire_to_nce(ire, ipha->ipha_dst, NULL); 884 if (nce == NULL) { 885 /* Allocation failure? */ 886 ip_drop_output("ire_to_nce", mp, ill); 887 freemsg(mp); 888 error = ENOBUFS; 889 goto done; 890 } 891 if (nce->nce_is_condemned) { 892 nce_t *nce1; 893 894 nce1 = ire_handle_condemned_nce(nce, ire, ipha, NULL, B_TRUE); 895 nce_refrele(nce); 896 if (nce1 == NULL) { 897 if (!repeat) { 898 /* Try finding a better IRE */ 899 repeat = B_TRUE; 900 ire_refrele(ire); 901 goto repeat_ire; 902 } 903 /* Tried twice - drop packet */ 904 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards); 905 ip_drop_output("No nce", mp, ill); 906 freemsg(mp); 907 error = ENOBUFS; 908 goto done; 909 } 910 nce = nce1; 911 } 912 913 /* 914 * For multicast with multirt we have a flag passed back from 915 * ire_lookup_multi_ill_v4 since we don't have an IRE for each 916 * possible multicast address. 917 * We also need a flag for multicast since we can't check 918 * whether RTF_MULTIRT is set in ixa_ire for multicast. 919 */ 920 if (multirt) { 921 ixa->ixa_postfragfn = ip_postfrag_multirt_v4; 922 ixa->ixa_flags |= IXAF_MULTIRT_MULTICAST; 923 } else { 924 ixa->ixa_postfragfn = ire->ire_postfragfn; 925 ixa->ixa_flags &= ~IXAF_MULTIRT_MULTICAST; 926 } 927 ASSERT(ixa->ixa_nce == NULL); 928 ixa->ixa_nce = nce; 929 930 /* 931 * Check for a dce_t with a path mtu. 932 */ 933 dce = dce_lookup_v4(dst, ipst, NULL); 934 ASSERT(dce != NULL); 935 936 if (!(ixaflags & IXAF_PMTU_DISCOVERY)) { 937 ixa->ixa_fragsize = ip_get_base_mtu(nce->nce_ill, ire); 938 } else if (dce->dce_flags & DCEF_PMTU) { 939 /* 940 * To avoid a periodic timer to increase the path MTU we 941 * look at dce_last_change_time each time we send a packet. 942 */ 943 now = ddi_get_lbolt64(); 944 if (TICK_TO_SEC(now) - dce->dce_last_change_time > 945 ipst->ips_ip_pathmtu_interval) { 946 /* 947 * Older than 20 minutes. Drop the path MTU information. 948 */ 949 mutex_enter(&dce->dce_lock); 950 dce->dce_flags &= ~(DCEF_PMTU|DCEF_TOO_SMALL_PMTU); 951 dce->dce_last_change_time = TICK_TO_SEC(now); 952 mutex_exit(&dce->dce_lock); 953 dce_increment_generation(dce); 954 ixa->ixa_fragsize = ip_get_base_mtu(nce->nce_ill, ire); 955 } else { 956 uint_t fragsize; 957 958 fragsize = ip_get_base_mtu(nce->nce_ill, ire); 959 if (fragsize > dce->dce_pmtu) 960 fragsize = dce->dce_pmtu; 961 ixa->ixa_fragsize = fragsize; 962 } 963 } else { 964 ixa->ixa_fragsize = ip_get_base_mtu(nce->nce_ill, ire); 965 } 966 967 /* 968 * We use use ire_nexthop_ill (and not ncec_ill) to avoid the under ipmp 969 * interface for source address selection. 970 */ 971 ill = ire_nexthop_ill(ire); 972 973 if (ixaflags & IXAF_SET_SOURCE) { 974 ipaddr_t src; 975 976 /* 977 * We use the final destination to get 978 * correct selection for source routed packets 979 */ 980 981 /* If unreachable we have no ill but need some source */ 982 if (ill == NULL) { 983 src = htonl(INADDR_LOOPBACK); 984 error = 0; 985 } else { 986 error = ip_select_source_v4(ill, setsrc, dst, 987 ixa->ixa_multicast_ifaddr, ixa->ixa_zoneid, ipst, 988 &src, NULL, NULL); 989 } 990 if (error != 0) { 991 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCOutRequests); 992 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); 993 ip_drop_output("ipIfStatsOutDiscards - no source", 994 mp, ill); 995 freemsg(mp); 996 goto done; 997 } 998 ipha->ipha_src = src; 999 } else if (ixaflags & IXAF_VERIFY_SOURCE) { 1000 /* Check if the IP source is assigned to the host. */ 1001 if (!ip_verify_src(mp, ixa, NULL)) { 1002 /* Don't send a packet with a source that isn't ours */ 1003 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests); 1004 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards); 1005 ip_drop_output("ipIfStatsOutDiscards - invalid source", 1006 mp, ill); 1007 freemsg(mp); 1008 error = EADDRNOTAVAIL; 1009 goto done; 1010 } 1011 } 1012 1013 1014 /* 1015 * Check against global IPsec policy to set the AH/ESP attributes. 1016 * IPsec will set IXAF_IPSEC_* and ixa_ipsec_* as appropriate. 1017 */ 1018 if (!(ixaflags & (IXAF_NO_IPSEC|IXAF_IPSEC_SECURE))) { 1019 ASSERT(ixa->ixa_ipsec_policy == NULL); 1020 mp = ip_output_attach_policy(mp, ipha, NULL, NULL, ixa); 1021 if (mp == NULL) { 1022 /* MIB and ip_drop_packet already done */ 1023 return (EHOSTUNREACH); /* IPsec policy failure */ 1024 } 1025 } 1026 1027 if (ill != NULL) { 1028 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCOutRequests); 1029 } else { 1030 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests); 1031 } 1032 1033 /* 1034 * We update the statistics on the most specific IRE i.e., the first 1035 * one we found. 1036 * We don't have an IRE when we fragment, hence ire_ob_pkt_count 1037 * can only count the use prior to fragmentation. However the MIB 1038 * counters on the ill will be incremented in post fragmentation. 1039 */ 1040 ire->ire_ob_pkt_count++; 1041 1042 /* 1043 * Based on ire_type and ire_flags call one of: 1044 * ire_send_local_v4 - for IRE_LOCAL and IRE_LOOPBACK 1045 * ire_send_multirt_v4 - if RTF_MULTIRT 1046 * ire_send_noroute_v4 - if RTF_REJECT or RTF_BLACHOLE 1047 * ire_send_multicast_v4 - for IRE_MULTICAST 1048 * ire_send_broadcast_v4 - for IRE_BROADCAST 1049 * ire_send_wire_v4 - for the rest. 1050 */ 1051 error = (ire->ire_sendfn)(ire, mp, ipha, ixa, &dce->dce_ident); 1052 done: 1053 ire_refrele(ire); 1054 if (dce != NULL) 1055 dce_refrele(dce); 1056 if (ill != NULL) 1057 ill_refrele(ill); 1058 if (ixa->ixa_nce != NULL) 1059 nce_refrele(ixa->ixa_nce); 1060 ixa->ixa_nce = NULL; 1061 return (error); 1062 } 1063 1064 /* 1065 * ire_sendfn() functions. 1066 * These functions use the following xmit_attr: 1067 * - ixa_fragsize - read to determine whether or not to fragment 1068 * - IXAF_IPSEC_SECURE - to determine whether or not to invoke IPsec 1069 * - ixa_ipsec_* are used inside IPsec 1070 * - IXAF_SET_SOURCE - replace IP source in broadcast case. 1071 * - IXAF_LOOPBACK_COPY - for multicast and broadcast 1072 */ 1073 1074 1075 /* 1076 * ire_sendfn for IRE_LOCAL and IRE_LOOPBACK 1077 * 1078 * The checks for restrict_interzone_loopback are done in ire_route_recursive. 1079 */ 1080 /* ARGSUSED4 */ 1081 int 1082 ire_send_local_v4(ire_t *ire, mblk_t *mp, void *iph_arg, 1083 ip_xmit_attr_t *ixa, uint32_t *identp) 1084 { 1085 ipha_t *ipha = (ipha_t *)iph_arg; 1086 ip_stack_t *ipst = ixa->ixa_ipst; 1087 ill_t *ill = ire->ire_ill; 1088 ip_recv_attr_t iras; /* NOTE: No bzero for performance */ 1089 uint_t pktlen = ixa->ixa_pktlen; 1090 1091 /* 1092 * No fragmentation, no nce, no application of IPsec, 1093 * and no ipha_ident assignment. 1094 * 1095 * Note different order between IP provider and FW_HOOKS than in 1096 * send_wire case. 1097 */ 1098 1099 /* 1100 * DTrace this as ip:::send. A packet blocked by FW_HOOKS will fire the 1101 * send probe, but not the receive probe. 1102 */ 1103 DTRACE_IP7(send, mblk_t *, mp, conn_t *, NULL, void_ip_t *, 1104 ipha, __dtrace_ipsr_ill_t *, ill, ipha_t *, ipha, ip6_t *, NULL, 1105 int, 1); 1106 1107 if (HOOKS4_INTERESTED_LOOPBACK_OUT(ipst)) { 1108 int error = 0; 1109 1110 DTRACE_PROBE4(ip4__loopback__out__start, ill_t *, NULL, 1111 ill_t *, ill, ipha_t *, ipha, mblk_t *, mp); 1112 FW_HOOKS(ipst->ips_ip4_loopback_out_event, 1113 ipst->ips_ipv4firewall_loopback_out, 1114 NULL, ill, ipha, mp, mp, 0, ipst, error); 1115 DTRACE_PROBE1(ip4__loopback__out__end, mblk_t *, mp); 1116 if (mp == NULL) 1117 return (error); 1118 1119 /* 1120 * Even if the destination was changed by the filter we use the 1121 * forwarding decision that was made based on the address 1122 * in ip_output/ip_set_destination. 1123 */ 1124 /* Length could be different */ 1125 ipha = (ipha_t *)mp->b_rptr; 1126 pktlen = ntohs(ipha->ipha_length); 1127 } 1128 1129 /* 1130 * If a callback is enabled then we need to know the 1131 * source and destination zoneids for the packet. We already 1132 * have those handy. 1133 */ 1134 if (ipst->ips_ip4_observe.he_interested) { 1135 zoneid_t szone, dzone; 1136 zoneid_t stackzoneid; 1137 1138 stackzoneid = netstackid_to_zoneid( 1139 ipst->ips_netstack->netstack_stackid); 1140 1141 if (stackzoneid == GLOBAL_ZONEID) { 1142 /* Shared-IP zone */ 1143 dzone = ire->ire_zoneid; 1144 szone = ixa->ixa_zoneid; 1145 } else { 1146 szone = dzone = stackzoneid; 1147 } 1148 ipobs_hook(mp, IPOBS_HOOK_LOCAL, szone, dzone, ill, ipst); 1149 } 1150 1151 /* Handle lo0 stats */ 1152 ipst->ips_loopback_packets++; 1153 1154 /* Map ixa to ira including IPsec policies */ 1155 ipsec_out_to_in(ixa, ill, &iras); 1156 iras.ira_pktlen = pktlen; 1157 iras.ira_ttl = ipha->ipha_ttl; 1158 1159 if (!IS_SIMPLE_IPH(ipha)) { 1160 ip_output_local_options(ipha, ipst); 1161 iras.ira_flags |= IRAF_IPV4_OPTIONS; 1162 } 1163 1164 if (HOOKS4_INTERESTED_LOOPBACK_IN(ipst)) { 1165 int error = 0; 1166 1167 DTRACE_PROBE4(ip4__loopback__in__start, ill_t *, ill, 1168 ill_t *, NULL, ipha_t *, ipha, mblk_t *, mp); 1169 FW_HOOKS(ipst->ips_ip4_loopback_in_event, 1170 ipst->ips_ipv4firewall_loopback_in, 1171 ill, NULL, ipha, mp, mp, 0, ipst, error); 1172 1173 DTRACE_PROBE1(ip4__loopback__in__end, mblk_t *, mp); 1174 if (mp == NULL) { 1175 ira_cleanup(&iras, B_FALSE); 1176 return (error); 1177 } 1178 /* 1179 * Even if the destination was changed by the filter we use the 1180 * forwarding decision that was made based on the address 1181 * in ip_output/ip_set_destination. 1182 */ 1183 /* Length could be different */ 1184 ipha = (ipha_t *)mp->b_rptr; 1185 pktlen = iras.ira_pktlen = ntohs(ipha->ipha_length); 1186 } 1187 1188 DTRACE_IP7(receive, mblk_t *, mp, conn_t *, NULL, void_ip_t *, 1189 ipha, __dtrace_ipsr_ill_t *, ill, ipha_t *, ipha, ip6_t *, NULL, 1190 int, 1); 1191 1192 ire->ire_ib_pkt_count++; 1193 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInReceives); 1194 UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCInOctets, pktlen); 1195 1196 /* Destined to ire_zoneid - use that for fanout */ 1197 iras.ira_zoneid = ire->ire_zoneid; 1198 1199 if (is_system_labeled()) { 1200 iras.ira_flags |= IRAF_SYSTEM_LABELED; 1201 1202 /* 1203 * This updates ira_cred, ira_tsl and ira_free_flags based 1204 * on the label. We don't expect this to ever fail for 1205 * loopback packets, so we silently drop the packet should it 1206 * fail. 1207 */ 1208 if (!tsol_get_pkt_label(mp, IPV4_VERSION, &iras)) { 1209 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 1210 ip_drop_input("tsol_get_pkt_label", mp, ill); 1211 freemsg(mp); 1212 return (0); 1213 } 1214 ASSERT(iras.ira_tsl != NULL); 1215 1216 /* tsol_get_pkt_label sometimes does pullupmsg */ 1217 ipha = (ipha_t *)mp->b_rptr; 1218 } 1219 1220 ip_fanout_v4(mp, ipha, &iras); 1221 1222 /* We moved any IPsec refs from ixa to iras */ 1223 ira_cleanup(&iras, B_FALSE); 1224 return (0); 1225 } 1226 1227 /* 1228 * ire_sendfn for IRE_BROADCAST 1229 * If the broadcast address is present on multiple ills and ixa_ifindex 1230 * isn't set, then we generate 1231 * a separate datagram (potentially with different source address) for 1232 * those ills. In any case, only one copy is looped back to ip_input_v4. 1233 */ 1234 int 1235 ire_send_broadcast_v4(ire_t *ire, mblk_t *mp, void *iph_arg, 1236 ip_xmit_attr_t *ixa, uint32_t *identp) 1237 { 1238 ipha_t *ipha = (ipha_t *)iph_arg; 1239 ip_stack_t *ipst = ixa->ixa_ipst; 1240 irb_t *irb = ire->ire_bucket; 1241 ire_t *ire1; 1242 mblk_t *mp1; 1243 ipha_t *ipha1; 1244 iaflags_t ixaflags = ixa->ixa_flags; 1245 nce_t *nce1, *nce_orig; 1246 1247 /* 1248 * Unless ire_send_multirt_v4 already set a ttl, force the 1249 * ttl to a smallish value. 1250 */ 1251 if (!(ixa->ixa_flags & IXAF_NO_TTL_CHANGE)) { 1252 /* 1253 * To avoid broadcast storms, we usually set the TTL to 1 for 1254 * broadcasts. This can 1255 * be overridden stack-wide through the ip_broadcast_ttl 1256 * ndd tunable, or on a per-connection basis through the 1257 * IP_BROADCAST_TTL socket option. 1258 * 1259 * If SO_DONTROUTE/IXAF_DONTROUTE is set, then ire_send_wire_v4 1260 * will force ttl to one after we've set this. 1261 */ 1262 if (ixaflags & IXAF_BROADCAST_TTL_SET) 1263 ipha->ipha_ttl = ixa->ixa_broadcast_ttl; 1264 else 1265 ipha->ipha_ttl = ipst->ips_ip_broadcast_ttl; 1266 } 1267 /* 1268 * Make sure we get a loopback copy (after IPsec and frag) 1269 * Skip hardware checksum so that loopback copy is checksumed. 1270 */ 1271 ixa->ixa_flags |= IXAF_LOOPBACK_COPY | IXAF_NO_HW_CKSUM; 1272 1273 /* Do we need to potentially generate multiple copies? */ 1274 if (irb->irb_ire_cnt == 1 || ixa->ixa_ifindex != 0) 1275 return (ire_send_wire_v4(ire, mp, ipha, ixa, identp)); 1276 1277 /* 1278 * Loop over all IRE_BROADCAST in the bucket (might only be one). 1279 * Note that everything in the bucket has the same destination address. 1280 */ 1281 irb_refhold(irb); 1282 for (ire1 = irb->irb_ire; ire1 != NULL; ire1 = ire1->ire_next) { 1283 /* We do the main IRE after the end of the loop */ 1284 if (ire1 == ire) 1285 continue; 1286 1287 /* 1288 * Only IREs for the same IP address should be in the same 1289 * bucket. 1290 * But could have IRE_HOSTs in the case of CGTP. 1291 * If we find any multirt routes we bail out of the loop 1292 * and just do the single packet at the end; ip_postfrag_multirt 1293 * will duplicate the packet. 1294 */ 1295 ASSERT(ire1->ire_addr == ire->ire_addr); 1296 if (!(ire1->ire_type & IRE_BROADCAST)) 1297 continue; 1298 1299 if (IRE_IS_CONDEMNED(ire1)) 1300 continue; 1301 1302 if (ixa->ixa_zoneid != ALL_ZONES && 1303 ire->ire_zoneid != ire1->ire_zoneid) 1304 continue; 1305 1306 ASSERT(ire->ire_ill != ire1->ire_ill && ire1->ire_ill != NULL); 1307 1308 if (ire1->ire_flags & RTF_MULTIRT) 1309 break; 1310 1311 /* 1312 * For IPMP we only send for the ipmp_ill. arp_nce_init() will 1313 * ensure that this goes out on the cast_ill. 1314 */ 1315 if (IS_UNDER_IPMP(ire1->ire_ill)) 1316 continue; 1317 1318 mp1 = copymsg(mp); 1319 if (mp1 == NULL) { 1320 BUMP_MIB(ire1->ire_ill->ill_ip_mib, 1321 ipIfStatsOutDiscards); 1322 ip_drop_output("ipIfStatsOutDiscards", 1323 mp, ire1->ire_ill); 1324 continue; 1325 } 1326 1327 ipha1 = (ipha_t *)mp1->b_rptr; 1328 if (ixa->ixa_flags & IXAF_SET_SOURCE) { 1329 /* 1330 * Need to pick a different source address for each 1331 * interface. If we have a global IPsec policy and 1332 * no per-socket policy then we punt to 1333 * ip_output_simple_v4 using a separate ip_xmit_attr_t. 1334 */ 1335 if (ixaflags & IXAF_IPSEC_GLOBAL_POLICY) { 1336 ip_output_simple_broadcast(ixa, mp1); 1337 continue; 1338 } 1339 /* Pick a new source address for each interface */ 1340 if (ip_select_source_v4(ire1->ire_ill, INADDR_ANY, 1341 ipha1->ipha_dst, INADDR_ANY, ixa->ixa_zoneid, ipst, 1342 &ipha1->ipha_src, NULL, NULL) != 0) { 1343 BUMP_MIB(ire1->ire_ill->ill_ip_mib, 1344 ipIfStatsOutDiscards); 1345 ip_drop_output("ipIfStatsOutDiscards - select " 1346 "broadcast source", mp1, ire1->ire_ill); 1347 freemsg(mp1); 1348 continue; 1349 } 1350 /* 1351 * Check against global IPsec policy to set the AH/ESP 1352 * attributes. IPsec will set IXAF_IPSEC_* and 1353 * ixa_ipsec_* as appropriate. 1354 */ 1355 if (!(ixaflags & (IXAF_NO_IPSEC|IXAF_IPSEC_SECURE))) { 1356 ASSERT(ixa->ixa_ipsec_policy == NULL); 1357 mp1 = ip_output_attach_policy(mp1, ipha, NULL, 1358 NULL, ixa); 1359 if (mp1 == NULL) { 1360 /* 1361 * MIB and ip_drop_packet already 1362 * done 1363 */ 1364 continue; 1365 } 1366 } 1367 } 1368 /* Make sure we have an NCE on this ill */ 1369 nce1 = arp_nce_init(ire1->ire_ill, ire1->ire_addr, 1370 ire1->ire_type); 1371 if (nce1 == NULL) { 1372 BUMP_MIB(ire1->ire_ill->ill_ip_mib, 1373 ipIfStatsOutDiscards); 1374 ip_drop_output("ipIfStatsOutDiscards - broadcast nce", 1375 mp1, ire1->ire_ill); 1376 freemsg(mp1); 1377 continue; 1378 } 1379 nce_orig = ixa->ixa_nce; 1380 ixa->ixa_nce = nce1; 1381 1382 ire_refhold(ire1); 1383 /* 1384 * Ignore any errors here. We just collect the errno for 1385 * the main ire below 1386 */ 1387 (void) ire_send_wire_v4(ire1, mp1, ipha1, ixa, identp); 1388 ire_refrele(ire1); 1389 1390 ixa->ixa_nce = nce_orig; 1391 nce_refrele(nce1); 1392 1393 ixa->ixa_flags &= ~IXAF_LOOPBACK_COPY; 1394 } 1395 irb_refrele(irb); 1396 /* Finally, the main one */ 1397 1398 /* 1399 * For IPMP we only send broadcasts on the ipmp_ill. 1400 */ 1401 if (IS_UNDER_IPMP(ire->ire_ill)) { 1402 freemsg(mp); 1403 return (0); 1404 } 1405 1406 return (ire_send_wire_v4(ire, mp, ipha, ixa, identp)); 1407 } 1408 1409 /* 1410 * Send a packet using a different source address and different 1411 * IPsec policy. 1412 */ 1413 static void 1414 ip_output_simple_broadcast(ip_xmit_attr_t *ixa, mblk_t *mp) 1415 { 1416 ip_xmit_attr_t ixas; 1417 1418 bzero(&ixas, sizeof (ixas)); 1419 ixas.ixa_flags = IXAF_BASIC_SIMPLE_V4; 1420 ixas.ixa_zoneid = ixa->ixa_zoneid; 1421 ixas.ixa_ifindex = 0; 1422 ixas.ixa_ipst = ixa->ixa_ipst; 1423 ixas.ixa_cred = ixa->ixa_cred; 1424 ixas.ixa_cpid = ixa->ixa_cpid; 1425 ixas.ixa_tsl = ixa->ixa_tsl; 1426 ixas.ixa_multicast_ttl = IP_DEFAULT_MULTICAST_TTL; 1427 1428 (void) ip_output_simple(mp, &ixas); 1429 ixa_cleanup(&ixas); 1430 } 1431 1432 1433 static void 1434 multirt_check_v4(ire_t *ire, ipha_t *ipha, ip_xmit_attr_t *ixa) 1435 { 1436 ip_stack_t *ipst = ixa->ixa_ipst; 1437 1438 /* Limit the TTL on multirt packets */ 1439 if (ire->ire_type & IRE_MULTICAST) { 1440 if (ipha->ipha_ttl > 1) { 1441 ip2dbg(("ire_send_multirt_v4: forcing multicast " 1442 "multirt TTL to 1 (was %d), dst 0x%08x\n", 1443 ipha->ipha_ttl, ntohl(ire->ire_addr))); 1444 ipha->ipha_ttl = 1; 1445 } 1446 ixa->ixa_flags |= IXAF_NO_TTL_CHANGE; 1447 } else if ((ipst->ips_ip_multirt_ttl > 0) && 1448 (ipha->ipha_ttl > ipst->ips_ip_multirt_ttl)) { 1449 ipha->ipha_ttl = ipst->ips_ip_multirt_ttl; 1450 /* 1451 * Need to ensure we don't increase the ttl should we go through 1452 * ire_send_broadcast or multicast. 1453 */ 1454 ixa->ixa_flags |= IXAF_NO_TTL_CHANGE; 1455 } 1456 } 1457 1458 /* 1459 * ire_sendfn for IRE_MULTICAST 1460 */ 1461 int 1462 ire_send_multicast_v4(ire_t *ire, mblk_t *mp, void *iph_arg, 1463 ip_xmit_attr_t *ixa, uint32_t *identp) 1464 { 1465 ipha_t *ipha = (ipha_t *)iph_arg; 1466 ip_stack_t *ipst = ixa->ixa_ipst; 1467 ill_t *ill = ire->ire_ill; 1468 iaflags_t ixaflags = ixa->ixa_flags; 1469 1470 /* 1471 * The IRE_MULTICAST is the same whether or not multirt is in use. 1472 * Hence we need special-case code. 1473 */ 1474 if (ixaflags & IXAF_MULTIRT_MULTICAST) 1475 multirt_check_v4(ire, ipha, ixa); 1476 1477 /* 1478 * Check if anything in ip_input_v4 wants a copy of the transmitted 1479 * packet (after IPsec and fragmentation) 1480 * 1481 * 1. Multicast routers always need a copy unless SO_DONTROUTE is set 1482 * RSVP and the rsvp daemon is an example of a 1483 * protocol and user level process that 1484 * handles it's own routing. Hence, it uses the 1485 * SO_DONTROUTE option to accomplish this. 1486 * 2. If the sender has set IP_MULTICAST_LOOP, then we just 1487 * check whether there are any receivers for the group on the ill 1488 * (ignoring the zoneid). 1489 * 3. If IP_MULTICAST_LOOP is not set, then we check if there are 1490 * any members in other shared-IP zones. 1491 * If such members exist, then we indicate that the sending zone 1492 * shouldn't get a loopback copy to preserve the IP_MULTICAST_LOOP 1493 * behavior. 1494 * 1495 * When we loopback we skip hardware checksum to make sure loopback 1496 * copy is checksumed. 1497 * 1498 * Note that ire_ill is the upper in the case of IPMP. 1499 */ 1500 ixa->ixa_flags &= ~(IXAF_LOOPBACK_COPY | IXAF_NO_HW_CKSUM); 1501 if (ipst->ips_ip_g_mrouter && ill->ill_mrouter_cnt > 0 && 1502 !(ixaflags & IXAF_DONTROUTE)) { 1503 ixa->ixa_flags |= IXAF_LOOPBACK_COPY | IXAF_NO_HW_CKSUM; 1504 } else if (ixaflags & IXAF_MULTICAST_LOOP) { 1505 /* 1506 * If this zone or any other zone has members then loopback 1507 * a copy. 1508 */ 1509 if (ill_hasmembers_v4(ill, ipha->ipha_dst)) 1510 ixa->ixa_flags |= IXAF_LOOPBACK_COPY | IXAF_NO_HW_CKSUM; 1511 } else if (ipst->ips_netstack->netstack_numzones > 1) { 1512 /* 1513 * This zone should not have a copy. But there are some other 1514 * zones which might have members. 1515 */ 1516 if (ill_hasmembers_otherzones_v4(ill, ipha->ipha_dst, 1517 ixa->ixa_zoneid)) { 1518 ixa->ixa_flags |= IXAF_NO_LOOP_ZONEID_SET; 1519 ixa->ixa_no_loop_zoneid = ixa->ixa_zoneid; 1520 ixa->ixa_flags |= IXAF_LOOPBACK_COPY | IXAF_NO_HW_CKSUM; 1521 } 1522 } 1523 1524 /* 1525 * Unless ire_send_multirt_v4 or icmp_output_hdrincl already set a ttl, 1526 * force the ttl to the IP_MULTICAST_TTL value 1527 */ 1528 if (!(ixaflags & IXAF_NO_TTL_CHANGE)) { 1529 ipha->ipha_ttl = ixa->ixa_multicast_ttl; 1530 } 1531 1532 return (ire_send_wire_v4(ire, mp, ipha, ixa, identp)); 1533 } 1534 1535 /* 1536 * ire_sendfn for IREs with RTF_MULTIRT 1537 */ 1538 int 1539 ire_send_multirt_v4(ire_t *ire, mblk_t *mp, void *iph_arg, 1540 ip_xmit_attr_t *ixa, uint32_t *identp) 1541 { 1542 ipha_t *ipha = (ipha_t *)iph_arg; 1543 1544 multirt_check_v4(ire, ipha, ixa); 1545 1546 if (ire->ire_type & IRE_MULTICAST) 1547 return (ire_send_multicast_v4(ire, mp, ipha, ixa, identp)); 1548 else if (ire->ire_type & IRE_BROADCAST) 1549 return (ire_send_broadcast_v4(ire, mp, ipha, ixa, identp)); 1550 else 1551 return (ire_send_wire_v4(ire, mp, ipha, ixa, identp)); 1552 } 1553 1554 /* 1555 * ire_sendfn for IREs with RTF_REJECT/RTF_BLACKHOLE, including IRE_NOROUTE 1556 */ 1557 int 1558 ire_send_noroute_v4(ire_t *ire, mblk_t *mp, void *iph_arg, 1559 ip_xmit_attr_t *ixa, uint32_t *identp) 1560 { 1561 ip_stack_t *ipst = ixa->ixa_ipst; 1562 ipha_t *ipha = (ipha_t *)iph_arg; 1563 ill_t *ill; 1564 ip_recv_attr_t iras; 1565 boolean_t dummy; 1566 1567 /* We assign an IP ident for nice errors */ 1568 ipha->ipha_ident = atomic_inc_32_nv(identp); 1569 1570 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutNoRoutes); 1571 1572 if (ire->ire_type & IRE_NOROUTE) { 1573 /* A lack of a route as opposed to RTF_REJECT|BLACKHOLE */ 1574 ip_rts_change(RTM_MISS, ipha->ipha_dst, 0, 0, 0, 0, 0, 0, 1575 RTA_DST, ipst); 1576 } 1577 1578 if (ire->ire_flags & RTF_BLACKHOLE) { 1579 ip_drop_output("ipIfStatsOutNoRoutes RTF_BLACKHOLE", mp, NULL); 1580 freemsg(mp); 1581 /* No error even for local senders - silent blackhole */ 1582 return (0); 1583 } 1584 ip_drop_output("ipIfStatsOutNoRoutes RTF_REJECT", mp, NULL); 1585 1586 /* 1587 * We need an ill_t for the ip_recv_attr_t even though this packet 1588 * was never received and icmp_unreachable doesn't currently use 1589 * ira_ill. 1590 */ 1591 ill = ill_lookup_on_name("lo0", B_FALSE, 1592 !(ixa->ixa_flags & IRAF_IS_IPV4), &dummy, ipst); 1593 if (ill == NULL) { 1594 freemsg(mp); 1595 return (EHOSTUNREACH); 1596 } 1597 1598 bzero(&iras, sizeof (iras)); 1599 /* Map ixa to ira including IPsec policies */ 1600 ipsec_out_to_in(ixa, ill, &iras); 1601 1602 if (ip_source_routed(ipha, ipst)) { 1603 icmp_unreachable(mp, ICMP_SOURCE_ROUTE_FAILED, &iras); 1604 } else { 1605 icmp_unreachable(mp, ICMP_HOST_UNREACHABLE, &iras); 1606 } 1607 /* We moved any IPsec refs from ixa to iras */ 1608 ira_cleanup(&iras, B_FALSE); 1609 ill_refrele(ill); 1610 return (EHOSTUNREACH); 1611 } 1612 1613 /* 1614 * Calculate a checksum ignoring any hardware capabilities 1615 * 1616 * Returns B_FALSE if the packet was too short for the checksum. Caller 1617 * should free and do stats. 1618 */ 1619 static boolean_t 1620 ip_output_sw_cksum_v4(mblk_t *mp, ipha_t *ipha, ip_xmit_attr_t *ixa) 1621 { 1622 ip_stack_t *ipst = ixa->ixa_ipst; 1623 uint_t pktlen = ixa->ixa_pktlen; 1624 uint16_t *cksump; 1625 uint32_t cksum; 1626 uint8_t protocol = ixa->ixa_protocol; 1627 uint16_t ip_hdr_length = ixa->ixa_ip_hdr_length; 1628 ipaddr_t dst = ipha->ipha_dst; 1629 ipaddr_t src = ipha->ipha_src; 1630 1631 /* Just in case it contained garbage */ 1632 DB_CKSUMFLAGS(mp) &= ~HCK_FLAGS; 1633 1634 /* 1635 * Calculate ULP checksum 1636 */ 1637 if (protocol == IPPROTO_TCP) { 1638 cksump = IPH_TCPH_CHECKSUMP(ipha, ip_hdr_length); 1639 cksum = IP_TCP_CSUM_COMP; 1640 } else if (protocol == IPPROTO_UDP) { 1641 cksump = IPH_UDPH_CHECKSUMP(ipha, ip_hdr_length); 1642 cksum = IP_UDP_CSUM_COMP; 1643 } else if (protocol == IPPROTO_SCTP) { 1644 sctp_hdr_t *sctph; 1645 1646 ASSERT(MBLKL(mp) >= (ip_hdr_length + sizeof (*sctph))); 1647 sctph = (sctp_hdr_t *)(mp->b_rptr + ip_hdr_length); 1648 /* 1649 * Zero out the checksum field to ensure proper 1650 * checksum calculation. 1651 */ 1652 sctph->sh_chksum = 0; 1653 #ifdef DEBUG 1654 if (!skip_sctp_cksum) 1655 #endif 1656 sctph->sh_chksum = sctp_cksum(mp, ip_hdr_length); 1657 goto ip_hdr_cksum; 1658 } else { 1659 goto ip_hdr_cksum; 1660 } 1661 1662 /* ULP puts the checksum field is in the first mblk */ 1663 ASSERT(((uchar_t *)cksump) + sizeof (uint16_t) <= mp->b_wptr); 1664 1665 /* 1666 * We accumulate the pseudo header checksum in cksum. 1667 * This is pretty hairy code, so watch close. One 1668 * thing to keep in mind is that UDP and TCP have 1669 * stored their respective datagram lengths in their 1670 * checksum fields. This lines things up real nice. 1671 */ 1672 cksum += (dst >> 16) + (dst & 0xFFFF) + (src >> 16) + (src & 0xFFFF); 1673 1674 cksum = IP_CSUM(mp, ip_hdr_length, cksum); 1675 /* 1676 * For UDP/IPv4 a zero means that the packets wasn't checksummed. 1677 * Change to 0xffff 1678 */ 1679 if (protocol == IPPROTO_UDP && cksum == 0) 1680 *cksump = ~cksum; 1681 else 1682 *cksump = cksum; 1683 1684 IP_STAT(ipst, ip_out_sw_cksum); 1685 IP_STAT_UPDATE(ipst, ip_out_sw_cksum_bytes, pktlen); 1686 1687 ip_hdr_cksum: 1688 /* Calculate IPv4 header checksum */ 1689 ipha->ipha_hdr_checksum = 0; 1690 ipha->ipha_hdr_checksum = ip_csum_hdr(ipha); 1691 return (B_TRUE); 1692 } 1693 1694 /* 1695 * Calculate the ULP checksum - try to use hardware. 1696 * In the case of MULTIRT, broadcast or multicast the 1697 * IXAF_NO_HW_CKSUM is set in which case we use software. 1698 * 1699 * If the hardware supports IP header checksum offload; then clear the 1700 * contents of IP header checksum field as expected by NIC. 1701 * Do this only if we offloaded either full or partial sum. 1702 * 1703 * Returns B_FALSE if the packet was too short for the checksum. Caller 1704 * should free and do stats. 1705 */ 1706 static boolean_t 1707 ip_output_cksum_v4(iaflags_t ixaflags, mblk_t *mp, ipha_t *ipha, 1708 ip_xmit_attr_t *ixa, ill_t *ill) 1709 { 1710 uint_t pktlen = ixa->ixa_pktlen; 1711 uint16_t *cksump; 1712 uint16_t hck_flags; 1713 uint32_t cksum; 1714 uint8_t protocol = ixa->ixa_protocol; 1715 uint16_t ip_hdr_length = ixa->ixa_ip_hdr_length; 1716 1717 if ((ixaflags & IXAF_NO_HW_CKSUM) || !ILL_HCKSUM_CAPABLE(ill) || 1718 !dohwcksum) { 1719 return (ip_output_sw_cksum_v4(mp, ipha, ixa)); 1720 } 1721 1722 /* 1723 * Calculate ULP checksum. Note that we don't use cksump and cksum 1724 * if the ill has FULL support. 1725 */ 1726 if (protocol == IPPROTO_TCP) { 1727 cksump = IPH_TCPH_CHECKSUMP(ipha, ip_hdr_length); 1728 cksum = IP_TCP_CSUM_COMP; /* Pseudo-header cksum */ 1729 } else if (protocol == IPPROTO_UDP) { 1730 cksump = IPH_UDPH_CHECKSUMP(ipha, ip_hdr_length); 1731 cksum = IP_UDP_CSUM_COMP; /* Pseudo-header cksum */ 1732 } else if (protocol == IPPROTO_SCTP) { 1733 sctp_hdr_t *sctph; 1734 1735 ASSERT(MBLKL(mp) >= (ip_hdr_length + sizeof (*sctph))); 1736 sctph = (sctp_hdr_t *)(mp->b_rptr + ip_hdr_length); 1737 /* 1738 * Zero out the checksum field to ensure proper 1739 * checksum calculation. 1740 */ 1741 sctph->sh_chksum = 0; 1742 #ifdef DEBUG 1743 if (!skip_sctp_cksum) 1744 #endif 1745 sctph->sh_chksum = sctp_cksum(mp, ip_hdr_length); 1746 goto ip_hdr_cksum; 1747 } else if (protocol == IPPROTO_ICMP) { 1748 /* 1749 * Note that we always calculate a SW checksum for ICMP. In the 1750 * future, if HW support for ICMP is advertised, we can change 1751 * this. 1752 */ 1753 return (ip_output_sw_cksum_v4(mp, ipha, ixa)); 1754 } else { 1755 ip_hdr_cksum: 1756 /* Calculate IPv4 header checksum */ 1757 ipha->ipha_hdr_checksum = 0; 1758 ipha->ipha_hdr_checksum = ip_csum_hdr(ipha); 1759 return (B_TRUE); 1760 } 1761 1762 /* ULP puts the checksum field is in the first mblk */ 1763 ASSERT(((uchar_t *)cksump) + sizeof (uint16_t) <= mp->b_wptr); 1764 1765 /* 1766 * Underlying interface supports hardware checksum offload for 1767 * the payload; leave the payload checksum for the hardware to 1768 * calculate. N.B: We only need to set up checksum info on the 1769 * first mblk. 1770 */ 1771 hck_flags = ill->ill_hcksum_capab->ill_hcksum_txflags; 1772 1773 DB_CKSUMFLAGS(mp) &= ~HCK_FLAGS; 1774 if (hck_flags & HCKSUM_INET_FULL_V4) { 1775 /* 1776 * Hardware calculates pseudo-header, header and the 1777 * payload checksums, so clear the checksum field in 1778 * the protocol header. 1779 */ 1780 *cksump = 0; 1781 DB_CKSUMFLAGS(mp) |= HCK_FULLCKSUM; 1782 1783 ipha->ipha_hdr_checksum = 0; 1784 if (hck_flags & HCKSUM_IPHDRCKSUM) { 1785 DB_CKSUMFLAGS(mp) |= HCK_IPV4_HDRCKSUM; 1786 } else { 1787 ipha->ipha_hdr_checksum = ip_csum_hdr(ipha); 1788 } 1789 return (B_TRUE); 1790 } 1791 if ((hck_flags) & HCKSUM_INET_PARTIAL) { 1792 ipaddr_t dst = ipha->ipha_dst; 1793 ipaddr_t src = ipha->ipha_src; 1794 /* 1795 * Partial checksum offload has been enabled. Fill 1796 * the checksum field in the protocol header with the 1797 * pseudo-header checksum value. 1798 * 1799 * We accumulate the pseudo header checksum in cksum. 1800 * This is pretty hairy code, so watch close. One 1801 * thing to keep in mind is that UDP and TCP have 1802 * stored their respective datagram lengths in their 1803 * checksum fields. This lines things up real nice. 1804 */ 1805 cksum += (dst >> 16) + (dst & 0xFFFF) + 1806 (src >> 16) + (src & 0xFFFF); 1807 cksum += *(cksump); 1808 cksum = (cksum & 0xFFFF) + (cksum >> 16); 1809 *(cksump) = (cksum & 0xFFFF) + (cksum >> 16); 1810 1811 /* 1812 * Offsets are relative to beginning of IP header. 1813 */ 1814 DB_CKSUMSTART(mp) = ip_hdr_length; 1815 DB_CKSUMSTUFF(mp) = (uint8_t *)cksump - (uint8_t *)ipha; 1816 DB_CKSUMEND(mp) = pktlen; 1817 DB_CKSUMFLAGS(mp) |= HCK_PARTIALCKSUM; 1818 1819 ipha->ipha_hdr_checksum = 0; 1820 if (hck_flags & HCKSUM_IPHDRCKSUM) { 1821 DB_CKSUMFLAGS(mp) |= HCK_IPV4_HDRCKSUM; 1822 } else { 1823 ipha->ipha_hdr_checksum = ip_csum_hdr(ipha); 1824 } 1825 return (B_TRUE); 1826 } 1827 /* Hardware capabilities include neither full nor partial IPv4 */ 1828 return (ip_output_sw_cksum_v4(mp, ipha, ixa)); 1829 } 1830 1831 /* 1832 * ire_sendfn for offlink and onlink destinations. 1833 * Also called from the multicast, broadcast, multirt send functions. 1834 * 1835 * Assumes that the caller has a hold on the ire. 1836 * 1837 * This function doesn't care if the IRE just became condemned since that 1838 * can happen at any time. 1839 */ 1840 /* ARGSUSED */ 1841 int 1842 ire_send_wire_v4(ire_t *ire, mblk_t *mp, void *iph_arg, 1843 ip_xmit_attr_t *ixa, uint32_t *identp) 1844 { 1845 ip_stack_t *ipst = ixa->ixa_ipst; 1846 ipha_t *ipha = (ipha_t *)iph_arg; 1847 iaflags_t ixaflags = ixa->ixa_flags; 1848 ill_t *ill; 1849 1850 ASSERT(ixa->ixa_nce != NULL); 1851 ill = ixa->ixa_nce->nce_ill; 1852 1853 if (ixaflags & IXAF_DONTROUTE) 1854 ipha->ipha_ttl = 1; 1855 1856 /* 1857 * Assign an ident value for this packet. There could be other 1858 * threads targeting the same destination, so we have to arrange 1859 * for a atomic increment. Note that we use a 32-bit atomic add 1860 * because it has better performance than its 16-bit sibling. 1861 * 1862 * Normally ixa_extra_ident is 0, but in the case of LSO it will 1863 * be the number of TCP segments that the driver/hardware will 1864 * extraly construct. 1865 * 1866 * If running in cluster mode and if the source address 1867 * belongs to a replicated service then vector through 1868 * cl_inet_ipident vector to allocate ip identifier 1869 * NOTE: This is a contract private interface with the 1870 * clustering group. 1871 */ 1872 if (cl_inet_ipident != NULL) { 1873 ipaddr_t src = ipha->ipha_src; 1874 ipaddr_t dst = ipha->ipha_dst; 1875 netstackid_t stack_id = ipst->ips_netstack->netstack_stackid; 1876 1877 ASSERT(cl_inet_isclusterwide != NULL); 1878 if ((*cl_inet_isclusterwide)(stack_id, IPPROTO_IP, 1879 AF_INET, (uint8_t *)(uintptr_t)src, NULL)) { 1880 /* 1881 * Note: not correct with LSO since we can't allocate 1882 * ixa_extra_ident+1 consecutive values. 1883 */ 1884 ipha->ipha_ident = (*cl_inet_ipident)(stack_id, 1885 IPPROTO_IP, AF_INET, (uint8_t *)(uintptr_t)src, 1886 (uint8_t *)(uintptr_t)dst, NULL); 1887 } else { 1888 ipha->ipha_ident = atomic_add_32_nv(identp, 1889 ixa->ixa_extra_ident + 1); 1890 } 1891 } else { 1892 ipha->ipha_ident = atomic_add_32_nv(identp, 1893 ixa->ixa_extra_ident + 1); 1894 } 1895 #ifndef _BIG_ENDIAN 1896 ipha->ipha_ident = htons(ipha->ipha_ident); 1897 #endif 1898 1899 /* 1900 * This might set b_band, thus the IPsec and fragmentation 1901 * code in IP ensures that b_band is updated in the first mblk. 1902 */ 1903 if (IPP_ENABLED(IPP_LOCAL_OUT, ipst)) { 1904 /* ip_process translates an IS_UNDER_IPMP */ 1905 mp = ip_process(IPP_LOCAL_OUT, mp, ill, ill); 1906 if (mp == NULL) { 1907 /* ip_drop_packet and MIB done */ 1908 return (0); /* Might just be delayed */ 1909 } 1910 } 1911 1912 /* 1913 * Verify any IPv4 options. 1914 * 1915 * The presence of IP options also forces the network stack to 1916 * calculate the checksum in software. This is because: 1917 * 1918 * Wrap around: certain partial-checksum NICs (eri, ce) limit 1919 * the size of "start offset" width to 6-bit. This effectively 1920 * sets the largest value of the offset to 64-bytes, starting 1921 * from the MAC header. When the cumulative MAC and IP headers 1922 * exceed such limit, the offset will wrap around. This causes 1923 * the checksum to be calculated at the wrong place. 1924 * 1925 * IPv4 source routing: none of the full-checksum capable NICs 1926 * is capable of correctly handling the IPv4 source-routing 1927 * option for purposes of calculating the pseudo-header; the 1928 * actual destination is different from the destination in the 1929 * header which is that of the next-hop. (This case may not be 1930 * true for NICs which can parse IPv6 extension headers, but 1931 * we choose to simplify the implementation by not offloading 1932 * checksum when they are present.) 1933 */ 1934 if (!IS_SIMPLE_IPH(ipha)) { 1935 ixaflags = ixa->ixa_flags |= IXAF_NO_HW_CKSUM; 1936 /* An IS_UNDER_IPMP ill is ok here */ 1937 if (ip_output_options(mp, ipha, ixa, ill)) { 1938 /* Packet has been consumed and ICMP error sent */ 1939 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); 1940 return (EINVAL); 1941 } 1942 } 1943 1944 /* 1945 * To handle IPsec/iptun's labeling needs we need to tag packets 1946 * while we still have ixa_tsl 1947 */ 1948 if (is_system_labeled() && ixa->ixa_tsl != NULL && 1949 (ill->ill_mactype == DL_6TO4 || ill->ill_mactype == DL_IPV4 || 1950 ill->ill_mactype == DL_IPV6)) { 1951 cred_t *newcr; 1952 1953 newcr = copycred_from_tslabel(ixa->ixa_cred, ixa->ixa_tsl, 1954 KM_NOSLEEP); 1955 if (newcr == NULL) { 1956 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); 1957 ip_drop_output("ipIfStatsOutDiscards - newcr", 1958 mp, ill); 1959 freemsg(mp); 1960 return (ENOBUFS); 1961 } 1962 mblk_setcred(mp, newcr, NOPID); 1963 crfree(newcr); /* mblk_setcred did its own crhold */ 1964 } 1965 1966 if (ixa->ixa_pktlen > ixa->ixa_fragsize || 1967 (ixaflags & IXAF_IPSEC_SECURE)) { 1968 uint32_t pktlen; 1969 1970 pktlen = ixa->ixa_pktlen; 1971 if (ixaflags & IXAF_IPSEC_SECURE) 1972 pktlen += ipsec_out_extra_length(ixa); 1973 1974 if (pktlen > IP_MAXPACKET) 1975 return (EMSGSIZE); 1976 1977 if (ixaflags & IXAF_SET_ULP_CKSUM) { 1978 /* 1979 * Compute ULP checksum and IP header checksum 1980 * using software 1981 */ 1982 if (!ip_output_sw_cksum_v4(mp, ipha, ixa)) { 1983 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); 1984 ip_drop_output("ipIfStatsOutDiscards", mp, ill); 1985 freemsg(mp); 1986 return (EINVAL); 1987 } 1988 } else { 1989 /* Calculate IPv4 header checksum */ 1990 ipha->ipha_hdr_checksum = 0; 1991 ipha->ipha_hdr_checksum = ip_csum_hdr(ipha); 1992 } 1993 1994 /* 1995 * If this packet would generate a icmp_frag_needed 1996 * message, we need to handle it before we do the IPsec 1997 * processing. Otherwise, we need to strip the IPsec 1998 * headers before we send up the message to the ULPs 1999 * which becomes messy and difficult. 2000 * 2001 * We check using IXAF_DONTFRAG. The DF bit in the header 2002 * is not inspected - it will be copied to any generated 2003 * fragments. 2004 */ 2005 if ((pktlen > ixa->ixa_fragsize) && 2006 (ixaflags & IXAF_DONTFRAG)) { 2007 /* Generate ICMP and return error */ 2008 ip_recv_attr_t iras; 2009 2010 DTRACE_PROBE4(ip4__fragsize__fail, uint_t, pktlen, 2011 uint_t, ixa->ixa_fragsize, uint_t, ixa->ixa_pktlen, 2012 uint_t, ixa->ixa_pmtu); 2013 2014 bzero(&iras, sizeof (iras)); 2015 /* Map ixa to ira including IPsec policies */ 2016 ipsec_out_to_in(ixa, ill, &iras); 2017 2018 ip_drop_output("ICMP_FRAG_NEEDED", mp, ill); 2019 icmp_frag_needed(mp, ixa->ixa_fragsize, &iras); 2020 /* We moved any IPsec refs from ixa to iras */ 2021 ira_cleanup(&iras, B_FALSE); 2022 return (EMSGSIZE); 2023 } 2024 DTRACE_PROBE4(ip4__fragsize__ok, uint_t, pktlen, 2025 uint_t, ixa->ixa_fragsize, uint_t, ixa->ixa_pktlen, 2026 uint_t, ixa->ixa_pmtu); 2027 2028 if (ixaflags & IXAF_IPSEC_SECURE) { 2029 /* 2030 * Pass in sufficient information so that 2031 * IPsec can determine whether to fragment, and 2032 * which function to call after fragmentation. 2033 */ 2034 return (ipsec_out_process(mp, ixa)); 2035 } 2036 return (ip_fragment_v4(mp, ixa->ixa_nce, ixaflags, 2037 ixa->ixa_pktlen, ixa->ixa_fragsize, ixa->ixa_xmit_hint, 2038 ixa->ixa_zoneid, ixa->ixa_no_loop_zoneid, 2039 ixa->ixa_postfragfn, &ixa->ixa_cookie)); 2040 } 2041 if (ixaflags & IXAF_SET_ULP_CKSUM) { 2042 /* Compute ULP checksum and IP header checksum */ 2043 /* An IS_UNDER_IPMP ill is ok here */ 2044 if (!ip_output_cksum_v4(ixaflags, mp, ipha, ixa, ill)) { 2045 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); 2046 ip_drop_output("ipIfStatsOutDiscards", mp, ill); 2047 freemsg(mp); 2048 return (EINVAL); 2049 } 2050 } else { 2051 /* Calculate IPv4 header checksum */ 2052 ipha->ipha_hdr_checksum = 0; 2053 ipha->ipha_hdr_checksum = ip_csum_hdr(ipha); 2054 } 2055 return ((ixa->ixa_postfragfn)(mp, ixa->ixa_nce, ixaflags, 2056 ixa->ixa_pktlen, ixa->ixa_xmit_hint, ixa->ixa_zoneid, 2057 ixa->ixa_no_loop_zoneid, &ixa->ixa_cookie)); 2058 } 2059 2060 /* 2061 * Send mp into ip_input 2062 * Common for IPv4 and IPv6 2063 */ 2064 void 2065 ip_postfrag_loopback(mblk_t *mp, nce_t *nce, iaflags_t ixaflags, 2066 uint_t pkt_len, zoneid_t nolzid) 2067 { 2068 rtc_t rtc; 2069 ill_t *ill = nce->nce_ill; 2070 ip_recv_attr_t iras; /* NOTE: No bzero for performance */ 2071 ncec_t *ncec; 2072 2073 ncec = nce->nce_common; 2074 iras.ira_flags = IRAF_VERIFY_IP_CKSUM | IRAF_VERIFY_ULP_CKSUM | 2075 IRAF_LOOPBACK | IRAF_L2SRC_LOOPBACK; 2076 if (ncec->ncec_flags & NCE_F_BCAST) 2077 iras.ira_flags |= IRAF_L2DST_BROADCAST; 2078 else if (ncec->ncec_flags & NCE_F_MCAST) 2079 iras.ira_flags |= IRAF_L2DST_MULTICAST; 2080 2081 iras.ira_free_flags = 0; 2082 iras.ira_cred = NULL; 2083 iras.ira_cpid = NOPID; 2084 iras.ira_tsl = NULL; 2085 iras.ira_zoneid = ALL_ZONES; 2086 iras.ira_pktlen = pkt_len; 2087 UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCInOctets, iras.ira_pktlen); 2088 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInReceives); 2089 2090 if (ixaflags & IXAF_IS_IPV4) 2091 iras.ira_flags |= IRAF_IS_IPV4; 2092 2093 iras.ira_ill = iras.ira_rill = ill; 2094 iras.ira_ruifindex = ill->ill_phyint->phyint_ifindex; 2095 iras.ira_rifindex = iras.ira_ruifindex; 2096 iras.ira_mhip = NULL; 2097 2098 iras.ira_flags |= ixaflags & IAF_MASK; 2099 iras.ira_no_loop_zoneid = nolzid; 2100 2101 /* Broadcast and multicast doesn't care about the squeue */ 2102 iras.ira_sqp = NULL; 2103 2104 rtc.rtc_ire = NULL; 2105 if (ixaflags & IXAF_IS_IPV4) { 2106 ipha_t *ipha = (ipha_t *)mp->b_rptr; 2107 2108 rtc.rtc_ipaddr = INADDR_ANY; 2109 2110 (*ill->ill_inputfn)(mp, ipha, &ipha->ipha_dst, &iras, &rtc); 2111 if (rtc.rtc_ire != NULL) { 2112 ASSERT(rtc.rtc_ipaddr != INADDR_ANY); 2113 ire_refrele(rtc.rtc_ire); 2114 } 2115 } else { 2116 ip6_t *ip6h = (ip6_t *)mp->b_rptr; 2117 2118 rtc.rtc_ip6addr = ipv6_all_zeros; 2119 2120 (*ill->ill_inputfn)(mp, ip6h, &ip6h->ip6_dst, &iras, &rtc); 2121 if (rtc.rtc_ire != NULL) { 2122 ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&rtc.rtc_ip6addr)); 2123 ire_refrele(rtc.rtc_ire); 2124 } 2125 } 2126 /* Any references to clean up? No hold on ira */ 2127 if (iras.ira_flags & (IRAF_IPSEC_SECURE|IRAF_SYSTEM_LABELED)) 2128 ira_cleanup(&iras, B_FALSE); 2129 } 2130 2131 /* 2132 * Post fragmentation function for IRE_MULTICAST and IRE_BROADCAST which 2133 * looks at the IXAF_LOOPBACK_COPY flag. 2134 * Common for IPv4 and IPv6. 2135 * 2136 * If the loopback copy fails (due to no memory) but we send the packet out 2137 * on the wire we return no failure. Only in the case we supress the wire 2138 * sending do we take the loopback failure into account. 2139 * 2140 * Note that we do not perform DTRACE_IP7 and FW_HOOKS for the looped back copy. 2141 * Those operations are performed on this packet in ip_xmit() and it would 2142 * be odd to do it twice for the same packet. 2143 */ 2144 int 2145 ip_postfrag_loopcheck(mblk_t *mp, nce_t *nce, iaflags_t ixaflags, 2146 uint_t pkt_len, uint32_t xmit_hint, zoneid_t szone, zoneid_t nolzid, 2147 uintptr_t *ixacookie) 2148 { 2149 ill_t *ill = nce->nce_ill; 2150 int error = 0; 2151 2152 /* 2153 * Check for IXAF_LOOPBACK_COPY - send a copy to ip as if the driver 2154 * had looped it back 2155 */ 2156 if (ixaflags & IXAF_LOOPBACK_COPY) { 2157 mblk_t *mp1; 2158 2159 mp1 = copymsg(mp); 2160 if (mp1 == NULL) { 2161 /* Failed to deliver the loopback copy. */ 2162 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); 2163 ip_drop_output("ipIfStatsOutDiscards", mp, ill); 2164 error = ENOBUFS; 2165 } else { 2166 ip_postfrag_loopback(mp1, nce, ixaflags, pkt_len, 2167 nolzid); 2168 } 2169 } 2170 2171 /* 2172 * If TTL = 0 then only do the loopback to this host i.e. we are 2173 * done. We are also done if this was the 2174 * loopback interface since it is sufficient 2175 * to loopback one copy of a multicast packet. 2176 */ 2177 if (ixaflags & IXAF_IS_IPV4) { 2178 ipha_t *ipha = (ipha_t *)mp->b_rptr; 2179 2180 if (ipha->ipha_ttl == 0) { 2181 ip_drop_output("multicast ipha_ttl not sent to wire", 2182 mp, ill); 2183 freemsg(mp); 2184 return (error); 2185 } 2186 } else { 2187 ip6_t *ip6h = (ip6_t *)mp->b_rptr; 2188 2189 if (ip6h->ip6_hops == 0) { 2190 ip_drop_output("multicast ipha_ttl not sent to wire", 2191 mp, ill); 2192 freemsg(mp); 2193 return (error); 2194 } 2195 } 2196 if (nce->nce_ill->ill_wq == NULL) { 2197 /* Loopback interface */ 2198 ip_drop_output("multicast on lo0 not sent to wire", mp, ill); 2199 freemsg(mp); 2200 return (error); 2201 } 2202 2203 return (ip_xmit(mp, nce, ixaflags, pkt_len, xmit_hint, szone, 0, 2204 ixacookie)); 2205 } 2206 2207 /* 2208 * Post fragmentation function for RTF_MULTIRT routes. 2209 * Since IRE_BROADCASTs can have RTF_MULTIRT, this function 2210 * checks IXAF_LOOPBACK_COPY. 2211 * 2212 * If no packet is sent due to failures then we return an errno, but if at 2213 * least one succeeded we return zero. 2214 */ 2215 int 2216 ip_postfrag_multirt_v4(mblk_t *mp, nce_t *nce, iaflags_t ixaflags, 2217 uint_t pkt_len, uint32_t xmit_hint, zoneid_t szone, zoneid_t nolzid, 2218 uintptr_t *ixacookie) 2219 { 2220 irb_t *irb; 2221 ipha_t *ipha = (ipha_t *)mp->b_rptr; 2222 ire_t *ire; 2223 ire_t *ire1; 2224 mblk_t *mp1; 2225 nce_t *nce1; 2226 ill_t *ill = nce->nce_ill; 2227 ill_t *ill1; 2228 ip_stack_t *ipst = ill->ill_ipst; 2229 int error = 0; 2230 int num_sent = 0; 2231 int err; 2232 uint_t ire_type; 2233 ipaddr_t nexthop; 2234 2235 ASSERT(ixaflags & IXAF_IS_IPV4); 2236 2237 /* Check for IXAF_LOOPBACK_COPY */ 2238 if (ixaflags & IXAF_LOOPBACK_COPY) { 2239 mblk_t *mp1; 2240 2241 mp1 = copymsg(mp); 2242 if (mp1 == NULL) { 2243 /* Failed to deliver the loopback copy. */ 2244 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); 2245 ip_drop_output("ipIfStatsOutDiscards", mp, ill); 2246 error = ENOBUFS; 2247 } else { 2248 ip_postfrag_loopback(mp1, nce, ixaflags, pkt_len, 2249 nolzid); 2250 } 2251 } 2252 2253 /* 2254 * Loop over RTF_MULTIRT for ipha_dst in the same bucket. Send 2255 * a copy to each one. 2256 * Use the nce (nexthop) and ipha_dst to find the ire. 2257 * 2258 * MULTIRT is not designed to work with shared-IP zones thus we don't 2259 * need to pass a zoneid or a label to the IRE lookup. 2260 */ 2261 if (V4_PART_OF_V6(nce->nce_addr) == ipha->ipha_dst) { 2262 /* Broadcast and multicast case */ 2263 ire = ire_ftable_lookup_v4(ipha->ipha_dst, 0, 0, 0, 2264 NULL, ALL_ZONES, NULL, MATCH_IRE_DSTONLY, 0, ipst, NULL); 2265 } else { 2266 ipaddr_t v4addr = V4_PART_OF_V6(nce->nce_addr); 2267 2268 /* Unicast case */ 2269 ire = ire_ftable_lookup_v4(ipha->ipha_dst, 0, v4addr, 0, 2270 NULL, ALL_ZONES, NULL, MATCH_IRE_GW, 0, ipst, NULL); 2271 } 2272 2273 if (ire == NULL || 2274 (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) || 2275 !(ire->ire_flags & RTF_MULTIRT)) { 2276 /* Drop */ 2277 ip_drop_output("ip_postfrag_multirt didn't find route", 2278 mp, nce->nce_ill); 2279 if (ire != NULL) 2280 ire_refrele(ire); 2281 return (ENETUNREACH); 2282 } 2283 2284 irb = ire->ire_bucket; 2285 irb_refhold(irb); 2286 for (ire1 = irb->irb_ire; ire1 != NULL; ire1 = ire1->ire_next) { 2287 /* 2288 * For broadcast we can have a mixture of IRE_BROADCAST and 2289 * IRE_HOST due to the manually added IRE_HOSTs that are used 2290 * to trigger the creation of the special CGTP broadcast routes. 2291 * Thus we have to skip if ire_type doesn't match the original. 2292 */ 2293 if (IRE_IS_CONDEMNED(ire1) || 2294 !(ire1->ire_flags & RTF_MULTIRT) || 2295 ire1->ire_type != ire->ire_type) 2296 continue; 2297 2298 /* Do the ire argument one after the loop */ 2299 if (ire1 == ire) 2300 continue; 2301 2302 ill1 = ire_nexthop_ill(ire1); 2303 if (ill1 == NULL) { 2304 /* 2305 * This ire might not have been picked by 2306 * ire_route_recursive, in which case ire_dep might 2307 * not have been setup yet. 2308 * We kick ire_route_recursive to try to resolve 2309 * starting at ire1. 2310 */ 2311 ire_t *ire2; 2312 uint_t match_flags = MATCH_IRE_DSTONLY; 2313 2314 if (ire1->ire_ill != NULL) 2315 match_flags |= MATCH_IRE_ILL; 2316 ire2 = ire_route_recursive_impl_v4(ire1, 2317 ire1->ire_addr, ire1->ire_type, ire1->ire_ill, 2318 ire1->ire_zoneid, NULL, match_flags, 2319 IRR_ALLOCATE, 0, ipst, NULL, NULL, NULL); 2320 if (ire2 != NULL) 2321 ire_refrele(ire2); 2322 ill1 = ire_nexthop_ill(ire1); 2323 } 2324 2325 if (ill1 == NULL) { 2326 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); 2327 ip_drop_output("ipIfStatsOutDiscards - no ill", 2328 mp, ill); 2329 error = ENETUNREACH; 2330 continue; 2331 } 2332 2333 /* Pick the addr and type to use for arp_nce_init */ 2334 if (nce->nce_common->ncec_flags & NCE_F_BCAST) { 2335 ire_type = IRE_BROADCAST; 2336 nexthop = ire1->ire_gateway_addr; 2337 } else if (nce->nce_common->ncec_flags & NCE_F_MCAST) { 2338 ire_type = IRE_MULTICAST; 2339 nexthop = ipha->ipha_dst; 2340 } else { 2341 ire_type = ire1->ire_type; /* Doesn't matter */ 2342 nexthop = ire1->ire_gateway_addr; 2343 } 2344 2345 /* If IPMP meta or under, then we just drop */ 2346 if (ill1->ill_grp != NULL) { 2347 BUMP_MIB(ill1->ill_ip_mib, ipIfStatsOutDiscards); 2348 ip_drop_output("ipIfStatsOutDiscards - IPMP", 2349 mp, ill1); 2350 ill_refrele(ill1); 2351 error = ENETUNREACH; 2352 continue; 2353 } 2354 2355 nce1 = arp_nce_init(ill1, nexthop, ire_type); 2356 if (nce1 == NULL) { 2357 BUMP_MIB(ill1->ill_ip_mib, ipIfStatsOutDiscards); 2358 ip_drop_output("ipIfStatsOutDiscards - no nce", 2359 mp, ill1); 2360 ill_refrele(ill1); 2361 error = ENETUNREACH; 2362 continue; 2363 } 2364 mp1 = copymsg(mp); 2365 if (mp1 == NULL) { 2366 BUMP_MIB(ill1->ill_ip_mib, ipIfStatsOutDiscards); 2367 ip_drop_output("ipIfStatsOutDiscards", mp, ill1); 2368 nce_refrele(nce1); 2369 ill_refrele(ill1); 2370 error = ENOBUFS; 2371 continue; 2372 } 2373 /* Preserve HW checksum for this copy */ 2374 DB_CKSUMSTART(mp1) = DB_CKSUMSTART(mp); 2375 DB_CKSUMSTUFF(mp1) = DB_CKSUMSTUFF(mp); 2376 DB_CKSUMEND(mp1) = DB_CKSUMEND(mp); 2377 DB_CKSUMFLAGS(mp1) = DB_CKSUMFLAGS(mp); 2378 DB_LSOMSS(mp1) = DB_LSOMSS(mp); 2379 2380 ire1->ire_ob_pkt_count++; 2381 err = ip_xmit(mp1, nce1, ixaflags, pkt_len, xmit_hint, szone, 2382 0, ixacookie); 2383 if (err == 0) 2384 num_sent++; 2385 else 2386 error = err; 2387 nce_refrele(nce1); 2388 ill_refrele(ill1); 2389 } 2390 irb_refrele(irb); 2391 ire_refrele(ire); 2392 /* Finally, the main one */ 2393 err = ip_xmit(mp, nce, ixaflags, pkt_len, xmit_hint, szone, 0, 2394 ixacookie); 2395 if (err == 0) 2396 num_sent++; 2397 else 2398 error = err; 2399 if (num_sent > 0) 2400 return (0); 2401 else 2402 return (error); 2403 } 2404 2405 /* 2406 * Verify local connectivity. This check is called by ULP fusion code. 2407 * The generation number on an IRE_LOCAL or IRE_LOOPBACK only changes if 2408 * the interface is brought down and back up. So we simply fail the local 2409 * process. The caller, TCP Fusion, should unfuse the connection. 2410 */ 2411 boolean_t 2412 ip_output_verify_local(ip_xmit_attr_t *ixa) 2413 { 2414 ire_t *ire = ixa->ixa_ire; 2415 2416 if (!(ire->ire_type & (IRE_LOCAL | IRE_LOOPBACK))) 2417 return (B_FALSE); 2418 2419 return (ixa->ixa_ire->ire_generation == ixa->ixa_ire_generation); 2420 } 2421 2422 /* 2423 * Local process for ULP loopback, TCP Fusion. Handle both IPv4 and IPv6. 2424 * 2425 * The caller must call ip_output_verify_local() first. This function handles 2426 * IPobs, FW_HOOKS, and/or IPsec cases sequentially. 2427 */ 2428 mblk_t * 2429 ip_output_process_local(mblk_t *mp, ip_xmit_attr_t *ixa, boolean_t hooks_out, 2430 boolean_t hooks_in, conn_t *peer_connp) 2431 { 2432 ill_t *ill = ixa->ixa_ire->ire_ill; 2433 ipha_t *ipha = NULL; 2434 ip6_t *ip6h = NULL; 2435 ip_stack_t *ipst = ixa->ixa_ipst; 2436 iaflags_t ixaflags = ixa->ixa_flags; 2437 ip_recv_attr_t iras; 2438 int error; 2439 2440 ASSERT(mp != NULL); 2441 2442 if (ixaflags & IXAF_IS_IPV4) { 2443 ipha = (ipha_t *)mp->b_rptr; 2444 2445 /* 2446 * If a callback is enabled then we need to know the 2447 * source and destination zoneids for the packet. We already 2448 * have those handy. 2449 */ 2450 if (ipst->ips_ip4_observe.he_interested) { 2451 zoneid_t szone, dzone; 2452 zoneid_t stackzoneid; 2453 2454 stackzoneid = netstackid_to_zoneid( 2455 ipst->ips_netstack->netstack_stackid); 2456 2457 if (stackzoneid == GLOBAL_ZONEID) { 2458 /* Shared-IP zone */ 2459 dzone = ixa->ixa_ire->ire_zoneid; 2460 szone = ixa->ixa_zoneid; 2461 } else { 2462 szone = dzone = stackzoneid; 2463 } 2464 ipobs_hook(mp, IPOBS_HOOK_LOCAL, szone, dzone, ill, 2465 ipst); 2466 } 2467 DTRACE_IP7(send, mblk_t *, mp, conn_t *, NULL, void_ip_t *, 2468 ipha, __dtrace_ipsr_ill_t *, ill, ipha_t *, ipha, ip6_t *, 2469 NULL, int, 1); 2470 2471 /* FW_HOOKS: LOOPBACK_OUT */ 2472 if (hooks_out) { 2473 DTRACE_PROBE4(ip4__loopback__out__start, ill_t *, NULL, 2474 ill_t *, ill, ipha_t *, ipha, mblk_t *, mp); 2475 FW_HOOKS(ipst->ips_ip4_loopback_out_event, 2476 ipst->ips_ipv4firewall_loopback_out, 2477 NULL, ill, ipha, mp, mp, 0, ipst, error); 2478 DTRACE_PROBE1(ip4__loopback__out__end, mblk_t *, mp); 2479 } 2480 if (mp == NULL) 2481 return (NULL); 2482 2483 /* FW_HOOKS: LOOPBACK_IN */ 2484 if (hooks_in) { 2485 DTRACE_PROBE4(ip4__loopback__in__start, ill_t *, ill, 2486 ill_t *, NULL, ipha_t *, ipha, mblk_t *, mp); 2487 FW_HOOKS(ipst->ips_ip4_loopback_in_event, 2488 ipst->ips_ipv4firewall_loopback_in, 2489 ill, NULL, ipha, mp, mp, 0, ipst, error); 2490 DTRACE_PROBE1(ip4__loopback__in__end, mblk_t *, mp); 2491 } 2492 if (mp == NULL) 2493 return (NULL); 2494 2495 DTRACE_IP7(receive, mblk_t *, mp, conn_t *, NULL, void_ip_t *, 2496 ipha, __dtrace_ipsr_ill_t *, ill, ipha_t *, ipha, ip6_t *, 2497 NULL, int, 1); 2498 2499 /* Inbound IPsec polocies */ 2500 if (peer_connp != NULL) { 2501 /* Map ixa to ira including IPsec policies. */ 2502 ipsec_out_to_in(ixa, ill, &iras); 2503 mp = ipsec_check_inbound_policy(mp, peer_connp, ipha, 2504 NULL, &iras); 2505 } 2506 } else { 2507 ip6h = (ip6_t *)mp->b_rptr; 2508 2509 /* 2510 * If a callback is enabled then we need to know the 2511 * source and destination zoneids for the packet. We already 2512 * have those handy. 2513 */ 2514 if (ipst->ips_ip6_observe.he_interested) { 2515 zoneid_t szone, dzone; 2516 zoneid_t stackzoneid; 2517 2518 stackzoneid = netstackid_to_zoneid( 2519 ipst->ips_netstack->netstack_stackid); 2520 2521 if (stackzoneid == GLOBAL_ZONEID) { 2522 /* Shared-IP zone */ 2523 dzone = ixa->ixa_ire->ire_zoneid; 2524 szone = ixa->ixa_zoneid; 2525 } else { 2526 szone = dzone = stackzoneid; 2527 } 2528 ipobs_hook(mp, IPOBS_HOOK_LOCAL, szone, dzone, ill, 2529 ipst); 2530 } 2531 DTRACE_IP7(send, mblk_t *, mp, conn_t *, NULL, void_ip_t *, 2532 ip6h, __dtrace_ipsr_ill_t *, ill, ipha_t *, NULL, ip6_t *, 2533 ip6h, int, 1); 2534 2535 /* FW_HOOKS: LOOPBACK_OUT */ 2536 if (hooks_out) { 2537 DTRACE_PROBE4(ip6__loopback__out__start, ill_t *, NULL, 2538 ill_t *, ill, ip6_t *, ip6h, mblk_t *, mp); 2539 FW_HOOKS6(ipst->ips_ip6_loopback_out_event, 2540 ipst->ips_ipv6firewall_loopback_out, 2541 NULL, ill, ip6h, mp, mp, 0, ipst, error); 2542 DTRACE_PROBE1(ip6__loopback__out__end, mblk_t *, mp); 2543 } 2544 if (mp == NULL) 2545 return (NULL); 2546 2547 /* FW_HOOKS: LOOPBACK_IN */ 2548 if (hooks_in) { 2549 DTRACE_PROBE4(ip6__loopback__in__start, ill_t *, ill, 2550 ill_t *, NULL, ip6_t *, ip6h, mblk_t *, mp); 2551 FW_HOOKS6(ipst->ips_ip6_loopback_in_event, 2552 ipst->ips_ipv6firewall_loopback_in, 2553 ill, NULL, ip6h, mp, mp, 0, ipst, error); 2554 DTRACE_PROBE1(ip6__loopback__in__end, mblk_t *, mp); 2555 } 2556 if (mp == NULL) 2557 return (NULL); 2558 2559 DTRACE_IP7(receive, mblk_t *, mp, conn_t *, NULL, void_ip_t *, 2560 ip6h, __dtrace_ipsr_ill_t *, ill, ipha_t *, NULL, ip6_t *, 2561 ip6h, int, 1); 2562 2563 /* Inbound IPsec polocies */ 2564 if (peer_connp != NULL) { 2565 /* Map ixa to ira including IPsec policies. */ 2566 ipsec_out_to_in(ixa, ill, &iras); 2567 mp = ipsec_check_inbound_policy(mp, peer_connp, NULL, 2568 ip6h, &iras); 2569 } 2570 } 2571 2572 if (mp == NULL) { 2573 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 2574 ip_drop_input("ipIfStatsInDiscards", NULL, ill); 2575 } 2576 2577 return (mp); 2578 } 2579