1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 /* Copyright (c) 1990 Mentat Inc. */ 27 28 #include <sys/types.h> 29 #include <sys/stream.h> 30 #include <sys/strsubr.h> 31 #include <sys/dlpi.h> 32 #include <sys/strsun.h> 33 #include <sys/zone.h> 34 #include <sys/ddi.h> 35 #include <sys/sunddi.h> 36 #include <sys/cmn_err.h> 37 #include <sys/debug.h> 38 #include <sys/atomic.h> 39 40 #include <sys/systm.h> 41 #include <sys/param.h> 42 #include <sys/kmem.h> 43 #include <sys/sdt.h> 44 #include <sys/socket.h> 45 #include <sys/mac.h> 46 #include <net/if.h> 47 #include <net/if_arp.h> 48 #include <net/route.h> 49 #include <sys/sockio.h> 50 #include <netinet/in.h> 51 #include <net/if_dl.h> 52 53 #include <inet/common.h> 54 #include <inet/mi.h> 55 #include <inet/mib2.h> 56 #include <inet/nd.h> 57 #include <inet/arp.h> 58 #include <inet/snmpcom.h> 59 #include <inet/kstatcom.h> 60 61 #include <netinet/igmp_var.h> 62 #include <netinet/ip6.h> 63 #include <netinet/icmp6.h> 64 #include <netinet/sctp.h> 65 66 #include <inet/ip.h> 67 #include <inet/ip_impl.h> 68 #include <inet/ip6.h> 69 #include <inet/ip6_asp.h> 70 #include <inet/tcp.h> 71 #include <inet/ip_multi.h> 72 #include <inet/ip_if.h> 73 #include <inet/ip_ire.h> 74 #include <inet/ip_ftable.h> 75 #include <inet/ip_rts.h> 76 #include <inet/optcom.h> 77 #include <inet/ip_ndp.h> 78 #include <inet/ip_listutils.h> 79 #include <netinet/igmp.h> 80 #include <netinet/ip_mroute.h> 81 #include <inet/ipp_common.h> 82 83 #include <net/pfkeyv2.h> 84 #include <inet/sadb.h> 85 #include <inet/ipsec_impl.h> 86 #include <inet/ipdrop.h> 87 #include <inet/ip_netinfo.h> 88 89 #include <sys/pattr.h> 90 #include <inet/ipclassifier.h> 91 #include <inet/sctp_ip.h> 92 #include <inet/sctp/sctp_impl.h> 93 #include <inet/udp_impl.h> 94 #include <sys/sunddi.h> 95 96 #include <sys/tsol/label.h> 97 #include <sys/tsol/tnet.h> 98 99 #include <sys/clock_impl.h> /* For LBOLT_FASTPATH{,64} */ 100 101 #ifdef DEBUG 102 extern boolean_t skip_sctp_cksum; 103 #endif 104 105 static int ip_verify_nce(mblk_t *, ip_xmit_attr_t *); 106 static int ip_verify_dce(mblk_t *, ip_xmit_attr_t *); 107 static boolean_t ip_verify_lso(ill_t *, ip_xmit_attr_t *); 108 static boolean_t ip_verify_zcopy(ill_t *, ip_xmit_attr_t *); 109 static void ip_output_simple_broadcast(ip_xmit_attr_t *, mblk_t *); 110 111 /* 112 * There are two types of output functions for IP used for different 113 * purposes: 114 * - ip_output_simple() is when sending ICMP errors, TCP resets, etc when there 115 * is no context in the form of a conn_t. However, there is a 116 * ip_xmit_attr_t that the callers use to influence interface selection 117 * (needed for ICMP echo as well as IPv6 link-locals) and IPsec. 118 * 119 * - conn_ip_output() is used when sending packets with a conn_t and 120 * ip_set_destination has been called to cache information. In that case 121 * various socket options are recorded in the ip_xmit_attr_t and should 122 * be taken into account. 123 */ 124 125 /* 126 * The caller *must* have called conn_connect() or ip_attr_connect() 127 * before calling conn_ip_output(). The caller needs to redo that each time 128 * the destination IP address or port changes, as well as each time there is 129 * a change to any socket option that would modify how packets are routed out 130 * of the box (e.g., SO_DONTROUTE, IP_NEXTHOP, IP_BOUND_IF). 131 * 132 * The ULP caller has to serialize the use of a single ip_xmit_attr_t. 133 * We assert for that here. 134 */ 135 int 136 conn_ip_output(mblk_t *mp, ip_xmit_attr_t *ixa) 137 { 138 iaflags_t ixaflags = ixa->ixa_flags; 139 ire_t *ire; 140 nce_t *nce; 141 dce_t *dce; 142 ill_t *ill; 143 ip_stack_t *ipst = ixa->ixa_ipst; 144 int error; 145 146 /* We defer ipIfStatsHCOutRequests until an error or we have an ill */ 147 148 ASSERT(ixa->ixa_ire != NULL); 149 /* Note there is no ixa_nce when reject and blackhole routes */ 150 ASSERT(ixa->ixa_dce != NULL); /* Could be default dce */ 151 152 #ifdef DEBUG 153 ASSERT(ixa->ixa_curthread == NULL); 154 ixa->ixa_curthread = curthread; 155 #endif 156 157 /* 158 * Even on labeled systems we can have a NULL ixa_tsl e.g., 159 * for IGMP/MLD traffic. 160 */ 161 162 ire = ixa->ixa_ire; 163 164 /* 165 * If the ULP says the (old) IRE resulted in reachability we 166 * record this before determine whether to use a new IRE. 167 * No locking for performance reasons. 168 */ 169 if (ixaflags & IXAF_REACH_CONF) 170 ire->ire_badcnt = 0; 171 172 /* 173 * Has routing changed since we cached the results of the lookup? 174 * 175 * This check captures all of: 176 * - the cached ire being deleted (by means of the special 177 * IRE_GENERATION_CONDEMNED) 178 * - A potentially better ire being added (ire_generation being 179 * increased) 180 * - A deletion of the nexthop ire that was used when we did the 181 * lookup. 182 * - An addition of a potentially better nexthop ire. 183 * The last two are handled by walking and increasing the generation 184 * number on all dependant IREs in ire_flush_cache(). 185 * 186 * The check also handles all cases of RTF_REJECT and RTF_BLACKHOLE 187 * since we ensure that each time we set ixa_ire to such an IRE we 188 * make sure the ixa_ire_generation does not match (by using 189 * IRE_GENERATION_VERIFY). 190 */ 191 if (ire->ire_generation != ixa->ixa_ire_generation) { 192 error = ip_verify_ire(mp, ixa); 193 if (error != 0) { 194 ip_drop_output("ipIfStatsOutDiscards - verify ire", 195 mp, NULL); 196 goto drop; 197 } 198 ire = ixa->ixa_ire; 199 ASSERT(ire != NULL); 200 if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) { 201 #ifdef DEBUG 202 ASSERT(ixa->ixa_curthread == curthread); 203 ixa->ixa_curthread = NULL; 204 #endif 205 ire->ire_ob_pkt_count++; 206 /* ixa_dce might be condemned; use default one */ 207 return ((ire->ire_sendfn)(ire, mp, mp->b_rptr, ixa, 208 &ipst->ips_dce_default->dce_ident)); 209 } 210 /* 211 * If the ncec changed then ip_verify_ire already set 212 * ixa->ixa_dce_generation = DCE_GENERATION_VERIFY; 213 * so we can recheck the interface mtu. 214 */ 215 216 /* 217 * Note that ire->ire_generation could already have changed. 218 * We catch that next time we send a packet. 219 */ 220 } 221 222 /* 223 * No need to lock access to ixa_nce since the ip_xmit_attr usage 224 * is single threaded. 225 */ 226 ASSERT(ixa->ixa_nce != NULL); 227 nce = ixa->ixa_nce; 228 if (nce->nce_is_condemned) { 229 error = ip_verify_nce(mp, ixa); 230 /* 231 * In case ZEROCOPY capability become not available, we 232 * copy the message and free the original one. We might 233 * be copying more data than needed but it doesn't hurt 234 * since such change rarely happens. 235 */ 236 switch (error) { 237 case 0: 238 break; 239 case ENOTSUP: { /* ZEROCOPY */ 240 mblk_t *nmp; 241 242 if ((nmp = copymsg(mp)) != NULL) { 243 freemsg(mp); 244 mp = nmp; 245 246 break; 247 } 248 /* FALLTHROUGH */ 249 } 250 default: 251 ip_drop_output("ipIfStatsOutDiscards - verify nce", 252 mp, NULL); 253 goto drop; 254 } 255 ire = ixa->ixa_ire; 256 ASSERT(ire != NULL); 257 if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) { 258 #ifdef DEBUG 259 ASSERT(ixa->ixa_curthread == curthread); 260 ixa->ixa_curthread = NULL; 261 #endif 262 ire->ire_ob_pkt_count++; 263 /* ixa_dce might be condemned; use default one */ 264 return ((ire->ire_sendfn)(ire, mp, mp->b_rptr, 265 ixa, &ipst->ips_dce_default->dce_ident)); 266 } 267 ASSERT(ixa->ixa_nce != NULL); 268 nce = ixa->ixa_nce; 269 270 /* 271 * Note that some other event could already have made 272 * the new nce condemned. We catch that next time we 273 * try to send a packet. 274 */ 275 } 276 /* 277 * If there is no per-destination dce_t then we have a reference to 278 * the default dce_t (which merely contains the dce_ipid). 279 * The generation check captures both the introduction of a 280 * per-destination dce_t (e.g., due to ICMP packet too big) and 281 * any change to the per-destination dce (including it becoming 282 * condemned by use of the special DCE_GENERATION_CONDEMNED). 283 */ 284 dce = ixa->ixa_dce; 285 286 /* 287 * To avoid a periodic timer to increase the path MTU we 288 * look at dce_last_change_time each time we send a packet. 289 */ 290 if (dce->dce_flags & DCEF_PMTU) { 291 int64_t now = LBOLT_FASTPATH64; 292 293 if ((TICK_TO_SEC(now) - dce->dce_last_change_time > 294 ipst->ips_ip_pathmtu_interval)) { 295 /* 296 * Older than 20 minutes. Drop the path MTU information. 297 * Since the path MTU changes as a result of this, 298 * twiddle ixa_dce_generation to make us go through the 299 * dce verification code in conn_ip_output. 300 */ 301 mutex_enter(&dce->dce_lock); 302 dce->dce_flags &= ~(DCEF_PMTU|DCEF_TOO_SMALL_PMTU); 303 dce->dce_last_change_time = TICK_TO_SEC(now); 304 mutex_exit(&dce->dce_lock); 305 dce_increment_generation(dce); 306 } 307 } 308 309 if (dce->dce_generation != ixa->ixa_dce_generation) { 310 error = ip_verify_dce(mp, ixa); 311 if (error != 0) { 312 ip_drop_output("ipIfStatsOutDiscards - verify dce", 313 mp, NULL); 314 goto drop; 315 } 316 dce = ixa->ixa_dce; 317 318 /* 319 * Note that some other event could already have made the 320 * new dce's generation number change. 321 * We catch that next time we try to send a packet. 322 */ 323 } 324 325 ill = nce->nce_ill; 326 327 /* 328 * An initial ixa_fragsize was set in ip_set_destination 329 * and we update it if any routing changes above. 330 * A change to ill_mtu with ifconfig will increase all dce_generation 331 * so that we will detect that with the generation check. 332 */ 333 334 /* 335 * Caller needs to make sure IXAF_VERIFY_SRC is not set if 336 * conn_unspec_src. 337 */ 338 if ((ixaflags & IXAF_VERIFY_SOURCE) && 339 ixa->ixa_src_generation != ipst->ips_src_generation) { 340 /* Check if the IP source is still assigned to the host. */ 341 uint_t gen; 342 343 if (!ip_verify_src(mp, ixa, &gen)) { 344 /* Don't send a packet with a source that isn't ours */ 345 error = EADDRNOTAVAIL; 346 ip_drop_output("ipIfStatsOutDiscards - invalid src", 347 mp, NULL); 348 goto drop; 349 } 350 /* The source is still valid - update the generation number */ 351 ixa->ixa_src_generation = gen; 352 } 353 354 /* 355 * We don't have an IRE when we fragment, hence ire_ob_pkt_count 356 * can only count the use prior to fragmentation. However the MIB 357 * counters on the ill will be incremented in post fragmentation. 358 */ 359 ire->ire_ob_pkt_count++; 360 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCOutRequests); 361 362 /* 363 * Based on ire_type and ire_flags call one of: 364 * ire_send_local_v* - for IRE_LOCAL and IRE_LOOPBACK 365 * ire_send_multirt_v* - if RTF_MULTIRT 366 * ire_send_noroute_v* - if RTF_REJECT or RTF_BLACHOLE 367 * ire_send_multicast_v* - for IRE_MULTICAST 368 * ire_send_broadcast_v4 - for IRE_BROADCAST 369 * ire_send_wire_v* - for the rest. 370 */ 371 #ifdef DEBUG 372 ASSERT(ixa->ixa_curthread == curthread); 373 ixa->ixa_curthread = NULL; 374 #endif 375 return ((ire->ire_sendfn)(ire, mp, mp->b_rptr, ixa, &dce->dce_ident)); 376 377 drop: 378 if (ixaflags & IXAF_IS_IPV4) { 379 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests); 380 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards); 381 } else { 382 BUMP_MIB(&ipst->ips_ip6_mib, ipIfStatsHCOutRequests); 383 BUMP_MIB(&ipst->ips_ip6_mib, ipIfStatsOutDiscards); 384 } 385 freemsg(mp); 386 #ifdef DEBUG 387 ASSERT(ixa->ixa_curthread == curthread); 388 ixa->ixa_curthread = NULL; 389 #endif 390 return (error); 391 } 392 393 /* 394 * Handle both IPv4 and IPv6. Sets the generation number 395 * to allow the caller to know when to call us again. 396 * Returns true if the source address in the packet is a valid source. 397 * We handle callers which try to send with a zero address (since we only 398 * get here if UNSPEC_SRC is not set). 399 */ 400 boolean_t 401 ip_verify_src(mblk_t *mp, ip_xmit_attr_t *ixa, uint_t *generationp) 402 { 403 ip_stack_t *ipst = ixa->ixa_ipst; 404 405 /* 406 * Need to grab the generation number before we check to 407 * avoid a race with a change to the set of local addresses. 408 * No lock needed since the thread which updates the set of local 409 * addresses use ipif/ill locks and exit those (hence a store memory 410 * barrier) before doing the atomic increase of ips_src_generation. 411 */ 412 if (generationp != NULL) 413 *generationp = ipst->ips_src_generation; 414 415 if (ixa->ixa_flags & IXAF_IS_IPV4) { 416 ipha_t *ipha = (ipha_t *)mp->b_rptr; 417 418 if (ipha->ipha_src == INADDR_ANY) 419 return (B_FALSE); 420 421 return (ip_laddr_verify_v4(ipha->ipha_src, ixa->ixa_zoneid, 422 ipst, B_FALSE) != IPVL_BAD); 423 } else { 424 ip6_t *ip6h = (ip6_t *)mp->b_rptr; 425 uint_t scopeid; 426 427 if (IN6_IS_ADDR_UNSPECIFIED(&ip6h->ip6_src)) 428 return (B_FALSE); 429 430 if (ixa->ixa_flags & IXAF_SCOPEID_SET) 431 scopeid = ixa->ixa_scopeid; 432 else 433 scopeid = 0; 434 435 return (ip_laddr_verify_v6(&ip6h->ip6_src, ixa->ixa_zoneid, 436 ipst, B_FALSE, scopeid) != IPVL_BAD); 437 } 438 } 439 440 /* 441 * Handle both IPv4 and IPv6. Reverify/recalculate the IRE to use. 442 */ 443 int 444 ip_verify_ire(mblk_t *mp, ip_xmit_attr_t *ixa) 445 { 446 uint_t gen; 447 ire_t *ire; 448 nce_t *nce; 449 int error; 450 boolean_t multirt = B_FALSE; 451 452 /* 453 * Redo ip_select_route. 454 * Need to grab generation number as part of the lookup to 455 * avoid race. 456 */ 457 error = 0; 458 ire = ip_select_route_pkt(mp, ixa, &gen, &error, &multirt); 459 ASSERT(ire != NULL); /* IRE_NOROUTE if none found */ 460 if (error != 0) { 461 ire_refrele(ire); 462 return (error); 463 } 464 465 if (ixa->ixa_ire != NULL) 466 ire_refrele_notr(ixa->ixa_ire); 467 #ifdef DEBUG 468 ire_refhold_notr(ire); 469 ire_refrele(ire); 470 #endif 471 ixa->ixa_ire = ire; 472 ixa->ixa_ire_generation = gen; 473 if (multirt) { 474 if (ixa->ixa_flags & IXAF_IS_IPV4) 475 ixa->ixa_postfragfn = ip_postfrag_multirt_v4; 476 else 477 ixa->ixa_postfragfn = ip_postfrag_multirt_v6; 478 ixa->ixa_flags |= IXAF_MULTIRT_MULTICAST; 479 } else { 480 ixa->ixa_postfragfn = ire->ire_postfragfn; 481 ixa->ixa_flags &= ~IXAF_MULTIRT_MULTICAST; 482 } 483 484 /* 485 * Don't look for an nce for reject or blackhole. 486 * They have ire_generation set to IRE_GENERATION_VERIFY which 487 * makes conn_ip_output avoid references to ixa_nce. 488 */ 489 if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) { 490 ASSERT(ixa->ixa_ire_generation == IRE_GENERATION_VERIFY); 491 ixa->ixa_dce_generation = DCE_GENERATION_VERIFY; 492 return (0); 493 } 494 495 /* The NCE could now be different */ 496 nce = ire_to_nce_pkt(ire, mp); 497 if (nce == NULL) { 498 /* 499 * Allocation failure. Make sure we redo ire/nce selection 500 * next time we send. 501 */ 502 ixa->ixa_ire_generation = IRE_GENERATION_VERIFY; 503 ixa->ixa_dce_generation = DCE_GENERATION_VERIFY; 504 return (ENOBUFS); 505 } 506 if (nce == ixa->ixa_nce) { 507 /* No change */ 508 nce_refrele(nce); 509 return (0); 510 } 511 512 /* 513 * Since the path MTU might change as a result of this 514 * route change, we twiddle ixa_dce_generation to 515 * make conn_ip_output go through the ip_verify_dce code. 516 */ 517 ixa->ixa_dce_generation = DCE_GENERATION_VERIFY; 518 519 if (ixa->ixa_nce != NULL) 520 nce_refrele(ixa->ixa_nce); 521 ixa->ixa_nce = nce; 522 return (0); 523 } 524 525 /* 526 * Handle both IPv4 and IPv6. Reverify/recalculate the NCE to use. 527 */ 528 static int 529 ip_verify_nce(mblk_t *mp, ip_xmit_attr_t *ixa) 530 { 531 ire_t *ire = ixa->ixa_ire; 532 nce_t *nce; 533 int error = 0; 534 ipha_t *ipha = NULL; 535 ip6_t *ip6h = NULL; 536 537 if (ire->ire_ipversion == IPV4_VERSION) 538 ipha = (ipha_t *)mp->b_rptr; 539 else 540 ip6h = (ip6_t *)mp->b_rptr; 541 542 nce = ire_handle_condemned_nce(ixa->ixa_nce, ire, ipha, ip6h, B_TRUE); 543 if (nce == NULL) { 544 /* Try to find a better ire */ 545 return (ip_verify_ire(mp, ixa)); 546 } 547 548 /* 549 * The hardware offloading capabilities, for example LSO, of the 550 * interface might have changed, so do sanity verification here. 551 */ 552 if (ixa->ixa_flags & IXAF_VERIFY_LSO) { 553 if (!ip_verify_lso(nce->nce_ill, ixa)) { 554 ASSERT(ixa->ixa_notify != NULL); 555 ixa->ixa_notify(ixa->ixa_notify_cookie, ixa, 556 IXAN_LSO, 0); 557 error = ENOTSUP; 558 } 559 } 560 561 /* 562 * Verify ZEROCOPY capability of underlying ill. Notify the ULP with 563 * any ZEROCOPY changes. In case ZEROCOPY capability is not available 564 * any more, return error so that conn_ip_output() can take care of 565 * the ZEROCOPY message properly. It's safe to continue send the 566 * message when ZEROCOPY newly become available. 567 */ 568 if (ixa->ixa_flags & IXAF_VERIFY_ZCOPY) { 569 if (!ip_verify_zcopy(nce->nce_ill, ixa)) { 570 ASSERT(ixa->ixa_notify != NULL); 571 ixa->ixa_notify(ixa->ixa_notify_cookie, ixa, 572 IXAN_ZCOPY, 0); 573 if ((ixa->ixa_flags & IXAF_ZCOPY_CAPAB) == 0) 574 error = ENOTSUP; 575 } 576 } 577 578 /* 579 * Since the path MTU might change as a result of this 580 * change, we twiddle ixa_dce_generation to 581 * make conn_ip_output go through the ip_verify_dce code. 582 */ 583 ixa->ixa_dce_generation = DCE_GENERATION_VERIFY; 584 585 nce_refrele(ixa->ixa_nce); 586 ixa->ixa_nce = nce; 587 return (error); 588 } 589 590 /* 591 * Handle both IPv4 and IPv6. Reverify/recalculate the DCE to use. 592 */ 593 static int 594 ip_verify_dce(mblk_t *mp, ip_xmit_attr_t *ixa) 595 { 596 dce_t *dce; 597 uint_t gen; 598 uint_t pmtu; 599 600 dce = dce_lookup_pkt(mp, ixa, &gen); 601 ASSERT(dce != NULL); 602 603 dce_refrele_notr(ixa->ixa_dce); 604 #ifdef DEBUG 605 dce_refhold_notr(dce); 606 dce_refrele(dce); 607 #endif 608 ixa->ixa_dce = dce; 609 ixa->ixa_dce_generation = gen; 610 611 /* Extract the (path) mtu from the dce, ncec_ill etc */ 612 pmtu = ip_get_pmtu(ixa); 613 614 /* 615 * Tell ULP about PMTU changes - increase or decrease - by returning 616 * an error if IXAF_VERIFY_PMTU is set. In such case, ULP should update 617 * both ixa_pmtu and ixa_fragsize appropriately. 618 * 619 * If ULP doesn't set that flag then we need to update ixa_fragsize 620 * since routing could have changed the ill after after ixa_fragsize 621 * was set previously in the conn_ip_output path or in 622 * ip_set_destination. 623 * 624 * In case of LSO, ixa_fragsize might be greater than ixa_pmtu. 625 * 626 * In the case of a path MTU increase we send the packet after the 627 * notify to the ULP. 628 */ 629 if (ixa->ixa_flags & IXAF_VERIFY_PMTU) { 630 if (ixa->ixa_pmtu != pmtu) { 631 uint_t oldmtu = ixa->ixa_pmtu; 632 633 DTRACE_PROBE2(verify_pmtu, uint32_t, pmtu, 634 uint32_t, ixa->ixa_pmtu); 635 ASSERT(ixa->ixa_notify != NULL); 636 ixa->ixa_notify(ixa->ixa_notify_cookie, ixa, 637 IXAN_PMTU, pmtu); 638 if (pmtu < oldmtu) 639 return (EMSGSIZE); 640 } 641 } else { 642 ixa->ixa_fragsize = pmtu; 643 } 644 return (0); 645 } 646 647 /* 648 * Verify LSO usability. Keep the return value simple to indicate whether 649 * the LSO capability has changed. Handle both IPv4 and IPv6. 650 */ 651 static boolean_t 652 ip_verify_lso(ill_t *ill, ip_xmit_attr_t *ixa) 653 { 654 ill_lso_capab_t *lsoc = &ixa->ixa_lso_capab; 655 ill_lso_capab_t *new_lsoc = ill->ill_lso_capab; 656 657 if (ixa->ixa_flags & IXAF_LSO_CAPAB) { 658 /* 659 * Not unsable any more. 660 */ 661 if ((ixa->ixa_flags & IXAF_IPSEC_SECURE) || 662 (ixa->ixa_ire->ire_type & (IRE_LOCAL | IRE_LOOPBACK)) || 663 (ixa->ixa_ire->ire_flags & RTF_MULTIRT) || 664 ((ixa->ixa_flags & IXAF_IS_IPV4) ? 665 !ILL_LSO_TCP_IPV4_USABLE(ill) : 666 !ILL_LSO_TCP_IPV6_USABLE(ill))) { 667 ixa->ixa_flags &= ~IXAF_LSO_CAPAB; 668 669 return (B_FALSE); 670 } 671 672 /* 673 * Capability has changed, refresh the copy in ixa. 674 */ 675 if (lsoc->ill_lso_max != new_lsoc->ill_lso_max) { 676 *lsoc = *new_lsoc; 677 678 return (B_FALSE); 679 } 680 } else { /* Was not usable */ 681 if (!(ixa->ixa_flags & IXAF_IPSEC_SECURE) && 682 !(ixa->ixa_ire->ire_type & (IRE_LOCAL | IRE_LOOPBACK)) && 683 !(ixa->ixa_ire->ire_flags & RTF_MULTIRT) && 684 ((ixa->ixa_flags & IXAF_IS_IPV4) ? 685 ILL_LSO_TCP_IPV4_USABLE(ill) : 686 ILL_LSO_TCP_IPV6_USABLE(ill))) { 687 *lsoc = *new_lsoc; 688 ixa->ixa_flags |= IXAF_LSO_CAPAB; 689 690 return (B_FALSE); 691 } 692 } 693 694 return (B_TRUE); 695 } 696 697 /* 698 * Verify ZEROCOPY usability. Keep the return value simple to indicate whether 699 * the ZEROCOPY capability has changed. Handle both IPv4 and IPv6. 700 */ 701 static boolean_t 702 ip_verify_zcopy(ill_t *ill, ip_xmit_attr_t *ixa) 703 { 704 if (ixa->ixa_flags & IXAF_ZCOPY_CAPAB) { 705 /* 706 * Not unsable any more. 707 */ 708 if ((ixa->ixa_flags & IXAF_IPSEC_SECURE) || 709 (ixa->ixa_ire->ire_type & (IRE_LOCAL | IRE_LOOPBACK)) || 710 (ixa->ixa_ire->ire_flags & RTF_MULTIRT) || 711 !ILL_ZCOPY_USABLE(ill)) { 712 ixa->ixa_flags &= ~IXAF_ZCOPY_CAPAB; 713 714 return (B_FALSE); 715 } 716 } else { /* Was not usable */ 717 if (!(ixa->ixa_flags & IXAF_IPSEC_SECURE) && 718 !(ixa->ixa_ire->ire_type & (IRE_LOCAL | IRE_LOOPBACK)) && 719 !(ixa->ixa_ire->ire_flags & RTF_MULTIRT) && 720 ILL_ZCOPY_USABLE(ill)) { 721 ixa->ixa_flags |= IXAF_ZCOPY_CAPAB; 722 723 return (B_FALSE); 724 } 725 } 726 727 return (B_TRUE); 728 } 729 730 731 /* 732 * When there is no conn_t context, this will send a packet. 733 * The caller must *not* have called conn_connect() or ip_attr_connect() 734 * before calling ip_output_simple(). 735 * Handles IPv4 and IPv6. Returns zero or an errno such as ENETUNREACH. 736 * Honors IXAF_SET_SOURCE. 737 * 738 * We acquire the ire and after calling ire_sendfn we release 739 * the hold on the ire. Ditto for the nce and dce. 740 * 741 * This assumes that the caller has set the following in ip_xmit_attr_t: 742 * ixa_tsl, ixa_zoneid, and ixa_ipst must always be set. 743 * If ixa_ifindex is non-zero it means send out that ill. (If it is 744 * an upper IPMP ill we load balance across the group; if a lower we send 745 * on that lower ill without load balancing.) 746 * IXAF_IS_IPV4 must be set correctly. 747 * If IXAF_IPSEC_SECURE is set then the ixa_ipsec_* fields must be set. 748 * If IXAF_NO_IPSEC is set we'd skip IPsec policy lookup. 749 * If neither of those two are set we do an IPsec policy lookup. 750 * 751 * We handle setting things like 752 * ixa_pktlen 753 * ixa_ip_hdr_length 754 * ixa->ixa_protocol 755 * 756 * The caller may set ixa_xmit_hint, which is used for ECMP selection and 757 * transmit ring selecting in GLD. 758 * 759 * The caller must do an ixa_cleanup() to release any IPsec references 760 * after we return. 761 */ 762 int 763 ip_output_simple(mblk_t *mp, ip_xmit_attr_t *ixa) 764 { 765 ts_label_t *effective_tsl = NULL; 766 int err; 767 768 ASSERT(ixa->ixa_ipst != NULL); 769 770 if (is_system_labeled()) { 771 ip_stack_t *ipst = ixa->ixa_ipst; 772 773 if (ixa->ixa_flags & IXAF_IS_IPV4) { 774 err = tsol_check_label_v4(ixa->ixa_tsl, ixa->ixa_zoneid, 775 &mp, CONN_MAC_DEFAULT, B_FALSE, ixa->ixa_ipst, 776 &effective_tsl); 777 } else { 778 err = tsol_check_label_v6(ixa->ixa_tsl, ixa->ixa_zoneid, 779 &mp, CONN_MAC_DEFAULT, B_FALSE, ixa->ixa_ipst, 780 &effective_tsl); 781 } 782 if (err != 0) { 783 ip2dbg(("tsol_check: label check failed (%d)\n", err)); 784 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests); 785 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards); 786 ip_drop_output("tsol_check_label", mp, NULL); 787 freemsg(mp); 788 return (err); 789 } 790 if (effective_tsl != NULL) { 791 /* Update the label */ 792 ip_xmit_attr_replace_tsl(ixa, effective_tsl); 793 } 794 } 795 796 if (ixa->ixa_flags & IXAF_IS_IPV4) 797 return (ip_output_simple_v4(mp, ixa)); 798 else 799 return (ip_output_simple_v6(mp, ixa)); 800 } 801 802 int 803 ip_output_simple_v4(mblk_t *mp, ip_xmit_attr_t *ixa) 804 { 805 ipha_t *ipha; 806 ipaddr_t firsthop; /* In IP header */ 807 ipaddr_t dst; /* End of source route, or ipha_dst if none */ 808 ire_t *ire; 809 ipaddr_t setsrc; /* RTF_SETSRC */ 810 int error; 811 ill_t *ill = NULL; 812 dce_t *dce = NULL; 813 nce_t *nce; 814 iaflags_t ixaflags = ixa->ixa_flags; 815 ip_stack_t *ipst = ixa->ixa_ipst; 816 boolean_t repeat = B_FALSE; 817 boolean_t multirt = B_FALSE; 818 int64_t now; 819 820 ipha = (ipha_t *)mp->b_rptr; 821 ASSERT(IPH_HDR_VERSION(ipha) == IPV4_VERSION); 822 823 /* 824 * Even on labeled systems we can have a NULL ixa_tsl e.g., 825 * for IGMP/MLD traffic. 826 */ 827 828 /* Caller already set flags */ 829 ASSERT(ixa->ixa_flags & IXAF_IS_IPV4); 830 831 ASSERT(ixa->ixa_nce == NULL); 832 833 ixa->ixa_pktlen = ntohs(ipha->ipha_length); 834 ASSERT(ixa->ixa_pktlen == msgdsize(mp)); 835 ixa->ixa_ip_hdr_length = IPH_HDR_LENGTH(ipha); 836 ixa->ixa_protocol = ipha->ipha_protocol; 837 838 /* 839 * Assumes that source routed packets have already been massaged by 840 * the ULP (ip_massage_options) and as a result ipha_dst is the next 841 * hop in the source route. The final destination is used for IPsec 842 * policy and DCE lookup. 843 */ 844 firsthop = ipha->ipha_dst; 845 dst = ip_get_dst(ipha); 846 847 repeat_ire: 848 error = 0; 849 setsrc = INADDR_ANY; 850 ire = ip_select_route_v4(firsthop, ixa, NULL, &setsrc, &error, 851 &multirt); 852 ASSERT(ire != NULL); /* IRE_NOROUTE if none found */ 853 if (error != 0) { 854 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests); 855 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards); 856 ip_drop_output("ipIfStatsOutDiscards - select route", mp, NULL); 857 freemsg(mp); 858 goto done; 859 } 860 861 if (ire->ire_flags & (RTF_BLACKHOLE|RTF_REJECT)) { 862 /* ire_ill might be NULL hence need to skip some code */ 863 if (ixaflags & IXAF_SET_SOURCE) 864 ipha->ipha_src = htonl(INADDR_LOOPBACK); 865 ixa->ixa_fragsize = IP_MAXPACKET; 866 ill = NULL; 867 nce = NULL; 868 ire->ire_ob_pkt_count++; 869 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests); 870 /* No dce yet; use default one */ 871 error = (ire->ire_sendfn)(ire, mp, ipha, ixa, 872 &ipst->ips_dce_default->dce_ident); 873 goto done; 874 } 875 876 /* Note that ipha_dst is only used for IRE_MULTICAST */ 877 nce = ire_to_nce(ire, ipha->ipha_dst, NULL); 878 if (nce == NULL) { 879 /* Allocation failure? */ 880 ip_drop_output("ire_to_nce", mp, ill); 881 freemsg(mp); 882 error = ENOBUFS; 883 goto done; 884 } 885 if (nce->nce_is_condemned) { 886 nce_t *nce1; 887 888 nce1 = ire_handle_condemned_nce(nce, ire, ipha, NULL, B_TRUE); 889 nce_refrele(nce); 890 if (nce1 == NULL) { 891 if (!repeat) { 892 /* Try finding a better IRE */ 893 repeat = B_TRUE; 894 ire_refrele(ire); 895 goto repeat_ire; 896 } 897 /* Tried twice - drop packet */ 898 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards); 899 ip_drop_output("No nce", mp, ill); 900 freemsg(mp); 901 error = ENOBUFS; 902 goto done; 903 } 904 nce = nce1; 905 } 906 907 /* 908 * For multicast with multirt we have a flag passed back from 909 * ire_lookup_multi_ill_v4 since we don't have an IRE for each 910 * possible multicast address. 911 * We also need a flag for multicast since we can't check 912 * whether RTF_MULTIRT is set in ixa_ire for multicast. 913 */ 914 if (multirt) { 915 ixa->ixa_postfragfn = ip_postfrag_multirt_v4; 916 ixa->ixa_flags |= IXAF_MULTIRT_MULTICAST; 917 } else { 918 ixa->ixa_postfragfn = ire->ire_postfragfn; 919 ixa->ixa_flags &= ~IXAF_MULTIRT_MULTICAST; 920 } 921 ASSERT(ixa->ixa_nce == NULL); 922 ixa->ixa_nce = nce; 923 924 /* 925 * Check for a dce_t with a path mtu. 926 */ 927 dce = dce_lookup_v4(dst, ipst, NULL); 928 ASSERT(dce != NULL); 929 930 if (!(ixaflags & IXAF_PMTU_DISCOVERY)) { 931 ixa->ixa_fragsize = ip_get_base_mtu(nce->nce_ill, ire); 932 } else if (dce->dce_flags & DCEF_PMTU) { 933 /* 934 * To avoid a periodic timer to increase the path MTU we 935 * look at dce_last_change_time each time we send a packet. 936 */ 937 now = ddi_get_lbolt64(); 938 if (TICK_TO_SEC(now) - dce->dce_last_change_time > 939 ipst->ips_ip_pathmtu_interval) { 940 /* 941 * Older than 20 minutes. Drop the path MTU information. 942 */ 943 mutex_enter(&dce->dce_lock); 944 dce->dce_flags &= ~(DCEF_PMTU|DCEF_TOO_SMALL_PMTU); 945 dce->dce_last_change_time = TICK_TO_SEC(now); 946 mutex_exit(&dce->dce_lock); 947 dce_increment_generation(dce); 948 ixa->ixa_fragsize = ip_get_base_mtu(nce->nce_ill, ire); 949 } else { 950 uint_t fragsize; 951 952 fragsize = ip_get_base_mtu(nce->nce_ill, ire); 953 if (fragsize > dce->dce_pmtu) 954 fragsize = dce->dce_pmtu; 955 ixa->ixa_fragsize = fragsize; 956 } 957 } else { 958 ixa->ixa_fragsize = ip_get_base_mtu(nce->nce_ill, ire); 959 } 960 961 /* 962 * We use use ire_nexthop_ill (and not ncec_ill) to avoid the under ipmp 963 * interface for source address selection. 964 */ 965 ill = ire_nexthop_ill(ire); 966 967 if (ixaflags & IXAF_SET_SOURCE) { 968 ipaddr_t src; 969 970 /* 971 * We use the final destination to get 972 * correct selection for source routed packets 973 */ 974 975 /* If unreachable we have no ill but need some source */ 976 if (ill == NULL) { 977 src = htonl(INADDR_LOOPBACK); 978 error = 0; 979 } else { 980 error = ip_select_source_v4(ill, setsrc, dst, 981 ixa->ixa_multicast_ifaddr, ixa->ixa_zoneid, ipst, 982 &src, NULL, NULL); 983 } 984 if (error != 0) { 985 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCOutRequests); 986 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); 987 ip_drop_output("ipIfStatsOutDiscards - no source", 988 mp, ill); 989 freemsg(mp); 990 goto done; 991 } 992 ipha->ipha_src = src; 993 } else if (ixaflags & IXAF_VERIFY_SOURCE) { 994 /* Check if the IP source is assigned to the host. */ 995 if (!ip_verify_src(mp, ixa, NULL)) { 996 /* Don't send a packet with a source that isn't ours */ 997 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests); 998 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards); 999 ip_drop_output("ipIfStatsOutDiscards - invalid source", 1000 mp, ill); 1001 freemsg(mp); 1002 error = EADDRNOTAVAIL; 1003 goto done; 1004 } 1005 } 1006 1007 1008 /* 1009 * Check against global IPsec policy to set the AH/ESP attributes. 1010 * IPsec will set IXAF_IPSEC_* and ixa_ipsec_* as appropriate. 1011 */ 1012 if (!(ixaflags & (IXAF_NO_IPSEC|IXAF_IPSEC_SECURE))) { 1013 ASSERT(ixa->ixa_ipsec_policy == NULL); 1014 mp = ip_output_attach_policy(mp, ipha, NULL, NULL, ixa); 1015 if (mp == NULL) { 1016 /* MIB and ip_drop_packet already done */ 1017 return (EHOSTUNREACH); /* IPsec policy failure */ 1018 } 1019 } 1020 1021 if (ill != NULL) { 1022 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCOutRequests); 1023 } else { 1024 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests); 1025 } 1026 1027 /* 1028 * We update the statistics on the most specific IRE i.e., the first 1029 * one we found. 1030 * We don't have an IRE when we fragment, hence ire_ob_pkt_count 1031 * can only count the use prior to fragmentation. However the MIB 1032 * counters on the ill will be incremented in post fragmentation. 1033 */ 1034 ire->ire_ob_pkt_count++; 1035 1036 /* 1037 * Based on ire_type and ire_flags call one of: 1038 * ire_send_local_v4 - for IRE_LOCAL and IRE_LOOPBACK 1039 * ire_send_multirt_v4 - if RTF_MULTIRT 1040 * ire_send_noroute_v4 - if RTF_REJECT or RTF_BLACHOLE 1041 * ire_send_multicast_v4 - for IRE_MULTICAST 1042 * ire_send_broadcast_v4 - for IRE_BROADCAST 1043 * ire_send_wire_v4 - for the rest. 1044 */ 1045 error = (ire->ire_sendfn)(ire, mp, ipha, ixa, &dce->dce_ident); 1046 done: 1047 ire_refrele(ire); 1048 if (dce != NULL) 1049 dce_refrele(dce); 1050 if (ill != NULL) 1051 ill_refrele(ill); 1052 if (ixa->ixa_nce != NULL) 1053 nce_refrele(ixa->ixa_nce); 1054 ixa->ixa_nce = NULL; 1055 return (error); 1056 } 1057 1058 /* 1059 * ire_sendfn() functions. 1060 * These functions use the following xmit_attr: 1061 * - ixa_fragsize - read to determine whether or not to fragment 1062 * - IXAF_IPSEC_SECURE - to determine whether or not to invoke IPsec 1063 * - ixa_ipsec_* are used inside IPsec 1064 * - IXAF_SET_SOURCE - replace IP source in broadcast case. 1065 * - IXAF_LOOPBACK_COPY - for multicast and broadcast 1066 */ 1067 1068 1069 /* 1070 * ire_sendfn for IRE_LOCAL and IRE_LOOPBACK 1071 * 1072 * The checks for restrict_interzone_loopback are done in ire_route_recursive. 1073 */ 1074 /* ARGSUSED4 */ 1075 int 1076 ire_send_local_v4(ire_t *ire, mblk_t *mp, void *iph_arg, 1077 ip_xmit_attr_t *ixa, uint32_t *identp) 1078 { 1079 ipha_t *ipha = (ipha_t *)iph_arg; 1080 ip_stack_t *ipst = ixa->ixa_ipst; 1081 ill_t *ill = ire->ire_ill; 1082 ip_recv_attr_t iras; /* NOTE: No bzero for performance */ 1083 uint_t pktlen = ixa->ixa_pktlen; 1084 1085 /* 1086 * No fragmentation, no nce, no application of IPsec, 1087 * and no ipha_ident assignment. 1088 * 1089 * Note different order between IP provider and FW_HOOKS than in 1090 * send_wire case. 1091 */ 1092 1093 /* 1094 * DTrace this as ip:::send. A packet blocked by FW_HOOKS will fire the 1095 * send probe, but not the receive probe. 1096 */ 1097 DTRACE_IP7(send, mblk_t *, mp, conn_t *, NULL, void_ip_t *, 1098 ipha, __dtrace_ipsr_ill_t *, ill, ipha_t *, ipha, ip6_t *, NULL, 1099 int, 1); 1100 1101 if (HOOKS4_INTERESTED_LOOPBACK_OUT(ipst)) { 1102 int error; 1103 1104 DTRACE_PROBE4(ip4__loopback__out__start, ill_t *, NULL, 1105 ill_t *, ill, ipha_t *, ipha, mblk_t *, mp); 1106 FW_HOOKS(ipst->ips_ip4_loopback_out_event, 1107 ipst->ips_ipv4firewall_loopback_out, 1108 NULL, ill, ipha, mp, mp, 0, ipst, error); 1109 DTRACE_PROBE1(ip4__loopback__out__end, mblk_t *, mp); 1110 if (mp == NULL) 1111 return (error); 1112 1113 /* 1114 * Even if the destination was changed by the filter we use the 1115 * forwarding decision that was made based on the address 1116 * in ip_output/ip_set_destination. 1117 */ 1118 /* Length could be different */ 1119 ipha = (ipha_t *)mp->b_rptr; 1120 pktlen = ntohs(ipha->ipha_length); 1121 } 1122 1123 /* 1124 * If a callback is enabled then we need to know the 1125 * source and destination zoneids for the packet. We already 1126 * have those handy. 1127 */ 1128 if (ipst->ips_ip4_observe.he_interested) { 1129 zoneid_t szone, dzone; 1130 zoneid_t stackzoneid; 1131 1132 stackzoneid = netstackid_to_zoneid( 1133 ipst->ips_netstack->netstack_stackid); 1134 1135 if (stackzoneid == GLOBAL_ZONEID) { 1136 /* Shared-IP zone */ 1137 dzone = ire->ire_zoneid; 1138 szone = ixa->ixa_zoneid; 1139 } else { 1140 szone = dzone = stackzoneid; 1141 } 1142 ipobs_hook(mp, IPOBS_HOOK_LOCAL, szone, dzone, ill, ipst); 1143 } 1144 1145 /* Handle lo0 stats */ 1146 ipst->ips_loopback_packets++; 1147 1148 /* Map ixa to ira including IPsec policies */ 1149 ipsec_out_to_in(ixa, ill, &iras); 1150 iras.ira_pktlen = pktlen; 1151 1152 if (!IS_SIMPLE_IPH(ipha)) { 1153 ip_output_local_options(ipha, ipst); 1154 iras.ira_flags |= IRAF_IPV4_OPTIONS; 1155 } 1156 1157 if (HOOKS4_INTERESTED_LOOPBACK_IN(ipst)) { 1158 int error; 1159 1160 DTRACE_PROBE4(ip4__loopback__in__start, ill_t *, ill, 1161 ill_t *, NULL, ipha_t *, ipha, mblk_t *, mp); 1162 FW_HOOKS(ipst->ips_ip4_loopback_in_event, 1163 ipst->ips_ipv4firewall_loopback_in, 1164 ill, NULL, ipha, mp, mp, 0, ipst, error); 1165 1166 DTRACE_PROBE1(ip4__loopback__in__end, mblk_t *, mp); 1167 if (mp == NULL) { 1168 ira_cleanup(&iras, B_FALSE); 1169 return (error); 1170 } 1171 /* 1172 * Even if the destination was changed by the filter we use the 1173 * forwarding decision that was made based on the address 1174 * in ip_output/ip_set_destination. 1175 */ 1176 /* Length could be different */ 1177 ipha = (ipha_t *)mp->b_rptr; 1178 pktlen = iras.ira_pktlen = ntohs(ipha->ipha_length); 1179 } 1180 1181 DTRACE_IP7(receive, mblk_t *, mp, conn_t *, NULL, void_ip_t *, 1182 ipha, __dtrace_ipsr_ill_t *, ill, ipha_t *, ipha, ip6_t *, NULL, 1183 int, 1); 1184 1185 ire->ire_ib_pkt_count++; 1186 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInReceives); 1187 UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCInOctets, pktlen); 1188 1189 /* Destined to ire_zoneid - use that for fanout */ 1190 iras.ira_zoneid = ire->ire_zoneid; 1191 1192 if (is_system_labeled()) { 1193 iras.ira_flags |= IRAF_SYSTEM_LABELED; 1194 1195 /* 1196 * This updates ira_cred, ira_tsl and ira_free_flags based 1197 * on the label. We don't expect this to ever fail for 1198 * loopback packets, so we silently drop the packet should it 1199 * fail. 1200 */ 1201 if (!tsol_get_pkt_label(mp, IPV4_VERSION, &iras)) { 1202 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 1203 ip_drop_input("tsol_get_pkt_label", mp, ill); 1204 freemsg(mp); 1205 return (0); 1206 } 1207 ASSERT(iras.ira_tsl != NULL); 1208 1209 /* tsol_get_pkt_label sometimes does pullupmsg */ 1210 ipha = (ipha_t *)mp->b_rptr; 1211 } 1212 1213 ip_fanout_v4(mp, ipha, &iras); 1214 1215 /* We moved any IPsec refs from ixa to iras */ 1216 ira_cleanup(&iras, B_FALSE); 1217 return (0); 1218 } 1219 1220 /* 1221 * ire_sendfn for IRE_BROADCAST 1222 * If the broadcast address is present on multiple ills and ixa_ifindex 1223 * isn't set, then we generate 1224 * a separate datagram (potentially with different source address) for 1225 * those ills. In any case, only one copy is looped back to ip_input_v4. 1226 */ 1227 int 1228 ire_send_broadcast_v4(ire_t *ire, mblk_t *mp, void *iph_arg, 1229 ip_xmit_attr_t *ixa, uint32_t *identp) 1230 { 1231 ipha_t *ipha = (ipha_t *)iph_arg; 1232 ip_stack_t *ipst = ixa->ixa_ipst; 1233 irb_t *irb = ire->ire_bucket; 1234 ire_t *ire1; 1235 mblk_t *mp1; 1236 ipha_t *ipha1; 1237 iaflags_t ixaflags = ixa->ixa_flags; 1238 nce_t *nce1, *nce_orig; 1239 1240 /* 1241 * Unless ire_send_multirt_v4 already set a ttl, force the 1242 * ttl to a smallish value. 1243 */ 1244 if (!(ixa->ixa_flags & IXAF_NO_TTL_CHANGE)) { 1245 /* 1246 * To avoid broadcast storms, we usually set the TTL to 1 for 1247 * broadcasts. This can 1248 * be overridden stack-wide through the ip_broadcast_ttl 1249 * ndd tunable, or on a per-connection basis through the 1250 * IP_BROADCAST_TTL socket option. 1251 * 1252 * If SO_DONTROUTE/IXAF_DONTROUTE is set, then ire_send_wire_v4 1253 * will force ttl to one after we've set this. 1254 */ 1255 if (ixaflags & IXAF_BROADCAST_TTL_SET) 1256 ipha->ipha_ttl = ixa->ixa_broadcast_ttl; 1257 else 1258 ipha->ipha_ttl = ipst->ips_ip_broadcast_ttl; 1259 } 1260 /* 1261 * Make sure we get a loopback copy (after IPsec and frag) 1262 * Skip hardware checksum so that loopback copy is checksumed. 1263 */ 1264 ixa->ixa_flags |= IXAF_LOOPBACK_COPY | IXAF_NO_HW_CKSUM; 1265 1266 /* Do we need to potentially generate multiple copies? */ 1267 if (irb->irb_ire_cnt == 1 || ixa->ixa_ifindex != 0) 1268 return (ire_send_wire_v4(ire, mp, ipha, ixa, identp)); 1269 1270 /* 1271 * Loop over all IRE_BROADCAST in the bucket (might only be one). 1272 * Note that everything in the bucket has the same destination address. 1273 */ 1274 irb_refhold(irb); 1275 for (ire1 = irb->irb_ire; ire1 != NULL; ire1 = ire1->ire_next) { 1276 /* We do the main IRE after the end of the loop */ 1277 if (ire1 == ire) 1278 continue; 1279 1280 /* 1281 * Only IREs for the same IP address should be in the same 1282 * bucket. 1283 * But could have IRE_HOSTs in the case of CGTP. 1284 * If we find any multirt routes we bail out of the loop 1285 * and just do the single packet at the end; ip_postfrag_multirt 1286 * will duplicate the packet. 1287 */ 1288 ASSERT(ire1->ire_addr == ire->ire_addr); 1289 if (!(ire1->ire_type & IRE_BROADCAST)) 1290 continue; 1291 1292 if (IRE_IS_CONDEMNED(ire1)) 1293 continue; 1294 1295 if (ixa->ixa_zoneid != ALL_ZONES && 1296 ire->ire_zoneid != ire1->ire_zoneid) 1297 continue; 1298 1299 ASSERT(ire->ire_ill != ire1->ire_ill && ire1->ire_ill != NULL); 1300 1301 if (ire1->ire_flags & RTF_MULTIRT) 1302 break; 1303 1304 /* 1305 * For IPMP we only send for the ipmp_ill. arp_nce_init() will 1306 * ensure that this goes out on the cast_ill. 1307 */ 1308 if (IS_UNDER_IPMP(ire1->ire_ill)) 1309 continue; 1310 1311 mp1 = copymsg(mp); 1312 if (mp1 == NULL) { 1313 BUMP_MIB(ire1->ire_ill->ill_ip_mib, 1314 ipIfStatsOutDiscards); 1315 ip_drop_output("ipIfStatsOutDiscards", 1316 mp, ire1->ire_ill); 1317 continue; 1318 } 1319 1320 ipha1 = (ipha_t *)mp1->b_rptr; 1321 if (ixa->ixa_flags & IXAF_SET_SOURCE) { 1322 /* 1323 * Need to pick a different source address for each 1324 * interface. If we have a global IPsec policy and 1325 * no per-socket policy then we punt to 1326 * ip_output_simple_v4 using a separate ip_xmit_attr_t. 1327 */ 1328 if (ixaflags & IXAF_IPSEC_GLOBAL_POLICY) { 1329 ip_output_simple_broadcast(ixa, mp1); 1330 continue; 1331 } 1332 /* Pick a new source address for each interface */ 1333 if (ip_select_source_v4(ire1->ire_ill, INADDR_ANY, 1334 ipha1->ipha_dst, INADDR_ANY, ixa->ixa_zoneid, ipst, 1335 &ipha1->ipha_src, NULL, NULL) != 0) { 1336 BUMP_MIB(ire1->ire_ill->ill_ip_mib, 1337 ipIfStatsOutDiscards); 1338 ip_drop_output("ipIfStatsOutDiscards - select " 1339 "broadcast source", mp1, ire1->ire_ill); 1340 freemsg(mp1); 1341 continue; 1342 } 1343 /* 1344 * Check against global IPsec policy to set the AH/ESP 1345 * attributes. IPsec will set IXAF_IPSEC_* and 1346 * ixa_ipsec_* as appropriate. 1347 */ 1348 if (!(ixaflags & (IXAF_NO_IPSEC|IXAF_IPSEC_SECURE))) { 1349 ASSERT(ixa->ixa_ipsec_policy == NULL); 1350 mp1 = ip_output_attach_policy(mp1, ipha, NULL, 1351 NULL, ixa); 1352 if (mp1 == NULL) { 1353 /* 1354 * MIB and ip_drop_packet already 1355 * done 1356 */ 1357 continue; 1358 } 1359 } 1360 } 1361 /* Make sure we have an NCE on this ill */ 1362 nce1 = arp_nce_init(ire1->ire_ill, ire1->ire_addr, 1363 ire1->ire_type); 1364 if (nce1 == NULL) { 1365 BUMP_MIB(ire1->ire_ill->ill_ip_mib, 1366 ipIfStatsOutDiscards); 1367 ip_drop_output("ipIfStatsOutDiscards - broadcast nce", 1368 mp1, ire1->ire_ill); 1369 freemsg(mp1); 1370 continue; 1371 } 1372 nce_orig = ixa->ixa_nce; 1373 ixa->ixa_nce = nce1; 1374 1375 ire_refhold(ire1); 1376 /* 1377 * Ignore any errors here. We just collect the errno for 1378 * the main ire below 1379 */ 1380 (void) ire_send_wire_v4(ire1, mp1, ipha1, ixa, identp); 1381 ire_refrele(ire1); 1382 1383 ixa->ixa_nce = nce_orig; 1384 nce_refrele(nce1); 1385 1386 ixa->ixa_flags &= ~IXAF_LOOPBACK_COPY; 1387 } 1388 irb_refrele(irb); 1389 /* Finally, the main one */ 1390 1391 /* 1392 * For IPMP we only send broadcasts on the ipmp_ill. 1393 */ 1394 if (IS_UNDER_IPMP(ire->ire_ill)) { 1395 freemsg(mp); 1396 return (0); 1397 } 1398 1399 return (ire_send_wire_v4(ire, mp, ipha, ixa, identp)); 1400 } 1401 1402 /* 1403 * Send a packet using a different source address and different 1404 * IPsec policy. 1405 */ 1406 static void 1407 ip_output_simple_broadcast(ip_xmit_attr_t *ixa, mblk_t *mp) 1408 { 1409 ip_xmit_attr_t ixas; 1410 1411 bzero(&ixas, sizeof (ixas)); 1412 ixas.ixa_flags = IXAF_BASIC_SIMPLE_V4; 1413 ixas.ixa_zoneid = ixa->ixa_zoneid; 1414 ixas.ixa_ifindex = 0; 1415 ixas.ixa_ipst = ixa->ixa_ipst; 1416 ixas.ixa_cred = ixa->ixa_cred; 1417 ixas.ixa_cpid = ixa->ixa_cpid; 1418 ixas.ixa_tsl = ixa->ixa_tsl; 1419 ixas.ixa_multicast_ttl = IP_DEFAULT_MULTICAST_TTL; 1420 1421 (void) ip_output_simple(mp, &ixas); 1422 ixa_cleanup(&ixas); 1423 } 1424 1425 1426 static void 1427 multirt_check_v4(ire_t *ire, ipha_t *ipha, ip_xmit_attr_t *ixa) 1428 { 1429 ip_stack_t *ipst = ixa->ixa_ipst; 1430 1431 /* Limit the TTL on multirt packets */ 1432 if (ire->ire_type & IRE_MULTICAST) { 1433 if (ipha->ipha_ttl > 1) { 1434 ip2dbg(("ire_send_multirt_v4: forcing multicast " 1435 "multirt TTL to 1 (was %d), dst 0x%08x\n", 1436 ipha->ipha_ttl, ntohl(ire->ire_addr))); 1437 ipha->ipha_ttl = 1; 1438 } 1439 ixa->ixa_flags |= IXAF_NO_TTL_CHANGE; 1440 } else if ((ipst->ips_ip_multirt_ttl > 0) && 1441 (ipha->ipha_ttl > ipst->ips_ip_multirt_ttl)) { 1442 ipha->ipha_ttl = ipst->ips_ip_multirt_ttl; 1443 /* 1444 * Need to ensure we don't increase the ttl should we go through 1445 * ire_send_broadcast or multicast. 1446 */ 1447 ixa->ixa_flags |= IXAF_NO_TTL_CHANGE; 1448 } 1449 } 1450 1451 /* 1452 * ire_sendfn for IRE_MULTICAST 1453 */ 1454 int 1455 ire_send_multicast_v4(ire_t *ire, mblk_t *mp, void *iph_arg, 1456 ip_xmit_attr_t *ixa, uint32_t *identp) 1457 { 1458 ipha_t *ipha = (ipha_t *)iph_arg; 1459 ip_stack_t *ipst = ixa->ixa_ipst; 1460 ill_t *ill = ire->ire_ill; 1461 iaflags_t ixaflags = ixa->ixa_flags; 1462 1463 /* 1464 * The IRE_MULTICAST is the same whether or not multirt is in use. 1465 * Hence we need special-case code. 1466 */ 1467 if (ixaflags & IXAF_MULTIRT_MULTICAST) 1468 multirt_check_v4(ire, ipha, ixa); 1469 1470 /* 1471 * Check if anything in ip_input_v4 wants a copy of the transmitted 1472 * packet (after IPsec and fragmentation) 1473 * 1474 * 1. Multicast routers always need a copy unless SO_DONTROUTE is set 1475 * RSVP and the rsvp daemon is an example of a 1476 * protocol and user level process that 1477 * handles it's own routing. Hence, it uses the 1478 * SO_DONTROUTE option to accomplish this. 1479 * 2. If the sender has set IP_MULTICAST_LOOP, then we just 1480 * check whether there are any receivers for the group on the ill 1481 * (ignoring the zoneid). 1482 * 3. If IP_MULTICAST_LOOP is not set, then we check if there are 1483 * any members in other shared-IP zones. 1484 * If such members exist, then we indicate that the sending zone 1485 * shouldn't get a loopback copy to preserve the IP_MULTICAST_LOOP 1486 * behavior. 1487 * 1488 * When we loopback we skip hardware checksum to make sure loopback 1489 * copy is checksumed. 1490 * 1491 * Note that ire_ill is the upper in the case of IPMP. 1492 */ 1493 ixa->ixa_flags &= ~(IXAF_LOOPBACK_COPY | IXAF_NO_HW_CKSUM); 1494 if (ipst->ips_ip_g_mrouter && ill->ill_mrouter_cnt > 0 && 1495 !(ixaflags & IXAF_DONTROUTE)) { 1496 ixa->ixa_flags |= IXAF_LOOPBACK_COPY | IXAF_NO_HW_CKSUM; 1497 } else if (ixaflags & IXAF_MULTICAST_LOOP) { 1498 /* 1499 * If this zone or any other zone has members then loopback 1500 * a copy. 1501 */ 1502 if (ill_hasmembers_v4(ill, ipha->ipha_dst)) 1503 ixa->ixa_flags |= IXAF_LOOPBACK_COPY | IXAF_NO_HW_CKSUM; 1504 } else if (ipst->ips_netstack->netstack_numzones > 1) { 1505 /* 1506 * This zone should not have a copy. But there are some other 1507 * zones which might have members. 1508 */ 1509 if (ill_hasmembers_otherzones_v4(ill, ipha->ipha_dst, 1510 ixa->ixa_zoneid)) { 1511 ixa->ixa_flags |= IXAF_NO_LOOP_ZONEID_SET; 1512 ixa->ixa_no_loop_zoneid = ixa->ixa_zoneid; 1513 ixa->ixa_flags |= IXAF_LOOPBACK_COPY | IXAF_NO_HW_CKSUM; 1514 } 1515 } 1516 1517 /* 1518 * Unless ire_send_multirt_v4 or icmp_output_hdrincl already set a ttl, 1519 * force the ttl to the IP_MULTICAST_TTL value 1520 */ 1521 if (!(ixaflags & IXAF_NO_TTL_CHANGE)) { 1522 ipha->ipha_ttl = ixa->ixa_multicast_ttl; 1523 } 1524 1525 return (ire_send_wire_v4(ire, mp, ipha, ixa, identp)); 1526 } 1527 1528 /* 1529 * ire_sendfn for IREs with RTF_MULTIRT 1530 */ 1531 int 1532 ire_send_multirt_v4(ire_t *ire, mblk_t *mp, void *iph_arg, 1533 ip_xmit_attr_t *ixa, uint32_t *identp) 1534 { 1535 ipha_t *ipha = (ipha_t *)iph_arg; 1536 1537 multirt_check_v4(ire, ipha, ixa); 1538 1539 if (ire->ire_type & IRE_MULTICAST) 1540 return (ire_send_multicast_v4(ire, mp, ipha, ixa, identp)); 1541 else if (ire->ire_type & IRE_BROADCAST) 1542 return (ire_send_broadcast_v4(ire, mp, ipha, ixa, identp)); 1543 else 1544 return (ire_send_wire_v4(ire, mp, ipha, ixa, identp)); 1545 } 1546 1547 /* 1548 * ire_sendfn for IREs with RTF_REJECT/RTF_BLACKHOLE, including IRE_NOROUTE 1549 */ 1550 int 1551 ire_send_noroute_v4(ire_t *ire, mblk_t *mp, void *iph_arg, 1552 ip_xmit_attr_t *ixa, uint32_t *identp) 1553 { 1554 ip_stack_t *ipst = ixa->ixa_ipst; 1555 ipha_t *ipha = (ipha_t *)iph_arg; 1556 ill_t *ill; 1557 ip_recv_attr_t iras; 1558 boolean_t dummy; 1559 1560 /* We assign an IP ident for nice errors */ 1561 ipha->ipha_ident = atomic_add_32_nv(identp, 1); 1562 1563 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutNoRoutes); 1564 1565 if (ire->ire_type & IRE_NOROUTE) { 1566 /* A lack of a route as opposed to RTF_REJECT|BLACKHOLE */ 1567 ip_rts_change(RTM_MISS, ipha->ipha_dst, 0, 0, 0, 0, 0, 0, 1568 RTA_DST, ipst); 1569 } 1570 1571 if (ire->ire_flags & RTF_BLACKHOLE) { 1572 ip_drop_output("ipIfStatsOutNoRoutes RTF_BLACKHOLE", mp, NULL); 1573 freemsg(mp); 1574 /* No error even for local senders - silent blackhole */ 1575 return (0); 1576 } 1577 ip_drop_output("ipIfStatsOutNoRoutes RTF_REJECT", mp, NULL); 1578 1579 /* 1580 * We need an ill_t for the ip_recv_attr_t even though this packet 1581 * was never received and icmp_unreachable doesn't currently use 1582 * ira_ill. 1583 */ 1584 ill = ill_lookup_on_name("lo0", B_FALSE, 1585 !(ixa->ixa_flags & IRAF_IS_IPV4), &dummy, ipst); 1586 if (ill == NULL) { 1587 freemsg(mp); 1588 return (EHOSTUNREACH); 1589 } 1590 1591 bzero(&iras, sizeof (iras)); 1592 /* Map ixa to ira including IPsec policies */ 1593 ipsec_out_to_in(ixa, ill, &iras); 1594 1595 if (ip_source_routed(ipha, ipst)) { 1596 icmp_unreachable(mp, ICMP_SOURCE_ROUTE_FAILED, &iras); 1597 } else { 1598 icmp_unreachable(mp, ICMP_HOST_UNREACHABLE, &iras); 1599 } 1600 /* We moved any IPsec refs from ixa to iras */ 1601 ira_cleanup(&iras, B_FALSE); 1602 ill_refrele(ill); 1603 return (EHOSTUNREACH); 1604 } 1605 1606 /* 1607 * Calculate a checksum ignoring any hardware capabilities 1608 * 1609 * Returns B_FALSE if the packet was too short for the checksum. Caller 1610 * should free and do stats. 1611 */ 1612 static boolean_t 1613 ip_output_sw_cksum_v4(mblk_t *mp, ipha_t *ipha, ip_xmit_attr_t *ixa) 1614 { 1615 ip_stack_t *ipst = ixa->ixa_ipst; 1616 uint_t pktlen = ixa->ixa_pktlen; 1617 uint16_t *cksump; 1618 uint32_t cksum; 1619 uint8_t protocol = ixa->ixa_protocol; 1620 uint16_t ip_hdr_length = ixa->ixa_ip_hdr_length; 1621 ipaddr_t dst = ipha->ipha_dst; 1622 ipaddr_t src = ipha->ipha_src; 1623 1624 /* Just in case it contained garbage */ 1625 DB_CKSUMFLAGS(mp) &= ~HCK_FLAGS; 1626 1627 /* 1628 * Calculate ULP checksum 1629 */ 1630 if (protocol == IPPROTO_TCP) { 1631 cksump = IPH_TCPH_CHECKSUMP(ipha, ip_hdr_length); 1632 cksum = IP_TCP_CSUM_COMP; 1633 } else if (protocol == IPPROTO_UDP) { 1634 cksump = IPH_UDPH_CHECKSUMP(ipha, ip_hdr_length); 1635 cksum = IP_UDP_CSUM_COMP; 1636 } else if (protocol == IPPROTO_SCTP) { 1637 sctp_hdr_t *sctph; 1638 1639 ASSERT(MBLKL(mp) >= (ip_hdr_length + sizeof (*sctph))); 1640 sctph = (sctp_hdr_t *)(mp->b_rptr + ip_hdr_length); 1641 /* 1642 * Zero out the checksum field to ensure proper 1643 * checksum calculation. 1644 */ 1645 sctph->sh_chksum = 0; 1646 #ifdef DEBUG 1647 if (!skip_sctp_cksum) 1648 #endif 1649 sctph->sh_chksum = sctp_cksum(mp, ip_hdr_length); 1650 goto ip_hdr_cksum; 1651 } else { 1652 goto ip_hdr_cksum; 1653 } 1654 1655 /* ULP puts the checksum field is in the first mblk */ 1656 ASSERT(((uchar_t *)cksump) + sizeof (uint16_t) <= mp->b_wptr); 1657 1658 /* 1659 * We accumulate the pseudo header checksum in cksum. 1660 * This is pretty hairy code, so watch close. One 1661 * thing to keep in mind is that UDP and TCP have 1662 * stored their respective datagram lengths in their 1663 * checksum fields. This lines things up real nice. 1664 */ 1665 cksum += (dst >> 16) + (dst & 0xFFFF) + (src >> 16) + (src & 0xFFFF); 1666 1667 cksum = IP_CSUM(mp, ip_hdr_length, cksum); 1668 /* 1669 * For UDP/IPv4 a zero means that the packets wasn't checksummed. 1670 * Change to 0xffff 1671 */ 1672 if (protocol == IPPROTO_UDP && cksum == 0) 1673 *cksump = ~cksum; 1674 else 1675 *cksump = cksum; 1676 1677 IP_STAT(ipst, ip_out_sw_cksum); 1678 IP_STAT_UPDATE(ipst, ip_out_sw_cksum_bytes, pktlen); 1679 1680 ip_hdr_cksum: 1681 /* Calculate IPv4 header checksum */ 1682 ipha->ipha_hdr_checksum = 0; 1683 ipha->ipha_hdr_checksum = ip_csum_hdr(ipha); 1684 return (B_TRUE); 1685 } 1686 1687 /* 1688 * Calculate the ULP checksum - try to use hardware. 1689 * In the case of MULTIRT, broadcast or multicast the 1690 * IXAF_NO_HW_CKSUM is set in which case we use software. 1691 * 1692 * If the hardware supports IP header checksum offload; then clear the 1693 * contents of IP header checksum field as expected by NIC. 1694 * Do this only if we offloaded either full or partial sum. 1695 * 1696 * Returns B_FALSE if the packet was too short for the checksum. Caller 1697 * should free and do stats. 1698 */ 1699 static boolean_t 1700 ip_output_cksum_v4(iaflags_t ixaflags, mblk_t *mp, ipha_t *ipha, 1701 ip_xmit_attr_t *ixa, ill_t *ill) 1702 { 1703 uint_t pktlen = ixa->ixa_pktlen; 1704 uint16_t *cksump; 1705 uint16_t hck_flags; 1706 uint32_t cksum; 1707 uint8_t protocol = ixa->ixa_protocol; 1708 uint16_t ip_hdr_length = ixa->ixa_ip_hdr_length; 1709 1710 if ((ixaflags & IXAF_NO_HW_CKSUM) || !ILL_HCKSUM_CAPABLE(ill) || 1711 !dohwcksum) { 1712 return (ip_output_sw_cksum_v4(mp, ipha, ixa)); 1713 } 1714 1715 /* 1716 * Calculate ULP checksum. Note that we don't use cksump and cksum 1717 * if the ill has FULL support. 1718 */ 1719 if (protocol == IPPROTO_TCP) { 1720 cksump = IPH_TCPH_CHECKSUMP(ipha, ip_hdr_length); 1721 cksum = IP_TCP_CSUM_COMP; /* Pseudo-header cksum */ 1722 } else if (protocol == IPPROTO_UDP) { 1723 cksump = IPH_UDPH_CHECKSUMP(ipha, ip_hdr_length); 1724 cksum = IP_UDP_CSUM_COMP; /* Pseudo-header cksum */ 1725 } else if (protocol == IPPROTO_SCTP) { 1726 sctp_hdr_t *sctph; 1727 1728 ASSERT(MBLKL(mp) >= (ip_hdr_length + sizeof (*sctph))); 1729 sctph = (sctp_hdr_t *)(mp->b_rptr + ip_hdr_length); 1730 /* 1731 * Zero out the checksum field to ensure proper 1732 * checksum calculation. 1733 */ 1734 sctph->sh_chksum = 0; 1735 #ifdef DEBUG 1736 if (!skip_sctp_cksum) 1737 #endif 1738 sctph->sh_chksum = sctp_cksum(mp, ip_hdr_length); 1739 goto ip_hdr_cksum; 1740 } else { 1741 ip_hdr_cksum: 1742 /* Calculate IPv4 header checksum */ 1743 ipha->ipha_hdr_checksum = 0; 1744 ipha->ipha_hdr_checksum = ip_csum_hdr(ipha); 1745 return (B_TRUE); 1746 } 1747 1748 /* ULP puts the checksum field is in the first mblk */ 1749 ASSERT(((uchar_t *)cksump) + sizeof (uint16_t) <= mp->b_wptr); 1750 1751 /* 1752 * Underlying interface supports hardware checksum offload for 1753 * the payload; leave the payload checksum for the hardware to 1754 * calculate. N.B: We only need to set up checksum info on the 1755 * first mblk. 1756 */ 1757 hck_flags = ill->ill_hcksum_capab->ill_hcksum_txflags; 1758 1759 DB_CKSUMFLAGS(mp) &= ~HCK_FLAGS; 1760 if (hck_flags & HCKSUM_INET_FULL_V4) { 1761 /* 1762 * Hardware calculates pseudo-header, header and the 1763 * payload checksums, so clear the checksum field in 1764 * the protocol header. 1765 */ 1766 *cksump = 0; 1767 DB_CKSUMFLAGS(mp) |= HCK_FULLCKSUM; 1768 1769 ipha->ipha_hdr_checksum = 0; 1770 if (hck_flags & HCKSUM_IPHDRCKSUM) { 1771 DB_CKSUMFLAGS(mp) |= HCK_IPV4_HDRCKSUM; 1772 } else { 1773 ipha->ipha_hdr_checksum = ip_csum_hdr(ipha); 1774 } 1775 return (B_TRUE); 1776 } 1777 if ((hck_flags) & HCKSUM_INET_PARTIAL) { 1778 ipaddr_t dst = ipha->ipha_dst; 1779 ipaddr_t src = ipha->ipha_src; 1780 /* 1781 * Partial checksum offload has been enabled. Fill 1782 * the checksum field in the protocol header with the 1783 * pseudo-header checksum value. 1784 * 1785 * We accumulate the pseudo header checksum in cksum. 1786 * This is pretty hairy code, so watch close. One 1787 * thing to keep in mind is that UDP and TCP have 1788 * stored their respective datagram lengths in their 1789 * checksum fields. This lines things up real nice. 1790 */ 1791 cksum += (dst >> 16) + (dst & 0xFFFF) + 1792 (src >> 16) + (src & 0xFFFF); 1793 cksum += *(cksump); 1794 cksum = (cksum & 0xFFFF) + (cksum >> 16); 1795 *(cksump) = (cksum & 0xFFFF) + (cksum >> 16); 1796 1797 /* 1798 * Offsets are relative to beginning of IP header. 1799 */ 1800 DB_CKSUMSTART(mp) = ip_hdr_length; 1801 DB_CKSUMSTUFF(mp) = (uint8_t *)cksump - (uint8_t *)ipha; 1802 DB_CKSUMEND(mp) = pktlen; 1803 DB_CKSUMFLAGS(mp) |= HCK_PARTIALCKSUM; 1804 1805 ipha->ipha_hdr_checksum = 0; 1806 if (hck_flags & HCKSUM_IPHDRCKSUM) { 1807 DB_CKSUMFLAGS(mp) |= HCK_IPV4_HDRCKSUM; 1808 } else { 1809 ipha->ipha_hdr_checksum = ip_csum_hdr(ipha); 1810 } 1811 return (B_TRUE); 1812 } 1813 /* Hardware capabilities include neither full nor partial IPv4 */ 1814 return (ip_output_sw_cksum_v4(mp, ipha, ixa)); 1815 } 1816 1817 /* 1818 * ire_sendfn for offlink and onlink destinations. 1819 * Also called from the multicast, broadcast, multirt send functions. 1820 * 1821 * Assumes that the caller has a hold on the ire. 1822 * 1823 * This function doesn't care if the IRE just became condemned since that 1824 * can happen at any time. 1825 */ 1826 /* ARGSUSED */ 1827 int 1828 ire_send_wire_v4(ire_t *ire, mblk_t *mp, void *iph_arg, 1829 ip_xmit_attr_t *ixa, uint32_t *identp) 1830 { 1831 ip_stack_t *ipst = ixa->ixa_ipst; 1832 ipha_t *ipha = (ipha_t *)iph_arg; 1833 iaflags_t ixaflags = ixa->ixa_flags; 1834 ill_t *ill; 1835 1836 ASSERT(ixa->ixa_nce != NULL); 1837 ill = ixa->ixa_nce->nce_ill; 1838 1839 if (ixaflags & IXAF_DONTROUTE) 1840 ipha->ipha_ttl = 1; 1841 1842 /* 1843 * Assign an ident value for this packet. There could be other 1844 * threads targeting the same destination, so we have to arrange 1845 * for a atomic increment. Note that we use a 32-bit atomic add 1846 * because it has better performance than its 16-bit sibling. 1847 * 1848 * Normally ixa_extra_ident is 0, but in the case of LSO it will 1849 * be the number of TCP segments that the driver/hardware will 1850 * extraly construct. 1851 * 1852 * If running in cluster mode and if the source address 1853 * belongs to a replicated service then vector through 1854 * cl_inet_ipident vector to allocate ip identifier 1855 * NOTE: This is a contract private interface with the 1856 * clustering group. 1857 */ 1858 if (cl_inet_ipident != NULL) { 1859 ipaddr_t src = ipha->ipha_src; 1860 ipaddr_t dst = ipha->ipha_dst; 1861 netstackid_t stack_id = ipst->ips_netstack->netstack_stackid; 1862 1863 ASSERT(cl_inet_isclusterwide != NULL); 1864 if ((*cl_inet_isclusterwide)(stack_id, IPPROTO_IP, 1865 AF_INET, (uint8_t *)(uintptr_t)src, NULL)) { 1866 /* 1867 * Note: not correct with LSO since we can't allocate 1868 * ixa_extra_ident+1 consecutive values. 1869 */ 1870 ipha->ipha_ident = (*cl_inet_ipident)(stack_id, 1871 IPPROTO_IP, AF_INET, (uint8_t *)(uintptr_t)src, 1872 (uint8_t *)(uintptr_t)dst, NULL); 1873 } else { 1874 ipha->ipha_ident = atomic_add_32_nv(identp, 1875 ixa->ixa_extra_ident + 1); 1876 } 1877 } else { 1878 ipha->ipha_ident = atomic_add_32_nv(identp, 1879 ixa->ixa_extra_ident + 1); 1880 } 1881 #ifndef _BIG_ENDIAN 1882 ipha->ipha_ident = htons(ipha->ipha_ident); 1883 #endif 1884 1885 /* 1886 * This might set b_band, thus the IPsec and fragmentation 1887 * code in IP ensures that b_band is updated in the first mblk. 1888 */ 1889 if (IPP_ENABLED(IPP_LOCAL_OUT, ipst)) { 1890 /* ip_process translates an IS_UNDER_IPMP */ 1891 mp = ip_process(IPP_LOCAL_OUT, mp, ill, ill); 1892 if (mp == NULL) { 1893 /* ip_drop_packet and MIB done */ 1894 return (0); /* Might just be delayed */ 1895 } 1896 } 1897 1898 /* 1899 * Verify any IPv4 options. 1900 * 1901 * The presense of IP options also forces the network stack to 1902 * calculate the checksum in software. This is because: 1903 * 1904 * Wrap around: certain partial-checksum NICs (eri, ce) limit 1905 * the size of "start offset" width to 6-bit. This effectively 1906 * sets the largest value of the offset to 64-bytes, starting 1907 * from the MAC header. When the cumulative MAC and IP headers 1908 * exceed such limit, the offset will wrap around. This causes 1909 * the checksum to be calculated at the wrong place. 1910 * 1911 * IPv4 source routing: none of the full-checksum capable NICs 1912 * is capable of correctly handling the IPv4 source-routing 1913 * option for purposes of calculating the pseudo-header; the 1914 * actual destination is different from the destination in the 1915 * header which is that of the next-hop. (This case may not be 1916 * true for NICs which can parse IPv6 extension headers, but 1917 * we choose to simplify the implementation by not offloading 1918 * checksum when they are present.) 1919 */ 1920 if (!IS_SIMPLE_IPH(ipha)) { 1921 ixaflags = ixa->ixa_flags |= IXAF_NO_HW_CKSUM; 1922 /* An IS_UNDER_IPMP ill is ok here */ 1923 if (ip_output_options(mp, ipha, ixa, ill)) { 1924 /* Packet has been consumed and ICMP error sent */ 1925 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); 1926 return (EINVAL); 1927 } 1928 } 1929 1930 /* 1931 * To handle IPsec/iptun's labeling needs we need to tag packets 1932 * while we still have ixa_tsl 1933 */ 1934 if (is_system_labeled() && ixa->ixa_tsl != NULL && 1935 (ill->ill_mactype == DL_6TO4 || ill->ill_mactype == DL_IPV4 || 1936 ill->ill_mactype == DL_IPV6)) { 1937 cred_t *newcr; 1938 1939 newcr = copycred_from_tslabel(ixa->ixa_cred, ixa->ixa_tsl, 1940 KM_NOSLEEP); 1941 if (newcr == NULL) { 1942 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); 1943 ip_drop_output("ipIfStatsOutDiscards - newcr", 1944 mp, ill); 1945 freemsg(mp); 1946 return (ENOBUFS); 1947 } 1948 mblk_setcred(mp, newcr, NOPID); 1949 crfree(newcr); /* mblk_setcred did its own crhold */ 1950 } 1951 1952 if (ixa->ixa_pktlen > ixa->ixa_fragsize || 1953 (ixaflags & IXAF_IPSEC_SECURE)) { 1954 uint32_t pktlen; 1955 1956 pktlen = ixa->ixa_pktlen; 1957 if (ixaflags & IXAF_IPSEC_SECURE) 1958 pktlen += ipsec_out_extra_length(ixa); 1959 1960 if (pktlen > IP_MAXPACKET) 1961 return (EMSGSIZE); 1962 1963 if (ixaflags & IXAF_SET_ULP_CKSUM) { 1964 /* 1965 * Compute ULP checksum and IP header checksum 1966 * using software 1967 */ 1968 if (!ip_output_sw_cksum_v4(mp, ipha, ixa)) { 1969 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); 1970 ip_drop_output("ipIfStatsOutDiscards", mp, ill); 1971 freemsg(mp); 1972 return (EINVAL); 1973 } 1974 } else { 1975 /* Calculate IPv4 header checksum */ 1976 ipha->ipha_hdr_checksum = 0; 1977 ipha->ipha_hdr_checksum = ip_csum_hdr(ipha); 1978 } 1979 1980 /* 1981 * If this packet would generate a icmp_frag_needed 1982 * message, we need to handle it before we do the IPsec 1983 * processing. Otherwise, we need to strip the IPsec 1984 * headers before we send up the message to the ULPs 1985 * which becomes messy and difficult. 1986 * 1987 * We check using IXAF_DONTFRAG. The DF bit in the header 1988 * is not inspected - it will be copied to any generated 1989 * fragments. 1990 */ 1991 if ((pktlen > ixa->ixa_fragsize) && 1992 (ixaflags & IXAF_DONTFRAG)) { 1993 /* Generate ICMP and return error */ 1994 ip_recv_attr_t iras; 1995 1996 DTRACE_PROBE4(ip4__fragsize__fail, uint_t, pktlen, 1997 uint_t, ixa->ixa_fragsize, uint_t, ixa->ixa_pktlen, 1998 uint_t, ixa->ixa_pmtu); 1999 2000 bzero(&iras, sizeof (iras)); 2001 /* Map ixa to ira including IPsec policies */ 2002 ipsec_out_to_in(ixa, ill, &iras); 2003 2004 ip_drop_output("ICMP_FRAG_NEEDED", mp, ill); 2005 icmp_frag_needed(mp, ixa->ixa_fragsize, &iras); 2006 /* We moved any IPsec refs from ixa to iras */ 2007 ira_cleanup(&iras, B_FALSE); 2008 return (EMSGSIZE); 2009 } 2010 DTRACE_PROBE4(ip4__fragsize__ok, uint_t, pktlen, 2011 uint_t, ixa->ixa_fragsize, uint_t, ixa->ixa_pktlen, 2012 uint_t, ixa->ixa_pmtu); 2013 2014 if (ixaflags & IXAF_IPSEC_SECURE) { 2015 /* 2016 * Pass in sufficient information so that 2017 * IPsec can determine whether to fragment, and 2018 * which function to call after fragmentation. 2019 */ 2020 return (ipsec_out_process(mp, ixa)); 2021 } 2022 return (ip_fragment_v4(mp, ixa->ixa_nce, ixaflags, 2023 ixa->ixa_pktlen, ixa->ixa_fragsize, ixa->ixa_xmit_hint, 2024 ixa->ixa_zoneid, ixa->ixa_no_loop_zoneid, 2025 ixa->ixa_postfragfn, &ixa->ixa_cookie)); 2026 } 2027 if (ixaflags & IXAF_SET_ULP_CKSUM) { 2028 /* Compute ULP checksum and IP header checksum */ 2029 /* An IS_UNDER_IPMP ill is ok here */ 2030 if (!ip_output_cksum_v4(ixaflags, mp, ipha, ixa, ill)) { 2031 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); 2032 ip_drop_output("ipIfStatsOutDiscards", mp, ill); 2033 freemsg(mp); 2034 return (EINVAL); 2035 } 2036 } else { 2037 /* Calculate IPv4 header checksum */ 2038 ipha->ipha_hdr_checksum = 0; 2039 ipha->ipha_hdr_checksum = ip_csum_hdr(ipha); 2040 } 2041 return ((ixa->ixa_postfragfn)(mp, ixa->ixa_nce, ixaflags, 2042 ixa->ixa_pktlen, ixa->ixa_xmit_hint, ixa->ixa_zoneid, 2043 ixa->ixa_no_loop_zoneid, &ixa->ixa_cookie)); 2044 } 2045 2046 /* 2047 * Send mp into ip_input 2048 * Common for IPv4 and IPv6 2049 */ 2050 void 2051 ip_postfrag_loopback(mblk_t *mp, nce_t *nce, iaflags_t ixaflags, 2052 uint_t pkt_len, zoneid_t nolzid) 2053 { 2054 rtc_t rtc; 2055 ill_t *ill = nce->nce_ill; 2056 ip_recv_attr_t iras; /* NOTE: No bzero for performance */ 2057 ncec_t *ncec; 2058 2059 ncec = nce->nce_common; 2060 iras.ira_flags = IRAF_VERIFY_IP_CKSUM | IRAF_VERIFY_ULP_CKSUM | 2061 IRAF_LOOPBACK | IRAF_L2SRC_LOOPBACK; 2062 if (ncec->ncec_flags & NCE_F_BCAST) 2063 iras.ira_flags |= IRAF_L2DST_BROADCAST; 2064 else if (ncec->ncec_flags & NCE_F_MCAST) 2065 iras.ira_flags |= IRAF_L2DST_MULTICAST; 2066 2067 iras.ira_free_flags = 0; 2068 iras.ira_cred = NULL; 2069 iras.ira_cpid = NOPID; 2070 iras.ira_tsl = NULL; 2071 iras.ira_zoneid = ALL_ZONES; 2072 iras.ira_pktlen = pkt_len; 2073 UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCInOctets, iras.ira_pktlen); 2074 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInReceives); 2075 2076 if (ixaflags & IXAF_IS_IPV4) 2077 iras.ira_flags |= IRAF_IS_IPV4; 2078 2079 iras.ira_ill = iras.ira_rill = ill; 2080 iras.ira_ruifindex = ill->ill_phyint->phyint_ifindex; 2081 iras.ira_rifindex = iras.ira_ruifindex; 2082 iras.ira_mhip = NULL; 2083 2084 iras.ira_flags |= ixaflags & IAF_MASK; 2085 iras.ira_no_loop_zoneid = nolzid; 2086 2087 /* Broadcast and multicast doesn't care about the squeue */ 2088 iras.ira_sqp = NULL; 2089 2090 rtc.rtc_ire = NULL; 2091 if (ixaflags & IXAF_IS_IPV4) { 2092 ipha_t *ipha = (ipha_t *)mp->b_rptr; 2093 2094 rtc.rtc_ipaddr = INADDR_ANY; 2095 2096 (*ill->ill_inputfn)(mp, ipha, &ipha->ipha_dst, &iras, &rtc); 2097 if (rtc.rtc_ire != NULL) { 2098 ASSERT(rtc.rtc_ipaddr != INADDR_ANY); 2099 ire_refrele(rtc.rtc_ire); 2100 } 2101 } else { 2102 ip6_t *ip6h = (ip6_t *)mp->b_rptr; 2103 2104 rtc.rtc_ip6addr = ipv6_all_zeros; 2105 2106 (*ill->ill_inputfn)(mp, ip6h, &ip6h->ip6_dst, &iras, &rtc); 2107 if (rtc.rtc_ire != NULL) { 2108 ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&rtc.rtc_ip6addr)); 2109 ire_refrele(rtc.rtc_ire); 2110 } 2111 } 2112 /* Any references to clean up? No hold on ira */ 2113 if (iras.ira_flags & (IRAF_IPSEC_SECURE|IRAF_SYSTEM_LABELED)) 2114 ira_cleanup(&iras, B_FALSE); 2115 } 2116 2117 /* 2118 * Post fragmentation function for IRE_MULTICAST and IRE_BROADCAST which 2119 * looks at the IXAF_LOOPBACK_COPY flag. 2120 * Common for IPv4 and IPv6. 2121 * 2122 * If the loopback copy fails (due to no memory) but we send the packet out 2123 * on the wire we return no failure. Only in the case we supress the wire 2124 * sending do we take the loopback failure into account. 2125 * 2126 * Note that we do not perform DTRACE_IP7 and FW_HOOKS for the looped back copy. 2127 * Those operations are performed on this packet in ip_xmit() and it would 2128 * be odd to do it twice for the same packet. 2129 */ 2130 int 2131 ip_postfrag_loopcheck(mblk_t *mp, nce_t *nce, iaflags_t ixaflags, 2132 uint_t pkt_len, uint32_t xmit_hint, zoneid_t szone, zoneid_t nolzid, 2133 uintptr_t *ixacookie) 2134 { 2135 ill_t *ill = nce->nce_ill; 2136 int error = 0; 2137 2138 /* 2139 * Check for IXAF_LOOPBACK_COPY - send a copy to ip as if the driver 2140 * had looped it back 2141 */ 2142 if (ixaflags & IXAF_LOOPBACK_COPY) { 2143 mblk_t *mp1; 2144 2145 mp1 = copymsg(mp); 2146 if (mp1 == NULL) { 2147 /* Failed to deliver the loopback copy. */ 2148 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); 2149 ip_drop_output("ipIfStatsOutDiscards", mp, ill); 2150 error = ENOBUFS; 2151 } else { 2152 ip_postfrag_loopback(mp1, nce, ixaflags, pkt_len, 2153 nolzid); 2154 } 2155 } 2156 2157 /* 2158 * If TTL = 0 then only do the loopback to this host i.e. we are 2159 * done. We are also done if this was the 2160 * loopback interface since it is sufficient 2161 * to loopback one copy of a multicast packet. 2162 */ 2163 if (ixaflags & IXAF_IS_IPV4) { 2164 ipha_t *ipha = (ipha_t *)mp->b_rptr; 2165 2166 if (ipha->ipha_ttl == 0) { 2167 ip_drop_output("multicast ipha_ttl not sent to wire", 2168 mp, ill); 2169 freemsg(mp); 2170 return (error); 2171 } 2172 } else { 2173 ip6_t *ip6h = (ip6_t *)mp->b_rptr; 2174 2175 if (ip6h->ip6_hops == 0) { 2176 ip_drop_output("multicast ipha_ttl not sent to wire", 2177 mp, ill); 2178 freemsg(mp); 2179 return (error); 2180 } 2181 } 2182 if (nce->nce_ill->ill_wq == NULL) { 2183 /* Loopback interface */ 2184 ip_drop_output("multicast on lo0 not sent to wire", mp, ill); 2185 freemsg(mp); 2186 return (error); 2187 } 2188 2189 return (ip_xmit(mp, nce, ixaflags, pkt_len, xmit_hint, szone, 0, 2190 ixacookie)); 2191 } 2192 2193 /* 2194 * Post fragmentation function for RTF_MULTIRT routes. 2195 * Since IRE_BROADCASTs can have RTF_MULTIRT, this function 2196 * checks IXAF_LOOPBACK_COPY. 2197 * 2198 * If no packet is sent due to failures then we return an errno, but if at 2199 * least one succeeded we return zero. 2200 */ 2201 int 2202 ip_postfrag_multirt_v4(mblk_t *mp, nce_t *nce, iaflags_t ixaflags, 2203 uint_t pkt_len, uint32_t xmit_hint, zoneid_t szone, zoneid_t nolzid, 2204 uintptr_t *ixacookie) 2205 { 2206 irb_t *irb; 2207 ipha_t *ipha = (ipha_t *)mp->b_rptr; 2208 ire_t *ire; 2209 ire_t *ire1; 2210 mblk_t *mp1; 2211 nce_t *nce1; 2212 ill_t *ill = nce->nce_ill; 2213 ill_t *ill1; 2214 ip_stack_t *ipst = ill->ill_ipst; 2215 int error = 0; 2216 int num_sent = 0; 2217 int err; 2218 uint_t ire_type; 2219 ipaddr_t nexthop; 2220 2221 ASSERT(ixaflags & IXAF_IS_IPV4); 2222 2223 /* Check for IXAF_LOOPBACK_COPY */ 2224 if (ixaflags & IXAF_LOOPBACK_COPY) { 2225 mblk_t *mp1; 2226 2227 mp1 = copymsg(mp); 2228 if (mp1 == NULL) { 2229 /* Failed to deliver the loopback copy. */ 2230 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); 2231 ip_drop_output("ipIfStatsOutDiscards", mp, ill); 2232 error = ENOBUFS; 2233 } else { 2234 ip_postfrag_loopback(mp1, nce, ixaflags, pkt_len, 2235 nolzid); 2236 } 2237 } 2238 2239 /* 2240 * Loop over RTF_MULTIRT for ipha_dst in the same bucket. Send 2241 * a copy to each one. 2242 * Use the nce (nexthop) and ipha_dst to find the ire. 2243 * 2244 * MULTIRT is not designed to work with shared-IP zones thus we don't 2245 * need to pass a zoneid or a label to the IRE lookup. 2246 */ 2247 if (V4_PART_OF_V6(nce->nce_addr) == ipha->ipha_dst) { 2248 /* Broadcast and multicast case */ 2249 ire = ire_ftable_lookup_v4(ipha->ipha_dst, 0, 0, 0, 2250 NULL, ALL_ZONES, NULL, MATCH_IRE_DSTONLY, 0, ipst, NULL); 2251 } else { 2252 ipaddr_t v4addr = V4_PART_OF_V6(nce->nce_addr); 2253 2254 /* Unicast case */ 2255 ire = ire_ftable_lookup_v4(ipha->ipha_dst, 0, v4addr, 0, 2256 NULL, ALL_ZONES, NULL, MATCH_IRE_GW, 0, ipst, NULL); 2257 } 2258 2259 if (ire == NULL || 2260 (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) || 2261 !(ire->ire_flags & RTF_MULTIRT)) { 2262 /* Drop */ 2263 ip_drop_output("ip_postfrag_multirt didn't find route", 2264 mp, nce->nce_ill); 2265 if (ire != NULL) 2266 ire_refrele(ire); 2267 return (ENETUNREACH); 2268 } 2269 2270 irb = ire->ire_bucket; 2271 irb_refhold(irb); 2272 for (ire1 = irb->irb_ire; ire1 != NULL; ire1 = ire1->ire_next) { 2273 /* 2274 * For broadcast we can have a mixture of IRE_BROADCAST and 2275 * IRE_HOST due to the manually added IRE_HOSTs that are used 2276 * to trigger the creation of the special CGTP broadcast routes. 2277 * Thus we have to skip if ire_type doesn't match the original. 2278 */ 2279 if (IRE_IS_CONDEMNED(ire1) || 2280 !(ire1->ire_flags & RTF_MULTIRT) || 2281 ire1->ire_type != ire->ire_type) 2282 continue; 2283 2284 /* Do the ire argument one after the loop */ 2285 if (ire1 == ire) 2286 continue; 2287 2288 ill1 = ire_nexthop_ill(ire1); 2289 if (ill1 == NULL) { 2290 /* 2291 * This ire might not have been picked by 2292 * ire_route_recursive, in which case ire_dep might 2293 * not have been setup yet. 2294 * We kick ire_route_recursive to try to resolve 2295 * starting at ire1. 2296 */ 2297 ire_t *ire2; 2298 2299 ire2 = ire_route_recursive_impl_v4(ire1, 2300 ire1->ire_addr, ire1->ire_type, ire1->ire_ill, 2301 ire1->ire_zoneid, NULL, MATCH_IRE_DSTONLY, 2302 B_TRUE, 0, ipst, NULL, NULL, NULL); 2303 if (ire2 != NULL) 2304 ire_refrele(ire2); 2305 ill1 = ire_nexthop_ill(ire1); 2306 } 2307 2308 if (ill1 == NULL) { 2309 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); 2310 ip_drop_output("ipIfStatsOutDiscards - no ill", 2311 mp, ill); 2312 error = ENETUNREACH; 2313 continue; 2314 } 2315 2316 /* Pick the addr and type to use for arp_nce_init */ 2317 if (nce->nce_common->ncec_flags & NCE_F_BCAST) { 2318 ire_type = IRE_BROADCAST; 2319 nexthop = ire1->ire_gateway_addr; 2320 } else if (nce->nce_common->ncec_flags & NCE_F_MCAST) { 2321 ire_type = IRE_MULTICAST; 2322 nexthop = ipha->ipha_dst; 2323 } else { 2324 ire_type = ire1->ire_type; /* Doesn't matter */ 2325 nexthop = ire1->ire_gateway_addr; 2326 } 2327 2328 /* If IPMP meta or under, then we just drop */ 2329 if (ill1->ill_grp != NULL) { 2330 BUMP_MIB(ill1->ill_ip_mib, ipIfStatsOutDiscards); 2331 ip_drop_output("ipIfStatsOutDiscards - IPMP", 2332 mp, ill1); 2333 ill_refrele(ill1); 2334 error = ENETUNREACH; 2335 continue; 2336 } 2337 2338 nce1 = arp_nce_init(ill1, nexthop, ire_type); 2339 if (nce1 == NULL) { 2340 BUMP_MIB(ill1->ill_ip_mib, ipIfStatsOutDiscards); 2341 ip_drop_output("ipIfStatsOutDiscards - no nce", 2342 mp, ill1); 2343 ill_refrele(ill1); 2344 error = ENETUNREACH; 2345 continue; 2346 } 2347 mp1 = copymsg(mp); 2348 if (mp1 == NULL) { 2349 BUMP_MIB(ill1->ill_ip_mib, ipIfStatsOutDiscards); 2350 ip_drop_output("ipIfStatsOutDiscards", mp, ill1); 2351 nce_refrele(nce1); 2352 ill_refrele(ill1); 2353 error = ENOBUFS; 2354 continue; 2355 } 2356 /* Preserve HW checksum for this copy */ 2357 DB_CKSUMSTART(mp1) = DB_CKSUMSTART(mp); 2358 DB_CKSUMSTUFF(mp1) = DB_CKSUMSTUFF(mp); 2359 DB_CKSUMEND(mp1) = DB_CKSUMEND(mp); 2360 DB_CKSUMFLAGS(mp1) = DB_CKSUMFLAGS(mp); 2361 DB_LSOMSS(mp1) = DB_LSOMSS(mp); 2362 2363 ire1->ire_ob_pkt_count++; 2364 err = ip_xmit(mp1, nce1, ixaflags, pkt_len, xmit_hint, szone, 2365 0, ixacookie); 2366 if (err == 0) 2367 num_sent++; 2368 else 2369 error = err; 2370 nce_refrele(nce1); 2371 ill_refrele(ill1); 2372 } 2373 irb_refrele(irb); 2374 ire_refrele(ire); 2375 /* Finally, the main one */ 2376 err = ip_xmit(mp, nce, ixaflags, pkt_len, xmit_hint, szone, 0, 2377 ixacookie); 2378 if (err == 0) 2379 num_sent++; 2380 else 2381 error = err; 2382 if (num_sent > 0) 2383 return (0); 2384 else 2385 return (error); 2386 } 2387 2388 /* 2389 * Verify local connectivity. This check is called by ULP fusion code. 2390 * The generation number on an IRE_LOCAL or IRE_LOOPBACK only changes if 2391 * the interface is brought down and back up. So we simply fail the local 2392 * process. The caller, TCP Fusion, should unfuse the connection. 2393 */ 2394 boolean_t 2395 ip_output_verify_local(ip_xmit_attr_t *ixa) 2396 { 2397 ire_t *ire = ixa->ixa_ire; 2398 2399 if (!(ire->ire_type & (IRE_LOCAL | IRE_LOOPBACK))) 2400 return (B_FALSE); 2401 2402 return (ixa->ixa_ire->ire_generation == ixa->ixa_ire_generation); 2403 } 2404 2405 /* 2406 * Local process for ULP loopback, TCP Fusion. Handle both IPv4 and IPv6. 2407 * 2408 * The caller must call ip_output_verify_local() first. This function handles 2409 * IPobs, FW_HOOKS, and/or IPsec cases sequentially. 2410 */ 2411 mblk_t * 2412 ip_output_process_local(mblk_t *mp, ip_xmit_attr_t *ixa, boolean_t hooks_out, 2413 boolean_t hooks_in, conn_t *peer_connp) 2414 { 2415 ill_t *ill = ixa->ixa_ire->ire_ill; 2416 ipha_t *ipha = NULL; 2417 ip6_t *ip6h = NULL; 2418 ip_stack_t *ipst = ixa->ixa_ipst; 2419 iaflags_t ixaflags = ixa->ixa_flags; 2420 ip_recv_attr_t iras; 2421 int error; 2422 2423 ASSERT(mp != NULL); 2424 2425 if (ixaflags & IXAF_IS_IPV4) { 2426 ipha = (ipha_t *)mp->b_rptr; 2427 2428 /* 2429 * If a callback is enabled then we need to know the 2430 * source and destination zoneids for the packet. We already 2431 * have those handy. 2432 */ 2433 if (ipst->ips_ip4_observe.he_interested) { 2434 zoneid_t szone, dzone; 2435 zoneid_t stackzoneid; 2436 2437 stackzoneid = netstackid_to_zoneid( 2438 ipst->ips_netstack->netstack_stackid); 2439 2440 if (stackzoneid == GLOBAL_ZONEID) { 2441 /* Shared-IP zone */ 2442 dzone = ixa->ixa_ire->ire_zoneid; 2443 szone = ixa->ixa_zoneid; 2444 } else { 2445 szone = dzone = stackzoneid; 2446 } 2447 ipobs_hook(mp, IPOBS_HOOK_LOCAL, szone, dzone, ill, 2448 ipst); 2449 } 2450 DTRACE_IP7(send, mblk_t *, mp, conn_t *, NULL, void_ip_t *, 2451 ipha, __dtrace_ipsr_ill_t *, ill, ipha_t *, ipha, ip6_t *, 2452 NULL, int, 1); 2453 2454 /* FW_HOOKS: LOOPBACK_OUT */ 2455 if (hooks_out) { 2456 DTRACE_PROBE4(ip4__loopback__out__start, ill_t *, NULL, 2457 ill_t *, ill, ipha_t *, ipha, mblk_t *, mp); 2458 FW_HOOKS(ipst->ips_ip4_loopback_out_event, 2459 ipst->ips_ipv4firewall_loopback_out, 2460 NULL, ill, ipha, mp, mp, 0, ipst, error); 2461 DTRACE_PROBE1(ip4__loopback__out__end, mblk_t *, mp); 2462 } 2463 if (mp == NULL) 2464 return (NULL); 2465 2466 /* FW_HOOKS: LOOPBACK_IN */ 2467 if (hooks_in) { 2468 DTRACE_PROBE4(ip4__loopback__in__start, ill_t *, ill, 2469 ill_t *, NULL, ipha_t *, ipha, mblk_t *, mp); 2470 FW_HOOKS(ipst->ips_ip4_loopback_in_event, 2471 ipst->ips_ipv4firewall_loopback_in, 2472 ill, NULL, ipha, mp, mp, 0, ipst, error); 2473 DTRACE_PROBE1(ip4__loopback__in__end, mblk_t *, mp); 2474 } 2475 if (mp == NULL) 2476 return (NULL); 2477 2478 DTRACE_IP7(receive, mblk_t *, mp, conn_t *, NULL, void_ip_t *, 2479 ipha, __dtrace_ipsr_ill_t *, ill, ipha_t *, ipha, ip6_t *, 2480 NULL, int, 1); 2481 2482 /* Inbound IPsec polocies */ 2483 if (peer_connp != NULL) { 2484 /* Map ixa to ira including IPsec policies. */ 2485 ipsec_out_to_in(ixa, ill, &iras); 2486 mp = ipsec_check_inbound_policy(mp, peer_connp, ipha, 2487 NULL, &iras); 2488 } 2489 } else { 2490 ip6h = (ip6_t *)mp->b_rptr; 2491 2492 /* 2493 * If a callback is enabled then we need to know the 2494 * source and destination zoneids for the packet. We already 2495 * have those handy. 2496 */ 2497 if (ipst->ips_ip6_observe.he_interested) { 2498 zoneid_t szone, dzone; 2499 zoneid_t stackzoneid; 2500 2501 stackzoneid = netstackid_to_zoneid( 2502 ipst->ips_netstack->netstack_stackid); 2503 2504 if (stackzoneid == GLOBAL_ZONEID) { 2505 /* Shared-IP zone */ 2506 dzone = ixa->ixa_ire->ire_zoneid; 2507 szone = ixa->ixa_zoneid; 2508 } else { 2509 szone = dzone = stackzoneid; 2510 } 2511 ipobs_hook(mp, IPOBS_HOOK_LOCAL, szone, dzone, ill, 2512 ipst); 2513 } 2514 DTRACE_IP7(send, mblk_t *, mp, conn_t *, NULL, void_ip_t *, 2515 ip6h, __dtrace_ipsr_ill_t *, ill, ipha_t *, NULL, ip6_t *, 2516 ip6h, int, 1); 2517 2518 /* FW_HOOKS: LOOPBACK_OUT */ 2519 if (hooks_out) { 2520 DTRACE_PROBE4(ip6__loopback__out__start, ill_t *, NULL, 2521 ill_t *, ill, ip6_t *, ip6h, mblk_t *, mp); 2522 FW_HOOKS6(ipst->ips_ip6_loopback_out_event, 2523 ipst->ips_ipv6firewall_loopback_out, 2524 NULL, ill, ip6h, mp, mp, 0, ipst, error); 2525 DTRACE_PROBE1(ip6__loopback__out__end, mblk_t *, mp); 2526 } 2527 if (mp == NULL) 2528 return (NULL); 2529 2530 /* FW_HOOKS: LOOPBACK_IN */ 2531 if (hooks_in) { 2532 DTRACE_PROBE4(ip6__loopback__in__start, ill_t *, ill, 2533 ill_t *, NULL, ip6_t *, ip6h, mblk_t *, mp); 2534 FW_HOOKS6(ipst->ips_ip6_loopback_in_event, 2535 ipst->ips_ipv6firewall_loopback_in, 2536 ill, NULL, ip6h, mp, mp, 0, ipst, error); 2537 DTRACE_PROBE1(ip6__loopback__in__end, mblk_t *, mp); 2538 } 2539 if (mp == NULL) 2540 return (NULL); 2541 2542 DTRACE_IP7(receive, mblk_t *, mp, conn_t *, NULL, void_ip_t *, 2543 ip6h, __dtrace_ipsr_ill_t *, ill, ipha_t *, NULL, ip6_t *, 2544 ip6h, int, 1); 2545 2546 /* Inbound IPsec polocies */ 2547 if (peer_connp != NULL) { 2548 /* Map ixa to ira including IPsec policies. */ 2549 ipsec_out_to_in(ixa, ill, &iras); 2550 mp = ipsec_check_inbound_policy(mp, peer_connp, NULL, 2551 ip6h, &iras); 2552 } 2553 } 2554 2555 if (mp == NULL) { 2556 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 2557 ip_drop_input("ipIfStatsInDiscards", NULL, ill); 2558 } 2559 2560 return (mp); 2561 } 2562