1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. 24 */ 25 /* Copyright (c) 1990 Mentat Inc. */ 26 27 #include <sys/types.h> 28 #include <sys/stream.h> 29 #include <sys/strsubr.h> 30 #include <sys/dlpi.h> 31 #include <sys/strsun.h> 32 #include <sys/zone.h> 33 #include <sys/ddi.h> 34 #include <sys/sunddi.h> 35 #include <sys/cmn_err.h> 36 #include <sys/debug.h> 37 #include <sys/atomic.h> 38 39 #include <sys/systm.h> 40 #include <sys/param.h> 41 #include <sys/kmem.h> 42 #include <sys/sdt.h> 43 #include <sys/socket.h> 44 #include <sys/mac.h> 45 #include <net/if.h> 46 #include <net/if_arp.h> 47 #include <net/route.h> 48 #include <sys/sockio.h> 49 #include <netinet/in.h> 50 #include <net/if_dl.h> 51 52 #include <inet/common.h> 53 #include <inet/mi.h> 54 #include <inet/mib2.h> 55 #include <inet/nd.h> 56 #include <inet/arp.h> 57 #include <inet/snmpcom.h> 58 #include <inet/kstatcom.h> 59 60 #include <netinet/igmp_var.h> 61 #include <netinet/ip6.h> 62 #include <netinet/icmp6.h> 63 #include <netinet/sctp.h> 64 65 #include <inet/ip.h> 66 #include <inet/ip_impl.h> 67 #include <inet/ip6.h> 68 #include <inet/ip6_asp.h> 69 #include <inet/tcp.h> 70 #include <inet/ip_multi.h> 71 #include <inet/ip_if.h> 72 #include <inet/ip_ire.h> 73 #include <inet/ip_ftable.h> 74 #include <inet/ip_rts.h> 75 #include <inet/optcom.h> 76 #include <inet/ip_ndp.h> 77 #include <inet/ip_listutils.h> 78 #include <netinet/igmp.h> 79 #include <netinet/ip_mroute.h> 80 #include <inet/ipp_common.h> 81 82 #include <net/pfkeyv2.h> 83 #include <inet/sadb.h> 84 #include <inet/ipsec_impl.h> 85 #include <inet/ipdrop.h> 86 #include <inet/ip_netinfo.h> 87 88 #include <sys/pattr.h> 89 #include <inet/ipclassifier.h> 90 #include <inet/sctp_ip.h> 91 #include <inet/sctp/sctp_impl.h> 92 #include <inet/udp_impl.h> 93 #include <sys/sunddi.h> 94 95 #include <sys/tsol/label.h> 96 #include <sys/tsol/tnet.h> 97 98 #include <sys/clock_impl.h> /* For LBOLT_FASTPATH{,64} */ 99 100 #ifdef DEBUG 101 extern boolean_t skip_sctp_cksum; 102 #endif 103 104 static int ip_verify_nce(mblk_t *, ip_xmit_attr_t *); 105 static int ip_verify_dce(mblk_t *, ip_xmit_attr_t *); 106 static boolean_t ip_verify_lso(ill_t *, ip_xmit_attr_t *); 107 static boolean_t ip_verify_zcopy(ill_t *, ip_xmit_attr_t *); 108 static void ip_output_simple_broadcast(ip_xmit_attr_t *, mblk_t *); 109 110 /* 111 * There are two types of output functions for IP used for different 112 * purposes: 113 * - ip_output_simple() is when sending ICMP errors, TCP resets, etc when there 114 * is no context in the form of a conn_t. However, there is a 115 * ip_xmit_attr_t that the callers use to influence interface selection 116 * (needed for ICMP echo as well as IPv6 link-locals) and IPsec. 117 * 118 * - conn_ip_output() is used when sending packets with a conn_t and 119 * ip_set_destination has been called to cache information. In that case 120 * various socket options are recorded in the ip_xmit_attr_t and should 121 * be taken into account. 122 */ 123 124 /* 125 * The caller *must* have called conn_connect() or ip_attr_connect() 126 * before calling conn_ip_output(). The caller needs to redo that each time 127 * the destination IP address or port changes, as well as each time there is 128 * a change to any socket option that would modify how packets are routed out 129 * of the box (e.g., SO_DONTROUTE, IP_NEXTHOP, IP_BOUND_IF). 130 * 131 * The ULP caller has to serialize the use of a single ip_xmit_attr_t. 132 * We assert for that here. 133 */ 134 int 135 conn_ip_output(mblk_t *mp, ip_xmit_attr_t *ixa) 136 { 137 iaflags_t ixaflags = ixa->ixa_flags; 138 ire_t *ire; 139 nce_t *nce; 140 dce_t *dce; 141 ill_t *ill; 142 ip_stack_t *ipst = ixa->ixa_ipst; 143 int error; 144 145 /* We defer ipIfStatsHCOutRequests until an error or we have an ill */ 146 147 ASSERT(ixa->ixa_ire != NULL); 148 /* Note there is no ixa_nce when reject and blackhole routes */ 149 ASSERT(ixa->ixa_dce != NULL); /* Could be default dce */ 150 151 #ifdef DEBUG 152 ASSERT(ixa->ixa_curthread == NULL); 153 ixa->ixa_curthread = curthread; 154 #endif 155 156 /* 157 * Even on labeled systems we can have a NULL ixa_tsl e.g., 158 * for IGMP/MLD traffic. 159 */ 160 161 ire = ixa->ixa_ire; 162 163 /* 164 * If the ULP says the (old) IRE resulted in reachability we 165 * record this before determine whether to use a new IRE. 166 * No locking for performance reasons. 167 */ 168 if (ixaflags & IXAF_REACH_CONF) 169 ire->ire_badcnt = 0; 170 171 /* 172 * Has routing changed since we cached the results of the lookup? 173 * 174 * This check captures all of: 175 * - the cached ire being deleted (by means of the special 176 * IRE_GENERATION_CONDEMNED) 177 * - A potentially better ire being added (ire_generation being 178 * increased) 179 * - A deletion of the nexthop ire that was used when we did the 180 * lookup. 181 * - An addition of a potentially better nexthop ire. 182 * The last two are handled by walking and increasing the generation 183 * number on all dependant IREs in ire_flush_cache(). 184 * 185 * The check also handles all cases of RTF_REJECT and RTF_BLACKHOLE 186 * since we ensure that each time we set ixa_ire to such an IRE we 187 * make sure the ixa_ire_generation does not match (by using 188 * IRE_GENERATION_VERIFY). 189 */ 190 if (ire->ire_generation != ixa->ixa_ire_generation) { 191 error = ip_verify_ire(mp, ixa); 192 if (error != 0) { 193 ip_drop_output("ipIfStatsOutDiscards - verify ire", 194 mp, NULL); 195 goto drop; 196 } 197 ire = ixa->ixa_ire; 198 ASSERT(ire != NULL); 199 if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) { 200 #ifdef DEBUG 201 ASSERT(ixa->ixa_curthread == curthread); 202 ixa->ixa_curthread = NULL; 203 #endif 204 ire->ire_ob_pkt_count++; 205 /* ixa_dce might be condemned; use default one */ 206 return ((ire->ire_sendfn)(ire, mp, mp->b_rptr, ixa, 207 &ipst->ips_dce_default->dce_ident)); 208 } 209 /* 210 * If the ncec changed then ip_verify_ire already set 211 * ixa->ixa_dce_generation = DCE_GENERATION_VERIFY; 212 * so we can recheck the interface mtu. 213 */ 214 215 /* 216 * Note that ire->ire_generation could already have changed. 217 * We catch that next time we send a packet. 218 */ 219 } 220 221 /* 222 * No need to lock access to ixa_nce since the ip_xmit_attr usage 223 * is single threaded. 224 */ 225 ASSERT(ixa->ixa_nce != NULL); 226 nce = ixa->ixa_nce; 227 if (nce->nce_is_condemned) { 228 error = ip_verify_nce(mp, ixa); 229 /* 230 * In case ZEROCOPY capability become not available, we 231 * copy the message and free the original one. We might 232 * be copying more data than needed but it doesn't hurt 233 * since such change rarely happens. 234 */ 235 switch (error) { 236 case 0: 237 break; 238 case ENOTSUP: { /* ZEROCOPY */ 239 mblk_t *nmp; 240 241 if ((nmp = copymsg(mp)) != NULL) { 242 freemsg(mp); 243 mp = nmp; 244 245 break; 246 } 247 } 248 /* FALLTHROUGH */ 249 default: 250 ip_drop_output("ipIfStatsOutDiscards - verify nce", 251 mp, NULL); 252 goto drop; 253 } 254 ire = ixa->ixa_ire; 255 ASSERT(ire != NULL); 256 if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) { 257 #ifdef DEBUG 258 ASSERT(ixa->ixa_curthread == curthread); 259 ixa->ixa_curthread = NULL; 260 #endif 261 ire->ire_ob_pkt_count++; 262 /* ixa_dce might be condemned; use default one */ 263 return ((ire->ire_sendfn)(ire, mp, mp->b_rptr, 264 ixa, &ipst->ips_dce_default->dce_ident)); 265 } 266 ASSERT(ixa->ixa_nce != NULL); 267 nce = ixa->ixa_nce; 268 269 /* 270 * Note that some other event could already have made 271 * the new nce condemned. We catch that next time we 272 * try to send a packet. 273 */ 274 } 275 /* 276 * If there is no per-destination dce_t then we have a reference to 277 * the default dce_t (which merely contains the dce_ipid). 278 * The generation check captures both the introduction of a 279 * per-destination dce_t (e.g., due to ICMP packet too big) and 280 * any change to the per-destination dce (including it becoming 281 * condemned by use of the special DCE_GENERATION_CONDEMNED). 282 */ 283 dce = ixa->ixa_dce; 284 285 /* 286 * To avoid a periodic timer to increase the path MTU we 287 * look at dce_last_change_time each time we send a packet. 288 */ 289 if (dce->dce_flags & DCEF_PMTU) { 290 int64_t now = LBOLT_FASTPATH64; 291 292 if ((TICK_TO_SEC(now) - dce->dce_last_change_time > 293 ipst->ips_ip_pathmtu_interval)) { 294 /* 295 * Older than 20 minutes. Drop the path MTU information. 296 * Since the path MTU changes as a result of this, 297 * twiddle ixa_dce_generation to make us go through the 298 * dce verification code in conn_ip_output. 299 */ 300 mutex_enter(&dce->dce_lock); 301 dce->dce_flags &= ~(DCEF_PMTU|DCEF_TOO_SMALL_PMTU); 302 dce->dce_last_change_time = TICK_TO_SEC(now); 303 mutex_exit(&dce->dce_lock); 304 dce_increment_generation(dce); 305 } 306 } 307 308 if (dce->dce_generation != ixa->ixa_dce_generation) { 309 error = ip_verify_dce(mp, ixa); 310 if (error != 0) { 311 ip_drop_output("ipIfStatsOutDiscards - verify dce", 312 mp, NULL); 313 goto drop; 314 } 315 dce = ixa->ixa_dce; 316 317 /* 318 * Note that some other event could already have made the 319 * new dce's generation number change. 320 * We catch that next time we try to send a packet. 321 */ 322 } 323 324 ill = nce->nce_ill; 325 326 /* 327 * An initial ixa_fragsize was set in ip_set_destination 328 * and we update it if any routing changes above. 329 * A change to ill_mtu with ifconfig will increase all dce_generation 330 * so that we will detect that with the generation check. Ditto for 331 * ill_mc_mtu. 332 */ 333 334 /* 335 * Caller needs to make sure IXAF_VERIFY_SRC is not set if 336 * conn_unspec_src. 337 */ 338 if ((ixaflags & IXAF_VERIFY_SOURCE) && 339 ixa->ixa_src_generation != ipst->ips_src_generation) { 340 /* Check if the IP source is still assigned to the host. */ 341 uint_t gen; 342 343 if (!ip_verify_src(mp, ixa, &gen)) { 344 /* Don't send a packet with a source that isn't ours */ 345 error = EADDRNOTAVAIL; 346 ip_drop_output("ipIfStatsOutDiscards - invalid src", 347 mp, NULL); 348 goto drop; 349 } 350 /* The source is still valid - update the generation number */ 351 ixa->ixa_src_generation = gen; 352 } 353 354 /* 355 * We don't have an IRE when we fragment, hence ire_ob_pkt_count 356 * can only count the use prior to fragmentation. However the MIB 357 * counters on the ill will be incremented in post fragmentation. 358 */ 359 ire->ire_ob_pkt_count++; 360 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCOutRequests); 361 362 /* 363 * Based on ire_type and ire_flags call one of: 364 * ire_send_local_v* - for IRE_LOCAL and IRE_LOOPBACK 365 * ire_send_multirt_v* - if RTF_MULTIRT 366 * ire_send_noroute_v* - if RTF_REJECT or RTF_BLACHOLE 367 * ire_send_multicast_v* - for IRE_MULTICAST 368 * ire_send_broadcast_v4 - for IRE_BROADCAST 369 * ire_send_wire_v* - for the rest. 370 */ 371 #ifdef DEBUG 372 ASSERT(ixa->ixa_curthread == curthread); 373 ixa->ixa_curthread = NULL; 374 #endif 375 return ((ire->ire_sendfn)(ire, mp, mp->b_rptr, ixa, &dce->dce_ident)); 376 377 drop: 378 if (ixaflags & IXAF_IS_IPV4) { 379 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests); 380 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards); 381 } else { 382 BUMP_MIB(&ipst->ips_ip6_mib, ipIfStatsHCOutRequests); 383 BUMP_MIB(&ipst->ips_ip6_mib, ipIfStatsOutDiscards); 384 } 385 freemsg(mp); 386 #ifdef DEBUG 387 ASSERT(ixa->ixa_curthread == curthread); 388 ixa->ixa_curthread = NULL; 389 #endif 390 return (error); 391 } 392 393 /* 394 * Handle both IPv4 and IPv6. Sets the generation number 395 * to allow the caller to know when to call us again. 396 * Returns true if the source address in the packet is a valid source. 397 * We handle callers which try to send with a zero address (since we only 398 * get here if UNSPEC_SRC is not set). 399 */ 400 boolean_t 401 ip_verify_src(mblk_t *mp, ip_xmit_attr_t *ixa, uint_t *generationp) 402 { 403 ip_stack_t *ipst = ixa->ixa_ipst; 404 405 /* 406 * Need to grab the generation number before we check to 407 * avoid a race with a change to the set of local addresses. 408 * No lock needed since the thread which updates the set of local 409 * addresses use ipif/ill locks and exit those (hence a store memory 410 * barrier) before doing the atomic increase of ips_src_generation. 411 */ 412 if (generationp != NULL) 413 *generationp = ipst->ips_src_generation; 414 415 if (ixa->ixa_flags & IXAF_IS_IPV4) { 416 ipha_t *ipha = (ipha_t *)mp->b_rptr; 417 418 if (ipha->ipha_src == INADDR_ANY) 419 return (B_FALSE); 420 421 return (ip_laddr_verify_v4(ipha->ipha_src, ixa->ixa_zoneid, 422 ipst, B_FALSE) != IPVL_BAD); 423 } else { 424 ip6_t *ip6h = (ip6_t *)mp->b_rptr; 425 uint_t scopeid; 426 427 if (IN6_IS_ADDR_UNSPECIFIED(&ip6h->ip6_src)) 428 return (B_FALSE); 429 430 if (ixa->ixa_flags & IXAF_SCOPEID_SET) 431 scopeid = ixa->ixa_scopeid; 432 else 433 scopeid = 0; 434 435 return (ip_laddr_verify_v6(&ip6h->ip6_src, ixa->ixa_zoneid, 436 ipst, B_FALSE, scopeid) != IPVL_BAD); 437 } 438 } 439 440 /* 441 * Handle both IPv4 and IPv6. Reverify/recalculate the IRE to use. 442 */ 443 int 444 ip_verify_ire(mblk_t *mp, ip_xmit_attr_t *ixa) 445 { 446 uint_t gen; 447 ire_t *ire; 448 nce_t *nce; 449 int error; 450 boolean_t multirt = B_FALSE; 451 452 /* 453 * Redo ip_select_route. 454 * Need to grab generation number as part of the lookup to 455 * avoid race. 456 */ 457 error = 0; 458 ire = ip_select_route_pkt(mp, ixa, &gen, &error, &multirt); 459 ASSERT(ire != NULL); /* IRE_NOROUTE if none found */ 460 if (error != 0) { 461 ire_refrele(ire); 462 return (error); 463 } 464 465 if (ixa->ixa_ire != NULL) 466 ire_refrele_notr(ixa->ixa_ire); 467 #ifdef DEBUG 468 ire_refhold_notr(ire); 469 ire_refrele(ire); 470 #endif 471 ixa->ixa_ire = ire; 472 ixa->ixa_ire_generation = gen; 473 if (multirt) { 474 if (ixa->ixa_flags & IXAF_IS_IPV4) 475 ixa->ixa_postfragfn = ip_postfrag_multirt_v4; 476 else 477 ixa->ixa_postfragfn = ip_postfrag_multirt_v6; 478 ixa->ixa_flags |= IXAF_MULTIRT_MULTICAST; 479 } else { 480 ixa->ixa_postfragfn = ire->ire_postfragfn; 481 ixa->ixa_flags &= ~IXAF_MULTIRT_MULTICAST; 482 } 483 484 /* 485 * Don't look for an nce for reject or blackhole. 486 * They have ire_generation set to IRE_GENERATION_VERIFY which 487 * makes conn_ip_output avoid references to ixa_nce. 488 */ 489 if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) { 490 ASSERT(ixa->ixa_ire_generation == IRE_GENERATION_VERIFY); 491 ixa->ixa_dce_generation = DCE_GENERATION_VERIFY; 492 return (0); 493 } 494 495 /* The NCE could now be different */ 496 nce = ire_to_nce_pkt(ire, mp); 497 if (nce == NULL) { 498 /* 499 * Allocation failure. Make sure we redo ire/nce selection 500 * next time we send. 501 */ 502 ixa->ixa_ire_generation = IRE_GENERATION_VERIFY; 503 ixa->ixa_dce_generation = DCE_GENERATION_VERIFY; 504 return (ENOBUFS); 505 } 506 if (nce == ixa->ixa_nce) { 507 /* No change */ 508 nce_refrele(nce); 509 return (0); 510 } 511 512 /* 513 * Since the path MTU might change as a result of this 514 * route change, we twiddle ixa_dce_generation to 515 * make conn_ip_output go through the ip_verify_dce code. 516 */ 517 ixa->ixa_dce_generation = DCE_GENERATION_VERIFY; 518 519 if (ixa->ixa_nce != NULL) 520 nce_refrele(ixa->ixa_nce); 521 ixa->ixa_nce = nce; 522 return (0); 523 } 524 525 /* 526 * Handle both IPv4 and IPv6. Reverify/recalculate the NCE to use. 527 */ 528 static int 529 ip_verify_nce(mblk_t *mp, ip_xmit_attr_t *ixa) 530 { 531 ire_t *ire = ixa->ixa_ire; 532 nce_t *nce; 533 int error = 0; 534 ipha_t *ipha = NULL; 535 ip6_t *ip6h = NULL; 536 537 if (ire->ire_ipversion == IPV4_VERSION) 538 ipha = (ipha_t *)mp->b_rptr; 539 else 540 ip6h = (ip6_t *)mp->b_rptr; 541 542 nce = ire_handle_condemned_nce(ixa->ixa_nce, ire, ipha, ip6h, B_TRUE); 543 if (nce == NULL) { 544 /* Try to find a better ire */ 545 return (ip_verify_ire(mp, ixa)); 546 } 547 548 /* 549 * The hardware offloading capabilities, for example LSO, of the 550 * interface might have changed, so do sanity verification here. 551 */ 552 if (ixa->ixa_flags & IXAF_VERIFY_LSO) { 553 if (!ip_verify_lso(nce->nce_ill, ixa)) { 554 ASSERT(ixa->ixa_notify != NULL); 555 ixa->ixa_notify(ixa->ixa_notify_cookie, ixa, 556 IXAN_LSO, 0); 557 error = ENOTSUP; 558 } 559 } 560 561 /* 562 * Verify ZEROCOPY capability of underlying ill. Notify the ULP with 563 * any ZEROCOPY changes. In case ZEROCOPY capability is not available 564 * any more, return error so that conn_ip_output() can take care of 565 * the ZEROCOPY message properly. It's safe to continue send the 566 * message when ZEROCOPY newly become available. 567 */ 568 if (ixa->ixa_flags & IXAF_VERIFY_ZCOPY) { 569 if (!ip_verify_zcopy(nce->nce_ill, ixa)) { 570 ASSERT(ixa->ixa_notify != NULL); 571 ixa->ixa_notify(ixa->ixa_notify_cookie, ixa, 572 IXAN_ZCOPY, 0); 573 if ((ixa->ixa_flags & IXAF_ZCOPY_CAPAB) == 0) 574 error = ENOTSUP; 575 } 576 } 577 578 /* 579 * Since the path MTU might change as a result of this 580 * change, we twiddle ixa_dce_generation to 581 * make conn_ip_output go through the ip_verify_dce code. 582 */ 583 ixa->ixa_dce_generation = DCE_GENERATION_VERIFY; 584 585 nce_refrele(ixa->ixa_nce); 586 ixa->ixa_nce = nce; 587 return (error); 588 } 589 590 /* 591 * Handle both IPv4 and IPv6. Reverify/recalculate the DCE to use. 592 */ 593 static int 594 ip_verify_dce(mblk_t *mp, ip_xmit_attr_t *ixa) 595 { 596 dce_t *dce; 597 uint_t gen; 598 uint_t pmtu; 599 600 dce = dce_lookup_pkt(mp, ixa, &gen); 601 ASSERT(dce != NULL); 602 603 dce_refrele_notr(ixa->ixa_dce); 604 #ifdef DEBUG 605 dce_refhold_notr(dce); 606 dce_refrele(dce); 607 #endif 608 ixa->ixa_dce = dce; 609 ixa->ixa_dce_generation = gen; 610 611 /* Extract the (path) mtu from the dce, ncec_ill etc */ 612 pmtu = ip_get_pmtu(ixa); 613 614 /* 615 * Tell ULP about PMTU changes - increase or decrease - by returning 616 * an error if IXAF_VERIFY_PMTU is set. In such case, ULP should update 617 * both ixa_pmtu and ixa_fragsize appropriately. 618 * 619 * If ULP doesn't set that flag then we need to update ixa_fragsize 620 * since routing could have changed the ill after after ixa_fragsize 621 * was set previously in the conn_ip_output path or in 622 * ip_set_destination. 623 * 624 * In case of LSO, ixa_fragsize might be greater than ixa_pmtu. 625 * 626 * In the case of a path MTU increase we send the packet after the 627 * notify to the ULP. 628 */ 629 if (ixa->ixa_flags & IXAF_VERIFY_PMTU) { 630 if (ixa->ixa_pmtu != pmtu) { 631 uint_t oldmtu = ixa->ixa_pmtu; 632 633 DTRACE_PROBE2(verify_pmtu, uint32_t, pmtu, 634 uint32_t, ixa->ixa_pmtu); 635 ASSERT(ixa->ixa_notify != NULL); 636 ixa->ixa_notify(ixa->ixa_notify_cookie, ixa, 637 IXAN_PMTU, pmtu); 638 if (pmtu < oldmtu) 639 return (EMSGSIZE); 640 } 641 } else { 642 ixa->ixa_fragsize = pmtu; 643 } 644 return (0); 645 } 646 647 /* 648 * Verify LSO usability. Keep the return value simple to indicate whether 649 * the LSO capability has changed. Handle both IPv4 and IPv6. 650 */ 651 static boolean_t 652 ip_verify_lso(ill_t *ill, ip_xmit_attr_t *ixa) 653 { 654 ill_lso_capab_t *lsoc = &ixa->ixa_lso_capab; 655 ill_lso_capab_t *new_lsoc = ill->ill_lso_capab; 656 657 if (ixa->ixa_flags & IXAF_LSO_CAPAB) { 658 /* 659 * Not unsable any more. 660 */ 661 if ((ixa->ixa_flags & IXAF_IPSEC_SECURE) || 662 (ixa->ixa_ire->ire_type & (IRE_LOCAL | IRE_LOOPBACK)) || 663 (ixa->ixa_ire->ire_flags & RTF_MULTIRT) || 664 ((ixa->ixa_flags & IXAF_IS_IPV4) ? 665 !ILL_LSO_TCP_IPV4_USABLE(ill) : 666 !ILL_LSO_TCP_IPV6_USABLE(ill))) { 667 ixa->ixa_flags &= ~IXAF_LSO_CAPAB; 668 669 return (B_FALSE); 670 } 671 672 /* 673 * Capability has changed, refresh the copy in ixa. 674 */ 675 if (lsoc->ill_lso_max_tcpv4 != new_lsoc->ill_lso_max_tcpv4 || 676 lsoc->ill_lso_max_tcpv6 != new_lsoc->ill_lso_max_tcpv6) { 677 *lsoc = *new_lsoc; 678 679 return (B_FALSE); 680 } 681 } else { /* Was not usable */ 682 if (!(ixa->ixa_flags & IXAF_IPSEC_SECURE) && 683 !(ixa->ixa_ire->ire_type & (IRE_LOCAL | IRE_LOOPBACK)) && 684 !(ixa->ixa_ire->ire_flags & RTF_MULTIRT) && 685 ((ixa->ixa_flags & IXAF_IS_IPV4) ? 686 ILL_LSO_TCP_IPV4_USABLE(ill) : 687 ILL_LSO_TCP_IPV6_USABLE(ill))) { 688 *lsoc = *new_lsoc; 689 ixa->ixa_flags |= IXAF_LSO_CAPAB; 690 691 return (B_FALSE); 692 } 693 } 694 695 return (B_TRUE); 696 } 697 698 /* 699 * Verify ZEROCOPY usability. Keep the return value simple to indicate whether 700 * the ZEROCOPY capability has changed. Handle both IPv4 and IPv6. 701 */ 702 static boolean_t 703 ip_verify_zcopy(ill_t *ill, ip_xmit_attr_t *ixa) 704 { 705 if (ixa->ixa_flags & IXAF_ZCOPY_CAPAB) { 706 /* 707 * Not unsable any more. 708 */ 709 if ((ixa->ixa_flags & IXAF_IPSEC_SECURE) || 710 (ixa->ixa_ire->ire_type & (IRE_LOCAL | IRE_LOOPBACK)) || 711 (ixa->ixa_ire->ire_flags & RTF_MULTIRT) || 712 !ILL_ZCOPY_USABLE(ill)) { 713 ixa->ixa_flags &= ~IXAF_ZCOPY_CAPAB; 714 715 return (B_FALSE); 716 } 717 } else { /* Was not usable */ 718 if (!(ixa->ixa_flags & IXAF_IPSEC_SECURE) && 719 !(ixa->ixa_ire->ire_type & (IRE_LOCAL | IRE_LOOPBACK)) && 720 !(ixa->ixa_ire->ire_flags & RTF_MULTIRT) && 721 ILL_ZCOPY_USABLE(ill)) { 722 ixa->ixa_flags |= IXAF_ZCOPY_CAPAB; 723 724 return (B_FALSE); 725 } 726 } 727 728 return (B_TRUE); 729 } 730 731 732 /* 733 * When there is no conn_t context, this will send a packet. 734 * The caller must *not* have called conn_connect() or ip_attr_connect() 735 * before calling ip_output_simple(). 736 * Handles IPv4 and IPv6. Returns zero or an errno such as ENETUNREACH. 737 * Honors IXAF_SET_SOURCE. 738 * 739 * We acquire the ire and after calling ire_sendfn we release 740 * the hold on the ire. Ditto for the nce and dce. 741 * 742 * This assumes that the caller has set the following in ip_xmit_attr_t: 743 * ixa_tsl, ixa_zoneid, and ixa_ipst must always be set. 744 * If ixa_ifindex is non-zero it means send out that ill. (If it is 745 * an upper IPMP ill we load balance across the group; if a lower we send 746 * on that lower ill without load balancing.) 747 * IXAF_IS_IPV4 must be set correctly. 748 * If IXAF_IPSEC_SECURE is set then the ixa_ipsec_* fields must be set. 749 * If IXAF_NO_IPSEC is set we'd skip IPsec policy lookup. 750 * If neither of those two are set we do an IPsec policy lookup. 751 * 752 * We handle setting things like 753 * ixa_pktlen 754 * ixa_ip_hdr_length 755 * ixa->ixa_protocol 756 * 757 * The caller may set ixa_xmit_hint, which is used for ECMP selection and 758 * transmit ring selecting in GLD. 759 * 760 * The caller must do an ixa_cleanup() to release any IPsec references 761 * after we return. 762 */ 763 int 764 ip_output_simple(mblk_t *mp, ip_xmit_attr_t *ixa) 765 { 766 ts_label_t *effective_tsl = NULL; 767 int err; 768 769 ASSERT(ixa->ixa_ipst != NULL); 770 771 if (is_system_labeled()) { 772 ip_stack_t *ipst = ixa->ixa_ipst; 773 774 if (ixa->ixa_flags & IXAF_IS_IPV4) { 775 err = tsol_check_label_v4(ixa->ixa_tsl, ixa->ixa_zoneid, 776 &mp, CONN_MAC_DEFAULT, B_FALSE, ixa->ixa_ipst, 777 &effective_tsl); 778 } else { 779 err = tsol_check_label_v6(ixa->ixa_tsl, ixa->ixa_zoneid, 780 &mp, CONN_MAC_DEFAULT, B_FALSE, ixa->ixa_ipst, 781 &effective_tsl); 782 } 783 if (err != 0) { 784 ip2dbg(("tsol_check: label check failed (%d)\n", err)); 785 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests); 786 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards); 787 ip_drop_output("tsol_check_label", mp, NULL); 788 freemsg(mp); 789 return (err); 790 } 791 if (effective_tsl != NULL) { 792 /* Update the label */ 793 ip_xmit_attr_replace_tsl(ixa, effective_tsl); 794 } 795 } 796 797 if (ixa->ixa_flags & IXAF_IS_IPV4) 798 return (ip_output_simple_v4(mp, ixa)); 799 else 800 return (ip_output_simple_v6(mp, ixa)); 801 } 802 803 int 804 ip_output_simple_v4(mblk_t *mp, ip_xmit_attr_t *ixa) 805 { 806 ipha_t *ipha; 807 ipaddr_t firsthop; /* In IP header */ 808 ipaddr_t dst; /* End of source route, or ipha_dst if none */ 809 ire_t *ire; 810 ipaddr_t setsrc; /* RTF_SETSRC */ 811 int error; 812 ill_t *ill = NULL; 813 dce_t *dce = NULL; 814 nce_t *nce; 815 iaflags_t ixaflags = ixa->ixa_flags; 816 ip_stack_t *ipst = ixa->ixa_ipst; 817 boolean_t repeat = B_FALSE; 818 boolean_t multirt = B_FALSE; 819 int64_t now; 820 821 ipha = (ipha_t *)mp->b_rptr; 822 ASSERT(IPH_HDR_VERSION(ipha) == IPV4_VERSION); 823 824 /* 825 * Even on labeled systems we can have a NULL ixa_tsl e.g., 826 * for IGMP/MLD traffic. 827 */ 828 829 /* Caller already set flags */ 830 ASSERT(ixa->ixa_flags & IXAF_IS_IPV4); 831 832 ASSERT(ixa->ixa_nce == NULL); 833 834 ixa->ixa_pktlen = ntohs(ipha->ipha_length); 835 ASSERT(ixa->ixa_pktlen == msgdsize(mp)); 836 ixa->ixa_ip_hdr_length = IPH_HDR_LENGTH(ipha); 837 ixa->ixa_protocol = ipha->ipha_protocol; 838 839 /* 840 * Assumes that source routed packets have already been massaged by 841 * the ULP (ip_massage_options) and as a result ipha_dst is the next 842 * hop in the source route. The final destination is used for IPsec 843 * policy and DCE lookup. 844 */ 845 firsthop = ipha->ipha_dst; 846 dst = ip_get_dst(ipha); 847 848 repeat_ire: 849 error = 0; 850 setsrc = INADDR_ANY; 851 ire = ip_select_route_v4(firsthop, ipha->ipha_src, ixa, NULL, 852 &setsrc, &error, &multirt); 853 ASSERT(ire != NULL); /* IRE_NOROUTE if none found */ 854 if (error != 0) { 855 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests); 856 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards); 857 ip_drop_output("ipIfStatsOutDiscards - select route", mp, NULL); 858 freemsg(mp); 859 goto done; 860 } 861 862 if (ire->ire_flags & (RTF_BLACKHOLE|RTF_REJECT)) { 863 /* ire_ill might be NULL hence need to skip some code */ 864 if (ixaflags & IXAF_SET_SOURCE) 865 ipha->ipha_src = htonl(INADDR_LOOPBACK); 866 ixa->ixa_fragsize = IP_MAXPACKET; 867 ill = NULL; 868 nce = NULL; 869 ire->ire_ob_pkt_count++; 870 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests); 871 /* No dce yet; use default one */ 872 error = (ire->ire_sendfn)(ire, mp, ipha, ixa, 873 &ipst->ips_dce_default->dce_ident); 874 goto done; 875 } 876 877 /* Note that ipha_dst is only used for IRE_MULTICAST */ 878 nce = ire_to_nce(ire, ipha->ipha_dst, NULL); 879 if (nce == NULL) { 880 /* Allocation failure? */ 881 ip_drop_output("ire_to_nce", mp, ill); 882 freemsg(mp); 883 error = ENOBUFS; 884 goto done; 885 } 886 if (nce->nce_is_condemned) { 887 nce_t *nce1; 888 889 nce1 = ire_handle_condemned_nce(nce, ire, ipha, NULL, B_TRUE); 890 nce_refrele(nce); 891 if (nce1 == NULL) { 892 if (!repeat) { 893 /* Try finding a better IRE */ 894 repeat = B_TRUE; 895 ire_refrele(ire); 896 goto repeat_ire; 897 } 898 /* Tried twice - drop packet */ 899 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards); 900 ip_drop_output("No nce", mp, ill); 901 freemsg(mp); 902 error = ENOBUFS; 903 goto done; 904 } 905 nce = nce1; 906 } 907 908 /* 909 * For multicast with multirt we have a flag passed back from 910 * ire_lookup_multi_ill_v4 since we don't have an IRE for each 911 * possible multicast address. 912 * We also need a flag for multicast since we can't check 913 * whether RTF_MULTIRT is set in ixa_ire for multicast. 914 */ 915 if (multirt) { 916 ixa->ixa_postfragfn = ip_postfrag_multirt_v4; 917 ixa->ixa_flags |= IXAF_MULTIRT_MULTICAST; 918 } else { 919 ixa->ixa_postfragfn = ire->ire_postfragfn; 920 ixa->ixa_flags &= ~IXAF_MULTIRT_MULTICAST; 921 } 922 ASSERT(ixa->ixa_nce == NULL); 923 ixa->ixa_nce = nce; 924 925 /* 926 * Check for a dce_t with a path mtu. 927 */ 928 dce = dce_lookup_v4(dst, ipst, NULL); 929 ASSERT(dce != NULL); 930 931 if (!(ixaflags & IXAF_PMTU_DISCOVERY)) { 932 ixa->ixa_fragsize = ip_get_base_mtu(nce->nce_ill, ire); 933 } else if (dce->dce_flags & DCEF_PMTU) { 934 /* 935 * To avoid a periodic timer to increase the path MTU we 936 * look at dce_last_change_time each time we send a packet. 937 */ 938 now = ddi_get_lbolt64(); 939 if (TICK_TO_SEC(now) - dce->dce_last_change_time > 940 ipst->ips_ip_pathmtu_interval) { 941 /* 942 * Older than 20 minutes. Drop the path MTU information. 943 */ 944 mutex_enter(&dce->dce_lock); 945 dce->dce_flags &= ~(DCEF_PMTU|DCEF_TOO_SMALL_PMTU); 946 dce->dce_last_change_time = TICK_TO_SEC(now); 947 mutex_exit(&dce->dce_lock); 948 dce_increment_generation(dce); 949 ixa->ixa_fragsize = ip_get_base_mtu(nce->nce_ill, ire); 950 } else { 951 uint_t fragsize; 952 953 fragsize = ip_get_base_mtu(nce->nce_ill, ire); 954 if (fragsize > dce->dce_pmtu) 955 fragsize = dce->dce_pmtu; 956 ixa->ixa_fragsize = fragsize; 957 } 958 } else { 959 ixa->ixa_fragsize = ip_get_base_mtu(nce->nce_ill, ire); 960 } 961 962 /* 963 * We use use ire_nexthop_ill (and not ncec_ill) to avoid the under ipmp 964 * interface for source address selection. 965 */ 966 ill = ire_nexthop_ill(ire); 967 968 if (ixaflags & IXAF_SET_SOURCE) { 969 ipaddr_t src; 970 971 /* 972 * We use the final destination to get 973 * correct selection for source routed packets 974 */ 975 976 /* If unreachable we have no ill but need some source */ 977 if (ill == NULL) { 978 src = htonl(INADDR_LOOPBACK); 979 error = 0; 980 } else { 981 error = ip_select_source_v4(ill, setsrc, dst, 982 ixa->ixa_multicast_ifaddr, ixa->ixa_zoneid, ipst, 983 &src, NULL, NULL); 984 } 985 if (error != 0) { 986 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCOutRequests); 987 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); 988 ip_drop_output("ipIfStatsOutDiscards - no source", 989 mp, ill); 990 freemsg(mp); 991 goto done; 992 } 993 ipha->ipha_src = src; 994 } else if (ixaflags & IXAF_VERIFY_SOURCE) { 995 /* Check if the IP source is assigned to the host. */ 996 if (!ip_verify_src(mp, ixa, NULL)) { 997 /* Don't send a packet with a source that isn't ours */ 998 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests); 999 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards); 1000 ip_drop_output("ipIfStatsOutDiscards - invalid source", 1001 mp, ill); 1002 freemsg(mp); 1003 error = EADDRNOTAVAIL; 1004 goto done; 1005 } 1006 } 1007 1008 1009 /* 1010 * Check against global IPsec policy to set the AH/ESP attributes. 1011 * IPsec will set IXAF_IPSEC_* and ixa_ipsec_* as appropriate. 1012 */ 1013 if (!(ixaflags & (IXAF_NO_IPSEC|IXAF_IPSEC_SECURE))) { 1014 ASSERT(ixa->ixa_ipsec_policy == NULL); 1015 mp = ip_output_attach_policy(mp, ipha, NULL, NULL, ixa); 1016 if (mp == NULL) { 1017 /* MIB and ip_drop_packet already done */ 1018 return (EHOSTUNREACH); /* IPsec policy failure */ 1019 } 1020 } 1021 1022 if (ill != NULL) { 1023 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCOutRequests); 1024 } else { 1025 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests); 1026 } 1027 1028 /* 1029 * We update the statistics on the most specific IRE i.e., the first 1030 * one we found. 1031 * We don't have an IRE when we fragment, hence ire_ob_pkt_count 1032 * can only count the use prior to fragmentation. However the MIB 1033 * counters on the ill will be incremented in post fragmentation. 1034 */ 1035 ire->ire_ob_pkt_count++; 1036 1037 /* 1038 * Based on ire_type and ire_flags call one of: 1039 * ire_send_local_v4 - for IRE_LOCAL and IRE_LOOPBACK 1040 * ire_send_multirt_v4 - if RTF_MULTIRT 1041 * ire_send_noroute_v4 - if RTF_REJECT or RTF_BLACHOLE 1042 * ire_send_multicast_v4 - for IRE_MULTICAST 1043 * ire_send_broadcast_v4 - for IRE_BROADCAST 1044 * ire_send_wire_v4 - for the rest. 1045 */ 1046 error = (ire->ire_sendfn)(ire, mp, ipha, ixa, &dce->dce_ident); 1047 done: 1048 ire_refrele(ire); 1049 if (dce != NULL) 1050 dce_refrele(dce); 1051 if (ill != NULL) 1052 ill_refrele(ill); 1053 if (ixa->ixa_nce != NULL) 1054 nce_refrele(ixa->ixa_nce); 1055 ixa->ixa_nce = NULL; 1056 return (error); 1057 } 1058 1059 /* 1060 * ire_sendfn() functions. 1061 * These functions use the following xmit_attr: 1062 * - ixa_fragsize - read to determine whether or not to fragment 1063 * - IXAF_IPSEC_SECURE - to determine whether or not to invoke IPsec 1064 * - ixa_ipsec_* are used inside IPsec 1065 * - IXAF_SET_SOURCE - replace IP source in broadcast case. 1066 * - IXAF_LOOPBACK_COPY - for multicast and broadcast 1067 */ 1068 1069 1070 /* 1071 * ire_sendfn for IRE_LOCAL and IRE_LOOPBACK 1072 * 1073 * The checks for restrict_interzone_loopback are done in ire_route_recursive. 1074 */ 1075 /* ARGSUSED4 */ 1076 int 1077 ire_send_local_v4(ire_t *ire, mblk_t *mp, void *iph_arg, 1078 ip_xmit_attr_t *ixa, uint32_t *identp) 1079 { 1080 ipha_t *ipha = (ipha_t *)iph_arg; 1081 ip_stack_t *ipst = ixa->ixa_ipst; 1082 ill_t *ill = ire->ire_ill; 1083 ip_recv_attr_t iras; /* NOTE: No bzero for performance */ 1084 uint_t pktlen = ixa->ixa_pktlen; 1085 1086 /* 1087 * No fragmentation, no nce, no application of IPsec, 1088 * and no ipha_ident assignment. 1089 * 1090 * Note different order between IP provider and FW_HOOKS than in 1091 * send_wire case. 1092 */ 1093 1094 /* 1095 * DTrace this as ip:::send. A packet blocked by FW_HOOKS will fire the 1096 * send probe, but not the receive probe. 1097 */ 1098 DTRACE_IP7(send, mblk_t *, mp, conn_t *, NULL, void_ip_t *, 1099 ipha, __dtrace_ipsr_ill_t *, ill, ipha_t *, ipha, ip6_t *, NULL, 1100 int, 1); 1101 1102 if (HOOKS4_INTERESTED_LOOPBACK_OUT(ipst)) { 1103 int error = 0; 1104 1105 DTRACE_PROBE4(ip4__loopback__out__start, ill_t *, NULL, 1106 ill_t *, ill, ipha_t *, ipha, mblk_t *, mp); 1107 FW_HOOKS(ipst->ips_ip4_loopback_out_event, 1108 ipst->ips_ipv4firewall_loopback_out, 1109 NULL, ill, ipha, mp, mp, 0, ipst, error); 1110 DTRACE_PROBE1(ip4__loopback__out__end, mblk_t *, mp); 1111 if (mp == NULL) 1112 return (error); 1113 1114 /* 1115 * Even if the destination was changed by the filter we use the 1116 * forwarding decision that was made based on the address 1117 * in ip_output/ip_set_destination. 1118 */ 1119 /* Length could be different */ 1120 ipha = (ipha_t *)mp->b_rptr; 1121 pktlen = ntohs(ipha->ipha_length); 1122 } 1123 1124 /* 1125 * If a callback is enabled then we need to know the 1126 * source and destination zoneids for the packet. We already 1127 * have those handy. 1128 */ 1129 if (ipst->ips_ip4_observe.he_interested) { 1130 zoneid_t szone, dzone; 1131 zoneid_t stackzoneid; 1132 1133 stackzoneid = netstackid_to_zoneid( 1134 ipst->ips_netstack->netstack_stackid); 1135 1136 if (stackzoneid == GLOBAL_ZONEID) { 1137 /* Shared-IP zone */ 1138 dzone = ire->ire_zoneid; 1139 szone = ixa->ixa_zoneid; 1140 } else { 1141 szone = dzone = stackzoneid; 1142 } 1143 ipobs_hook(mp, IPOBS_HOOK_LOCAL, szone, dzone, ill, ipst); 1144 } 1145 1146 /* Handle lo0 stats */ 1147 ipst->ips_loopback_packets++; 1148 1149 /* Map ixa to ira including IPsec policies */ 1150 ipsec_out_to_in(ixa, ill, &iras); 1151 iras.ira_pktlen = pktlen; 1152 1153 if (!IS_SIMPLE_IPH(ipha)) { 1154 ip_output_local_options(ipha, ipst); 1155 iras.ira_flags |= IRAF_IPV4_OPTIONS; 1156 } 1157 1158 if (HOOKS4_INTERESTED_LOOPBACK_IN(ipst)) { 1159 int error = 0; 1160 1161 DTRACE_PROBE4(ip4__loopback__in__start, ill_t *, ill, 1162 ill_t *, NULL, ipha_t *, ipha, mblk_t *, mp); 1163 FW_HOOKS(ipst->ips_ip4_loopback_in_event, 1164 ipst->ips_ipv4firewall_loopback_in, 1165 ill, NULL, ipha, mp, mp, 0, ipst, error); 1166 1167 DTRACE_PROBE1(ip4__loopback__in__end, mblk_t *, mp); 1168 if (mp == NULL) { 1169 ira_cleanup(&iras, B_FALSE); 1170 return (error); 1171 } 1172 /* 1173 * Even if the destination was changed by the filter we use the 1174 * forwarding decision that was made based on the address 1175 * in ip_output/ip_set_destination. 1176 */ 1177 /* Length could be different */ 1178 ipha = (ipha_t *)mp->b_rptr; 1179 pktlen = iras.ira_pktlen = ntohs(ipha->ipha_length); 1180 } 1181 1182 DTRACE_IP7(receive, mblk_t *, mp, conn_t *, NULL, void_ip_t *, 1183 ipha, __dtrace_ipsr_ill_t *, ill, ipha_t *, ipha, ip6_t *, NULL, 1184 int, 1); 1185 1186 ire->ire_ib_pkt_count++; 1187 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInReceives); 1188 UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCInOctets, pktlen); 1189 1190 /* Destined to ire_zoneid - use that for fanout */ 1191 iras.ira_zoneid = ire->ire_zoneid; 1192 1193 if (is_system_labeled()) { 1194 iras.ira_flags |= IRAF_SYSTEM_LABELED; 1195 1196 /* 1197 * This updates ira_cred, ira_tsl and ira_free_flags based 1198 * on the label. We don't expect this to ever fail for 1199 * loopback packets, so we silently drop the packet should it 1200 * fail. 1201 */ 1202 if (!tsol_get_pkt_label(mp, IPV4_VERSION, &iras)) { 1203 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 1204 ip_drop_input("tsol_get_pkt_label", mp, ill); 1205 freemsg(mp); 1206 return (0); 1207 } 1208 ASSERT(iras.ira_tsl != NULL); 1209 1210 /* tsol_get_pkt_label sometimes does pullupmsg */ 1211 ipha = (ipha_t *)mp->b_rptr; 1212 } 1213 1214 ip_fanout_v4(mp, ipha, &iras); 1215 1216 /* We moved any IPsec refs from ixa to iras */ 1217 ira_cleanup(&iras, B_FALSE); 1218 return (0); 1219 } 1220 1221 /* 1222 * ire_sendfn for IRE_BROADCAST 1223 * If the broadcast address is present on multiple ills and ixa_ifindex 1224 * isn't set, then we generate 1225 * a separate datagram (potentially with different source address) for 1226 * those ills. In any case, only one copy is looped back to ip_input_v4. 1227 */ 1228 int 1229 ire_send_broadcast_v4(ire_t *ire, mblk_t *mp, void *iph_arg, 1230 ip_xmit_attr_t *ixa, uint32_t *identp) 1231 { 1232 ipha_t *ipha = (ipha_t *)iph_arg; 1233 ip_stack_t *ipst = ixa->ixa_ipst; 1234 irb_t *irb = ire->ire_bucket; 1235 ire_t *ire1; 1236 mblk_t *mp1; 1237 ipha_t *ipha1; 1238 iaflags_t ixaflags = ixa->ixa_flags; 1239 nce_t *nce1, *nce_orig; 1240 1241 /* 1242 * Unless ire_send_multirt_v4 already set a ttl, force the 1243 * ttl to a smallish value. 1244 */ 1245 if (!(ixa->ixa_flags & IXAF_NO_TTL_CHANGE)) { 1246 /* 1247 * To avoid broadcast storms, we usually set the TTL to 1 for 1248 * broadcasts. This can 1249 * be overridden stack-wide through the ip_broadcast_ttl 1250 * ndd tunable, or on a per-connection basis through the 1251 * IP_BROADCAST_TTL socket option. 1252 * 1253 * If SO_DONTROUTE/IXAF_DONTROUTE is set, then ire_send_wire_v4 1254 * will force ttl to one after we've set this. 1255 */ 1256 if (ixaflags & IXAF_BROADCAST_TTL_SET) 1257 ipha->ipha_ttl = ixa->ixa_broadcast_ttl; 1258 else 1259 ipha->ipha_ttl = ipst->ips_ip_broadcast_ttl; 1260 } 1261 /* 1262 * Make sure we get a loopback copy (after IPsec and frag) 1263 * Skip hardware checksum so that loopback copy is checksumed. 1264 */ 1265 ixa->ixa_flags |= IXAF_LOOPBACK_COPY | IXAF_NO_HW_CKSUM; 1266 1267 /* Do we need to potentially generate multiple copies? */ 1268 if (irb->irb_ire_cnt == 1 || ixa->ixa_ifindex != 0) 1269 return (ire_send_wire_v4(ire, mp, ipha, ixa, identp)); 1270 1271 /* 1272 * Loop over all IRE_BROADCAST in the bucket (might only be one). 1273 * Note that everything in the bucket has the same destination address. 1274 */ 1275 irb_refhold(irb); 1276 for (ire1 = irb->irb_ire; ire1 != NULL; ire1 = ire1->ire_next) { 1277 /* We do the main IRE after the end of the loop */ 1278 if (ire1 == ire) 1279 continue; 1280 1281 /* 1282 * Only IREs for the same IP address should be in the same 1283 * bucket. 1284 * But could have IRE_HOSTs in the case of CGTP. 1285 * If we find any multirt routes we bail out of the loop 1286 * and just do the single packet at the end; ip_postfrag_multirt 1287 * will duplicate the packet. 1288 */ 1289 ASSERT(ire1->ire_addr == ire->ire_addr); 1290 if (!(ire1->ire_type & IRE_BROADCAST)) 1291 continue; 1292 1293 if (IRE_IS_CONDEMNED(ire1)) 1294 continue; 1295 1296 if (ixa->ixa_zoneid != ALL_ZONES && 1297 ire->ire_zoneid != ire1->ire_zoneid) 1298 continue; 1299 1300 ASSERT(ire->ire_ill != ire1->ire_ill && ire1->ire_ill != NULL); 1301 1302 if (ire1->ire_flags & RTF_MULTIRT) 1303 break; 1304 1305 /* 1306 * For IPMP we only send for the ipmp_ill. arp_nce_init() will 1307 * ensure that this goes out on the cast_ill. 1308 */ 1309 if (IS_UNDER_IPMP(ire1->ire_ill)) 1310 continue; 1311 1312 mp1 = copymsg(mp); 1313 if (mp1 == NULL) { 1314 BUMP_MIB(ire1->ire_ill->ill_ip_mib, 1315 ipIfStatsOutDiscards); 1316 ip_drop_output("ipIfStatsOutDiscards", 1317 mp, ire1->ire_ill); 1318 continue; 1319 } 1320 1321 ipha1 = (ipha_t *)mp1->b_rptr; 1322 if (ixa->ixa_flags & IXAF_SET_SOURCE) { 1323 /* 1324 * Need to pick a different source address for each 1325 * interface. If we have a global IPsec policy and 1326 * no per-socket policy then we punt to 1327 * ip_output_simple_v4 using a separate ip_xmit_attr_t. 1328 */ 1329 if (ixaflags & IXAF_IPSEC_GLOBAL_POLICY) { 1330 ip_output_simple_broadcast(ixa, mp1); 1331 continue; 1332 } 1333 /* Pick a new source address for each interface */ 1334 if (ip_select_source_v4(ire1->ire_ill, INADDR_ANY, 1335 ipha1->ipha_dst, INADDR_ANY, ixa->ixa_zoneid, ipst, 1336 &ipha1->ipha_src, NULL, NULL) != 0) { 1337 BUMP_MIB(ire1->ire_ill->ill_ip_mib, 1338 ipIfStatsOutDiscards); 1339 ip_drop_output("ipIfStatsOutDiscards - select " 1340 "broadcast source", mp1, ire1->ire_ill); 1341 freemsg(mp1); 1342 continue; 1343 } 1344 /* 1345 * Check against global IPsec policy to set the AH/ESP 1346 * attributes. IPsec will set IXAF_IPSEC_* and 1347 * ixa_ipsec_* as appropriate. 1348 */ 1349 if (!(ixaflags & (IXAF_NO_IPSEC|IXAF_IPSEC_SECURE))) { 1350 ASSERT(ixa->ixa_ipsec_policy == NULL); 1351 mp1 = ip_output_attach_policy(mp1, ipha, NULL, 1352 NULL, ixa); 1353 if (mp1 == NULL) { 1354 /* 1355 * MIB and ip_drop_packet already 1356 * done 1357 */ 1358 continue; 1359 } 1360 } 1361 } 1362 /* Make sure we have an NCE on this ill */ 1363 nce1 = arp_nce_init(ire1->ire_ill, ire1->ire_addr, 1364 ire1->ire_type); 1365 if (nce1 == NULL) { 1366 BUMP_MIB(ire1->ire_ill->ill_ip_mib, 1367 ipIfStatsOutDiscards); 1368 ip_drop_output("ipIfStatsOutDiscards - broadcast nce", 1369 mp1, ire1->ire_ill); 1370 freemsg(mp1); 1371 continue; 1372 } 1373 nce_orig = ixa->ixa_nce; 1374 ixa->ixa_nce = nce1; 1375 1376 ire_refhold(ire1); 1377 /* 1378 * Ignore any errors here. We just collect the errno for 1379 * the main ire below 1380 */ 1381 (void) ire_send_wire_v4(ire1, mp1, ipha1, ixa, identp); 1382 ire_refrele(ire1); 1383 1384 ixa->ixa_nce = nce_orig; 1385 nce_refrele(nce1); 1386 1387 ixa->ixa_flags &= ~IXAF_LOOPBACK_COPY; 1388 } 1389 irb_refrele(irb); 1390 /* Finally, the main one */ 1391 1392 /* 1393 * For IPMP we only send broadcasts on the ipmp_ill. 1394 */ 1395 if (IS_UNDER_IPMP(ire->ire_ill)) { 1396 freemsg(mp); 1397 return (0); 1398 } 1399 1400 return (ire_send_wire_v4(ire, mp, ipha, ixa, identp)); 1401 } 1402 1403 /* 1404 * Send a packet using a different source address and different 1405 * IPsec policy. 1406 */ 1407 static void 1408 ip_output_simple_broadcast(ip_xmit_attr_t *ixa, mblk_t *mp) 1409 { 1410 ip_xmit_attr_t ixas; 1411 1412 bzero(&ixas, sizeof (ixas)); 1413 ixas.ixa_flags = IXAF_BASIC_SIMPLE_V4; 1414 ixas.ixa_zoneid = ixa->ixa_zoneid; 1415 ixas.ixa_ifindex = 0; 1416 ixas.ixa_ipst = ixa->ixa_ipst; 1417 ixas.ixa_cred = ixa->ixa_cred; 1418 ixas.ixa_cpid = ixa->ixa_cpid; 1419 ixas.ixa_tsl = ixa->ixa_tsl; 1420 ixas.ixa_multicast_ttl = IP_DEFAULT_MULTICAST_TTL; 1421 1422 (void) ip_output_simple(mp, &ixas); 1423 ixa_cleanup(&ixas); 1424 } 1425 1426 1427 static void 1428 multirt_check_v4(ire_t *ire, ipha_t *ipha, ip_xmit_attr_t *ixa) 1429 { 1430 ip_stack_t *ipst = ixa->ixa_ipst; 1431 1432 /* Limit the TTL on multirt packets */ 1433 if (ire->ire_type & IRE_MULTICAST) { 1434 if (ipha->ipha_ttl > 1) { 1435 ip2dbg(("ire_send_multirt_v4: forcing multicast " 1436 "multirt TTL to 1 (was %d), dst 0x%08x\n", 1437 ipha->ipha_ttl, ntohl(ire->ire_addr))); 1438 ipha->ipha_ttl = 1; 1439 } 1440 ixa->ixa_flags |= IXAF_NO_TTL_CHANGE; 1441 } else if ((ipst->ips_ip_multirt_ttl > 0) && 1442 (ipha->ipha_ttl > ipst->ips_ip_multirt_ttl)) { 1443 ipha->ipha_ttl = ipst->ips_ip_multirt_ttl; 1444 /* 1445 * Need to ensure we don't increase the ttl should we go through 1446 * ire_send_broadcast or multicast. 1447 */ 1448 ixa->ixa_flags |= IXAF_NO_TTL_CHANGE; 1449 } 1450 } 1451 1452 /* 1453 * ire_sendfn for IRE_MULTICAST 1454 */ 1455 int 1456 ire_send_multicast_v4(ire_t *ire, mblk_t *mp, void *iph_arg, 1457 ip_xmit_attr_t *ixa, uint32_t *identp) 1458 { 1459 ipha_t *ipha = (ipha_t *)iph_arg; 1460 ip_stack_t *ipst = ixa->ixa_ipst; 1461 ill_t *ill = ire->ire_ill; 1462 iaflags_t ixaflags = ixa->ixa_flags; 1463 1464 /* 1465 * The IRE_MULTICAST is the same whether or not multirt is in use. 1466 * Hence we need special-case code. 1467 */ 1468 if (ixaflags & IXAF_MULTIRT_MULTICAST) 1469 multirt_check_v4(ire, ipha, ixa); 1470 1471 /* 1472 * Check if anything in ip_input_v4 wants a copy of the transmitted 1473 * packet (after IPsec and fragmentation) 1474 * 1475 * 1. Multicast routers always need a copy unless SO_DONTROUTE is set 1476 * RSVP and the rsvp daemon is an example of a 1477 * protocol and user level process that 1478 * handles it's own routing. Hence, it uses the 1479 * SO_DONTROUTE option to accomplish this. 1480 * 2. If the sender has set IP_MULTICAST_LOOP, then we just 1481 * check whether there are any receivers for the group on the ill 1482 * (ignoring the zoneid). 1483 * 3. If IP_MULTICAST_LOOP is not set, then we check if there are 1484 * any members in other shared-IP zones. 1485 * If such members exist, then we indicate that the sending zone 1486 * shouldn't get a loopback copy to preserve the IP_MULTICAST_LOOP 1487 * behavior. 1488 * 1489 * When we loopback we skip hardware checksum to make sure loopback 1490 * copy is checksumed. 1491 * 1492 * Note that ire_ill is the upper in the case of IPMP. 1493 */ 1494 ixa->ixa_flags &= ~(IXAF_LOOPBACK_COPY | IXAF_NO_HW_CKSUM); 1495 if (ipst->ips_ip_g_mrouter && ill->ill_mrouter_cnt > 0 && 1496 !(ixaflags & IXAF_DONTROUTE)) { 1497 ixa->ixa_flags |= IXAF_LOOPBACK_COPY | IXAF_NO_HW_CKSUM; 1498 } else if (ixaflags & IXAF_MULTICAST_LOOP) { 1499 /* 1500 * If this zone or any other zone has members then loopback 1501 * a copy. 1502 */ 1503 if (ill_hasmembers_v4(ill, ipha->ipha_dst)) 1504 ixa->ixa_flags |= IXAF_LOOPBACK_COPY | IXAF_NO_HW_CKSUM; 1505 } else if (ipst->ips_netstack->netstack_numzones > 1) { 1506 /* 1507 * This zone should not have a copy. But there are some other 1508 * zones which might have members. 1509 */ 1510 if (ill_hasmembers_otherzones_v4(ill, ipha->ipha_dst, 1511 ixa->ixa_zoneid)) { 1512 ixa->ixa_flags |= IXAF_NO_LOOP_ZONEID_SET; 1513 ixa->ixa_no_loop_zoneid = ixa->ixa_zoneid; 1514 ixa->ixa_flags |= IXAF_LOOPBACK_COPY | IXAF_NO_HW_CKSUM; 1515 } 1516 } 1517 1518 /* 1519 * Unless ire_send_multirt_v4 or icmp_output_hdrincl already set a ttl, 1520 * force the ttl to the IP_MULTICAST_TTL value 1521 */ 1522 if (!(ixaflags & IXAF_NO_TTL_CHANGE)) { 1523 ipha->ipha_ttl = ixa->ixa_multicast_ttl; 1524 } 1525 1526 return (ire_send_wire_v4(ire, mp, ipha, ixa, identp)); 1527 } 1528 1529 /* 1530 * ire_sendfn for IREs with RTF_MULTIRT 1531 */ 1532 int 1533 ire_send_multirt_v4(ire_t *ire, mblk_t *mp, void *iph_arg, 1534 ip_xmit_attr_t *ixa, uint32_t *identp) 1535 { 1536 ipha_t *ipha = (ipha_t *)iph_arg; 1537 1538 multirt_check_v4(ire, ipha, ixa); 1539 1540 if (ire->ire_type & IRE_MULTICAST) 1541 return (ire_send_multicast_v4(ire, mp, ipha, ixa, identp)); 1542 else if (ire->ire_type & IRE_BROADCAST) 1543 return (ire_send_broadcast_v4(ire, mp, ipha, ixa, identp)); 1544 else 1545 return (ire_send_wire_v4(ire, mp, ipha, ixa, identp)); 1546 } 1547 1548 /* 1549 * ire_sendfn for IREs with RTF_REJECT/RTF_BLACKHOLE, including IRE_NOROUTE 1550 */ 1551 int 1552 ire_send_noroute_v4(ire_t *ire, mblk_t *mp, void *iph_arg, 1553 ip_xmit_attr_t *ixa, uint32_t *identp) 1554 { 1555 ip_stack_t *ipst = ixa->ixa_ipst; 1556 ipha_t *ipha = (ipha_t *)iph_arg; 1557 ill_t *ill; 1558 ip_recv_attr_t iras; 1559 boolean_t dummy; 1560 1561 /* We assign an IP ident for nice errors */ 1562 ipha->ipha_ident = atomic_inc_32_nv(identp); 1563 1564 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutNoRoutes); 1565 1566 if (ire->ire_type & IRE_NOROUTE) { 1567 /* A lack of a route as opposed to RTF_REJECT|BLACKHOLE */ 1568 ip_rts_change(RTM_MISS, ipha->ipha_dst, 0, 0, 0, 0, 0, 0, 1569 RTA_DST, ipst); 1570 } 1571 1572 if (ire->ire_flags & RTF_BLACKHOLE) { 1573 ip_drop_output("ipIfStatsOutNoRoutes RTF_BLACKHOLE", mp, NULL); 1574 freemsg(mp); 1575 /* No error even for local senders - silent blackhole */ 1576 return (0); 1577 } 1578 ip_drop_output("ipIfStatsOutNoRoutes RTF_REJECT", mp, NULL); 1579 1580 /* 1581 * We need an ill_t for the ip_recv_attr_t even though this packet 1582 * was never received and icmp_unreachable doesn't currently use 1583 * ira_ill. 1584 */ 1585 ill = ill_lookup_on_name("lo0", B_FALSE, 1586 !(ixa->ixa_flags & IRAF_IS_IPV4), &dummy, ipst); 1587 if (ill == NULL) { 1588 freemsg(mp); 1589 return (EHOSTUNREACH); 1590 } 1591 1592 bzero(&iras, sizeof (iras)); 1593 /* Map ixa to ira including IPsec policies */ 1594 ipsec_out_to_in(ixa, ill, &iras); 1595 1596 if (ip_source_routed(ipha, ipst)) { 1597 icmp_unreachable(mp, ICMP_SOURCE_ROUTE_FAILED, &iras); 1598 } else { 1599 icmp_unreachable(mp, ICMP_HOST_UNREACHABLE, &iras); 1600 } 1601 /* We moved any IPsec refs from ixa to iras */ 1602 ira_cleanup(&iras, B_FALSE); 1603 ill_refrele(ill); 1604 return (EHOSTUNREACH); 1605 } 1606 1607 /* 1608 * Calculate a checksum ignoring any hardware capabilities 1609 * 1610 * Returns B_FALSE if the packet was too short for the checksum. Caller 1611 * should free and do stats. 1612 */ 1613 static boolean_t 1614 ip_output_sw_cksum_v4(mblk_t *mp, ipha_t *ipha, ip_xmit_attr_t *ixa) 1615 { 1616 ip_stack_t *ipst = ixa->ixa_ipst; 1617 uint_t pktlen = ixa->ixa_pktlen; 1618 uint16_t *cksump; 1619 uint32_t cksum; 1620 uint8_t protocol = ixa->ixa_protocol; 1621 uint16_t ip_hdr_length = ixa->ixa_ip_hdr_length; 1622 ipaddr_t dst = ipha->ipha_dst; 1623 ipaddr_t src = ipha->ipha_src; 1624 1625 /* Just in case it contained garbage */ 1626 DB_CKSUMFLAGS(mp) &= ~HCK_FLAGS; 1627 1628 /* 1629 * Calculate ULP checksum 1630 */ 1631 if (protocol == IPPROTO_TCP) { 1632 cksump = IPH_TCPH_CHECKSUMP(ipha, ip_hdr_length); 1633 cksum = IP_TCP_CSUM_COMP; 1634 } else if (protocol == IPPROTO_UDP) { 1635 cksump = IPH_UDPH_CHECKSUMP(ipha, ip_hdr_length); 1636 cksum = IP_UDP_CSUM_COMP; 1637 } else if (protocol == IPPROTO_SCTP) { 1638 sctp_hdr_t *sctph; 1639 1640 ASSERT(MBLKL(mp) >= (ip_hdr_length + sizeof (*sctph))); 1641 sctph = (sctp_hdr_t *)(mp->b_rptr + ip_hdr_length); 1642 /* 1643 * Zero out the checksum field to ensure proper 1644 * checksum calculation. 1645 */ 1646 sctph->sh_chksum = 0; 1647 #ifdef DEBUG 1648 if (!skip_sctp_cksum) 1649 #endif 1650 sctph->sh_chksum = sctp_cksum(mp, ip_hdr_length); 1651 goto ip_hdr_cksum; 1652 } else { 1653 goto ip_hdr_cksum; 1654 } 1655 1656 /* ULP puts the checksum field is in the first mblk */ 1657 ASSERT(((uchar_t *)cksump) + sizeof (uint16_t) <= mp->b_wptr); 1658 1659 /* 1660 * We accumulate the pseudo header checksum in cksum. 1661 * This is pretty hairy code, so watch close. One 1662 * thing to keep in mind is that UDP and TCP have 1663 * stored their respective datagram lengths in their 1664 * checksum fields. This lines things up real nice. 1665 */ 1666 cksum += (dst >> 16) + (dst & 0xFFFF) + (src >> 16) + (src & 0xFFFF); 1667 1668 cksum = IP_CSUM(mp, ip_hdr_length, cksum); 1669 /* 1670 * For UDP/IPv4 a zero means that the packets wasn't checksummed. 1671 * Change to 0xffff 1672 */ 1673 if (protocol == IPPROTO_UDP && cksum == 0) 1674 *cksump = ~cksum; 1675 else 1676 *cksump = cksum; 1677 1678 IP_STAT(ipst, ip_out_sw_cksum); 1679 IP_STAT_UPDATE(ipst, ip_out_sw_cksum_bytes, pktlen); 1680 1681 ip_hdr_cksum: 1682 /* Calculate IPv4 header checksum */ 1683 ipha->ipha_hdr_checksum = 0; 1684 ipha->ipha_hdr_checksum = ip_csum_hdr(ipha); 1685 return (B_TRUE); 1686 } 1687 1688 /* 1689 * Calculate the ULP checksum - try to use hardware. 1690 * In the case of MULTIRT, broadcast or multicast the 1691 * IXAF_NO_HW_CKSUM is set in which case we use software. 1692 * 1693 * If the hardware supports IP header checksum offload; then clear the 1694 * contents of IP header checksum field as expected by NIC. 1695 * Do this only if we offloaded either full or partial sum. 1696 * 1697 * Returns B_FALSE if the packet was too short for the checksum. Caller 1698 * should free and do stats. 1699 */ 1700 static boolean_t 1701 ip_output_cksum_v4(iaflags_t ixaflags, mblk_t *mp, ipha_t *ipha, 1702 ip_xmit_attr_t *ixa, ill_t *ill) 1703 { 1704 uint_t pktlen = ixa->ixa_pktlen; 1705 uint16_t *cksump; 1706 uint16_t hck_flags; 1707 uint32_t cksum; 1708 uint8_t protocol = ixa->ixa_protocol; 1709 uint16_t ip_hdr_length = ixa->ixa_ip_hdr_length; 1710 1711 if ((ixaflags & IXAF_NO_HW_CKSUM) || !ILL_HCKSUM_CAPABLE(ill) || 1712 !dohwcksum) { 1713 return (ip_output_sw_cksum_v4(mp, ipha, ixa)); 1714 } 1715 1716 /* 1717 * Calculate ULP checksum. Note that we don't use cksump and cksum 1718 * if the ill has FULL support. 1719 */ 1720 if (protocol == IPPROTO_TCP) { 1721 cksump = IPH_TCPH_CHECKSUMP(ipha, ip_hdr_length); 1722 cksum = IP_TCP_CSUM_COMP; /* Pseudo-header cksum */ 1723 } else if (protocol == IPPROTO_UDP) { 1724 cksump = IPH_UDPH_CHECKSUMP(ipha, ip_hdr_length); 1725 cksum = IP_UDP_CSUM_COMP; /* Pseudo-header cksum */ 1726 } else if (protocol == IPPROTO_SCTP) { 1727 sctp_hdr_t *sctph; 1728 1729 ASSERT(MBLKL(mp) >= (ip_hdr_length + sizeof (*sctph))); 1730 sctph = (sctp_hdr_t *)(mp->b_rptr + ip_hdr_length); 1731 /* 1732 * Zero out the checksum field to ensure proper 1733 * checksum calculation. 1734 */ 1735 sctph->sh_chksum = 0; 1736 #ifdef DEBUG 1737 if (!skip_sctp_cksum) 1738 #endif 1739 sctph->sh_chksum = sctp_cksum(mp, ip_hdr_length); 1740 goto ip_hdr_cksum; 1741 } else { 1742 ip_hdr_cksum: 1743 /* Calculate IPv4 header checksum */ 1744 ipha->ipha_hdr_checksum = 0; 1745 ipha->ipha_hdr_checksum = ip_csum_hdr(ipha); 1746 return (B_TRUE); 1747 } 1748 1749 /* ULP puts the checksum field is in the first mblk */ 1750 ASSERT(((uchar_t *)cksump) + sizeof (uint16_t) <= mp->b_wptr); 1751 1752 /* 1753 * Underlying interface supports hardware checksum offload for 1754 * the payload; leave the payload checksum for the hardware to 1755 * calculate. N.B: We only need to set up checksum info on the 1756 * first mblk. 1757 */ 1758 hck_flags = ill->ill_hcksum_capab->ill_hcksum_txflags; 1759 1760 DB_CKSUMFLAGS(mp) &= ~HCK_FLAGS; 1761 if (hck_flags & HCKSUM_INET_FULL_V4) { 1762 /* 1763 * Hardware calculates pseudo-header, header and the 1764 * payload checksums, so clear the checksum field in 1765 * the protocol header. 1766 */ 1767 *cksump = 0; 1768 DB_CKSUMFLAGS(mp) |= HCK_FULLCKSUM; 1769 1770 ipha->ipha_hdr_checksum = 0; 1771 if (hck_flags & HCKSUM_IPHDRCKSUM) { 1772 DB_CKSUMFLAGS(mp) |= HCK_IPV4_HDRCKSUM; 1773 } else { 1774 ipha->ipha_hdr_checksum = ip_csum_hdr(ipha); 1775 } 1776 return (B_TRUE); 1777 } 1778 if ((hck_flags) & HCKSUM_INET_PARTIAL) { 1779 ipaddr_t dst = ipha->ipha_dst; 1780 ipaddr_t src = ipha->ipha_src; 1781 /* 1782 * Partial checksum offload has been enabled. Fill 1783 * the checksum field in the protocol header with the 1784 * pseudo-header checksum value. 1785 * 1786 * We accumulate the pseudo header checksum in cksum. 1787 * This is pretty hairy code, so watch close. One 1788 * thing to keep in mind is that UDP and TCP have 1789 * stored their respective datagram lengths in their 1790 * checksum fields. This lines things up real nice. 1791 */ 1792 cksum += (dst >> 16) + (dst & 0xFFFF) + 1793 (src >> 16) + (src & 0xFFFF); 1794 cksum += *(cksump); 1795 cksum = (cksum & 0xFFFF) + (cksum >> 16); 1796 *(cksump) = (cksum & 0xFFFF) + (cksum >> 16); 1797 1798 /* 1799 * Offsets are relative to beginning of IP header. 1800 */ 1801 DB_CKSUMSTART(mp) = ip_hdr_length; 1802 DB_CKSUMSTUFF(mp) = (uint8_t *)cksump - (uint8_t *)ipha; 1803 DB_CKSUMEND(mp) = pktlen; 1804 DB_CKSUMFLAGS(mp) |= HCK_PARTIALCKSUM; 1805 1806 ipha->ipha_hdr_checksum = 0; 1807 if (hck_flags & HCKSUM_IPHDRCKSUM) { 1808 DB_CKSUMFLAGS(mp) |= HCK_IPV4_HDRCKSUM; 1809 } else { 1810 ipha->ipha_hdr_checksum = ip_csum_hdr(ipha); 1811 } 1812 return (B_TRUE); 1813 } 1814 /* Hardware capabilities include neither full nor partial IPv4 */ 1815 return (ip_output_sw_cksum_v4(mp, ipha, ixa)); 1816 } 1817 1818 /* 1819 * ire_sendfn for offlink and onlink destinations. 1820 * Also called from the multicast, broadcast, multirt send functions. 1821 * 1822 * Assumes that the caller has a hold on the ire. 1823 * 1824 * This function doesn't care if the IRE just became condemned since that 1825 * can happen at any time. 1826 */ 1827 /* ARGSUSED */ 1828 int 1829 ire_send_wire_v4(ire_t *ire, mblk_t *mp, void *iph_arg, 1830 ip_xmit_attr_t *ixa, uint32_t *identp) 1831 { 1832 ip_stack_t *ipst = ixa->ixa_ipst; 1833 ipha_t *ipha = (ipha_t *)iph_arg; 1834 iaflags_t ixaflags = ixa->ixa_flags; 1835 ill_t *ill; 1836 1837 ASSERT(ixa->ixa_nce != NULL); 1838 ill = ixa->ixa_nce->nce_ill; 1839 1840 if (ixaflags & IXAF_DONTROUTE) 1841 ipha->ipha_ttl = 1; 1842 1843 /* 1844 * Assign an ident value for this packet. There could be other 1845 * threads targeting the same destination, so we have to arrange 1846 * for a atomic increment. Note that we use a 32-bit atomic add 1847 * because it has better performance than its 16-bit sibling. 1848 * 1849 * Normally ixa_extra_ident is 0, but in the case of LSO it will 1850 * be the number of TCP segments that the driver/hardware will 1851 * extraly construct. 1852 * 1853 * If running in cluster mode and if the source address 1854 * belongs to a replicated service then vector through 1855 * cl_inet_ipident vector to allocate ip identifier 1856 * NOTE: This is a contract private interface with the 1857 * clustering group. 1858 */ 1859 if (cl_inet_ipident != NULL) { 1860 ipaddr_t src = ipha->ipha_src; 1861 ipaddr_t dst = ipha->ipha_dst; 1862 netstackid_t stack_id = ipst->ips_netstack->netstack_stackid; 1863 1864 ASSERT(cl_inet_isclusterwide != NULL); 1865 if ((*cl_inet_isclusterwide)(stack_id, IPPROTO_IP, 1866 AF_INET, (uint8_t *)(uintptr_t)src, NULL)) { 1867 /* 1868 * Note: not correct with LSO since we can't allocate 1869 * ixa_extra_ident+1 consecutive values. 1870 */ 1871 ipha->ipha_ident = (*cl_inet_ipident)(stack_id, 1872 IPPROTO_IP, AF_INET, (uint8_t *)(uintptr_t)src, 1873 (uint8_t *)(uintptr_t)dst, NULL); 1874 } else { 1875 ipha->ipha_ident = atomic_add_32_nv(identp, 1876 ixa->ixa_extra_ident + 1); 1877 } 1878 } else { 1879 ipha->ipha_ident = atomic_add_32_nv(identp, 1880 ixa->ixa_extra_ident + 1); 1881 } 1882 #ifndef _BIG_ENDIAN 1883 ipha->ipha_ident = htons(ipha->ipha_ident); 1884 #endif 1885 1886 /* 1887 * This might set b_band, thus the IPsec and fragmentation 1888 * code in IP ensures that b_band is updated in the first mblk. 1889 */ 1890 if (IPP_ENABLED(IPP_LOCAL_OUT, ipst)) { 1891 /* ip_process translates an IS_UNDER_IPMP */ 1892 mp = ip_process(IPP_LOCAL_OUT, mp, ill, ill); 1893 if (mp == NULL) { 1894 /* ip_drop_packet and MIB done */ 1895 return (0); /* Might just be delayed */ 1896 } 1897 } 1898 1899 /* 1900 * Verify any IPv4 options. 1901 * 1902 * The presense of IP options also forces the network stack to 1903 * calculate the checksum in software. This is because: 1904 * 1905 * Wrap around: certain partial-checksum NICs (eri, ce) limit 1906 * the size of "start offset" width to 6-bit. This effectively 1907 * sets the largest value of the offset to 64-bytes, starting 1908 * from the MAC header. When the cumulative MAC and IP headers 1909 * exceed such limit, the offset will wrap around. This causes 1910 * the checksum to be calculated at the wrong place. 1911 * 1912 * IPv4 source routing: none of the full-checksum capable NICs 1913 * is capable of correctly handling the IPv4 source-routing 1914 * option for purposes of calculating the pseudo-header; the 1915 * actual destination is different from the destination in the 1916 * header which is that of the next-hop. (This case may not be 1917 * true for NICs which can parse IPv6 extension headers, but 1918 * we choose to simplify the implementation by not offloading 1919 * checksum when they are present.) 1920 */ 1921 if (!IS_SIMPLE_IPH(ipha)) { 1922 ixaflags = ixa->ixa_flags |= IXAF_NO_HW_CKSUM; 1923 /* An IS_UNDER_IPMP ill is ok here */ 1924 if (ip_output_options(mp, ipha, ixa, ill)) { 1925 /* Packet has been consumed and ICMP error sent */ 1926 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); 1927 return (EINVAL); 1928 } 1929 } 1930 1931 /* 1932 * To handle IPsec/iptun's labeling needs we need to tag packets 1933 * while we still have ixa_tsl 1934 */ 1935 if (is_system_labeled() && ixa->ixa_tsl != NULL && 1936 (ill->ill_mactype == DL_6TO4 || ill->ill_mactype == DL_IPV4 || 1937 ill->ill_mactype == DL_IPV6)) { 1938 cred_t *newcr; 1939 1940 newcr = copycred_from_tslabel(ixa->ixa_cred, ixa->ixa_tsl, 1941 KM_NOSLEEP); 1942 if (newcr == NULL) { 1943 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); 1944 ip_drop_output("ipIfStatsOutDiscards - newcr", 1945 mp, ill); 1946 freemsg(mp); 1947 return (ENOBUFS); 1948 } 1949 mblk_setcred(mp, newcr, NOPID); 1950 crfree(newcr); /* mblk_setcred did its own crhold */ 1951 } 1952 1953 if (ixa->ixa_pktlen > ixa->ixa_fragsize || 1954 (ixaflags & IXAF_IPSEC_SECURE)) { 1955 uint32_t pktlen; 1956 1957 pktlen = ixa->ixa_pktlen; 1958 if (ixaflags & IXAF_IPSEC_SECURE) 1959 pktlen += ipsec_out_extra_length(ixa); 1960 1961 if (pktlen > IP_MAXPACKET) 1962 return (EMSGSIZE); 1963 1964 if (ixaflags & IXAF_SET_ULP_CKSUM) { 1965 /* 1966 * Compute ULP checksum and IP header checksum 1967 * using software 1968 */ 1969 if (!ip_output_sw_cksum_v4(mp, ipha, ixa)) { 1970 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); 1971 ip_drop_output("ipIfStatsOutDiscards", mp, ill); 1972 freemsg(mp); 1973 return (EINVAL); 1974 } 1975 } else { 1976 /* Calculate IPv4 header checksum */ 1977 ipha->ipha_hdr_checksum = 0; 1978 ipha->ipha_hdr_checksum = ip_csum_hdr(ipha); 1979 } 1980 1981 /* 1982 * If this packet would generate a icmp_frag_needed 1983 * message, we need to handle it before we do the IPsec 1984 * processing. Otherwise, we need to strip the IPsec 1985 * headers before we send up the message to the ULPs 1986 * which becomes messy and difficult. 1987 * 1988 * We check using IXAF_DONTFRAG. The DF bit in the header 1989 * is not inspected - it will be copied to any generated 1990 * fragments. 1991 */ 1992 if ((pktlen > ixa->ixa_fragsize) && 1993 (ixaflags & IXAF_DONTFRAG)) { 1994 /* Generate ICMP and return error */ 1995 ip_recv_attr_t iras; 1996 1997 DTRACE_PROBE4(ip4__fragsize__fail, uint_t, pktlen, 1998 uint_t, ixa->ixa_fragsize, uint_t, ixa->ixa_pktlen, 1999 uint_t, ixa->ixa_pmtu); 2000 2001 bzero(&iras, sizeof (iras)); 2002 /* Map ixa to ira including IPsec policies */ 2003 ipsec_out_to_in(ixa, ill, &iras); 2004 2005 ip_drop_output("ICMP_FRAG_NEEDED", mp, ill); 2006 icmp_frag_needed(mp, ixa->ixa_fragsize, &iras); 2007 /* We moved any IPsec refs from ixa to iras */ 2008 ira_cleanup(&iras, B_FALSE); 2009 return (EMSGSIZE); 2010 } 2011 DTRACE_PROBE4(ip4__fragsize__ok, uint_t, pktlen, 2012 uint_t, ixa->ixa_fragsize, uint_t, ixa->ixa_pktlen, 2013 uint_t, ixa->ixa_pmtu); 2014 2015 if (ixaflags & IXAF_IPSEC_SECURE) { 2016 /* 2017 * Pass in sufficient information so that 2018 * IPsec can determine whether to fragment, and 2019 * which function to call after fragmentation. 2020 */ 2021 return (ipsec_out_process(mp, ixa)); 2022 } 2023 return (ip_fragment_v4(mp, ixa->ixa_nce, ixaflags, 2024 ixa->ixa_pktlen, ixa->ixa_fragsize, ixa->ixa_xmit_hint, 2025 ixa->ixa_zoneid, ixa->ixa_no_loop_zoneid, 2026 ixa->ixa_postfragfn, &ixa->ixa_cookie)); 2027 } 2028 if (ixaflags & IXAF_SET_ULP_CKSUM) { 2029 /* Compute ULP checksum and IP header checksum */ 2030 /* An IS_UNDER_IPMP ill is ok here */ 2031 if (!ip_output_cksum_v4(ixaflags, mp, ipha, ixa, ill)) { 2032 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); 2033 ip_drop_output("ipIfStatsOutDiscards", mp, ill); 2034 freemsg(mp); 2035 return (EINVAL); 2036 } 2037 } else { 2038 /* Calculate IPv4 header checksum */ 2039 ipha->ipha_hdr_checksum = 0; 2040 ipha->ipha_hdr_checksum = ip_csum_hdr(ipha); 2041 } 2042 return ((ixa->ixa_postfragfn)(mp, ixa->ixa_nce, ixaflags, 2043 ixa->ixa_pktlen, ixa->ixa_xmit_hint, ixa->ixa_zoneid, 2044 ixa->ixa_no_loop_zoneid, &ixa->ixa_cookie)); 2045 } 2046 2047 /* 2048 * Send mp into ip_input 2049 * Common for IPv4 and IPv6 2050 */ 2051 void 2052 ip_postfrag_loopback(mblk_t *mp, nce_t *nce, iaflags_t ixaflags, 2053 uint_t pkt_len, zoneid_t nolzid) 2054 { 2055 rtc_t rtc; 2056 ill_t *ill = nce->nce_ill; 2057 ip_recv_attr_t iras; /* NOTE: No bzero for performance */ 2058 ncec_t *ncec; 2059 2060 ncec = nce->nce_common; 2061 iras.ira_flags = IRAF_VERIFY_IP_CKSUM | IRAF_VERIFY_ULP_CKSUM | 2062 IRAF_LOOPBACK | IRAF_L2SRC_LOOPBACK; 2063 if (ncec->ncec_flags & NCE_F_BCAST) 2064 iras.ira_flags |= IRAF_L2DST_BROADCAST; 2065 else if (ncec->ncec_flags & NCE_F_MCAST) 2066 iras.ira_flags |= IRAF_L2DST_MULTICAST; 2067 2068 iras.ira_free_flags = 0; 2069 iras.ira_cred = NULL; 2070 iras.ira_cpid = NOPID; 2071 iras.ira_tsl = NULL; 2072 iras.ira_zoneid = ALL_ZONES; 2073 iras.ira_pktlen = pkt_len; 2074 UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCInOctets, iras.ira_pktlen); 2075 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInReceives); 2076 2077 if (ixaflags & IXAF_IS_IPV4) 2078 iras.ira_flags |= IRAF_IS_IPV4; 2079 2080 iras.ira_ill = iras.ira_rill = ill; 2081 iras.ira_ruifindex = ill->ill_phyint->phyint_ifindex; 2082 iras.ira_rifindex = iras.ira_ruifindex; 2083 iras.ira_mhip = NULL; 2084 2085 iras.ira_flags |= ixaflags & IAF_MASK; 2086 iras.ira_no_loop_zoneid = nolzid; 2087 2088 /* Broadcast and multicast doesn't care about the squeue */ 2089 iras.ira_sqp = NULL; 2090 2091 rtc.rtc_ire = NULL; 2092 if (ixaflags & IXAF_IS_IPV4) { 2093 ipha_t *ipha = (ipha_t *)mp->b_rptr; 2094 2095 rtc.rtc_ipaddr = INADDR_ANY; 2096 2097 (*ill->ill_inputfn)(mp, ipha, &ipha->ipha_dst, &iras, &rtc); 2098 if (rtc.rtc_ire != NULL) { 2099 ASSERT(rtc.rtc_ipaddr != INADDR_ANY); 2100 ire_refrele(rtc.rtc_ire); 2101 } 2102 } else { 2103 ip6_t *ip6h = (ip6_t *)mp->b_rptr; 2104 2105 rtc.rtc_ip6addr = ipv6_all_zeros; 2106 2107 (*ill->ill_inputfn)(mp, ip6h, &ip6h->ip6_dst, &iras, &rtc); 2108 if (rtc.rtc_ire != NULL) { 2109 ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&rtc.rtc_ip6addr)); 2110 ire_refrele(rtc.rtc_ire); 2111 } 2112 } 2113 /* Any references to clean up? No hold on ira */ 2114 if (iras.ira_flags & (IRAF_IPSEC_SECURE|IRAF_SYSTEM_LABELED)) 2115 ira_cleanup(&iras, B_FALSE); 2116 } 2117 2118 /* 2119 * Post fragmentation function for IRE_MULTICAST and IRE_BROADCAST which 2120 * looks at the IXAF_LOOPBACK_COPY flag. 2121 * Common for IPv4 and IPv6. 2122 * 2123 * If the loopback copy fails (due to no memory) but we send the packet out 2124 * on the wire we return no failure. Only in the case we supress the wire 2125 * sending do we take the loopback failure into account. 2126 * 2127 * Note that we do not perform DTRACE_IP7 and FW_HOOKS for the looped back copy. 2128 * Those operations are performed on this packet in ip_xmit() and it would 2129 * be odd to do it twice for the same packet. 2130 */ 2131 int 2132 ip_postfrag_loopcheck(mblk_t *mp, nce_t *nce, iaflags_t ixaflags, 2133 uint_t pkt_len, uint32_t xmit_hint, zoneid_t szone, zoneid_t nolzid, 2134 uintptr_t *ixacookie) 2135 { 2136 ill_t *ill = nce->nce_ill; 2137 int error = 0; 2138 2139 /* 2140 * Check for IXAF_LOOPBACK_COPY - send a copy to ip as if the driver 2141 * had looped it back 2142 */ 2143 if (ixaflags & IXAF_LOOPBACK_COPY) { 2144 mblk_t *mp1; 2145 2146 mp1 = copymsg(mp); 2147 if (mp1 == NULL) { 2148 /* Failed to deliver the loopback copy. */ 2149 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); 2150 ip_drop_output("ipIfStatsOutDiscards", mp, ill); 2151 error = ENOBUFS; 2152 } else { 2153 ip_postfrag_loopback(mp1, nce, ixaflags, pkt_len, 2154 nolzid); 2155 } 2156 } 2157 2158 /* 2159 * If TTL = 0 then only do the loopback to this host i.e. we are 2160 * done. We are also done if this was the 2161 * loopback interface since it is sufficient 2162 * to loopback one copy of a multicast packet. 2163 */ 2164 if (ixaflags & IXAF_IS_IPV4) { 2165 ipha_t *ipha = (ipha_t *)mp->b_rptr; 2166 2167 if (ipha->ipha_ttl == 0) { 2168 ip_drop_output("multicast ipha_ttl not sent to wire", 2169 mp, ill); 2170 freemsg(mp); 2171 return (error); 2172 } 2173 } else { 2174 ip6_t *ip6h = (ip6_t *)mp->b_rptr; 2175 2176 if (ip6h->ip6_hops == 0) { 2177 ip_drop_output("multicast ipha_ttl not sent to wire", 2178 mp, ill); 2179 freemsg(mp); 2180 return (error); 2181 } 2182 } 2183 if (nce->nce_ill->ill_wq == NULL) { 2184 /* Loopback interface */ 2185 ip_drop_output("multicast on lo0 not sent to wire", mp, ill); 2186 freemsg(mp); 2187 return (error); 2188 } 2189 2190 return (ip_xmit(mp, nce, ixaflags, pkt_len, xmit_hint, szone, 0, 2191 ixacookie)); 2192 } 2193 2194 /* 2195 * Post fragmentation function for RTF_MULTIRT routes. 2196 * Since IRE_BROADCASTs can have RTF_MULTIRT, this function 2197 * checks IXAF_LOOPBACK_COPY. 2198 * 2199 * If no packet is sent due to failures then we return an errno, but if at 2200 * least one succeeded we return zero. 2201 */ 2202 int 2203 ip_postfrag_multirt_v4(mblk_t *mp, nce_t *nce, iaflags_t ixaflags, 2204 uint_t pkt_len, uint32_t xmit_hint, zoneid_t szone, zoneid_t nolzid, 2205 uintptr_t *ixacookie) 2206 { 2207 irb_t *irb; 2208 ipha_t *ipha = (ipha_t *)mp->b_rptr; 2209 ire_t *ire; 2210 ire_t *ire1; 2211 mblk_t *mp1; 2212 nce_t *nce1; 2213 ill_t *ill = nce->nce_ill; 2214 ill_t *ill1; 2215 ip_stack_t *ipst = ill->ill_ipst; 2216 int error = 0; 2217 int num_sent = 0; 2218 int err; 2219 uint_t ire_type; 2220 ipaddr_t nexthop; 2221 2222 ASSERT(ixaflags & IXAF_IS_IPV4); 2223 2224 /* Check for IXAF_LOOPBACK_COPY */ 2225 if (ixaflags & IXAF_LOOPBACK_COPY) { 2226 mblk_t *mp1; 2227 2228 mp1 = copymsg(mp); 2229 if (mp1 == NULL) { 2230 /* Failed to deliver the loopback copy. */ 2231 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); 2232 ip_drop_output("ipIfStatsOutDiscards", mp, ill); 2233 error = ENOBUFS; 2234 } else { 2235 ip_postfrag_loopback(mp1, nce, ixaflags, pkt_len, 2236 nolzid); 2237 } 2238 } 2239 2240 /* 2241 * Loop over RTF_MULTIRT for ipha_dst in the same bucket. Send 2242 * a copy to each one. 2243 * Use the nce (nexthop) and ipha_dst to find the ire. 2244 * 2245 * MULTIRT is not designed to work with shared-IP zones thus we don't 2246 * need to pass a zoneid or a label to the IRE lookup. 2247 */ 2248 if (V4_PART_OF_V6(nce->nce_addr) == ipha->ipha_dst) { 2249 /* Broadcast and multicast case */ 2250 ire = ire_ftable_lookup_v4(ipha->ipha_dst, 0, 0, 0, 2251 NULL, ALL_ZONES, NULL, MATCH_IRE_DSTONLY, 0, ipst, NULL); 2252 } else { 2253 ipaddr_t v4addr = V4_PART_OF_V6(nce->nce_addr); 2254 2255 /* Unicast case */ 2256 ire = ire_ftable_lookup_v4(ipha->ipha_dst, 0, v4addr, 0, 2257 NULL, ALL_ZONES, NULL, MATCH_IRE_GW, 0, ipst, NULL); 2258 } 2259 2260 if (ire == NULL || 2261 (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) || 2262 !(ire->ire_flags & RTF_MULTIRT)) { 2263 /* Drop */ 2264 ip_drop_output("ip_postfrag_multirt didn't find route", 2265 mp, nce->nce_ill); 2266 if (ire != NULL) 2267 ire_refrele(ire); 2268 return (ENETUNREACH); 2269 } 2270 2271 irb = ire->ire_bucket; 2272 irb_refhold(irb); 2273 for (ire1 = irb->irb_ire; ire1 != NULL; ire1 = ire1->ire_next) { 2274 /* 2275 * For broadcast we can have a mixture of IRE_BROADCAST and 2276 * IRE_HOST due to the manually added IRE_HOSTs that are used 2277 * to trigger the creation of the special CGTP broadcast routes. 2278 * Thus we have to skip if ire_type doesn't match the original. 2279 */ 2280 if (IRE_IS_CONDEMNED(ire1) || 2281 !(ire1->ire_flags & RTF_MULTIRT) || 2282 ire1->ire_type != ire->ire_type) 2283 continue; 2284 2285 /* Do the ire argument one after the loop */ 2286 if (ire1 == ire) 2287 continue; 2288 2289 ill1 = ire_nexthop_ill(ire1); 2290 if (ill1 == NULL) { 2291 /* 2292 * This ire might not have been picked by 2293 * ire_route_recursive, in which case ire_dep might 2294 * not have been setup yet. 2295 * We kick ire_route_recursive to try to resolve 2296 * starting at ire1. 2297 */ 2298 ire_t *ire2; 2299 uint_t match_flags = MATCH_IRE_DSTONLY; 2300 2301 if (ire1->ire_ill != NULL) 2302 match_flags |= MATCH_IRE_ILL; 2303 ire2 = ire_route_recursive_impl_v4(ire1, 2304 ire1->ire_addr, ire1->ire_type, ire1->ire_ill, 2305 ire1->ire_zoneid, NULL, match_flags, 2306 IRR_ALLOCATE, 0, ipst, NULL, NULL, NULL); 2307 if (ire2 != NULL) 2308 ire_refrele(ire2); 2309 ill1 = ire_nexthop_ill(ire1); 2310 } 2311 2312 if (ill1 == NULL) { 2313 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); 2314 ip_drop_output("ipIfStatsOutDiscards - no ill", 2315 mp, ill); 2316 error = ENETUNREACH; 2317 continue; 2318 } 2319 2320 /* Pick the addr and type to use for arp_nce_init */ 2321 if (nce->nce_common->ncec_flags & NCE_F_BCAST) { 2322 ire_type = IRE_BROADCAST; 2323 nexthop = ire1->ire_gateway_addr; 2324 } else if (nce->nce_common->ncec_flags & NCE_F_MCAST) { 2325 ire_type = IRE_MULTICAST; 2326 nexthop = ipha->ipha_dst; 2327 } else { 2328 ire_type = ire1->ire_type; /* Doesn't matter */ 2329 nexthop = ire1->ire_gateway_addr; 2330 } 2331 2332 /* If IPMP meta or under, then we just drop */ 2333 if (ill1->ill_grp != NULL) { 2334 BUMP_MIB(ill1->ill_ip_mib, ipIfStatsOutDiscards); 2335 ip_drop_output("ipIfStatsOutDiscards - IPMP", 2336 mp, ill1); 2337 ill_refrele(ill1); 2338 error = ENETUNREACH; 2339 continue; 2340 } 2341 2342 nce1 = arp_nce_init(ill1, nexthop, ire_type); 2343 if (nce1 == NULL) { 2344 BUMP_MIB(ill1->ill_ip_mib, ipIfStatsOutDiscards); 2345 ip_drop_output("ipIfStatsOutDiscards - no nce", 2346 mp, ill1); 2347 ill_refrele(ill1); 2348 error = ENETUNREACH; 2349 continue; 2350 } 2351 mp1 = copymsg(mp); 2352 if (mp1 == NULL) { 2353 BUMP_MIB(ill1->ill_ip_mib, ipIfStatsOutDiscards); 2354 ip_drop_output("ipIfStatsOutDiscards", mp, ill1); 2355 nce_refrele(nce1); 2356 ill_refrele(ill1); 2357 error = ENOBUFS; 2358 continue; 2359 } 2360 /* Preserve HW checksum for this copy */ 2361 DB_CKSUMSTART(mp1) = DB_CKSUMSTART(mp); 2362 DB_CKSUMSTUFF(mp1) = DB_CKSUMSTUFF(mp); 2363 DB_CKSUMEND(mp1) = DB_CKSUMEND(mp); 2364 DB_CKSUMFLAGS(mp1) = DB_CKSUMFLAGS(mp); 2365 DB_LSOMSS(mp1) = DB_LSOMSS(mp); 2366 2367 ire1->ire_ob_pkt_count++; 2368 err = ip_xmit(mp1, nce1, ixaflags, pkt_len, xmit_hint, szone, 2369 0, ixacookie); 2370 if (err == 0) 2371 num_sent++; 2372 else 2373 error = err; 2374 nce_refrele(nce1); 2375 ill_refrele(ill1); 2376 } 2377 irb_refrele(irb); 2378 ire_refrele(ire); 2379 /* Finally, the main one */ 2380 err = ip_xmit(mp, nce, ixaflags, pkt_len, xmit_hint, szone, 0, 2381 ixacookie); 2382 if (err == 0) 2383 num_sent++; 2384 else 2385 error = err; 2386 if (num_sent > 0) 2387 return (0); 2388 else 2389 return (error); 2390 } 2391 2392 /* 2393 * Verify local connectivity. This check is called by ULP fusion code. 2394 * The generation number on an IRE_LOCAL or IRE_LOOPBACK only changes if 2395 * the interface is brought down and back up. So we simply fail the local 2396 * process. The caller, TCP Fusion, should unfuse the connection. 2397 */ 2398 boolean_t 2399 ip_output_verify_local(ip_xmit_attr_t *ixa) 2400 { 2401 ire_t *ire = ixa->ixa_ire; 2402 2403 if (!(ire->ire_type & (IRE_LOCAL | IRE_LOOPBACK))) 2404 return (B_FALSE); 2405 2406 return (ixa->ixa_ire->ire_generation == ixa->ixa_ire_generation); 2407 } 2408 2409 /* 2410 * Local process for ULP loopback, TCP Fusion. Handle both IPv4 and IPv6. 2411 * 2412 * The caller must call ip_output_verify_local() first. This function handles 2413 * IPobs, FW_HOOKS, and/or IPsec cases sequentially. 2414 */ 2415 mblk_t * 2416 ip_output_process_local(mblk_t *mp, ip_xmit_attr_t *ixa, boolean_t hooks_out, 2417 boolean_t hooks_in, conn_t *peer_connp) 2418 { 2419 ill_t *ill = ixa->ixa_ire->ire_ill; 2420 ipha_t *ipha = NULL; 2421 ip6_t *ip6h = NULL; 2422 ip_stack_t *ipst = ixa->ixa_ipst; 2423 iaflags_t ixaflags = ixa->ixa_flags; 2424 ip_recv_attr_t iras; 2425 int error; 2426 2427 ASSERT(mp != NULL); 2428 2429 if (ixaflags & IXAF_IS_IPV4) { 2430 ipha = (ipha_t *)mp->b_rptr; 2431 2432 /* 2433 * If a callback is enabled then we need to know the 2434 * source and destination zoneids for the packet. We already 2435 * have those handy. 2436 */ 2437 if (ipst->ips_ip4_observe.he_interested) { 2438 zoneid_t szone, dzone; 2439 zoneid_t stackzoneid; 2440 2441 stackzoneid = netstackid_to_zoneid( 2442 ipst->ips_netstack->netstack_stackid); 2443 2444 if (stackzoneid == GLOBAL_ZONEID) { 2445 /* Shared-IP zone */ 2446 dzone = ixa->ixa_ire->ire_zoneid; 2447 szone = ixa->ixa_zoneid; 2448 } else { 2449 szone = dzone = stackzoneid; 2450 } 2451 ipobs_hook(mp, IPOBS_HOOK_LOCAL, szone, dzone, ill, 2452 ipst); 2453 } 2454 DTRACE_IP7(send, mblk_t *, mp, conn_t *, NULL, void_ip_t *, 2455 ipha, __dtrace_ipsr_ill_t *, ill, ipha_t *, ipha, ip6_t *, 2456 NULL, int, 1); 2457 2458 /* FW_HOOKS: LOOPBACK_OUT */ 2459 if (hooks_out) { 2460 DTRACE_PROBE4(ip4__loopback__out__start, ill_t *, NULL, 2461 ill_t *, ill, ipha_t *, ipha, mblk_t *, mp); 2462 FW_HOOKS(ipst->ips_ip4_loopback_out_event, 2463 ipst->ips_ipv4firewall_loopback_out, 2464 NULL, ill, ipha, mp, mp, 0, ipst, error); 2465 DTRACE_PROBE1(ip4__loopback__out__end, mblk_t *, mp); 2466 } 2467 if (mp == NULL) 2468 return (NULL); 2469 2470 /* FW_HOOKS: LOOPBACK_IN */ 2471 if (hooks_in) { 2472 DTRACE_PROBE4(ip4__loopback__in__start, ill_t *, ill, 2473 ill_t *, NULL, ipha_t *, ipha, mblk_t *, mp); 2474 FW_HOOKS(ipst->ips_ip4_loopback_in_event, 2475 ipst->ips_ipv4firewall_loopback_in, 2476 ill, NULL, ipha, mp, mp, 0, ipst, error); 2477 DTRACE_PROBE1(ip4__loopback__in__end, mblk_t *, mp); 2478 } 2479 if (mp == NULL) 2480 return (NULL); 2481 2482 DTRACE_IP7(receive, mblk_t *, mp, conn_t *, NULL, void_ip_t *, 2483 ipha, __dtrace_ipsr_ill_t *, ill, ipha_t *, ipha, ip6_t *, 2484 NULL, int, 1); 2485 2486 /* Inbound IPsec polocies */ 2487 if (peer_connp != NULL) { 2488 /* Map ixa to ira including IPsec policies. */ 2489 ipsec_out_to_in(ixa, ill, &iras); 2490 mp = ipsec_check_inbound_policy(mp, peer_connp, ipha, 2491 NULL, &iras); 2492 } 2493 } else { 2494 ip6h = (ip6_t *)mp->b_rptr; 2495 2496 /* 2497 * If a callback is enabled then we need to know the 2498 * source and destination zoneids for the packet. We already 2499 * have those handy. 2500 */ 2501 if (ipst->ips_ip6_observe.he_interested) { 2502 zoneid_t szone, dzone; 2503 zoneid_t stackzoneid; 2504 2505 stackzoneid = netstackid_to_zoneid( 2506 ipst->ips_netstack->netstack_stackid); 2507 2508 if (stackzoneid == GLOBAL_ZONEID) { 2509 /* Shared-IP zone */ 2510 dzone = ixa->ixa_ire->ire_zoneid; 2511 szone = ixa->ixa_zoneid; 2512 } else { 2513 szone = dzone = stackzoneid; 2514 } 2515 ipobs_hook(mp, IPOBS_HOOK_LOCAL, szone, dzone, ill, 2516 ipst); 2517 } 2518 DTRACE_IP7(send, mblk_t *, mp, conn_t *, NULL, void_ip_t *, 2519 ip6h, __dtrace_ipsr_ill_t *, ill, ipha_t *, NULL, ip6_t *, 2520 ip6h, int, 1); 2521 2522 /* FW_HOOKS: LOOPBACK_OUT */ 2523 if (hooks_out) { 2524 DTRACE_PROBE4(ip6__loopback__out__start, ill_t *, NULL, 2525 ill_t *, ill, ip6_t *, ip6h, mblk_t *, mp); 2526 FW_HOOKS6(ipst->ips_ip6_loopback_out_event, 2527 ipst->ips_ipv6firewall_loopback_out, 2528 NULL, ill, ip6h, mp, mp, 0, ipst, error); 2529 DTRACE_PROBE1(ip6__loopback__out__end, mblk_t *, mp); 2530 } 2531 if (mp == NULL) 2532 return (NULL); 2533 2534 /* FW_HOOKS: LOOPBACK_IN */ 2535 if (hooks_in) { 2536 DTRACE_PROBE4(ip6__loopback__in__start, ill_t *, ill, 2537 ill_t *, NULL, ip6_t *, ip6h, mblk_t *, mp); 2538 FW_HOOKS6(ipst->ips_ip6_loopback_in_event, 2539 ipst->ips_ipv6firewall_loopback_in, 2540 ill, NULL, ip6h, mp, mp, 0, ipst, error); 2541 DTRACE_PROBE1(ip6__loopback__in__end, mblk_t *, mp); 2542 } 2543 if (mp == NULL) 2544 return (NULL); 2545 2546 DTRACE_IP7(receive, mblk_t *, mp, conn_t *, NULL, void_ip_t *, 2547 ip6h, __dtrace_ipsr_ill_t *, ill, ipha_t *, NULL, ip6_t *, 2548 ip6h, int, 1); 2549 2550 /* Inbound IPsec polocies */ 2551 if (peer_connp != NULL) { 2552 /* Map ixa to ira including IPsec policies. */ 2553 ipsec_out_to_in(ixa, ill, &iras); 2554 mp = ipsec_check_inbound_policy(mp, peer_connp, NULL, 2555 ip6h, &iras); 2556 } 2557 } 2558 2559 if (mp == NULL) { 2560 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 2561 ip_drop_input("ipIfStatsInDiscards", NULL, ill); 2562 } 2563 2564 return (mp); 2565 } 2566