1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 /* Copyright (c) 1990 Mentat Inc. */ 27 28 #include <sys/types.h> 29 #include <sys/stream.h> 30 #include <sys/strsubr.h> 31 #include <sys/dlpi.h> 32 #include <sys/strsun.h> 33 #include <sys/zone.h> 34 #include <sys/ddi.h> 35 #include <sys/sunddi.h> 36 #include <sys/cmn_err.h> 37 #include <sys/debug.h> 38 #include <sys/atomic.h> 39 40 #include <sys/systm.h> 41 #include <sys/param.h> 42 #include <sys/kmem.h> 43 #include <sys/sdt.h> 44 #include <sys/socket.h> 45 #include <sys/mac.h> 46 #include <net/if.h> 47 #include <net/if_arp.h> 48 #include <net/route.h> 49 #include <sys/sockio.h> 50 #include <netinet/in.h> 51 #include <net/if_dl.h> 52 53 #include <inet/common.h> 54 #include <inet/mi.h> 55 #include <inet/mib2.h> 56 #include <inet/nd.h> 57 #include <inet/arp.h> 58 #include <inet/snmpcom.h> 59 #include <inet/kstatcom.h> 60 61 #include <netinet/igmp_var.h> 62 #include <netinet/ip6.h> 63 #include <netinet/icmp6.h> 64 #include <netinet/sctp.h> 65 66 #include <inet/ip.h> 67 #include <inet/ip_impl.h> 68 #include <inet/ip6.h> 69 #include <inet/ip6_asp.h> 70 #include <inet/tcp.h> 71 #include <inet/ip_multi.h> 72 #include <inet/ip_if.h> 73 #include <inet/ip_ire.h> 74 #include <inet/ip_ftable.h> 75 #include <inet/ip_rts.h> 76 #include <inet/optcom.h> 77 #include <inet/ip_ndp.h> 78 #include <inet/ip_listutils.h> 79 #include <netinet/igmp.h> 80 #include <netinet/ip_mroute.h> 81 #include <inet/ipp_common.h> 82 83 #include <net/pfkeyv2.h> 84 #include <inet/sadb.h> 85 #include <inet/ipsec_impl.h> 86 #include <inet/ipdrop.h> 87 #include <inet/ip_netinfo.h> 88 89 #include <sys/pattr.h> 90 #include <inet/ipclassifier.h> 91 #include <inet/sctp_ip.h> 92 #include <inet/sctp/sctp_impl.h> 93 #include <inet/udp_impl.h> 94 #include <sys/sunddi.h> 95 96 #include <sys/tsol/label.h> 97 #include <sys/tsol/tnet.h> 98 99 #ifdef DEBUG 100 extern boolean_t skip_sctp_cksum; 101 #endif 102 103 static int ip_verify_nce(mblk_t *, ip_xmit_attr_t *); 104 static int ip_verify_dce(mblk_t *, ip_xmit_attr_t *); 105 static boolean_t ip_verify_lso(ill_t *, ip_xmit_attr_t *); 106 static boolean_t ip_verify_zcopy(ill_t *, ip_xmit_attr_t *); 107 static void ip_output_simple_broadcast(ip_xmit_attr_t *, mblk_t *); 108 109 /* 110 * There are two types of output functions for IP used for different 111 * purposes: 112 * - ip_output_simple() is when sending ICMP errors, TCP resets, etc when there 113 * is no context in the form of a conn_t. However, there is a 114 * ip_xmit_attr_t that the callers use to influence interface selection 115 * (needed for ICMP echo as well as IPv6 link-locals) and IPsec. 116 * 117 * - conn_ip_output() is used when sending packets with a conn_t and 118 * ip_set_destination has been called to cache information. In that case 119 * various socket options are recorded in the ip_xmit_attr_t and should 120 * be taken into account. 121 */ 122 123 /* 124 * The caller *must* have called conn_connect() or ip_attr_connect() 125 * before calling conn_ip_output(). The caller needs to redo that each time 126 * the destination IP address or port changes, as well as each time there is 127 * a change to any socket option that would modify how packets are routed out 128 * of the box (e.g., SO_DONTROUTE, IP_NEXTHOP, IP_BOUND_IF). 129 * 130 * The ULP caller has to serialize the use of a single ip_xmit_attr_t. 131 * We assert for that here. 132 */ 133 int 134 conn_ip_output(mblk_t *mp, ip_xmit_attr_t *ixa) 135 { 136 iaflags_t ixaflags = ixa->ixa_flags; 137 ire_t *ire; 138 nce_t *nce; 139 dce_t *dce; 140 ill_t *ill; 141 ip_stack_t *ipst = ixa->ixa_ipst; 142 int error; 143 int64_t now; 144 145 /* We defer ipIfStatsHCOutRequests until an error or we have an ill */ 146 147 ASSERT(ixa->ixa_ire != NULL); 148 /* Note there is no ixa_nce when reject and blackhole routes */ 149 ASSERT(ixa->ixa_dce != NULL); /* Could be default dce */ 150 151 #ifdef DEBUG 152 ASSERT(ixa->ixa_curthread == NULL); 153 ixa->ixa_curthread = curthread; 154 #endif 155 156 /* 157 * Even on labeled systems we can have a NULL ixa_tsl e.g., 158 * for IGMP/MLD traffic. 159 */ 160 161 ire = ixa->ixa_ire; 162 163 /* 164 * If the ULP says the (old) IRE resulted in reachability we 165 * record this before determine whether to use a new IRE. 166 * No locking for performance reasons. 167 */ 168 if (ixaflags & IXAF_REACH_CONF) 169 ire->ire_badcnt = 0; 170 171 /* 172 * Has routing changed since we cached the results of the lookup? 173 * 174 * This check captures all of: 175 * - the cached ire being deleted (by means of the special 176 * IRE_GENERATION_CONDEMNED) 177 * - A potentially better ire being added (ire_generation being 178 * increased) 179 * - A deletion of the nexthop ire that was used when we did the 180 * lookup. 181 * - An addition of a potentially better nexthop ire. 182 * The last two are handled by walking and increasing the generation 183 * number on all dependant IREs in ire_flush_cache(). 184 * 185 * The check also handles all cases of RTF_REJECT and RTF_BLACKHOLE 186 * since we ensure that each time we set ixa_ire to such an IRE we 187 * make sure the ixa_ire_generation does not match (by using 188 * IRE_GENERATION_VERIFY). 189 */ 190 if (ire->ire_generation != ixa->ixa_ire_generation) { 191 error = ip_verify_ire(mp, ixa); 192 if (error != 0) { 193 ip_drop_output("ipIfStatsOutDiscards - verify ire", 194 mp, NULL); 195 goto drop; 196 } 197 ire = ixa->ixa_ire; 198 ASSERT(ire != NULL); 199 if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) { 200 #ifdef DEBUG 201 ASSERT(ixa->ixa_curthread == curthread); 202 ixa->ixa_curthread = NULL; 203 #endif 204 ire->ire_ob_pkt_count++; 205 /* ixa_dce might be condemned; use default one */ 206 return ((ire->ire_sendfn)(ire, mp, mp->b_rptr, ixa, 207 &ipst->ips_dce_default->dce_ident)); 208 } 209 /* 210 * If the ncec changed then ip_verify_ire already set 211 * ixa->ixa_dce_generation = DCE_GENERATION_VERIFY; 212 * so we can recheck the interface mtu. 213 */ 214 215 /* 216 * Note that ire->ire_generation could already have changed. 217 * We catch that next time we send a packet. 218 */ 219 } 220 221 /* 222 * No need to lock access to ixa_nce since the ip_xmit_attr usage 223 * is single threaded. 224 */ 225 ASSERT(ixa->ixa_nce != NULL); 226 nce = ixa->ixa_nce; 227 if (nce->nce_is_condemned) { 228 error = ip_verify_nce(mp, ixa); 229 /* 230 * In case ZEROCOPY capability become not available, we 231 * copy the message and free the original one. We might 232 * be copying more data than needed but it doesn't hurt 233 * since such change rarely happens. 234 */ 235 switch (error) { 236 case 0: 237 break; 238 case ENOTSUP: { /* ZEROCOPY */ 239 mblk_t *nmp; 240 241 if ((nmp = copymsg(mp)) != NULL) { 242 freemsg(mp); 243 mp = nmp; 244 245 break; 246 } 247 /* FALLTHROUGH */ 248 } 249 default: 250 ip_drop_output("ipIfStatsOutDiscards - verify nce", 251 mp, NULL); 252 goto drop; 253 } 254 ire = ixa->ixa_ire; 255 ASSERT(ire != NULL); 256 if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) { 257 #ifdef DEBUG 258 ASSERT(ixa->ixa_curthread == curthread); 259 ixa->ixa_curthread = NULL; 260 #endif 261 ire->ire_ob_pkt_count++; 262 /* ixa_dce might be condemned; use default one */ 263 return ((ire->ire_sendfn)(ire, mp, mp->b_rptr, 264 ixa, &ipst->ips_dce_default->dce_ident)); 265 } 266 ASSERT(ixa->ixa_nce != NULL); 267 nce = ixa->ixa_nce; 268 269 /* 270 * Note that some other event could already have made 271 * the new nce condemned. We catch that next time we 272 * try to send a packet. 273 */ 274 } 275 /* 276 * If there is no per-destination dce_t then we have a reference to 277 * the default dce_t (which merely contains the dce_ipid). 278 * The generation check captures both the introduction of a 279 * per-destination dce_t (e.g., due to ICMP packet too big) and 280 * any change to the per-destination dce (including it becoming 281 * condemned by use of the special DCE_GENERATION_CONDEMNED). 282 */ 283 dce = ixa->ixa_dce; 284 285 /* 286 * To avoid a periodic timer to increase the path MTU we 287 * look at dce_last_change_time each time we send a packet. 288 */ 289 now = ddi_get_lbolt64(); 290 if ((dce->dce_flags & DCEF_PMTU) && 291 (TICK_TO_SEC(now) - dce->dce_last_change_time > 292 ipst->ips_ip_pathmtu_interval)) { 293 /* 294 * Older than 20 minutes. Drop the path MTU information. 295 * Since the path MTU changes as a result of this, twiddle 296 * ixa_dce_generation to make us go through the dce 297 * verification code in conn_ip_output. 298 */ 299 mutex_enter(&dce->dce_lock); 300 dce->dce_flags &= ~(DCEF_PMTU|DCEF_TOO_SMALL_PMTU); 301 dce->dce_last_change_time = TICK_TO_SEC(now); 302 mutex_exit(&dce->dce_lock); 303 dce_increment_generation(dce); 304 } 305 306 if (dce->dce_generation != ixa->ixa_dce_generation) { 307 error = ip_verify_dce(mp, ixa); 308 if (error != 0) { 309 ip_drop_output("ipIfStatsOutDiscards - verify dce", 310 mp, NULL); 311 goto drop; 312 } 313 dce = ixa->ixa_dce; 314 315 /* 316 * Note that some other event could already have made the 317 * new dce's generation number change. 318 * We catch that next time we try to send a packet. 319 */ 320 } 321 322 ill = nce->nce_ill; 323 324 /* 325 * An initial ixa_fragsize was set in ip_set_destination 326 * and we update it if any routing changes above. 327 * A change to ill_mtu with ifconfig will increase all dce_generation 328 * so that we will detect that with the generation check. 329 */ 330 331 /* 332 * Caller needs to make sure IXAF_VERIFY_SRC is not set if 333 * conn_unspec_src. 334 */ 335 if ((ixaflags & IXAF_VERIFY_SOURCE) && 336 ixa->ixa_src_generation != ipst->ips_src_generation) { 337 /* Check if the IP source is still assigned to the host. */ 338 uint_t gen; 339 340 if (!ip_verify_src(mp, ixa, &gen)) { 341 /* Don't send a packet with a source that isn't ours */ 342 error = EADDRNOTAVAIL; 343 ip_drop_output("ipIfStatsOutDiscards - invalid src", 344 mp, NULL); 345 goto drop; 346 } 347 /* The source is still valid - update the generation number */ 348 ixa->ixa_src_generation = gen; 349 } 350 351 /* 352 * We don't have an IRE when we fragment, hence ire_ob_pkt_count 353 * can only count the use prior to fragmentation. However the MIB 354 * counters on the ill will be incremented in post fragmentation. 355 */ 356 ire->ire_ob_pkt_count++; 357 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCOutRequests); 358 359 /* 360 * Based on ire_type and ire_flags call one of: 361 * ire_send_local_v* - for IRE_LOCAL and IRE_LOOPBACK 362 * ire_send_multirt_v* - if RTF_MULTIRT 363 * ire_send_noroute_v* - if RTF_REJECT or RTF_BLACHOLE 364 * ire_send_multicast_v* - for IRE_MULTICAST 365 * ire_send_broadcast_v4 - for IRE_BROADCAST 366 * ire_send_wire_v* - for the rest. 367 */ 368 #ifdef DEBUG 369 ASSERT(ixa->ixa_curthread == curthread); 370 ixa->ixa_curthread = NULL; 371 #endif 372 return ((ire->ire_sendfn)(ire, mp, mp->b_rptr, ixa, &dce->dce_ident)); 373 374 drop: 375 if (ixaflags & IXAF_IS_IPV4) { 376 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests); 377 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards); 378 } else { 379 BUMP_MIB(&ipst->ips_ip6_mib, ipIfStatsHCOutRequests); 380 BUMP_MIB(&ipst->ips_ip6_mib, ipIfStatsOutDiscards); 381 } 382 freemsg(mp); 383 #ifdef DEBUG 384 ASSERT(ixa->ixa_curthread == curthread); 385 ixa->ixa_curthread = NULL; 386 #endif 387 return (error); 388 } 389 390 /* 391 * Handle both IPv4 and IPv6. Sets the generation number 392 * to allow the caller to know when to call us again. 393 * Returns true if the source address in the packet is a valid source. 394 * We handle callers which try to send with a zero address (since we only 395 * get here if UNSPEC_SRC is not set). 396 */ 397 boolean_t 398 ip_verify_src(mblk_t *mp, ip_xmit_attr_t *ixa, uint_t *generationp) 399 { 400 ip_stack_t *ipst = ixa->ixa_ipst; 401 402 /* 403 * Need to grab the generation number before we check to 404 * avoid a race with a change to the set of local addresses. 405 * No lock needed since the thread which updates the set of local 406 * addresses use ipif/ill locks and exit those (hence a store memory 407 * barrier) before doing the atomic increase of ips_src_generation. 408 */ 409 if (generationp != NULL) 410 *generationp = ipst->ips_src_generation; 411 412 if (ixa->ixa_flags & IXAF_IS_IPV4) { 413 ipha_t *ipha = (ipha_t *)mp->b_rptr; 414 415 if (ipha->ipha_src == INADDR_ANY) 416 return (B_FALSE); 417 418 return (ip_laddr_verify_v4(ipha->ipha_src, ixa->ixa_zoneid, 419 ipst, B_FALSE) != IPVL_BAD); 420 } else { 421 ip6_t *ip6h = (ip6_t *)mp->b_rptr; 422 uint_t scopeid; 423 424 if (IN6_IS_ADDR_UNSPECIFIED(&ip6h->ip6_src)) 425 return (B_FALSE); 426 427 if (ixa->ixa_flags & IXAF_SCOPEID_SET) 428 scopeid = ixa->ixa_scopeid; 429 else 430 scopeid = 0; 431 432 return (ip_laddr_verify_v6(&ip6h->ip6_src, ixa->ixa_zoneid, 433 ipst, B_FALSE, scopeid) != IPVL_BAD); 434 } 435 } 436 437 /* 438 * Handle both IPv4 and IPv6. Reverify/recalculate the IRE to use. 439 */ 440 int 441 ip_verify_ire(mblk_t *mp, ip_xmit_attr_t *ixa) 442 { 443 uint_t gen; 444 ire_t *ire; 445 nce_t *nce; 446 int error; 447 boolean_t multirt = B_FALSE; 448 449 /* 450 * Redo ip_select_route. 451 * Need to grab generation number as part of the lookup to 452 * avoid race. 453 */ 454 error = 0; 455 ire = ip_select_route_pkt(mp, ixa, &gen, &error, &multirt); 456 ASSERT(ire != NULL); /* IRE_NOROUTE if none found */ 457 if (error != 0) { 458 ire_refrele(ire); 459 return (error); 460 } 461 462 if (ixa->ixa_ire != NULL) 463 ire_refrele_notr(ixa->ixa_ire); 464 #ifdef DEBUG 465 ire_refhold_notr(ire); 466 ire_refrele(ire); 467 #endif 468 ixa->ixa_ire = ire; 469 ixa->ixa_ire_generation = gen; 470 if (multirt) { 471 if (ixa->ixa_flags & IXAF_IS_IPV4) 472 ixa->ixa_postfragfn = ip_postfrag_multirt_v4; 473 else 474 ixa->ixa_postfragfn = ip_postfrag_multirt_v6; 475 ixa->ixa_flags |= IXAF_MULTIRT_MULTICAST; 476 } else { 477 ixa->ixa_postfragfn = ire->ire_postfragfn; 478 ixa->ixa_flags &= ~IXAF_MULTIRT_MULTICAST; 479 } 480 481 /* 482 * Don't look for an nce for reject or blackhole. 483 * They have ire_generation set to IRE_GENERATION_VERIFY which 484 * makes conn_ip_output avoid references to ixa_nce. 485 */ 486 if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) { 487 ASSERT(ixa->ixa_ire_generation == IRE_GENERATION_VERIFY); 488 ixa->ixa_dce_generation = DCE_GENERATION_VERIFY; 489 return (0); 490 } 491 492 /* The NCE could now be different */ 493 nce = ire_to_nce_pkt(ire, mp); 494 if (nce == NULL) { 495 /* 496 * Allocation failure. Make sure we redo ire/nce selection 497 * next time we send. 498 */ 499 ixa->ixa_ire_generation = IRE_GENERATION_VERIFY; 500 ixa->ixa_dce_generation = DCE_GENERATION_VERIFY; 501 return (ENOBUFS); 502 } 503 if (nce == ixa->ixa_nce) { 504 /* No change */ 505 nce_refrele(nce); 506 return (0); 507 } 508 509 /* 510 * Since the path MTU might change as a result of this 511 * route change, we twiddle ixa_dce_generation to 512 * make conn_ip_output go through the ip_verify_dce code. 513 */ 514 ixa->ixa_dce_generation = DCE_GENERATION_VERIFY; 515 516 if (ixa->ixa_nce != NULL) 517 nce_refrele(ixa->ixa_nce); 518 ixa->ixa_nce = nce; 519 return (0); 520 } 521 522 /* 523 * Handle both IPv4 and IPv6. Reverify/recalculate the NCE to use. 524 */ 525 static int 526 ip_verify_nce(mblk_t *mp, ip_xmit_attr_t *ixa) 527 { 528 ire_t *ire = ixa->ixa_ire; 529 nce_t *nce; 530 int error = 0; 531 ipha_t *ipha = NULL; 532 ip6_t *ip6h = NULL; 533 534 if (ire->ire_ipversion == IPV4_VERSION) 535 ipha = (ipha_t *)mp->b_rptr; 536 else 537 ip6h = (ip6_t *)mp->b_rptr; 538 539 nce = ire_handle_condemned_nce(ixa->ixa_nce, ire, ipha, ip6h, B_TRUE); 540 if (nce == NULL) { 541 /* Try to find a better ire */ 542 return (ip_verify_ire(mp, ixa)); 543 } 544 545 /* 546 * The hardware offloading capabilities, for example LSO, of the 547 * interface might have changed, so do sanity verification here. 548 */ 549 if (ixa->ixa_flags & IXAF_VERIFY_LSO) { 550 if (!ip_verify_lso(nce->nce_ill, ixa)) { 551 ASSERT(ixa->ixa_notify != NULL); 552 ixa->ixa_notify(ixa->ixa_notify_cookie, ixa, 553 IXAN_LSO, 0); 554 error = ENOTSUP; 555 } 556 } 557 558 /* 559 * Verify ZEROCOPY capability of underlying ill. Notify the ULP with 560 * any ZEROCOPY changes. In case ZEROCOPY capability is not available 561 * any more, return error so that conn_ip_output() can take care of 562 * the ZEROCOPY message properly. It's safe to continue send the 563 * message when ZEROCOPY newly become available. 564 */ 565 if (ixa->ixa_flags & IXAF_VERIFY_ZCOPY) { 566 if (!ip_verify_zcopy(nce->nce_ill, ixa)) { 567 ASSERT(ixa->ixa_notify != NULL); 568 ixa->ixa_notify(ixa->ixa_notify_cookie, ixa, 569 IXAN_ZCOPY, 0); 570 if ((ixa->ixa_flags & IXAF_ZCOPY_CAPAB) == 0) 571 error = ENOTSUP; 572 } 573 } 574 575 /* 576 * Since the path MTU might change as a result of this 577 * change, we twiddle ixa_dce_generation to 578 * make conn_ip_output go through the ip_verify_dce code. 579 */ 580 ixa->ixa_dce_generation = DCE_GENERATION_VERIFY; 581 582 nce_refrele(ixa->ixa_nce); 583 ixa->ixa_nce = nce; 584 return (error); 585 } 586 587 /* 588 * Handle both IPv4 and IPv6. Reverify/recalculate the DCE to use. 589 */ 590 static int 591 ip_verify_dce(mblk_t *mp, ip_xmit_attr_t *ixa) 592 { 593 dce_t *dce; 594 uint_t gen; 595 uint_t pmtu; 596 597 dce = dce_lookup_pkt(mp, ixa, &gen); 598 ASSERT(dce != NULL); 599 600 dce_refrele_notr(ixa->ixa_dce); 601 #ifdef DEBUG 602 dce_refhold_notr(dce); 603 dce_refrele(dce); 604 #endif 605 ixa->ixa_dce = dce; 606 ixa->ixa_dce_generation = gen; 607 608 /* Extract the (path) mtu from the dce, ncec_ill etc */ 609 pmtu = ip_get_pmtu(ixa); 610 611 /* 612 * Tell ULP about PMTU changes - increase or decrease - by returning 613 * an error if IXAF_VERIFY_PMTU is set. In such case, ULP should update 614 * both ixa_pmtu and ixa_fragsize appropriately. 615 * 616 * If ULP doesn't set that flag then we need to update ixa_fragsize 617 * since routing could have changed the ill after after ixa_fragsize 618 * was set previously in the conn_ip_output path or in 619 * ip_set_destination. 620 * 621 * In case of LSO, ixa_fragsize might be greater than ixa_pmtu. 622 * 623 * In the case of a path MTU increase we send the packet after the 624 * notify to the ULP. 625 */ 626 if (ixa->ixa_flags & IXAF_VERIFY_PMTU) { 627 if (ixa->ixa_pmtu != pmtu) { 628 uint_t oldmtu = ixa->ixa_pmtu; 629 630 DTRACE_PROBE2(verify_pmtu, uint32_t, pmtu, 631 uint32_t, ixa->ixa_pmtu); 632 ASSERT(ixa->ixa_notify != NULL); 633 ixa->ixa_notify(ixa->ixa_notify_cookie, ixa, 634 IXAN_PMTU, pmtu); 635 if (pmtu < oldmtu) 636 return (EMSGSIZE); 637 } 638 } else { 639 ixa->ixa_fragsize = pmtu; 640 } 641 return (0); 642 } 643 644 /* 645 * Verify LSO usability. Keep the return value simple to indicate whether 646 * the LSO capability has changed. Handle both IPv4 and IPv6. 647 */ 648 static boolean_t 649 ip_verify_lso(ill_t *ill, ip_xmit_attr_t *ixa) 650 { 651 ill_lso_capab_t *lsoc = &ixa->ixa_lso_capab; 652 ill_lso_capab_t *new_lsoc = ill->ill_lso_capab; 653 654 if (ixa->ixa_flags & IXAF_LSO_CAPAB) { 655 /* 656 * Not unsable any more. 657 */ 658 if ((ixa->ixa_flags & IXAF_IPSEC_SECURE) || 659 (ixa->ixa_ire->ire_type & (IRE_LOCAL | IRE_LOOPBACK)) || 660 (ixa->ixa_ire->ire_flags & RTF_MULTIRT) || 661 ((ixa->ixa_flags & IXAF_IS_IPV4) ? 662 !ILL_LSO_TCP_IPV4_USABLE(ill) : 663 !ILL_LSO_TCP_IPV6_USABLE(ill))) { 664 ixa->ixa_flags &= ~IXAF_LSO_CAPAB; 665 666 return (B_FALSE); 667 } 668 669 /* 670 * Capability has changed, refresh the copy in ixa. 671 */ 672 if (lsoc->ill_lso_max != new_lsoc->ill_lso_max) { 673 *lsoc = *new_lsoc; 674 675 return (B_FALSE); 676 } 677 } else { /* Was not usable */ 678 if (!(ixa->ixa_flags & IXAF_IPSEC_SECURE) && 679 !(ixa->ixa_ire->ire_type & (IRE_LOCAL | IRE_LOOPBACK)) && 680 !(ixa->ixa_ire->ire_flags & RTF_MULTIRT) && 681 ((ixa->ixa_flags & IXAF_IS_IPV4) ? 682 ILL_LSO_TCP_IPV4_USABLE(ill) : 683 ILL_LSO_TCP_IPV6_USABLE(ill))) { 684 *lsoc = *new_lsoc; 685 ixa->ixa_flags |= IXAF_LSO_CAPAB; 686 687 return (B_FALSE); 688 } 689 } 690 691 return (B_TRUE); 692 } 693 694 /* 695 * Verify ZEROCOPY usability. Keep the return value simple to indicate whether 696 * the ZEROCOPY capability has changed. Handle both IPv4 and IPv6. 697 */ 698 static boolean_t 699 ip_verify_zcopy(ill_t *ill, ip_xmit_attr_t *ixa) 700 { 701 if (ixa->ixa_flags & IXAF_ZCOPY_CAPAB) { 702 /* 703 * Not unsable any more. 704 */ 705 if ((ixa->ixa_flags & IXAF_IPSEC_SECURE) || 706 (ixa->ixa_ire->ire_type & (IRE_LOCAL | IRE_LOOPBACK)) || 707 (ixa->ixa_ire->ire_flags & RTF_MULTIRT) || 708 !ILL_ZCOPY_USABLE(ill)) { 709 ixa->ixa_flags &= ~IXAF_ZCOPY_CAPAB; 710 711 return (B_FALSE); 712 } 713 } else { /* Was not usable */ 714 if (!(ixa->ixa_flags & IXAF_IPSEC_SECURE) && 715 !(ixa->ixa_ire->ire_type & (IRE_LOCAL | IRE_LOOPBACK)) && 716 !(ixa->ixa_ire->ire_flags & RTF_MULTIRT) && 717 ILL_ZCOPY_USABLE(ill)) { 718 ixa->ixa_flags |= IXAF_ZCOPY_CAPAB; 719 720 return (B_FALSE); 721 } 722 } 723 724 return (B_TRUE); 725 } 726 727 728 /* 729 * When there is no conn_t context, this will send a packet. 730 * The caller must *not* have called conn_connect() or ip_attr_connect() 731 * before calling ip_output_simple(). 732 * Handles IPv4 and IPv6. Returns zero or an errno such as ENETUNREACH. 733 * Honors IXAF_SET_SOURCE. 734 * 735 * We acquire the ire and after calling ire_sendfn we release 736 * the hold on the ire. Ditto for the nce and dce. 737 * 738 * This assumes that the caller has set the following in ip_xmit_attr_t: 739 * ixa_tsl, ixa_zoneid, and ixa_ipst must always be set. 740 * If ixa_ifindex is non-zero it means send out that ill. (If it is 741 * an upper IPMP ill we load balance across the group; if a lower we send 742 * on that lower ill without load balancing.) 743 * IXAF_IS_IPV4 must be set correctly. 744 * If IXAF_IPSEC_SECURE is set then the ixa_ipsec_* fields must be set. 745 * If IXAF_NO_IPSEC is set we'd skip IPsec policy lookup. 746 * If neither of those two are set we do an IPsec policy lookup. 747 * 748 * We handle setting things like 749 * ixa_pktlen 750 * ixa_ip_hdr_length 751 * ixa->ixa_protocol 752 * 753 * The caller may set ixa_xmit_hint, which is used for ECMP selection and 754 * transmit ring selecting in GLD. 755 * 756 * The caller must do an ixa_cleanup() to release any IPsec references 757 * after we return. 758 */ 759 int 760 ip_output_simple(mblk_t *mp, ip_xmit_attr_t *ixa) 761 { 762 ts_label_t *effective_tsl = NULL; 763 int err; 764 765 ASSERT(ixa->ixa_ipst != NULL); 766 767 if (is_system_labeled()) { 768 ip_stack_t *ipst = ixa->ixa_ipst; 769 770 if (ixa->ixa_flags & IXAF_IS_IPV4) { 771 err = tsol_check_label_v4(ixa->ixa_tsl, ixa->ixa_zoneid, 772 &mp, CONN_MAC_DEFAULT, B_FALSE, ixa->ixa_ipst, 773 &effective_tsl); 774 } else { 775 err = tsol_check_label_v6(ixa->ixa_tsl, ixa->ixa_zoneid, 776 &mp, CONN_MAC_DEFAULT, B_FALSE, ixa->ixa_ipst, 777 &effective_tsl); 778 } 779 if (err != 0) { 780 ip2dbg(("tsol_check: label check failed (%d)\n", err)); 781 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests); 782 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards); 783 ip_drop_output("tsol_check_label", mp, NULL); 784 freemsg(mp); 785 return (err); 786 } 787 if (effective_tsl != NULL) { 788 /* Update the label */ 789 ip_xmit_attr_replace_tsl(ixa, effective_tsl); 790 } 791 } 792 793 if (ixa->ixa_flags & IXAF_IS_IPV4) 794 return (ip_output_simple_v4(mp, ixa)); 795 else 796 return (ip_output_simple_v6(mp, ixa)); 797 } 798 799 int 800 ip_output_simple_v4(mblk_t *mp, ip_xmit_attr_t *ixa) 801 { 802 ipha_t *ipha; 803 ipaddr_t firsthop; /* In IP header */ 804 ipaddr_t dst; /* End of source route, or ipha_dst if none */ 805 ire_t *ire; 806 ipaddr_t setsrc; /* RTF_SETSRC */ 807 int error; 808 ill_t *ill = NULL; 809 dce_t *dce = NULL; 810 nce_t *nce; 811 iaflags_t ixaflags = ixa->ixa_flags; 812 ip_stack_t *ipst = ixa->ixa_ipst; 813 boolean_t repeat = B_FALSE; 814 boolean_t multirt = B_FALSE; 815 int64_t now; 816 817 ipha = (ipha_t *)mp->b_rptr; 818 ASSERT(IPH_HDR_VERSION(ipha) == IPV4_VERSION); 819 820 /* 821 * Even on labeled systems we can have a NULL ixa_tsl e.g., 822 * for IGMP/MLD traffic. 823 */ 824 825 /* Caller already set flags */ 826 ASSERT(ixa->ixa_flags & IXAF_IS_IPV4); 827 828 ASSERT(ixa->ixa_nce == NULL); 829 830 ixa->ixa_pktlen = ntohs(ipha->ipha_length); 831 ASSERT(ixa->ixa_pktlen == msgdsize(mp)); 832 ixa->ixa_ip_hdr_length = IPH_HDR_LENGTH(ipha); 833 ixa->ixa_protocol = ipha->ipha_protocol; 834 835 /* 836 * Assumes that source routed packets have already been massaged by 837 * the ULP (ip_massage_options) and as a result ipha_dst is the next 838 * hop in the source route. The final destination is used for IPsec 839 * policy and DCE lookup. 840 */ 841 firsthop = ipha->ipha_dst; 842 dst = ip_get_dst(ipha); 843 844 repeat_ire: 845 error = 0; 846 setsrc = INADDR_ANY; 847 ire = ip_select_route_v4(firsthop, ixa, NULL, &setsrc, &error, 848 &multirt); 849 ASSERT(ire != NULL); /* IRE_NOROUTE if none found */ 850 if (error != 0) { 851 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests); 852 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards); 853 ip_drop_output("ipIfStatsOutDiscards - select route", mp, NULL); 854 freemsg(mp); 855 goto done; 856 } 857 858 if (ire->ire_flags & (RTF_BLACKHOLE|RTF_REJECT)) { 859 /* ire_ill might be NULL hence need to skip some code */ 860 if (ixaflags & IXAF_SET_SOURCE) 861 ipha->ipha_src = htonl(INADDR_LOOPBACK); 862 ixa->ixa_fragsize = IP_MAXPACKET; 863 ill = NULL; 864 nce = NULL; 865 ire->ire_ob_pkt_count++; 866 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests); 867 /* No dce yet; use default one */ 868 error = (ire->ire_sendfn)(ire, mp, ipha, ixa, 869 &ipst->ips_dce_default->dce_ident); 870 goto done; 871 } 872 873 /* Note that ipha_dst is only used for IRE_MULTICAST */ 874 nce = ire_to_nce(ire, ipha->ipha_dst, NULL); 875 if (nce == NULL) { 876 /* Allocation failure? */ 877 ip_drop_output("ire_to_nce", mp, ill); 878 freemsg(mp); 879 error = ENOBUFS; 880 goto done; 881 } 882 if (nce->nce_is_condemned) { 883 nce_t *nce1; 884 885 nce1 = ire_handle_condemned_nce(nce, ire, ipha, NULL, B_TRUE); 886 nce_refrele(nce); 887 if (nce1 == NULL) { 888 if (!repeat) { 889 /* Try finding a better IRE */ 890 repeat = B_TRUE; 891 ire_refrele(ire); 892 goto repeat_ire; 893 } 894 /* Tried twice - drop packet */ 895 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards); 896 ip_drop_output("No nce", mp, ill); 897 freemsg(mp); 898 error = ENOBUFS; 899 goto done; 900 } 901 nce = nce1; 902 } 903 904 /* 905 * For multicast with multirt we have a flag passed back from 906 * ire_lookup_multi_ill_v4 since we don't have an IRE for each 907 * possible multicast address. 908 * We also need a flag for multicast since we can't check 909 * whether RTF_MULTIRT is set in ixa_ire for multicast. 910 */ 911 if (multirt) { 912 ixa->ixa_postfragfn = ip_postfrag_multirt_v4; 913 ixa->ixa_flags |= IXAF_MULTIRT_MULTICAST; 914 } else { 915 ixa->ixa_postfragfn = ire->ire_postfragfn; 916 ixa->ixa_flags &= ~IXAF_MULTIRT_MULTICAST; 917 } 918 ASSERT(ixa->ixa_nce == NULL); 919 ixa->ixa_nce = nce; 920 921 /* 922 * Check for a dce_t with a path mtu. 923 */ 924 dce = dce_lookup_v4(dst, ipst, NULL); 925 ASSERT(dce != NULL); 926 927 if (!(ixaflags & IXAF_PMTU_DISCOVERY)) { 928 ixa->ixa_fragsize = ip_get_base_mtu(nce->nce_ill, ire); 929 } else if (dce->dce_flags & DCEF_PMTU) { 930 /* 931 * To avoid a periodic timer to increase the path MTU we 932 * look at dce_last_change_time each time we send a packet. 933 */ 934 now = ddi_get_lbolt64(); 935 if (TICK_TO_SEC(now) - dce->dce_last_change_time > 936 ipst->ips_ip_pathmtu_interval) { 937 /* 938 * Older than 20 minutes. Drop the path MTU information. 939 */ 940 mutex_enter(&dce->dce_lock); 941 dce->dce_flags &= ~(DCEF_PMTU|DCEF_TOO_SMALL_PMTU); 942 dce->dce_last_change_time = TICK_TO_SEC(now); 943 mutex_exit(&dce->dce_lock); 944 dce_increment_generation(dce); 945 ixa->ixa_fragsize = ip_get_base_mtu(nce->nce_ill, ire); 946 } else { 947 uint_t fragsize; 948 949 fragsize = ip_get_base_mtu(nce->nce_ill, ire); 950 if (fragsize > dce->dce_pmtu) 951 fragsize = dce->dce_pmtu; 952 ixa->ixa_fragsize = fragsize; 953 } 954 } else { 955 ixa->ixa_fragsize = ip_get_base_mtu(nce->nce_ill, ire); 956 } 957 958 /* 959 * We use use ire_nexthop_ill (and not ncec_ill) to avoid the under ipmp 960 * interface for source address selection. 961 */ 962 ill = ire_nexthop_ill(ire); 963 964 if (ixaflags & IXAF_SET_SOURCE) { 965 ipaddr_t src; 966 967 /* 968 * We use the final destination to get 969 * correct selection for source routed packets 970 */ 971 972 /* If unreachable we have no ill but need some source */ 973 if (ill == NULL) { 974 src = htonl(INADDR_LOOPBACK); 975 error = 0; 976 } else { 977 error = ip_select_source_v4(ill, setsrc, dst, 978 ixa->ixa_multicast_ifaddr, ixa->ixa_zoneid, ipst, 979 &src, NULL, NULL); 980 } 981 if (error != 0) { 982 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCOutRequests); 983 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); 984 ip_drop_output("ipIfStatsOutDiscards - no source", 985 mp, ill); 986 freemsg(mp); 987 goto done; 988 } 989 ipha->ipha_src = src; 990 } else if (ixaflags & IXAF_VERIFY_SOURCE) { 991 /* Check if the IP source is assigned to the host. */ 992 if (!ip_verify_src(mp, ixa, NULL)) { 993 /* Don't send a packet with a source that isn't ours */ 994 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests); 995 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards); 996 ip_drop_output("ipIfStatsOutDiscards - invalid source", 997 mp, ill); 998 freemsg(mp); 999 error = EADDRNOTAVAIL; 1000 goto done; 1001 } 1002 } 1003 1004 1005 /* 1006 * Check against global IPsec policy to set the AH/ESP attributes. 1007 * IPsec will set IXAF_IPSEC_* and ixa_ipsec_* as appropriate. 1008 */ 1009 if (!(ixaflags & (IXAF_NO_IPSEC|IXAF_IPSEC_SECURE))) { 1010 ASSERT(ixa->ixa_ipsec_policy == NULL); 1011 mp = ip_output_attach_policy(mp, ipha, NULL, NULL, ixa); 1012 if (mp == NULL) { 1013 /* MIB and ip_drop_packet already done */ 1014 return (EHOSTUNREACH); /* IPsec policy failure */ 1015 } 1016 } 1017 1018 if (ill != NULL) { 1019 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCOutRequests); 1020 } else { 1021 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests); 1022 } 1023 1024 /* 1025 * We update the statistics on the most specific IRE i.e., the first 1026 * one we found. 1027 * We don't have an IRE when we fragment, hence ire_ob_pkt_count 1028 * can only count the use prior to fragmentation. However the MIB 1029 * counters on the ill will be incremented in post fragmentation. 1030 */ 1031 ire->ire_ob_pkt_count++; 1032 1033 /* 1034 * Based on ire_type and ire_flags call one of: 1035 * ire_send_local_v4 - for IRE_LOCAL and IRE_LOOPBACK 1036 * ire_send_multirt_v4 - if RTF_MULTIRT 1037 * ire_send_noroute_v4 - if RTF_REJECT or RTF_BLACHOLE 1038 * ire_send_multicast_v4 - for IRE_MULTICAST 1039 * ire_send_broadcast_v4 - for IRE_BROADCAST 1040 * ire_send_wire_v4 - for the rest. 1041 */ 1042 error = (ire->ire_sendfn)(ire, mp, ipha, ixa, &dce->dce_ident); 1043 done: 1044 ire_refrele(ire); 1045 if (dce != NULL) 1046 dce_refrele(dce); 1047 if (ill != NULL) 1048 ill_refrele(ill); 1049 if (ixa->ixa_nce != NULL) 1050 nce_refrele(ixa->ixa_nce); 1051 ixa->ixa_nce = NULL; 1052 return (error); 1053 } 1054 1055 /* 1056 * ire_sendfn() functions. 1057 * These functions use the following xmit_attr: 1058 * - ixa_fragsize - read to determine whether or not to fragment 1059 * - IXAF_IPSEC_SECURE - to determine whether or not to invoke IPsec 1060 * - ixa_ipsec_* are used inside IPsec 1061 * - IXAF_SET_SOURCE - replace IP source in broadcast case. 1062 * - IXAF_LOOPBACK_COPY - for multicast and broadcast 1063 */ 1064 1065 1066 /* 1067 * ire_sendfn for IRE_LOCAL and IRE_LOOPBACK 1068 * 1069 * The checks for restrict_interzone_loopback are done in ire_route_recursive. 1070 */ 1071 /* ARGSUSED4 */ 1072 int 1073 ire_send_local_v4(ire_t *ire, mblk_t *mp, void *iph_arg, 1074 ip_xmit_attr_t *ixa, uint32_t *identp) 1075 { 1076 ipha_t *ipha = (ipha_t *)iph_arg; 1077 ip_stack_t *ipst = ixa->ixa_ipst; 1078 ill_t *ill = ire->ire_ill; 1079 ip_recv_attr_t iras; /* NOTE: No bzero for performance */ 1080 uint_t pktlen = ixa->ixa_pktlen; 1081 1082 /* 1083 * No fragmentation, no nce, no application of IPsec, 1084 * and no ipha_ident assignment. 1085 * 1086 * Note different order between IP provider and FW_HOOKS than in 1087 * send_wire case. 1088 */ 1089 1090 /* 1091 * DTrace this as ip:::send. A packet blocked by FW_HOOKS will fire the 1092 * send probe, but not the receive probe. 1093 */ 1094 DTRACE_IP7(send, mblk_t *, mp, conn_t *, NULL, void_ip_t *, 1095 ipha, __dtrace_ipsr_ill_t *, ill, ipha_t *, ipha, ip6_t *, NULL, 1096 int, 1); 1097 1098 if (HOOKS4_INTERESTED_LOOPBACK_OUT(ipst)) { 1099 int error; 1100 1101 DTRACE_PROBE4(ip4__loopback__out__start, ill_t *, NULL, 1102 ill_t *, ill, ipha_t *, ipha, mblk_t *, mp); 1103 FW_HOOKS(ipst->ips_ip4_loopback_out_event, 1104 ipst->ips_ipv4firewall_loopback_out, 1105 NULL, ill, ipha, mp, mp, 0, ipst, error); 1106 DTRACE_PROBE1(ip4__loopback__out__end, mblk_t *, mp); 1107 if (mp == NULL) 1108 return (error); 1109 1110 /* 1111 * Even if the destination was changed by the filter we use the 1112 * forwarding decision that was made based on the address 1113 * in ip_output/ip_set_destination. 1114 */ 1115 /* Length could be different */ 1116 ipha = (ipha_t *)mp->b_rptr; 1117 pktlen = ntohs(ipha->ipha_length); 1118 } 1119 1120 /* 1121 * If a callback is enabled then we need to know the 1122 * source and destination zoneids for the packet. We already 1123 * have those handy. 1124 */ 1125 if (ipst->ips_ip4_observe.he_interested) { 1126 zoneid_t szone, dzone; 1127 zoneid_t stackzoneid; 1128 1129 stackzoneid = netstackid_to_zoneid( 1130 ipst->ips_netstack->netstack_stackid); 1131 1132 if (stackzoneid == GLOBAL_ZONEID) { 1133 /* Shared-IP zone */ 1134 dzone = ire->ire_zoneid; 1135 szone = ixa->ixa_zoneid; 1136 } else { 1137 szone = dzone = stackzoneid; 1138 } 1139 ipobs_hook(mp, IPOBS_HOOK_LOCAL, szone, dzone, ill, ipst); 1140 } 1141 1142 /* Handle lo0 stats */ 1143 ipst->ips_loopback_packets++; 1144 1145 /* Map ixa to ira including IPsec policies */ 1146 ipsec_out_to_in(ixa, ill, &iras); 1147 iras.ira_pktlen = pktlen; 1148 1149 if (!IS_SIMPLE_IPH(ipha)) { 1150 ip_output_local_options(ipha, ipst); 1151 iras.ira_flags |= IRAF_IPV4_OPTIONS; 1152 } 1153 1154 if (HOOKS4_INTERESTED_LOOPBACK_IN(ipst)) { 1155 int error; 1156 1157 DTRACE_PROBE4(ip4__loopback__in__start, ill_t *, ill, 1158 ill_t *, NULL, ipha_t *, ipha, mblk_t *, mp); 1159 FW_HOOKS(ipst->ips_ip4_loopback_in_event, 1160 ipst->ips_ipv4firewall_loopback_in, 1161 ill, NULL, ipha, mp, mp, 0, ipst, error); 1162 1163 DTRACE_PROBE1(ip4__loopback__in__end, mblk_t *, mp); 1164 if (mp == NULL) { 1165 ira_cleanup(&iras, B_FALSE); 1166 return (error); 1167 } 1168 /* 1169 * Even if the destination was changed by the filter we use the 1170 * forwarding decision that was made based on the address 1171 * in ip_output/ip_set_destination. 1172 */ 1173 /* Length could be different */ 1174 ipha = (ipha_t *)mp->b_rptr; 1175 pktlen = iras.ira_pktlen = ntohs(ipha->ipha_length); 1176 } 1177 1178 DTRACE_IP7(receive, mblk_t *, mp, conn_t *, NULL, void_ip_t *, 1179 ipha, __dtrace_ipsr_ill_t *, ill, ipha_t *, ipha, ip6_t *, NULL, 1180 int, 1); 1181 1182 ire->ire_ib_pkt_count++; 1183 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInReceives); 1184 UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCInOctets, pktlen); 1185 1186 /* Destined to ire_zoneid - use that for fanout */ 1187 iras.ira_zoneid = ire->ire_zoneid; 1188 1189 if (is_system_labeled()) { 1190 iras.ira_flags |= IRAF_SYSTEM_LABELED; 1191 1192 /* 1193 * This updates ira_cred, ira_tsl and ira_free_flags based 1194 * on the label. We don't expect this to ever fail for 1195 * loopback packets, so we silently drop the packet should it 1196 * fail. 1197 */ 1198 if (!tsol_get_pkt_label(mp, IPV4_VERSION, &iras)) { 1199 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 1200 ip_drop_input("tsol_get_pkt_label", mp, ill); 1201 freemsg(mp); 1202 return (0); 1203 } 1204 ASSERT(iras.ira_tsl != NULL); 1205 1206 /* tsol_get_pkt_label sometimes does pullupmsg */ 1207 ipha = (ipha_t *)mp->b_rptr; 1208 } 1209 1210 ip_fanout_v4(mp, ipha, &iras); 1211 1212 /* We moved any IPsec refs from ixa to iras */ 1213 ira_cleanup(&iras, B_FALSE); 1214 return (0); 1215 } 1216 1217 /* 1218 * ire_sendfn for IRE_BROADCAST 1219 * If the broadcast address is present on multiple ills and ixa_ifindex 1220 * isn't set, then we generate 1221 * a separate datagram (potentially with different source address) for 1222 * those ills. In any case, only one copy is looped back to ip_input_v4. 1223 */ 1224 int 1225 ire_send_broadcast_v4(ire_t *ire, mblk_t *mp, void *iph_arg, 1226 ip_xmit_attr_t *ixa, uint32_t *identp) 1227 { 1228 ipha_t *ipha = (ipha_t *)iph_arg; 1229 ip_stack_t *ipst = ixa->ixa_ipst; 1230 irb_t *irb = ire->ire_bucket; 1231 ire_t *ire1; 1232 mblk_t *mp1; 1233 ipha_t *ipha1; 1234 iaflags_t ixaflags = ixa->ixa_flags; 1235 nce_t *nce1, *nce_orig; 1236 1237 /* 1238 * Unless ire_send_multirt_v4 already set a ttl, force the 1239 * ttl to a smallish value. 1240 */ 1241 if (!(ixa->ixa_flags & IXAF_NO_TTL_CHANGE)) { 1242 /* 1243 * To avoid broadcast storms, we usually set the TTL to 1 for 1244 * broadcasts. This can 1245 * be overridden stack-wide through the ip_broadcast_ttl 1246 * ndd tunable, or on a per-connection basis through the 1247 * IP_BROADCAST_TTL socket option. 1248 * 1249 * If SO_DONTROUTE/IXAF_DONTROUTE is set, then ire_send_wire_v4 1250 * will force ttl to one after we've set this. 1251 */ 1252 if (ixaflags & IXAF_BROADCAST_TTL_SET) 1253 ipha->ipha_ttl = ixa->ixa_broadcast_ttl; 1254 else 1255 ipha->ipha_ttl = ipst->ips_ip_broadcast_ttl; 1256 } 1257 /* 1258 * Make sure we get a loopback copy (after IPsec and frag) 1259 * Skip hardware checksum so that loopback copy is checksumed. 1260 */ 1261 ixa->ixa_flags |= IXAF_LOOPBACK_COPY | IXAF_NO_HW_CKSUM; 1262 1263 /* Do we need to potentially generate multiple copies? */ 1264 if (irb->irb_ire_cnt == 1 || ixa->ixa_ifindex != 0) 1265 return (ire_send_wire_v4(ire, mp, ipha, ixa, identp)); 1266 1267 /* 1268 * Loop over all IRE_BROADCAST in the bucket (might only be one). 1269 * Note that everything in the bucket has the same destination address. 1270 */ 1271 irb_refhold(irb); 1272 for (ire1 = irb->irb_ire; ire1 != NULL; ire1 = ire1->ire_next) { 1273 /* We do the main IRE after the end of the loop */ 1274 if (ire1 == ire) 1275 continue; 1276 1277 /* 1278 * Only IREs for the same IP address should be in the same 1279 * bucket. 1280 * But could have IRE_HOSTs in the case of CGTP. 1281 * If we find any multirt routes we bail out of the loop 1282 * and just do the single packet at the end; ip_postfrag_multirt 1283 * will duplicate the packet. 1284 */ 1285 ASSERT(ire1->ire_addr == ire->ire_addr); 1286 if (!(ire1->ire_type & IRE_BROADCAST)) 1287 continue; 1288 1289 if (IRE_IS_CONDEMNED(ire1)) 1290 continue; 1291 1292 if (ixa->ixa_zoneid != ALL_ZONES && 1293 ire->ire_zoneid != ire1->ire_zoneid) 1294 continue; 1295 1296 ASSERT(ire->ire_ill != ire1->ire_ill && ire1->ire_ill != NULL); 1297 1298 if (ire1->ire_flags & RTF_MULTIRT) 1299 break; 1300 1301 /* 1302 * For IPMP we only send for the ipmp_ill. arp_nce_init() will 1303 * ensure that this goes out on the cast_ill. 1304 */ 1305 if (IS_UNDER_IPMP(ire1->ire_ill)) 1306 continue; 1307 1308 mp1 = copymsg(mp); 1309 if (mp1 == NULL) { 1310 BUMP_MIB(ire1->ire_ill->ill_ip_mib, 1311 ipIfStatsOutDiscards); 1312 ip_drop_output("ipIfStatsOutDiscards", 1313 mp, ire1->ire_ill); 1314 continue; 1315 } 1316 1317 ipha1 = (ipha_t *)mp1->b_rptr; 1318 if (ixa->ixa_flags & IXAF_SET_SOURCE) { 1319 /* 1320 * Need to pick a different source address for each 1321 * interface. If we have a global IPsec policy and 1322 * no per-socket policy then we punt to 1323 * ip_output_simple_v4 using a separate ip_xmit_attr_t. 1324 */ 1325 if (ixaflags & IXAF_IPSEC_GLOBAL_POLICY) { 1326 ip_output_simple_broadcast(ixa, mp1); 1327 continue; 1328 } 1329 /* Pick a new source address for each interface */ 1330 if (ip_select_source_v4(ire1->ire_ill, INADDR_ANY, 1331 ipha1->ipha_dst, INADDR_ANY, ixa->ixa_zoneid, ipst, 1332 &ipha1->ipha_src, NULL, NULL) != 0) { 1333 BUMP_MIB(ire1->ire_ill->ill_ip_mib, 1334 ipIfStatsOutDiscards); 1335 ip_drop_output("ipIfStatsOutDiscards - select " 1336 "broadcast source", mp1, ire1->ire_ill); 1337 freemsg(mp1); 1338 continue; 1339 } 1340 /* 1341 * Check against global IPsec policy to set the AH/ESP 1342 * attributes. IPsec will set IXAF_IPSEC_* and 1343 * ixa_ipsec_* as appropriate. 1344 */ 1345 if (!(ixaflags & (IXAF_NO_IPSEC|IXAF_IPSEC_SECURE))) { 1346 ASSERT(ixa->ixa_ipsec_policy == NULL); 1347 mp1 = ip_output_attach_policy(mp1, ipha, NULL, 1348 NULL, ixa); 1349 if (mp1 == NULL) { 1350 /* 1351 * MIB and ip_drop_packet already 1352 * done 1353 */ 1354 continue; 1355 } 1356 } 1357 } 1358 /* Make sure we have an NCE on this ill */ 1359 nce1 = arp_nce_init(ire1->ire_ill, ire1->ire_addr, 1360 ire1->ire_type); 1361 if (nce1 == NULL) { 1362 BUMP_MIB(ire1->ire_ill->ill_ip_mib, 1363 ipIfStatsOutDiscards); 1364 ip_drop_output("ipIfStatsOutDiscards - broadcast nce", 1365 mp1, ire1->ire_ill); 1366 freemsg(mp1); 1367 continue; 1368 } 1369 nce_orig = ixa->ixa_nce; 1370 ixa->ixa_nce = nce1; 1371 1372 ire_refhold(ire1); 1373 /* 1374 * Ignore any errors here. We just collect the errno for 1375 * the main ire below 1376 */ 1377 (void) ire_send_wire_v4(ire1, mp1, ipha1, ixa, identp); 1378 ire_refrele(ire1); 1379 1380 ixa->ixa_nce = nce_orig; 1381 nce_refrele(nce1); 1382 1383 ixa->ixa_flags &= ~IXAF_LOOPBACK_COPY; 1384 } 1385 irb_refrele(irb); 1386 /* Finally, the main one */ 1387 1388 /* 1389 * For IPMP we only send broadcasts on the ipmp_ill. 1390 */ 1391 if (IS_UNDER_IPMP(ire->ire_ill)) { 1392 freemsg(mp); 1393 return (0); 1394 } 1395 1396 return (ire_send_wire_v4(ire, mp, ipha, ixa, identp)); 1397 } 1398 1399 /* 1400 * Send a packet using a different source address and different 1401 * IPsec policy. 1402 */ 1403 static void 1404 ip_output_simple_broadcast(ip_xmit_attr_t *ixa, mblk_t *mp) 1405 { 1406 ip_xmit_attr_t ixas; 1407 1408 bzero(&ixas, sizeof (ixas)); 1409 ixas.ixa_flags = IXAF_BASIC_SIMPLE_V4; 1410 ixas.ixa_zoneid = ixa->ixa_zoneid; 1411 ixas.ixa_ifindex = 0; 1412 ixas.ixa_ipst = ixa->ixa_ipst; 1413 ixas.ixa_cred = ixa->ixa_cred; 1414 ixas.ixa_cpid = ixa->ixa_cpid; 1415 ixas.ixa_tsl = ixa->ixa_tsl; 1416 ixas.ixa_multicast_ttl = IP_DEFAULT_MULTICAST_TTL; 1417 1418 (void) ip_output_simple(mp, &ixas); 1419 ixa_cleanup(&ixas); 1420 } 1421 1422 1423 static void 1424 multirt_check_v4(ire_t *ire, ipha_t *ipha, ip_xmit_attr_t *ixa) 1425 { 1426 ip_stack_t *ipst = ixa->ixa_ipst; 1427 1428 /* Limit the TTL on multirt packets */ 1429 if (ire->ire_type & IRE_MULTICAST) { 1430 if (ipha->ipha_ttl > 1) { 1431 ip2dbg(("ire_send_multirt_v4: forcing multicast " 1432 "multirt TTL to 1 (was %d), dst 0x%08x\n", 1433 ipha->ipha_ttl, ntohl(ire->ire_addr))); 1434 ipha->ipha_ttl = 1; 1435 } 1436 ixa->ixa_flags |= IXAF_NO_TTL_CHANGE; 1437 } else if ((ipst->ips_ip_multirt_ttl > 0) && 1438 (ipha->ipha_ttl > ipst->ips_ip_multirt_ttl)) { 1439 ipha->ipha_ttl = ipst->ips_ip_multirt_ttl; 1440 /* 1441 * Need to ensure we don't increase the ttl should we go through 1442 * ire_send_broadcast or multicast. 1443 */ 1444 ixa->ixa_flags |= IXAF_NO_TTL_CHANGE; 1445 } 1446 } 1447 1448 /* 1449 * ire_sendfn for IRE_MULTICAST 1450 */ 1451 int 1452 ire_send_multicast_v4(ire_t *ire, mblk_t *mp, void *iph_arg, 1453 ip_xmit_attr_t *ixa, uint32_t *identp) 1454 { 1455 ipha_t *ipha = (ipha_t *)iph_arg; 1456 ip_stack_t *ipst = ixa->ixa_ipst; 1457 ill_t *ill = ire->ire_ill; 1458 iaflags_t ixaflags = ixa->ixa_flags; 1459 1460 /* 1461 * The IRE_MULTICAST is the same whether or not multirt is in use. 1462 * Hence we need special-case code. 1463 */ 1464 if (ixaflags & IXAF_MULTIRT_MULTICAST) 1465 multirt_check_v4(ire, ipha, ixa); 1466 1467 /* 1468 * Check if anything in ip_input_v4 wants a copy of the transmitted 1469 * packet (after IPsec and fragmentation) 1470 * 1471 * 1. Multicast routers always need a copy unless SO_DONTROUTE is set 1472 * RSVP and the rsvp daemon is an example of a 1473 * protocol and user level process that 1474 * handles it's own routing. Hence, it uses the 1475 * SO_DONTROUTE option to accomplish this. 1476 * 2. If the sender has set IP_MULTICAST_LOOP, then we just 1477 * check whether there are any receivers for the group on the ill 1478 * (ignoring the zoneid). 1479 * 3. If IP_MULTICAST_LOOP is not set, then we check if there are 1480 * any members in other shared-IP zones. 1481 * If such members exist, then we indicate that the sending zone 1482 * shouldn't get a loopback copy to preserve the IP_MULTICAST_LOOP 1483 * behavior. 1484 * 1485 * When we loopback we skip hardware checksum to make sure loopback 1486 * copy is checksumed. 1487 * 1488 * Note that ire_ill is the upper in the case of IPMP. 1489 */ 1490 ixa->ixa_flags &= ~(IXAF_LOOPBACK_COPY | IXAF_NO_HW_CKSUM); 1491 if (ipst->ips_ip_g_mrouter && ill->ill_mrouter_cnt > 0 && 1492 !(ixaflags & IXAF_DONTROUTE)) { 1493 ixa->ixa_flags |= IXAF_LOOPBACK_COPY | IXAF_NO_HW_CKSUM; 1494 } else if (ixaflags & IXAF_MULTICAST_LOOP) { 1495 /* 1496 * If this zone or any other zone has members then loopback 1497 * a copy. 1498 */ 1499 if (ill_hasmembers_v4(ill, ipha->ipha_dst)) 1500 ixa->ixa_flags |= IXAF_LOOPBACK_COPY | IXAF_NO_HW_CKSUM; 1501 } else if (ipst->ips_netstack->netstack_numzones > 1) { 1502 /* 1503 * This zone should not have a copy. But there are some other 1504 * zones which might have members. 1505 */ 1506 if (ill_hasmembers_otherzones_v4(ill, ipha->ipha_dst, 1507 ixa->ixa_zoneid)) { 1508 ixa->ixa_flags |= IXAF_NO_LOOP_ZONEID_SET; 1509 ixa->ixa_no_loop_zoneid = ixa->ixa_zoneid; 1510 ixa->ixa_flags |= IXAF_LOOPBACK_COPY | IXAF_NO_HW_CKSUM; 1511 } 1512 } 1513 1514 /* 1515 * Unless ire_send_multirt_v4 or icmp_output_hdrincl already set a ttl, 1516 * force the ttl to the IP_MULTICAST_TTL value 1517 */ 1518 if (!(ixaflags & IXAF_NO_TTL_CHANGE)) { 1519 ipha->ipha_ttl = ixa->ixa_multicast_ttl; 1520 } 1521 1522 return (ire_send_wire_v4(ire, mp, ipha, ixa, identp)); 1523 } 1524 1525 /* 1526 * ire_sendfn for IREs with RTF_MULTIRT 1527 */ 1528 int 1529 ire_send_multirt_v4(ire_t *ire, mblk_t *mp, void *iph_arg, 1530 ip_xmit_attr_t *ixa, uint32_t *identp) 1531 { 1532 ipha_t *ipha = (ipha_t *)iph_arg; 1533 1534 multirt_check_v4(ire, ipha, ixa); 1535 1536 if (ire->ire_type & IRE_MULTICAST) 1537 return (ire_send_multicast_v4(ire, mp, ipha, ixa, identp)); 1538 else if (ire->ire_type & IRE_BROADCAST) 1539 return (ire_send_broadcast_v4(ire, mp, ipha, ixa, identp)); 1540 else 1541 return (ire_send_wire_v4(ire, mp, ipha, ixa, identp)); 1542 } 1543 1544 /* 1545 * ire_sendfn for IREs with RTF_REJECT/RTF_BLACKHOLE, including IRE_NOROUTE 1546 */ 1547 int 1548 ire_send_noroute_v4(ire_t *ire, mblk_t *mp, void *iph_arg, 1549 ip_xmit_attr_t *ixa, uint32_t *identp) 1550 { 1551 ip_stack_t *ipst = ixa->ixa_ipst; 1552 ipha_t *ipha = (ipha_t *)iph_arg; 1553 ill_t *ill; 1554 ip_recv_attr_t iras; 1555 boolean_t dummy; 1556 1557 /* We assign an IP ident for nice errors */ 1558 ipha->ipha_ident = atomic_add_32_nv(identp, 1); 1559 1560 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutNoRoutes); 1561 1562 if (ire->ire_type & IRE_NOROUTE) { 1563 /* A lack of a route as opposed to RTF_REJECT|BLACKHOLE */ 1564 ip_rts_change(RTM_MISS, ipha->ipha_dst, 0, 0, 0, 0, 0, 0, 1565 RTA_DST, ipst); 1566 } 1567 1568 if (ire->ire_flags & RTF_BLACKHOLE) { 1569 ip_drop_output("ipIfStatsOutNoRoutes RTF_BLACKHOLE", mp, NULL); 1570 freemsg(mp); 1571 /* No error even for local senders - silent blackhole */ 1572 return (0); 1573 } 1574 ip_drop_output("ipIfStatsOutNoRoutes RTF_REJECT", mp, NULL); 1575 1576 /* 1577 * We need an ill_t for the ip_recv_attr_t even though this packet 1578 * was never received and icmp_unreachable doesn't currently use 1579 * ira_ill. 1580 */ 1581 ill = ill_lookup_on_name("lo0", B_FALSE, 1582 !(ixa->ixa_flags & IRAF_IS_IPV4), &dummy, ipst); 1583 if (ill == NULL) { 1584 freemsg(mp); 1585 return (EHOSTUNREACH); 1586 } 1587 1588 bzero(&iras, sizeof (iras)); 1589 /* Map ixa to ira including IPsec policies */ 1590 ipsec_out_to_in(ixa, ill, &iras); 1591 1592 if (ip_source_routed(ipha, ipst)) { 1593 icmp_unreachable(mp, ICMP_SOURCE_ROUTE_FAILED, &iras); 1594 } else { 1595 icmp_unreachable(mp, ICMP_HOST_UNREACHABLE, &iras); 1596 } 1597 /* We moved any IPsec refs from ixa to iras */ 1598 ira_cleanup(&iras, B_FALSE); 1599 ill_refrele(ill); 1600 return (EHOSTUNREACH); 1601 } 1602 1603 /* 1604 * Calculate a checksum ignoring any hardware capabilities 1605 * 1606 * Returns B_FALSE if the packet was too short for the checksum. Caller 1607 * should free and do stats. 1608 */ 1609 static boolean_t 1610 ip_output_sw_cksum_v4(mblk_t *mp, ipha_t *ipha, ip_xmit_attr_t *ixa) 1611 { 1612 ip_stack_t *ipst = ixa->ixa_ipst; 1613 uint_t pktlen = ixa->ixa_pktlen; 1614 uint16_t *cksump; 1615 uint32_t cksum; 1616 uint8_t protocol = ixa->ixa_protocol; 1617 uint16_t ip_hdr_length = ixa->ixa_ip_hdr_length; 1618 ipaddr_t dst = ipha->ipha_dst; 1619 ipaddr_t src = ipha->ipha_src; 1620 1621 /* Just in case it contained garbage */ 1622 DB_CKSUMFLAGS(mp) &= ~HCK_FLAGS; 1623 1624 /* 1625 * Calculate ULP checksum 1626 */ 1627 if (protocol == IPPROTO_TCP) { 1628 cksump = IPH_TCPH_CHECKSUMP(ipha, ip_hdr_length); 1629 cksum = IP_TCP_CSUM_COMP; 1630 } else if (protocol == IPPROTO_UDP) { 1631 cksump = IPH_UDPH_CHECKSUMP(ipha, ip_hdr_length); 1632 cksum = IP_UDP_CSUM_COMP; 1633 } else if (protocol == IPPROTO_SCTP) { 1634 sctp_hdr_t *sctph; 1635 1636 ASSERT(MBLKL(mp) >= (ip_hdr_length + sizeof (*sctph))); 1637 sctph = (sctp_hdr_t *)(mp->b_rptr + ip_hdr_length); 1638 /* 1639 * Zero out the checksum field to ensure proper 1640 * checksum calculation. 1641 */ 1642 sctph->sh_chksum = 0; 1643 #ifdef DEBUG 1644 if (!skip_sctp_cksum) 1645 #endif 1646 sctph->sh_chksum = sctp_cksum(mp, ip_hdr_length); 1647 goto ip_hdr_cksum; 1648 } else { 1649 goto ip_hdr_cksum; 1650 } 1651 1652 /* ULP puts the checksum field is in the first mblk */ 1653 ASSERT(((uchar_t *)cksump) + sizeof (uint16_t) <= mp->b_wptr); 1654 1655 /* 1656 * We accumulate the pseudo header checksum in cksum. 1657 * This is pretty hairy code, so watch close. One 1658 * thing to keep in mind is that UDP and TCP have 1659 * stored their respective datagram lengths in their 1660 * checksum fields. This lines things up real nice. 1661 */ 1662 cksum += (dst >> 16) + (dst & 0xFFFF) + (src >> 16) + (src & 0xFFFF); 1663 1664 cksum = IP_CSUM(mp, ip_hdr_length, cksum); 1665 /* 1666 * For UDP/IPv4 a zero means that the packets wasn't checksummed. 1667 * Change to 0xffff 1668 */ 1669 if (protocol == IPPROTO_UDP && cksum == 0) 1670 *cksump = ~cksum; 1671 else 1672 *cksump = cksum; 1673 1674 IP_STAT(ipst, ip_out_sw_cksum); 1675 IP_STAT_UPDATE(ipst, ip_out_sw_cksum_bytes, pktlen); 1676 1677 ip_hdr_cksum: 1678 /* Calculate IPv4 header checksum */ 1679 ipha->ipha_hdr_checksum = 0; 1680 ipha->ipha_hdr_checksum = ip_csum_hdr(ipha); 1681 return (B_TRUE); 1682 } 1683 1684 /* 1685 * Calculate the ULP checksum - try to use hardware. 1686 * In the case of MULTIRT, broadcast or multicast the 1687 * IXAF_NO_HW_CKSUM is set in which case we use software. 1688 * 1689 * If the hardware supports IP header checksum offload; then clear the 1690 * contents of IP header checksum field as expected by NIC. 1691 * Do this only if we offloaded either full or partial sum. 1692 * 1693 * Returns B_FALSE if the packet was too short for the checksum. Caller 1694 * should free and do stats. 1695 */ 1696 static boolean_t 1697 ip_output_cksum_v4(iaflags_t ixaflags, mblk_t *mp, ipha_t *ipha, 1698 ip_xmit_attr_t *ixa, ill_t *ill) 1699 { 1700 uint_t pktlen = ixa->ixa_pktlen; 1701 uint16_t *cksump; 1702 uint16_t hck_flags; 1703 uint32_t cksum; 1704 uint8_t protocol = ixa->ixa_protocol; 1705 uint16_t ip_hdr_length = ixa->ixa_ip_hdr_length; 1706 1707 if ((ixaflags & IXAF_NO_HW_CKSUM) || !ILL_HCKSUM_CAPABLE(ill) || 1708 !dohwcksum) { 1709 return (ip_output_sw_cksum_v4(mp, ipha, ixa)); 1710 } 1711 1712 /* 1713 * Calculate ULP checksum. Note that we don't use cksump and cksum 1714 * if the ill has FULL support. 1715 */ 1716 if (protocol == IPPROTO_TCP) { 1717 cksump = IPH_TCPH_CHECKSUMP(ipha, ip_hdr_length); 1718 cksum = IP_TCP_CSUM_COMP; /* Pseudo-header cksum */ 1719 } else if (protocol == IPPROTO_UDP) { 1720 cksump = IPH_UDPH_CHECKSUMP(ipha, ip_hdr_length); 1721 cksum = IP_UDP_CSUM_COMP; /* Pseudo-header cksum */ 1722 } else if (protocol == IPPROTO_SCTP) { 1723 sctp_hdr_t *sctph; 1724 1725 ASSERT(MBLKL(mp) >= (ip_hdr_length + sizeof (*sctph))); 1726 sctph = (sctp_hdr_t *)(mp->b_rptr + ip_hdr_length); 1727 /* 1728 * Zero out the checksum field to ensure proper 1729 * checksum calculation. 1730 */ 1731 sctph->sh_chksum = 0; 1732 #ifdef DEBUG 1733 if (!skip_sctp_cksum) 1734 #endif 1735 sctph->sh_chksum = sctp_cksum(mp, ip_hdr_length); 1736 goto ip_hdr_cksum; 1737 } else { 1738 ip_hdr_cksum: 1739 /* Calculate IPv4 header checksum */ 1740 ipha->ipha_hdr_checksum = 0; 1741 ipha->ipha_hdr_checksum = ip_csum_hdr(ipha); 1742 return (B_TRUE); 1743 } 1744 1745 /* ULP puts the checksum field is in the first mblk */ 1746 ASSERT(((uchar_t *)cksump) + sizeof (uint16_t) <= mp->b_wptr); 1747 1748 /* 1749 * Underlying interface supports hardware checksum offload for 1750 * the payload; leave the payload checksum for the hardware to 1751 * calculate. N.B: We only need to set up checksum info on the 1752 * first mblk. 1753 */ 1754 hck_flags = ill->ill_hcksum_capab->ill_hcksum_txflags; 1755 1756 DB_CKSUMFLAGS(mp) &= ~HCK_FLAGS; 1757 if (hck_flags & HCKSUM_INET_FULL_V4) { 1758 /* 1759 * Hardware calculates pseudo-header, header and the 1760 * payload checksums, so clear the checksum field in 1761 * the protocol header. 1762 */ 1763 *cksump = 0; 1764 DB_CKSUMFLAGS(mp) |= HCK_FULLCKSUM; 1765 1766 ipha->ipha_hdr_checksum = 0; 1767 if (hck_flags & HCKSUM_IPHDRCKSUM) { 1768 DB_CKSUMFLAGS(mp) |= HCK_IPV4_HDRCKSUM; 1769 } else { 1770 ipha->ipha_hdr_checksum = ip_csum_hdr(ipha); 1771 } 1772 return (B_TRUE); 1773 } 1774 if ((hck_flags) & HCKSUM_INET_PARTIAL) { 1775 ipaddr_t dst = ipha->ipha_dst; 1776 ipaddr_t src = ipha->ipha_src; 1777 /* 1778 * Partial checksum offload has been enabled. Fill 1779 * the checksum field in the protocol header with the 1780 * pseudo-header checksum value. 1781 * 1782 * We accumulate the pseudo header checksum in cksum. 1783 * This is pretty hairy code, so watch close. One 1784 * thing to keep in mind is that UDP and TCP have 1785 * stored their respective datagram lengths in their 1786 * checksum fields. This lines things up real nice. 1787 */ 1788 cksum += (dst >> 16) + (dst & 0xFFFF) + 1789 (src >> 16) + (src & 0xFFFF); 1790 cksum += *(cksump); 1791 cksum = (cksum & 0xFFFF) + (cksum >> 16); 1792 *(cksump) = (cksum & 0xFFFF) + (cksum >> 16); 1793 1794 /* 1795 * Offsets are relative to beginning of IP header. 1796 */ 1797 DB_CKSUMSTART(mp) = ip_hdr_length; 1798 DB_CKSUMSTUFF(mp) = (uint8_t *)cksump - (uint8_t *)ipha; 1799 DB_CKSUMEND(mp) = pktlen; 1800 DB_CKSUMFLAGS(mp) |= HCK_PARTIALCKSUM; 1801 1802 ipha->ipha_hdr_checksum = 0; 1803 if (hck_flags & HCKSUM_IPHDRCKSUM) { 1804 DB_CKSUMFLAGS(mp) |= HCK_IPV4_HDRCKSUM; 1805 } else { 1806 ipha->ipha_hdr_checksum = ip_csum_hdr(ipha); 1807 } 1808 return (B_TRUE); 1809 } 1810 /* Hardware capabilities include neither full nor partial IPv4 */ 1811 return (ip_output_sw_cksum_v4(mp, ipha, ixa)); 1812 } 1813 1814 /* 1815 * ire_sendfn for offlink and onlink destinations. 1816 * Also called from the multicast, broadcast, multirt send functions. 1817 * 1818 * Assumes that the caller has a hold on the ire. 1819 * 1820 * This function doesn't care if the IRE just became condemned since that 1821 * can happen at any time. 1822 */ 1823 /* ARGSUSED */ 1824 int 1825 ire_send_wire_v4(ire_t *ire, mblk_t *mp, void *iph_arg, 1826 ip_xmit_attr_t *ixa, uint32_t *identp) 1827 { 1828 ip_stack_t *ipst = ixa->ixa_ipst; 1829 ipha_t *ipha = (ipha_t *)iph_arg; 1830 iaflags_t ixaflags = ixa->ixa_flags; 1831 ill_t *ill; 1832 1833 ASSERT(ixa->ixa_nce != NULL); 1834 ill = ixa->ixa_nce->nce_ill; 1835 1836 if (ixaflags & IXAF_DONTROUTE) 1837 ipha->ipha_ttl = 1; 1838 1839 /* 1840 * Assign an ident value for this packet. There could be other 1841 * threads targeting the same destination, so we have to arrange 1842 * for a atomic increment. Note that we use a 32-bit atomic add 1843 * because it has better performance than its 16-bit sibling. 1844 * 1845 * Normally ixa_extra_ident is 0, but in the case of LSO it will 1846 * be the number of TCP segments that the driver/hardware will 1847 * extraly construct. 1848 * 1849 * If running in cluster mode and if the source address 1850 * belongs to a replicated service then vector through 1851 * cl_inet_ipident vector to allocate ip identifier 1852 * NOTE: This is a contract private interface with the 1853 * clustering group. 1854 */ 1855 if (cl_inet_ipident != NULL) { 1856 ipaddr_t src = ipha->ipha_src; 1857 ipaddr_t dst = ipha->ipha_dst; 1858 netstackid_t stack_id = ipst->ips_netstack->netstack_stackid; 1859 1860 ASSERT(cl_inet_isclusterwide != NULL); 1861 if ((*cl_inet_isclusterwide)(stack_id, IPPROTO_IP, 1862 AF_INET, (uint8_t *)(uintptr_t)src, NULL)) { 1863 /* 1864 * Note: not correct with LSO since we can't allocate 1865 * ixa_extra_ident+1 consecutive values. 1866 */ 1867 ipha->ipha_ident = (*cl_inet_ipident)(stack_id, 1868 IPPROTO_IP, AF_INET, (uint8_t *)(uintptr_t)src, 1869 (uint8_t *)(uintptr_t)dst, NULL); 1870 } else { 1871 ipha->ipha_ident = atomic_add_32_nv(identp, 1872 ixa->ixa_extra_ident + 1); 1873 } 1874 } else { 1875 ipha->ipha_ident = atomic_add_32_nv(identp, 1876 ixa->ixa_extra_ident + 1); 1877 } 1878 #ifndef _BIG_ENDIAN 1879 ipha->ipha_ident = htons(ipha->ipha_ident); 1880 #endif 1881 1882 /* 1883 * This might set b_band, thus the IPsec and fragmentation 1884 * code in IP ensures that b_band is updated in the first mblk. 1885 */ 1886 if (IPP_ENABLED(IPP_LOCAL_OUT, ipst)) { 1887 /* ip_process translates an IS_UNDER_IPMP */ 1888 mp = ip_process(IPP_LOCAL_OUT, mp, ill, ill); 1889 if (mp == NULL) { 1890 /* ip_drop_packet and MIB done */ 1891 return (0); /* Might just be delayed */ 1892 } 1893 } 1894 1895 /* 1896 * Verify any IPv4 options. 1897 * 1898 * The presense of IP options also forces the network stack to 1899 * calculate the checksum in software. This is because: 1900 * 1901 * Wrap around: certain partial-checksum NICs (eri, ce) limit 1902 * the size of "start offset" width to 6-bit. This effectively 1903 * sets the largest value of the offset to 64-bytes, starting 1904 * from the MAC header. When the cumulative MAC and IP headers 1905 * exceed such limit, the offset will wrap around. This causes 1906 * the checksum to be calculated at the wrong place. 1907 * 1908 * IPv4 source routing: none of the full-checksum capable NICs 1909 * is capable of correctly handling the IPv4 source-routing 1910 * option for purposes of calculating the pseudo-header; the 1911 * actual destination is different from the destination in the 1912 * header which is that of the next-hop. (This case may not be 1913 * true for NICs which can parse IPv6 extension headers, but 1914 * we choose to simplify the implementation by not offloading 1915 * checksum when they are present.) 1916 */ 1917 if (!IS_SIMPLE_IPH(ipha)) { 1918 ixaflags = ixa->ixa_flags |= IXAF_NO_HW_CKSUM; 1919 /* An IS_UNDER_IPMP ill is ok here */ 1920 if (ip_output_options(mp, ipha, ixa, ill)) { 1921 /* Packet has been consumed and ICMP error sent */ 1922 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); 1923 return (EINVAL); 1924 } 1925 } 1926 1927 /* 1928 * To handle IPsec/iptun's labeling needs we need to tag packets 1929 * while we still have ixa_tsl 1930 */ 1931 if (is_system_labeled() && ixa->ixa_tsl != NULL && 1932 (ill->ill_mactype == DL_6TO4 || ill->ill_mactype == DL_IPV4 || 1933 ill->ill_mactype == DL_IPV6)) { 1934 cred_t *newcr; 1935 1936 newcr = copycred_from_tslabel(ixa->ixa_cred, ixa->ixa_tsl, 1937 KM_NOSLEEP); 1938 if (newcr == NULL) { 1939 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); 1940 ip_drop_output("ipIfStatsOutDiscards - newcr", 1941 mp, ill); 1942 freemsg(mp); 1943 return (ENOBUFS); 1944 } 1945 mblk_setcred(mp, newcr, NOPID); 1946 crfree(newcr); /* mblk_setcred did its own crhold */ 1947 } 1948 1949 if (ixa->ixa_pktlen > ixa->ixa_fragsize || 1950 (ixaflags & IXAF_IPSEC_SECURE)) { 1951 uint32_t pktlen; 1952 1953 pktlen = ixa->ixa_pktlen; 1954 if (ixaflags & IXAF_IPSEC_SECURE) 1955 pktlen += ipsec_out_extra_length(ixa); 1956 1957 if (pktlen > IP_MAXPACKET) 1958 return (EMSGSIZE); 1959 1960 if (ixaflags & IXAF_SET_ULP_CKSUM) { 1961 /* 1962 * Compute ULP checksum and IP header checksum 1963 * using software 1964 */ 1965 if (!ip_output_sw_cksum_v4(mp, ipha, ixa)) { 1966 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); 1967 ip_drop_output("ipIfStatsOutDiscards", mp, ill); 1968 freemsg(mp); 1969 return (EINVAL); 1970 } 1971 } else { 1972 /* Calculate IPv4 header checksum */ 1973 ipha->ipha_hdr_checksum = 0; 1974 ipha->ipha_hdr_checksum = ip_csum_hdr(ipha); 1975 } 1976 1977 /* 1978 * If this packet would generate a icmp_frag_needed 1979 * message, we need to handle it before we do the IPsec 1980 * processing. Otherwise, we need to strip the IPsec 1981 * headers before we send up the message to the ULPs 1982 * which becomes messy and difficult. 1983 * 1984 * We check using IXAF_DONTFRAG. The DF bit in the header 1985 * is not inspected - it will be copied to any generated 1986 * fragments. 1987 */ 1988 if ((pktlen > ixa->ixa_fragsize) && 1989 (ixaflags & IXAF_DONTFRAG)) { 1990 /* Generate ICMP and return error */ 1991 ip_recv_attr_t iras; 1992 1993 DTRACE_PROBE4(ip4__fragsize__fail, uint_t, pktlen, 1994 uint_t, ixa->ixa_fragsize, uint_t, ixa->ixa_pktlen, 1995 uint_t, ixa->ixa_pmtu); 1996 1997 bzero(&iras, sizeof (iras)); 1998 /* Map ixa to ira including IPsec policies */ 1999 ipsec_out_to_in(ixa, ill, &iras); 2000 2001 ip_drop_output("ICMP_FRAG_NEEDED", mp, ill); 2002 icmp_frag_needed(mp, ixa->ixa_fragsize, &iras); 2003 /* We moved any IPsec refs from ixa to iras */ 2004 ira_cleanup(&iras, B_FALSE); 2005 return (EMSGSIZE); 2006 } 2007 DTRACE_PROBE4(ip4__fragsize__ok, uint_t, pktlen, 2008 uint_t, ixa->ixa_fragsize, uint_t, ixa->ixa_pktlen, 2009 uint_t, ixa->ixa_pmtu); 2010 2011 if (ixaflags & IXAF_IPSEC_SECURE) { 2012 /* 2013 * Pass in sufficient information so that 2014 * IPsec can determine whether to fragment, and 2015 * which function to call after fragmentation. 2016 */ 2017 return (ipsec_out_process(mp, ixa)); 2018 } 2019 return (ip_fragment_v4(mp, ixa->ixa_nce, ixaflags, 2020 ixa->ixa_pktlen, ixa->ixa_fragsize, ixa->ixa_xmit_hint, 2021 ixa->ixa_zoneid, ixa->ixa_no_loop_zoneid, 2022 ixa->ixa_postfragfn, &ixa->ixa_cookie)); 2023 } 2024 if (ixaflags & IXAF_SET_ULP_CKSUM) { 2025 /* Compute ULP checksum and IP header checksum */ 2026 /* An IS_UNDER_IPMP ill is ok here */ 2027 if (!ip_output_cksum_v4(ixaflags, mp, ipha, ixa, ill)) { 2028 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); 2029 ip_drop_output("ipIfStatsOutDiscards", mp, ill); 2030 freemsg(mp); 2031 return (EINVAL); 2032 } 2033 } else { 2034 /* Calculate IPv4 header checksum */ 2035 ipha->ipha_hdr_checksum = 0; 2036 ipha->ipha_hdr_checksum = ip_csum_hdr(ipha); 2037 } 2038 return ((ixa->ixa_postfragfn)(mp, ixa->ixa_nce, ixaflags, 2039 ixa->ixa_pktlen, ixa->ixa_xmit_hint, ixa->ixa_zoneid, 2040 ixa->ixa_no_loop_zoneid, &ixa->ixa_cookie)); 2041 } 2042 2043 /* 2044 * Send mp into ip_input 2045 * Common for IPv4 and IPv6 2046 */ 2047 void 2048 ip_postfrag_loopback(mblk_t *mp, nce_t *nce, iaflags_t ixaflags, 2049 uint_t pkt_len, zoneid_t nolzid) 2050 { 2051 rtc_t rtc; 2052 ill_t *ill = nce->nce_ill; 2053 ip_recv_attr_t iras; /* NOTE: No bzero for performance */ 2054 ncec_t *ncec; 2055 2056 ncec = nce->nce_common; 2057 iras.ira_flags = IRAF_VERIFY_IP_CKSUM | IRAF_VERIFY_ULP_CKSUM | 2058 IRAF_LOOPBACK | IRAF_L2SRC_LOOPBACK; 2059 if (ncec->ncec_flags & NCE_F_BCAST) 2060 iras.ira_flags |= IRAF_L2DST_BROADCAST; 2061 else if (ncec->ncec_flags & NCE_F_MCAST) 2062 iras.ira_flags |= IRAF_L2DST_MULTICAST; 2063 2064 iras.ira_free_flags = 0; 2065 iras.ira_cred = NULL; 2066 iras.ira_cpid = NOPID; 2067 iras.ira_tsl = NULL; 2068 iras.ira_zoneid = ALL_ZONES; 2069 iras.ira_pktlen = pkt_len; 2070 UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCInOctets, iras.ira_pktlen); 2071 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInReceives); 2072 2073 if (ixaflags & IXAF_IS_IPV4) 2074 iras.ira_flags |= IRAF_IS_IPV4; 2075 2076 iras.ira_ill = iras.ira_rill = ill; 2077 iras.ira_ruifindex = ill->ill_phyint->phyint_ifindex; 2078 iras.ira_rifindex = iras.ira_ruifindex; 2079 iras.ira_mhip = NULL; 2080 2081 iras.ira_flags |= ixaflags & IAF_MASK; 2082 iras.ira_no_loop_zoneid = nolzid; 2083 2084 /* Broadcast and multicast doesn't care about the squeue */ 2085 iras.ira_sqp = NULL; 2086 2087 rtc.rtc_ire = NULL; 2088 if (ixaflags & IXAF_IS_IPV4) { 2089 ipha_t *ipha = (ipha_t *)mp->b_rptr; 2090 2091 rtc.rtc_ipaddr = INADDR_ANY; 2092 2093 (*ill->ill_inputfn)(mp, ipha, &ipha->ipha_dst, &iras, &rtc); 2094 if (rtc.rtc_ire != NULL) { 2095 ASSERT(rtc.rtc_ipaddr != INADDR_ANY); 2096 ire_refrele(rtc.rtc_ire); 2097 } 2098 } else { 2099 ip6_t *ip6h = (ip6_t *)mp->b_rptr; 2100 2101 rtc.rtc_ip6addr = ipv6_all_zeros; 2102 2103 (*ill->ill_inputfn)(mp, ip6h, &ip6h->ip6_dst, &iras, &rtc); 2104 if (rtc.rtc_ire != NULL) { 2105 ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&rtc.rtc_ip6addr)); 2106 ire_refrele(rtc.rtc_ire); 2107 } 2108 } 2109 /* Any references to clean up? No hold on ira */ 2110 if (iras.ira_flags & (IRAF_IPSEC_SECURE|IRAF_SYSTEM_LABELED)) 2111 ira_cleanup(&iras, B_FALSE); 2112 } 2113 2114 /* 2115 * Post fragmentation function for IRE_MULTICAST and IRE_BROADCAST which 2116 * looks at the IXAF_LOOPBACK_COPY flag. 2117 * Common for IPv4 and IPv6. 2118 * 2119 * If the loopback copy fails (due to no memory) but we send the packet out 2120 * on the wire we return no failure. Only in the case we supress the wire 2121 * sending do we take the loopback failure into account. 2122 * 2123 * Note that we do not perform DTRACE_IP7 and FW_HOOKS for the looped back copy. 2124 * Those operations are performed on this packet in ip_xmit() and it would 2125 * be odd to do it twice for the same packet. 2126 */ 2127 int 2128 ip_postfrag_loopcheck(mblk_t *mp, nce_t *nce, iaflags_t ixaflags, 2129 uint_t pkt_len, uint32_t xmit_hint, zoneid_t szone, zoneid_t nolzid, 2130 uintptr_t *ixacookie) 2131 { 2132 ill_t *ill = nce->nce_ill; 2133 int error = 0; 2134 2135 /* 2136 * Check for IXAF_LOOPBACK_COPY - send a copy to ip as if the driver 2137 * had looped it back 2138 */ 2139 if (ixaflags & IXAF_LOOPBACK_COPY) { 2140 mblk_t *mp1; 2141 2142 mp1 = copymsg(mp); 2143 if (mp1 == NULL) { 2144 /* Failed to deliver the loopback copy. */ 2145 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); 2146 ip_drop_output("ipIfStatsOutDiscards", mp, ill); 2147 error = ENOBUFS; 2148 } else { 2149 ip_postfrag_loopback(mp1, nce, ixaflags, pkt_len, 2150 nolzid); 2151 } 2152 } 2153 2154 /* 2155 * If TTL = 0 then only do the loopback to this host i.e. we are 2156 * done. We are also done if this was the 2157 * loopback interface since it is sufficient 2158 * to loopback one copy of a multicast packet. 2159 */ 2160 if (ixaflags & IXAF_IS_IPV4) { 2161 ipha_t *ipha = (ipha_t *)mp->b_rptr; 2162 2163 if (ipha->ipha_ttl == 0) { 2164 ip_drop_output("multicast ipha_ttl not sent to wire", 2165 mp, ill); 2166 freemsg(mp); 2167 return (error); 2168 } 2169 } else { 2170 ip6_t *ip6h = (ip6_t *)mp->b_rptr; 2171 2172 if (ip6h->ip6_hops == 0) { 2173 ip_drop_output("multicast ipha_ttl not sent to wire", 2174 mp, ill); 2175 freemsg(mp); 2176 return (error); 2177 } 2178 } 2179 if (nce->nce_ill->ill_wq == NULL) { 2180 /* Loopback interface */ 2181 ip_drop_output("multicast on lo0 not sent to wire", mp, ill); 2182 freemsg(mp); 2183 return (error); 2184 } 2185 2186 return (ip_xmit(mp, nce, ixaflags, pkt_len, xmit_hint, szone, 0, 2187 ixacookie)); 2188 } 2189 2190 /* 2191 * Post fragmentation function for RTF_MULTIRT routes. 2192 * Since IRE_BROADCASTs can have RTF_MULTIRT, this function 2193 * checks IXAF_LOOPBACK_COPY. 2194 * 2195 * If no packet is sent due to failures then we return an errno, but if at 2196 * least one succeeded we return zero. 2197 */ 2198 int 2199 ip_postfrag_multirt_v4(mblk_t *mp, nce_t *nce, iaflags_t ixaflags, 2200 uint_t pkt_len, uint32_t xmit_hint, zoneid_t szone, zoneid_t nolzid, 2201 uintptr_t *ixacookie) 2202 { 2203 irb_t *irb; 2204 ipha_t *ipha = (ipha_t *)mp->b_rptr; 2205 ire_t *ire; 2206 ire_t *ire1; 2207 mblk_t *mp1; 2208 nce_t *nce1; 2209 ill_t *ill = nce->nce_ill; 2210 ill_t *ill1; 2211 ip_stack_t *ipst = ill->ill_ipst; 2212 int error = 0; 2213 int num_sent = 0; 2214 int err; 2215 uint_t ire_type; 2216 ipaddr_t nexthop; 2217 2218 ASSERT(ixaflags & IXAF_IS_IPV4); 2219 2220 /* Check for IXAF_LOOPBACK_COPY */ 2221 if (ixaflags & IXAF_LOOPBACK_COPY) { 2222 mblk_t *mp1; 2223 2224 mp1 = copymsg(mp); 2225 if (mp1 == NULL) { 2226 /* Failed to deliver the loopback copy. */ 2227 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); 2228 ip_drop_output("ipIfStatsOutDiscards", mp, ill); 2229 error = ENOBUFS; 2230 } else { 2231 ip_postfrag_loopback(mp1, nce, ixaflags, pkt_len, 2232 nolzid); 2233 } 2234 } 2235 2236 /* 2237 * Loop over RTF_MULTIRT for ipha_dst in the same bucket. Send 2238 * a copy to each one. 2239 * Use the nce (nexthop) and ipha_dst to find the ire. 2240 * 2241 * MULTIRT is not designed to work with shared-IP zones thus we don't 2242 * need to pass a zoneid or a label to the IRE lookup. 2243 */ 2244 if (V4_PART_OF_V6(nce->nce_addr) == ipha->ipha_dst) { 2245 /* Broadcast and multicast case */ 2246 ire = ire_ftable_lookup_v4(ipha->ipha_dst, 0, 0, 0, 2247 NULL, ALL_ZONES, NULL, MATCH_IRE_DSTONLY, 0, ipst, NULL); 2248 } else { 2249 ipaddr_t v4addr = V4_PART_OF_V6(nce->nce_addr); 2250 2251 /* Unicast case */ 2252 ire = ire_ftable_lookup_v4(ipha->ipha_dst, 0, v4addr, 0, 2253 NULL, ALL_ZONES, NULL, MATCH_IRE_GW, 0, ipst, NULL); 2254 } 2255 2256 if (ire == NULL || 2257 (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) || 2258 !(ire->ire_flags & RTF_MULTIRT)) { 2259 /* Drop */ 2260 ip_drop_output("ip_postfrag_multirt didn't find route", 2261 mp, nce->nce_ill); 2262 if (ire != NULL) 2263 ire_refrele(ire); 2264 return (ENETUNREACH); 2265 } 2266 2267 irb = ire->ire_bucket; 2268 irb_refhold(irb); 2269 for (ire1 = irb->irb_ire; ire1 != NULL; ire1 = ire1->ire_next) { 2270 /* 2271 * For broadcast we can have a mixture of IRE_BROADCAST and 2272 * IRE_HOST due to the manually added IRE_HOSTs that are used 2273 * to trigger the creation of the special CGTP broadcast routes. 2274 * Thus we have to skip if ire_type doesn't match the original. 2275 */ 2276 if (IRE_IS_CONDEMNED(ire1) || 2277 !(ire1->ire_flags & RTF_MULTIRT) || 2278 ire1->ire_type != ire->ire_type) 2279 continue; 2280 2281 /* Do the ire argument one after the loop */ 2282 if (ire1 == ire) 2283 continue; 2284 2285 ill1 = ire_nexthop_ill(ire1); 2286 if (ill1 == NULL) { 2287 /* 2288 * This ire might not have been picked by 2289 * ire_route_recursive, in which case ire_dep might 2290 * not have been setup yet. 2291 * We kick ire_route_recursive to try to resolve 2292 * starting at ire1. 2293 */ 2294 ire_t *ire2; 2295 2296 ire2 = ire_route_recursive_impl_v4(ire1, 2297 ire1->ire_addr, ire1->ire_type, ire1->ire_ill, 2298 ire1->ire_zoneid, NULL, MATCH_IRE_DSTONLY, 2299 B_TRUE, 0, ipst, NULL, NULL, NULL); 2300 if (ire2 != NULL) 2301 ire_refrele(ire2); 2302 ill1 = ire_nexthop_ill(ire1); 2303 } 2304 2305 if (ill1 == NULL) { 2306 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); 2307 ip_drop_output("ipIfStatsOutDiscards - no ill", 2308 mp, ill); 2309 error = ENETUNREACH; 2310 continue; 2311 } 2312 2313 /* Pick the addr and type to use for arp_nce_init */ 2314 if (nce->nce_common->ncec_flags & NCE_F_BCAST) { 2315 ire_type = IRE_BROADCAST; 2316 nexthop = ire1->ire_gateway_addr; 2317 } else if (nce->nce_common->ncec_flags & NCE_F_MCAST) { 2318 ire_type = IRE_MULTICAST; 2319 nexthop = ipha->ipha_dst; 2320 } else { 2321 ire_type = ire1->ire_type; /* Doesn't matter */ 2322 nexthop = ire1->ire_gateway_addr; 2323 } 2324 2325 /* If IPMP meta or under, then we just drop */ 2326 if (ill1->ill_grp != NULL) { 2327 BUMP_MIB(ill1->ill_ip_mib, ipIfStatsOutDiscards); 2328 ip_drop_output("ipIfStatsOutDiscards - IPMP", 2329 mp, ill1); 2330 ill_refrele(ill1); 2331 error = ENETUNREACH; 2332 continue; 2333 } 2334 2335 nce1 = arp_nce_init(ill1, nexthop, ire_type); 2336 if (nce1 == NULL) { 2337 BUMP_MIB(ill1->ill_ip_mib, ipIfStatsOutDiscards); 2338 ip_drop_output("ipIfStatsOutDiscards - no nce", 2339 mp, ill1); 2340 ill_refrele(ill1); 2341 error = ENETUNREACH; 2342 continue; 2343 } 2344 mp1 = copymsg(mp); 2345 if (mp1 == NULL) { 2346 BUMP_MIB(ill1->ill_ip_mib, ipIfStatsOutDiscards); 2347 ip_drop_output("ipIfStatsOutDiscards", mp, ill1); 2348 nce_refrele(nce1); 2349 ill_refrele(ill1); 2350 error = ENOBUFS; 2351 continue; 2352 } 2353 /* Preserve HW checksum for this copy */ 2354 DB_CKSUMSTART(mp1) = DB_CKSUMSTART(mp); 2355 DB_CKSUMSTUFF(mp1) = DB_CKSUMSTUFF(mp); 2356 DB_CKSUMEND(mp1) = DB_CKSUMEND(mp); 2357 DB_CKSUMFLAGS(mp1) = DB_CKSUMFLAGS(mp); 2358 DB_LSOMSS(mp1) = DB_LSOMSS(mp); 2359 2360 ire1->ire_ob_pkt_count++; 2361 err = ip_xmit(mp1, nce1, ixaflags, pkt_len, xmit_hint, szone, 2362 0, ixacookie); 2363 if (err == 0) 2364 num_sent++; 2365 else 2366 error = err; 2367 nce_refrele(nce1); 2368 ill_refrele(ill1); 2369 } 2370 irb_refrele(irb); 2371 ire_refrele(ire); 2372 /* Finally, the main one */ 2373 err = ip_xmit(mp, nce, ixaflags, pkt_len, xmit_hint, szone, 0, 2374 ixacookie); 2375 if (err == 0) 2376 num_sent++; 2377 else 2378 error = err; 2379 if (num_sent > 0) 2380 return (0); 2381 else 2382 return (error); 2383 } 2384 2385 /* 2386 * Verify local connectivity. This check is called by ULP fusion code. 2387 * The generation number on an IRE_LOCAL or IRE_LOOPBACK only changes if 2388 * the interface is brought down and back up. So we simply fail the local 2389 * process. The caller, TCP Fusion, should unfuse the connection. 2390 */ 2391 boolean_t 2392 ip_output_verify_local(ip_xmit_attr_t *ixa) 2393 { 2394 ire_t *ire = ixa->ixa_ire; 2395 2396 if (!(ire->ire_type & (IRE_LOCAL | IRE_LOOPBACK))) 2397 return (B_FALSE); 2398 2399 return (ixa->ixa_ire->ire_generation == ixa->ixa_ire_generation); 2400 } 2401 2402 /* 2403 * Local process for ULP loopback, TCP Fusion. Handle both IPv4 and IPv6. 2404 * 2405 * The caller must call ip_output_verify_local() first. This function handles 2406 * IPobs, FW_HOOKS, and/or IPsec cases sequentially. 2407 */ 2408 mblk_t * 2409 ip_output_process_local(mblk_t *mp, ip_xmit_attr_t *ixa, boolean_t hooks_out, 2410 boolean_t hooks_in, conn_t *peer_connp) 2411 { 2412 ill_t *ill = ixa->ixa_ire->ire_ill; 2413 ipha_t *ipha = NULL; 2414 ip6_t *ip6h = NULL; 2415 ip_stack_t *ipst = ixa->ixa_ipst; 2416 iaflags_t ixaflags = ixa->ixa_flags; 2417 ip_recv_attr_t iras; 2418 int error; 2419 2420 ASSERT(mp != NULL); 2421 2422 if (ixaflags & IXAF_IS_IPV4) { 2423 ipha = (ipha_t *)mp->b_rptr; 2424 2425 /* 2426 * If a callback is enabled then we need to know the 2427 * source and destination zoneids for the packet. We already 2428 * have those handy. 2429 */ 2430 if (ipst->ips_ip4_observe.he_interested) { 2431 zoneid_t szone, dzone; 2432 zoneid_t stackzoneid; 2433 2434 stackzoneid = netstackid_to_zoneid( 2435 ipst->ips_netstack->netstack_stackid); 2436 2437 if (stackzoneid == GLOBAL_ZONEID) { 2438 /* Shared-IP zone */ 2439 dzone = ixa->ixa_ire->ire_zoneid; 2440 szone = ixa->ixa_zoneid; 2441 } else { 2442 szone = dzone = stackzoneid; 2443 } 2444 ipobs_hook(mp, IPOBS_HOOK_LOCAL, szone, dzone, ill, 2445 ipst); 2446 } 2447 DTRACE_IP7(send, mblk_t *, mp, conn_t *, NULL, void_ip_t *, 2448 ipha, __dtrace_ipsr_ill_t *, ill, ipha_t *, ipha, ip6_t *, 2449 NULL, int, 1); 2450 2451 /* FW_HOOKS: LOOPBACK_OUT */ 2452 if (hooks_out) { 2453 DTRACE_PROBE4(ip4__loopback__out__start, ill_t *, NULL, 2454 ill_t *, ill, ipha_t *, ipha, mblk_t *, mp); 2455 FW_HOOKS(ipst->ips_ip4_loopback_out_event, 2456 ipst->ips_ipv4firewall_loopback_out, 2457 NULL, ill, ipha, mp, mp, 0, ipst, error); 2458 DTRACE_PROBE1(ip4__loopback__out__end, mblk_t *, mp); 2459 } 2460 if (mp == NULL) 2461 return (NULL); 2462 2463 /* FW_HOOKS: LOOPBACK_IN */ 2464 if (hooks_in) { 2465 DTRACE_PROBE4(ip4__loopback__in__start, ill_t *, ill, 2466 ill_t *, NULL, ipha_t *, ipha, mblk_t *, mp); 2467 FW_HOOKS(ipst->ips_ip4_loopback_in_event, 2468 ipst->ips_ipv4firewall_loopback_in, 2469 ill, NULL, ipha, mp, mp, 0, ipst, error); 2470 DTRACE_PROBE1(ip4__loopback__in__end, mblk_t *, mp); 2471 } 2472 if (mp == NULL) 2473 return (NULL); 2474 2475 DTRACE_IP7(receive, mblk_t *, mp, conn_t *, NULL, void_ip_t *, 2476 ipha, __dtrace_ipsr_ill_t *, ill, ipha_t *, ipha, ip6_t *, 2477 NULL, int, 1); 2478 2479 /* Inbound IPsec polocies */ 2480 if (peer_connp != NULL) { 2481 /* Map ixa to ira including IPsec policies. */ 2482 ipsec_out_to_in(ixa, ill, &iras); 2483 mp = ipsec_check_inbound_policy(mp, peer_connp, ipha, 2484 NULL, &iras); 2485 } 2486 } else { 2487 ip6h = (ip6_t *)mp->b_rptr; 2488 2489 /* 2490 * If a callback is enabled then we need to know the 2491 * source and destination zoneids for the packet. We already 2492 * have those handy. 2493 */ 2494 if (ipst->ips_ip6_observe.he_interested) { 2495 zoneid_t szone, dzone; 2496 zoneid_t stackzoneid; 2497 2498 stackzoneid = netstackid_to_zoneid( 2499 ipst->ips_netstack->netstack_stackid); 2500 2501 if (stackzoneid == GLOBAL_ZONEID) { 2502 /* Shared-IP zone */ 2503 dzone = ixa->ixa_ire->ire_zoneid; 2504 szone = ixa->ixa_zoneid; 2505 } else { 2506 szone = dzone = stackzoneid; 2507 } 2508 ipobs_hook(mp, IPOBS_HOOK_LOCAL, szone, dzone, ill, 2509 ipst); 2510 } 2511 DTRACE_IP7(send, mblk_t *, mp, conn_t *, NULL, void_ip_t *, 2512 ip6h, __dtrace_ipsr_ill_t *, ill, ipha_t *, NULL, ip6_t *, 2513 ip6h, int, 1); 2514 2515 /* FW_HOOKS: LOOPBACK_OUT */ 2516 if (hooks_out) { 2517 DTRACE_PROBE4(ip6__loopback__out__start, ill_t *, NULL, 2518 ill_t *, ill, ip6_t *, ip6h, mblk_t *, mp); 2519 FW_HOOKS6(ipst->ips_ip6_loopback_out_event, 2520 ipst->ips_ipv6firewall_loopback_out, 2521 NULL, ill, ip6h, mp, mp, 0, ipst, error); 2522 DTRACE_PROBE1(ip6__loopback__out__end, mblk_t *, mp); 2523 } 2524 if (mp == NULL) 2525 return (NULL); 2526 2527 /* FW_HOOKS: LOOPBACK_IN */ 2528 if (hooks_in) { 2529 DTRACE_PROBE4(ip6__loopback__in__start, ill_t *, ill, 2530 ill_t *, NULL, ip6_t *, ip6h, mblk_t *, mp); 2531 FW_HOOKS6(ipst->ips_ip6_loopback_in_event, 2532 ipst->ips_ipv6firewall_loopback_in, 2533 ill, NULL, ip6h, mp, mp, 0, ipst, error); 2534 DTRACE_PROBE1(ip6__loopback__in__end, mblk_t *, mp); 2535 } 2536 if (mp == NULL) 2537 return (NULL); 2538 2539 DTRACE_IP7(receive, mblk_t *, mp, conn_t *, NULL, void_ip_t *, 2540 ip6h, __dtrace_ipsr_ill_t *, ill, ipha_t *, NULL, ip6_t *, 2541 ip6h, int, 1); 2542 2543 /* Inbound IPsec polocies */ 2544 if (peer_connp != NULL) { 2545 /* Map ixa to ira including IPsec policies. */ 2546 ipsec_out_to_in(ixa, ill, &iras); 2547 mp = ipsec_check_inbound_policy(mp, peer_connp, NULL, 2548 ip6h, &iras); 2549 } 2550 } 2551 2552 if (mp == NULL) { 2553 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 2554 ip_drop_input("ipIfStatsInDiscards", NULL, ill); 2555 } 2556 2557 return (mp); 2558 } 2559