1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. 24 */ 25 /* Copyright (c) 1990 Mentat Inc. */ 26 27 #include <sys/types.h> 28 #include <sys/stream.h> 29 #include <sys/strsubr.h> 30 #include <sys/dlpi.h> 31 #include <sys/strsun.h> 32 #include <sys/zone.h> 33 #include <sys/ddi.h> 34 #include <sys/sunddi.h> 35 #include <sys/cmn_err.h> 36 #include <sys/debug.h> 37 #include <sys/atomic.h> 38 39 #include <sys/systm.h> 40 #include <sys/param.h> 41 #include <sys/kmem.h> 42 #include <sys/sdt.h> 43 #include <sys/socket.h> 44 #include <sys/mac.h> 45 #include <net/if.h> 46 #include <net/if_arp.h> 47 #include <net/route.h> 48 #include <sys/sockio.h> 49 #include <netinet/in.h> 50 #include <net/if_dl.h> 51 52 #include <inet/common.h> 53 #include <inet/mi.h> 54 #include <inet/mib2.h> 55 #include <inet/nd.h> 56 #include <inet/arp.h> 57 #include <inet/snmpcom.h> 58 #include <inet/kstatcom.h> 59 60 #include <netinet/igmp_var.h> 61 #include <netinet/ip6.h> 62 #include <netinet/icmp6.h> 63 #include <netinet/sctp.h> 64 65 #include <inet/ip.h> 66 #include <inet/ip_impl.h> 67 #include <inet/ip6.h> 68 #include <inet/ip6_asp.h> 69 #include <inet/tcp.h> 70 #include <inet/ip_multi.h> 71 #include <inet/ip_if.h> 72 #include <inet/ip_ire.h> 73 #include <inet/ip_ftable.h> 74 #include <inet/ip_rts.h> 75 #include <inet/optcom.h> 76 #include <inet/ip_ndp.h> 77 #include <inet/ip_listutils.h> 78 #include <netinet/igmp.h> 79 #include <netinet/ip_mroute.h> 80 #include <inet/ipp_common.h> 81 82 #include <net/pfkeyv2.h> 83 #include <inet/sadb.h> 84 #include <inet/ipsec_impl.h> 85 #include <inet/ipdrop.h> 86 #include <inet/ip_netinfo.h> 87 88 #include <sys/pattr.h> 89 #include <inet/ipclassifier.h> 90 #include <inet/sctp_ip.h> 91 #include <inet/sctp/sctp_impl.h> 92 #include <inet/udp_impl.h> 93 #include <sys/sunddi.h> 94 95 #include <sys/tsol/label.h> 96 #include <sys/tsol/tnet.h> 97 98 #include <sys/clock_impl.h> /* For LBOLT_FASTPATH{,64} */ 99 100 #ifdef DEBUG 101 extern boolean_t skip_sctp_cksum; 102 #endif 103 104 static int ip_verify_nce(mblk_t *, ip_xmit_attr_t *); 105 static int ip_verify_dce(mblk_t *, ip_xmit_attr_t *); 106 static boolean_t ip_verify_lso(ill_t *, ip_xmit_attr_t *); 107 static boolean_t ip_verify_zcopy(ill_t *, ip_xmit_attr_t *); 108 static void ip_output_simple_broadcast(ip_xmit_attr_t *, mblk_t *); 109 110 /* 111 * There are two types of output functions for IP used for different 112 * purposes: 113 * - ip_output_simple() is when sending ICMP errors, TCP resets, etc when there 114 * is no context in the form of a conn_t. However, there is a 115 * ip_xmit_attr_t that the callers use to influence interface selection 116 * (needed for ICMP echo as well as IPv6 link-locals) and IPsec. 117 * 118 * - conn_ip_output() is used when sending packets with a conn_t and 119 * ip_set_destination has been called to cache information. In that case 120 * various socket options are recorded in the ip_xmit_attr_t and should 121 * be taken into account. 122 */ 123 124 /* 125 * The caller *must* have called conn_connect() or ip_attr_connect() 126 * before calling conn_ip_output(). The caller needs to redo that each time 127 * the destination IP address or port changes, as well as each time there is 128 * a change to any socket option that would modify how packets are routed out 129 * of the box (e.g., SO_DONTROUTE, IP_NEXTHOP, IP_BOUND_IF). 130 * 131 * The ULP caller has to serialize the use of a single ip_xmit_attr_t. 132 * We assert for that here. 133 */ 134 int 135 conn_ip_output(mblk_t *mp, ip_xmit_attr_t *ixa) 136 { 137 iaflags_t ixaflags = ixa->ixa_flags; 138 ire_t *ire; 139 nce_t *nce; 140 dce_t *dce; 141 ill_t *ill; 142 ip_stack_t *ipst = ixa->ixa_ipst; 143 int error; 144 145 /* We defer ipIfStatsHCOutRequests until an error or we have an ill */ 146 147 ASSERT(ixa->ixa_ire != NULL); 148 /* Note there is no ixa_nce when reject and blackhole routes */ 149 ASSERT(ixa->ixa_dce != NULL); /* Could be default dce */ 150 151 #ifdef DEBUG 152 ASSERT(ixa->ixa_curthread == NULL); 153 ixa->ixa_curthread = curthread; 154 #endif 155 156 /* 157 * Even on labeled systems we can have a NULL ixa_tsl e.g., 158 * for IGMP/MLD traffic. 159 */ 160 161 ire = ixa->ixa_ire; 162 163 /* 164 * If the ULP says the (old) IRE resulted in reachability we 165 * record this before determine whether to use a new IRE. 166 * No locking for performance reasons. 167 */ 168 if (ixaflags & IXAF_REACH_CONF) 169 ire->ire_badcnt = 0; 170 171 /* 172 * Has routing changed since we cached the results of the lookup? 173 * 174 * This check captures all of: 175 * - the cached ire being deleted (by means of the special 176 * IRE_GENERATION_CONDEMNED) 177 * - A potentially better ire being added (ire_generation being 178 * increased) 179 * - A deletion of the nexthop ire that was used when we did the 180 * lookup. 181 * - An addition of a potentially better nexthop ire. 182 * The last two are handled by walking and increasing the generation 183 * number on all dependant IREs in ire_flush_cache(). 184 * 185 * The check also handles all cases of RTF_REJECT and RTF_BLACKHOLE 186 * since we ensure that each time we set ixa_ire to such an IRE we 187 * make sure the ixa_ire_generation does not match (by using 188 * IRE_GENERATION_VERIFY). 189 */ 190 if (ire->ire_generation != ixa->ixa_ire_generation) { 191 error = ip_verify_ire(mp, ixa); 192 if (error != 0) { 193 ip_drop_output("ipIfStatsOutDiscards - verify ire", 194 mp, NULL); 195 goto drop; 196 } 197 ire = ixa->ixa_ire; 198 ASSERT(ire != NULL); 199 if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) { 200 #ifdef DEBUG 201 ASSERT(ixa->ixa_curthread == curthread); 202 ixa->ixa_curthread = NULL; 203 #endif 204 ire->ire_ob_pkt_count++; 205 /* ixa_dce might be condemned; use default one */ 206 return ((ire->ire_sendfn)(ire, mp, mp->b_rptr, ixa, 207 &ipst->ips_dce_default->dce_ident)); 208 } 209 /* 210 * If the ncec changed then ip_verify_ire already set 211 * ixa->ixa_dce_generation = DCE_GENERATION_VERIFY; 212 * so we can recheck the interface mtu. 213 */ 214 215 /* 216 * Note that ire->ire_generation could already have changed. 217 * We catch that next time we send a packet. 218 */ 219 } 220 221 /* 222 * No need to lock access to ixa_nce since the ip_xmit_attr usage 223 * is single threaded. 224 */ 225 ASSERT(ixa->ixa_nce != NULL); 226 nce = ixa->ixa_nce; 227 if (nce->nce_is_condemned) { 228 error = ip_verify_nce(mp, ixa); 229 /* 230 * In case ZEROCOPY capability become not available, we 231 * copy the message and free the original one. We might 232 * be copying more data than needed but it doesn't hurt 233 * since such change rarely happens. 234 */ 235 switch (error) { 236 case 0: 237 break; 238 case ENOTSUP: { /* ZEROCOPY */ 239 mblk_t *nmp; 240 241 if ((nmp = copymsg(mp)) != NULL) { 242 freemsg(mp); 243 mp = nmp; 244 245 break; 246 } 247 /* FALLTHROUGH */ 248 } 249 default: 250 ip_drop_output("ipIfStatsOutDiscards - verify nce", 251 mp, NULL); 252 goto drop; 253 } 254 ire = ixa->ixa_ire; 255 ASSERT(ire != NULL); 256 if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) { 257 #ifdef DEBUG 258 ASSERT(ixa->ixa_curthread == curthread); 259 ixa->ixa_curthread = NULL; 260 #endif 261 ire->ire_ob_pkt_count++; 262 /* ixa_dce might be condemned; use default one */ 263 return ((ire->ire_sendfn)(ire, mp, mp->b_rptr, 264 ixa, &ipst->ips_dce_default->dce_ident)); 265 } 266 ASSERT(ixa->ixa_nce != NULL); 267 nce = ixa->ixa_nce; 268 269 /* 270 * Note that some other event could already have made 271 * the new nce condemned. We catch that next time we 272 * try to send a packet. 273 */ 274 } 275 /* 276 * If there is no per-destination dce_t then we have a reference to 277 * the default dce_t (which merely contains the dce_ipid). 278 * The generation check captures both the introduction of a 279 * per-destination dce_t (e.g., due to ICMP packet too big) and 280 * any change to the per-destination dce (including it becoming 281 * condemned by use of the special DCE_GENERATION_CONDEMNED). 282 */ 283 dce = ixa->ixa_dce; 284 285 /* 286 * To avoid a periodic timer to increase the path MTU we 287 * look at dce_last_change_time each time we send a packet. 288 */ 289 if (dce->dce_flags & DCEF_PMTU) { 290 int64_t now = LBOLT_FASTPATH64; 291 292 if ((TICK_TO_SEC(now) - dce->dce_last_change_time > 293 ipst->ips_ip_pathmtu_interval)) { 294 /* 295 * Older than 20 minutes. Drop the path MTU information. 296 * Since the path MTU changes as a result of this, 297 * twiddle ixa_dce_generation to make us go through the 298 * dce verification code in conn_ip_output. 299 */ 300 mutex_enter(&dce->dce_lock); 301 dce->dce_flags &= ~(DCEF_PMTU|DCEF_TOO_SMALL_PMTU); 302 dce->dce_last_change_time = TICK_TO_SEC(now); 303 mutex_exit(&dce->dce_lock); 304 dce_increment_generation(dce); 305 } 306 } 307 308 if (dce->dce_generation != ixa->ixa_dce_generation) { 309 error = ip_verify_dce(mp, ixa); 310 if (error != 0) { 311 ip_drop_output("ipIfStatsOutDiscards - verify dce", 312 mp, NULL); 313 goto drop; 314 } 315 dce = ixa->ixa_dce; 316 317 /* 318 * Note that some other event could already have made the 319 * new dce's generation number change. 320 * We catch that next time we try to send a packet. 321 */ 322 } 323 324 ill = nce->nce_ill; 325 326 /* 327 * An initial ixa_fragsize was set in ip_set_destination 328 * and we update it if any routing changes above. 329 * A change to ill_mtu with ifconfig will increase all dce_generation 330 * so that we will detect that with the generation check. Ditto for 331 * ill_mc_mtu. 332 */ 333 334 /* 335 * Caller needs to make sure IXAF_VERIFY_SRC is not set if 336 * conn_unspec_src. 337 */ 338 if ((ixaflags & IXAF_VERIFY_SOURCE) && 339 ixa->ixa_src_generation != ipst->ips_src_generation) { 340 /* Check if the IP source is still assigned to the host. */ 341 uint_t gen; 342 343 if (!ip_verify_src(mp, ixa, &gen)) { 344 /* Don't send a packet with a source that isn't ours */ 345 error = EADDRNOTAVAIL; 346 ip_drop_output("ipIfStatsOutDiscards - invalid src", 347 mp, NULL); 348 goto drop; 349 } 350 /* The source is still valid - update the generation number */ 351 ixa->ixa_src_generation = gen; 352 } 353 354 /* 355 * We don't have an IRE when we fragment, hence ire_ob_pkt_count 356 * can only count the use prior to fragmentation. However the MIB 357 * counters on the ill will be incremented in post fragmentation. 358 */ 359 ire->ire_ob_pkt_count++; 360 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCOutRequests); 361 362 /* 363 * Based on ire_type and ire_flags call one of: 364 * ire_send_local_v* - for IRE_LOCAL and IRE_LOOPBACK 365 * ire_send_multirt_v* - if RTF_MULTIRT 366 * ire_send_noroute_v* - if RTF_REJECT or RTF_BLACHOLE 367 * ire_send_multicast_v* - for IRE_MULTICAST 368 * ire_send_broadcast_v4 - for IRE_BROADCAST 369 * ire_send_wire_v* - for the rest. 370 */ 371 #ifdef DEBUG 372 ASSERT(ixa->ixa_curthread == curthread); 373 ixa->ixa_curthread = NULL; 374 #endif 375 return ((ire->ire_sendfn)(ire, mp, mp->b_rptr, ixa, &dce->dce_ident)); 376 377 drop: 378 if (ixaflags & IXAF_IS_IPV4) { 379 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests); 380 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards); 381 } else { 382 BUMP_MIB(&ipst->ips_ip6_mib, ipIfStatsHCOutRequests); 383 BUMP_MIB(&ipst->ips_ip6_mib, ipIfStatsOutDiscards); 384 } 385 freemsg(mp); 386 #ifdef DEBUG 387 ASSERT(ixa->ixa_curthread == curthread); 388 ixa->ixa_curthread = NULL; 389 #endif 390 return (error); 391 } 392 393 /* 394 * Handle both IPv4 and IPv6. Sets the generation number 395 * to allow the caller to know when to call us again. 396 * Returns true if the source address in the packet is a valid source. 397 * We handle callers which try to send with a zero address (since we only 398 * get here if UNSPEC_SRC is not set). 399 */ 400 boolean_t 401 ip_verify_src(mblk_t *mp, ip_xmit_attr_t *ixa, uint_t *generationp) 402 { 403 ip_stack_t *ipst = ixa->ixa_ipst; 404 405 /* 406 * Need to grab the generation number before we check to 407 * avoid a race with a change to the set of local addresses. 408 * No lock needed since the thread which updates the set of local 409 * addresses use ipif/ill locks and exit those (hence a store memory 410 * barrier) before doing the atomic increase of ips_src_generation. 411 */ 412 if (generationp != NULL) 413 *generationp = ipst->ips_src_generation; 414 415 if (ixa->ixa_flags & IXAF_IS_IPV4) { 416 ipha_t *ipha = (ipha_t *)mp->b_rptr; 417 418 if (ipha->ipha_src == INADDR_ANY) 419 return (B_FALSE); 420 421 return (ip_laddr_verify_v4(ipha->ipha_src, ixa->ixa_zoneid, 422 ipst, B_FALSE) != IPVL_BAD); 423 } else { 424 ip6_t *ip6h = (ip6_t *)mp->b_rptr; 425 uint_t scopeid; 426 427 if (IN6_IS_ADDR_UNSPECIFIED(&ip6h->ip6_src)) 428 return (B_FALSE); 429 430 if (ixa->ixa_flags & IXAF_SCOPEID_SET) 431 scopeid = ixa->ixa_scopeid; 432 else 433 scopeid = 0; 434 435 return (ip_laddr_verify_v6(&ip6h->ip6_src, ixa->ixa_zoneid, 436 ipst, B_FALSE, scopeid) != IPVL_BAD); 437 } 438 } 439 440 /* 441 * Handle both IPv4 and IPv6. Reverify/recalculate the IRE to use. 442 */ 443 int 444 ip_verify_ire(mblk_t *mp, ip_xmit_attr_t *ixa) 445 { 446 uint_t gen; 447 ire_t *ire; 448 nce_t *nce; 449 int error; 450 boolean_t multirt = B_FALSE; 451 452 /* 453 * Redo ip_select_route. 454 * Need to grab generation number as part of the lookup to 455 * avoid race. 456 */ 457 error = 0; 458 ire = ip_select_route_pkt(mp, ixa, &gen, &error, &multirt); 459 ASSERT(ire != NULL); /* IRE_NOROUTE if none found */ 460 if (error != 0) { 461 ire_refrele(ire); 462 return (error); 463 } 464 465 if (ixa->ixa_ire != NULL) 466 ire_refrele_notr(ixa->ixa_ire); 467 #ifdef DEBUG 468 ire_refhold_notr(ire); 469 ire_refrele(ire); 470 #endif 471 ixa->ixa_ire = ire; 472 ixa->ixa_ire_generation = gen; 473 if (multirt) { 474 if (ixa->ixa_flags & IXAF_IS_IPV4) 475 ixa->ixa_postfragfn = ip_postfrag_multirt_v4; 476 else 477 ixa->ixa_postfragfn = ip_postfrag_multirt_v6; 478 ixa->ixa_flags |= IXAF_MULTIRT_MULTICAST; 479 } else { 480 ixa->ixa_postfragfn = ire->ire_postfragfn; 481 ixa->ixa_flags &= ~IXAF_MULTIRT_MULTICAST; 482 } 483 484 /* 485 * Don't look for an nce for reject or blackhole. 486 * They have ire_generation set to IRE_GENERATION_VERIFY which 487 * makes conn_ip_output avoid references to ixa_nce. 488 */ 489 if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) { 490 ASSERT(ixa->ixa_ire_generation == IRE_GENERATION_VERIFY); 491 ixa->ixa_dce_generation = DCE_GENERATION_VERIFY; 492 return (0); 493 } 494 495 /* The NCE could now be different */ 496 nce = ire_to_nce_pkt(ire, mp); 497 if (nce == NULL) { 498 /* 499 * Allocation failure. Make sure we redo ire/nce selection 500 * next time we send. 501 */ 502 ixa->ixa_ire_generation = IRE_GENERATION_VERIFY; 503 ixa->ixa_dce_generation = DCE_GENERATION_VERIFY; 504 return (ENOBUFS); 505 } 506 if (nce == ixa->ixa_nce) { 507 /* No change */ 508 nce_refrele(nce); 509 return (0); 510 } 511 512 /* 513 * Since the path MTU might change as a result of this 514 * route change, we twiddle ixa_dce_generation to 515 * make conn_ip_output go through the ip_verify_dce code. 516 */ 517 ixa->ixa_dce_generation = DCE_GENERATION_VERIFY; 518 519 if (ixa->ixa_nce != NULL) 520 nce_refrele(ixa->ixa_nce); 521 ixa->ixa_nce = nce; 522 return (0); 523 } 524 525 /* 526 * Handle both IPv4 and IPv6. Reverify/recalculate the NCE to use. 527 */ 528 static int 529 ip_verify_nce(mblk_t *mp, ip_xmit_attr_t *ixa) 530 { 531 ire_t *ire = ixa->ixa_ire; 532 nce_t *nce; 533 int error = 0; 534 ipha_t *ipha = NULL; 535 ip6_t *ip6h = NULL; 536 537 if (ire->ire_ipversion == IPV4_VERSION) 538 ipha = (ipha_t *)mp->b_rptr; 539 else 540 ip6h = (ip6_t *)mp->b_rptr; 541 542 nce = ire_handle_condemned_nce(ixa->ixa_nce, ire, ipha, ip6h, B_TRUE); 543 if (nce == NULL) { 544 /* Try to find a better ire */ 545 return (ip_verify_ire(mp, ixa)); 546 } 547 548 /* 549 * The hardware offloading capabilities, for example LSO, of the 550 * interface might have changed, so do sanity verification here. 551 */ 552 if (ixa->ixa_flags & IXAF_VERIFY_LSO) { 553 if (!ip_verify_lso(nce->nce_ill, ixa)) { 554 ASSERT(ixa->ixa_notify != NULL); 555 ixa->ixa_notify(ixa->ixa_notify_cookie, ixa, 556 IXAN_LSO, 0); 557 error = ENOTSUP; 558 } 559 } 560 561 /* 562 * Verify ZEROCOPY capability of underlying ill. Notify the ULP with 563 * any ZEROCOPY changes. In case ZEROCOPY capability is not available 564 * any more, return error so that conn_ip_output() can take care of 565 * the ZEROCOPY message properly. It's safe to continue send the 566 * message when ZEROCOPY newly become available. 567 */ 568 if (ixa->ixa_flags & IXAF_VERIFY_ZCOPY) { 569 if (!ip_verify_zcopy(nce->nce_ill, ixa)) { 570 ASSERT(ixa->ixa_notify != NULL); 571 ixa->ixa_notify(ixa->ixa_notify_cookie, ixa, 572 IXAN_ZCOPY, 0); 573 if ((ixa->ixa_flags & IXAF_ZCOPY_CAPAB) == 0) 574 error = ENOTSUP; 575 } 576 } 577 578 /* 579 * Since the path MTU might change as a result of this 580 * change, we twiddle ixa_dce_generation to 581 * make conn_ip_output go through the ip_verify_dce code. 582 */ 583 ixa->ixa_dce_generation = DCE_GENERATION_VERIFY; 584 585 nce_refrele(ixa->ixa_nce); 586 ixa->ixa_nce = nce; 587 return (error); 588 } 589 590 /* 591 * Handle both IPv4 and IPv6. Reverify/recalculate the DCE to use. 592 */ 593 static int 594 ip_verify_dce(mblk_t *mp, ip_xmit_attr_t *ixa) 595 { 596 dce_t *dce; 597 uint_t gen; 598 uint_t pmtu; 599 600 dce = dce_lookup_pkt(mp, ixa, &gen); 601 ASSERT(dce != NULL); 602 603 dce_refrele_notr(ixa->ixa_dce); 604 #ifdef DEBUG 605 dce_refhold_notr(dce); 606 dce_refrele(dce); 607 #endif 608 ixa->ixa_dce = dce; 609 ixa->ixa_dce_generation = gen; 610 611 /* Extract the (path) mtu from the dce, ncec_ill etc */ 612 pmtu = ip_get_pmtu(ixa); 613 614 /* 615 * Tell ULP about PMTU changes - increase or decrease - by returning 616 * an error if IXAF_VERIFY_PMTU is set. In such case, ULP should update 617 * both ixa_pmtu and ixa_fragsize appropriately. 618 * 619 * If ULP doesn't set that flag then we need to update ixa_fragsize 620 * since routing could have changed the ill after after ixa_fragsize 621 * was set previously in the conn_ip_output path or in 622 * ip_set_destination. 623 * 624 * In case of LSO, ixa_fragsize might be greater than ixa_pmtu. 625 * 626 * In the case of a path MTU increase we send the packet after the 627 * notify to the ULP. 628 */ 629 if (ixa->ixa_flags & IXAF_VERIFY_PMTU) { 630 if (ixa->ixa_pmtu != pmtu) { 631 uint_t oldmtu = ixa->ixa_pmtu; 632 633 DTRACE_PROBE2(verify_pmtu, uint32_t, pmtu, 634 uint32_t, ixa->ixa_pmtu); 635 ASSERT(ixa->ixa_notify != NULL); 636 ixa->ixa_notify(ixa->ixa_notify_cookie, ixa, 637 IXAN_PMTU, pmtu); 638 if (pmtu < oldmtu) 639 return (EMSGSIZE); 640 } 641 } else { 642 ixa->ixa_fragsize = pmtu; 643 } 644 return (0); 645 } 646 647 /* 648 * Verify LSO usability. Keep the return value simple to indicate whether 649 * the LSO capability has changed. Handle both IPv4 and IPv6. 650 */ 651 static boolean_t 652 ip_verify_lso(ill_t *ill, ip_xmit_attr_t *ixa) 653 { 654 ill_lso_capab_t *lsoc = &ixa->ixa_lso_capab; 655 ill_lso_capab_t *new_lsoc = ill->ill_lso_capab; 656 657 if (ixa->ixa_flags & IXAF_LSO_CAPAB) { 658 /* 659 * Not unsable any more. 660 */ 661 if ((ixa->ixa_flags & IXAF_IPSEC_SECURE) || 662 (ixa->ixa_ire->ire_type & (IRE_LOCAL | IRE_LOOPBACK)) || 663 (ixa->ixa_ire->ire_flags & RTF_MULTIRT) || 664 ((ixa->ixa_flags & IXAF_IS_IPV4) ? 665 !ILL_LSO_TCP_IPV4_USABLE(ill) : 666 !ILL_LSO_TCP_IPV6_USABLE(ill))) { 667 ixa->ixa_flags &= ~IXAF_LSO_CAPAB; 668 669 return (B_FALSE); 670 } 671 672 /* 673 * Capability has changed, refresh the copy in ixa. 674 */ 675 if (lsoc->ill_lso_max != new_lsoc->ill_lso_max) { 676 *lsoc = *new_lsoc; 677 678 return (B_FALSE); 679 } 680 } else { /* Was not usable */ 681 if (!(ixa->ixa_flags & IXAF_IPSEC_SECURE) && 682 !(ixa->ixa_ire->ire_type & (IRE_LOCAL | IRE_LOOPBACK)) && 683 !(ixa->ixa_ire->ire_flags & RTF_MULTIRT) && 684 ((ixa->ixa_flags & IXAF_IS_IPV4) ? 685 ILL_LSO_TCP_IPV4_USABLE(ill) : 686 ILL_LSO_TCP_IPV6_USABLE(ill))) { 687 *lsoc = *new_lsoc; 688 ixa->ixa_flags |= IXAF_LSO_CAPAB; 689 690 return (B_FALSE); 691 } 692 } 693 694 return (B_TRUE); 695 } 696 697 /* 698 * Verify ZEROCOPY usability. Keep the return value simple to indicate whether 699 * the ZEROCOPY capability has changed. Handle both IPv4 and IPv6. 700 */ 701 static boolean_t 702 ip_verify_zcopy(ill_t *ill, ip_xmit_attr_t *ixa) 703 { 704 if (ixa->ixa_flags & IXAF_ZCOPY_CAPAB) { 705 /* 706 * Not unsable any more. 707 */ 708 if ((ixa->ixa_flags & IXAF_IPSEC_SECURE) || 709 (ixa->ixa_ire->ire_type & (IRE_LOCAL | IRE_LOOPBACK)) || 710 (ixa->ixa_ire->ire_flags & RTF_MULTIRT) || 711 !ILL_ZCOPY_USABLE(ill)) { 712 ixa->ixa_flags &= ~IXAF_ZCOPY_CAPAB; 713 714 return (B_FALSE); 715 } 716 } else { /* Was not usable */ 717 if (!(ixa->ixa_flags & IXAF_IPSEC_SECURE) && 718 !(ixa->ixa_ire->ire_type & (IRE_LOCAL | IRE_LOOPBACK)) && 719 !(ixa->ixa_ire->ire_flags & RTF_MULTIRT) && 720 ILL_ZCOPY_USABLE(ill)) { 721 ixa->ixa_flags |= IXAF_ZCOPY_CAPAB; 722 723 return (B_FALSE); 724 } 725 } 726 727 return (B_TRUE); 728 } 729 730 731 /* 732 * When there is no conn_t context, this will send a packet. 733 * The caller must *not* have called conn_connect() or ip_attr_connect() 734 * before calling ip_output_simple(). 735 * Handles IPv4 and IPv6. Returns zero or an errno such as ENETUNREACH. 736 * Honors IXAF_SET_SOURCE. 737 * 738 * We acquire the ire and after calling ire_sendfn we release 739 * the hold on the ire. Ditto for the nce and dce. 740 * 741 * This assumes that the caller has set the following in ip_xmit_attr_t: 742 * ixa_tsl, ixa_zoneid, and ixa_ipst must always be set. 743 * If ixa_ifindex is non-zero it means send out that ill. (If it is 744 * an upper IPMP ill we load balance across the group; if a lower we send 745 * on that lower ill without load balancing.) 746 * IXAF_IS_IPV4 must be set correctly. 747 * If IXAF_IPSEC_SECURE is set then the ixa_ipsec_* fields must be set. 748 * If IXAF_NO_IPSEC is set we'd skip IPsec policy lookup. 749 * If neither of those two are set we do an IPsec policy lookup. 750 * 751 * We handle setting things like 752 * ixa_pktlen 753 * ixa_ip_hdr_length 754 * ixa->ixa_protocol 755 * 756 * The caller may set ixa_xmit_hint, which is used for ECMP selection and 757 * transmit ring selecting in GLD. 758 * 759 * The caller must do an ixa_cleanup() to release any IPsec references 760 * after we return. 761 */ 762 int 763 ip_output_simple(mblk_t *mp, ip_xmit_attr_t *ixa) 764 { 765 ts_label_t *effective_tsl = NULL; 766 int err; 767 768 ASSERT(ixa->ixa_ipst != NULL); 769 770 if (is_system_labeled()) { 771 ip_stack_t *ipst = ixa->ixa_ipst; 772 773 if (ixa->ixa_flags & IXAF_IS_IPV4) { 774 err = tsol_check_label_v4(ixa->ixa_tsl, ixa->ixa_zoneid, 775 &mp, CONN_MAC_DEFAULT, B_FALSE, ixa->ixa_ipst, 776 &effective_tsl); 777 } else { 778 err = tsol_check_label_v6(ixa->ixa_tsl, ixa->ixa_zoneid, 779 &mp, CONN_MAC_DEFAULT, B_FALSE, ixa->ixa_ipst, 780 &effective_tsl); 781 } 782 if (err != 0) { 783 ip2dbg(("tsol_check: label check failed (%d)\n", err)); 784 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests); 785 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards); 786 ip_drop_output("tsol_check_label", mp, NULL); 787 freemsg(mp); 788 return (err); 789 } 790 if (effective_tsl != NULL) { 791 /* Update the label */ 792 ip_xmit_attr_replace_tsl(ixa, effective_tsl); 793 } 794 } 795 796 if (ixa->ixa_flags & IXAF_IS_IPV4) 797 return (ip_output_simple_v4(mp, ixa)); 798 else 799 return (ip_output_simple_v6(mp, ixa)); 800 } 801 802 int 803 ip_output_simple_v4(mblk_t *mp, ip_xmit_attr_t *ixa) 804 { 805 ipha_t *ipha; 806 ipaddr_t firsthop; /* In IP header */ 807 ipaddr_t dst; /* End of source route, or ipha_dst if none */ 808 ire_t *ire; 809 ipaddr_t setsrc; /* RTF_SETSRC */ 810 int error; 811 ill_t *ill = NULL; 812 dce_t *dce = NULL; 813 nce_t *nce; 814 iaflags_t ixaflags = ixa->ixa_flags; 815 ip_stack_t *ipst = ixa->ixa_ipst; 816 boolean_t repeat = B_FALSE; 817 boolean_t multirt = B_FALSE; 818 int64_t now; 819 820 ipha = (ipha_t *)mp->b_rptr; 821 ASSERT(IPH_HDR_VERSION(ipha) == IPV4_VERSION); 822 823 /* 824 * Even on labeled systems we can have a NULL ixa_tsl e.g., 825 * for IGMP/MLD traffic. 826 */ 827 828 /* Caller already set flags */ 829 ASSERT(ixa->ixa_flags & IXAF_IS_IPV4); 830 831 ASSERT(ixa->ixa_nce == NULL); 832 833 ixa->ixa_pktlen = ntohs(ipha->ipha_length); 834 ASSERT(ixa->ixa_pktlen == msgdsize(mp)); 835 ixa->ixa_ip_hdr_length = IPH_HDR_LENGTH(ipha); 836 ixa->ixa_protocol = ipha->ipha_protocol; 837 838 /* 839 * Assumes that source routed packets have already been massaged by 840 * the ULP (ip_massage_options) and as a result ipha_dst is the next 841 * hop in the source route. The final destination is used for IPsec 842 * policy and DCE lookup. 843 */ 844 firsthop = ipha->ipha_dst; 845 dst = ip_get_dst(ipha); 846 847 repeat_ire: 848 error = 0; 849 setsrc = INADDR_ANY; 850 ire = ip_select_route_v4(firsthop, ipha->ipha_src, ixa, NULL, 851 &setsrc, &error, &multirt); 852 ASSERT(ire != NULL); /* IRE_NOROUTE if none found */ 853 if (error != 0) { 854 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests); 855 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards); 856 ip_drop_output("ipIfStatsOutDiscards - select route", mp, NULL); 857 freemsg(mp); 858 goto done; 859 } 860 861 if (ire->ire_flags & (RTF_BLACKHOLE|RTF_REJECT)) { 862 /* ire_ill might be NULL hence need to skip some code */ 863 if (ixaflags & IXAF_SET_SOURCE) 864 ipha->ipha_src = htonl(INADDR_LOOPBACK); 865 ixa->ixa_fragsize = IP_MAXPACKET; 866 ill = NULL; 867 nce = NULL; 868 ire->ire_ob_pkt_count++; 869 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests); 870 /* No dce yet; use default one */ 871 error = (ire->ire_sendfn)(ire, mp, ipha, ixa, 872 &ipst->ips_dce_default->dce_ident); 873 goto done; 874 } 875 876 /* Note that ipha_dst is only used for IRE_MULTICAST */ 877 nce = ire_to_nce(ire, ipha->ipha_dst, NULL); 878 if (nce == NULL) { 879 /* Allocation failure? */ 880 ip_drop_output("ire_to_nce", mp, ill); 881 freemsg(mp); 882 error = ENOBUFS; 883 goto done; 884 } 885 if (nce->nce_is_condemned) { 886 nce_t *nce1; 887 888 nce1 = ire_handle_condemned_nce(nce, ire, ipha, NULL, B_TRUE); 889 nce_refrele(nce); 890 if (nce1 == NULL) { 891 if (!repeat) { 892 /* Try finding a better IRE */ 893 repeat = B_TRUE; 894 ire_refrele(ire); 895 goto repeat_ire; 896 } 897 /* Tried twice - drop packet */ 898 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards); 899 ip_drop_output("No nce", mp, ill); 900 freemsg(mp); 901 error = ENOBUFS; 902 goto done; 903 } 904 nce = nce1; 905 } 906 907 /* 908 * For multicast with multirt we have a flag passed back from 909 * ire_lookup_multi_ill_v4 since we don't have an IRE for each 910 * possible multicast address. 911 * We also need a flag for multicast since we can't check 912 * whether RTF_MULTIRT is set in ixa_ire for multicast. 913 */ 914 if (multirt) { 915 ixa->ixa_postfragfn = ip_postfrag_multirt_v4; 916 ixa->ixa_flags |= IXAF_MULTIRT_MULTICAST; 917 } else { 918 ixa->ixa_postfragfn = ire->ire_postfragfn; 919 ixa->ixa_flags &= ~IXAF_MULTIRT_MULTICAST; 920 } 921 ASSERT(ixa->ixa_nce == NULL); 922 ixa->ixa_nce = nce; 923 924 /* 925 * Check for a dce_t with a path mtu. 926 */ 927 dce = dce_lookup_v4(dst, ipst, NULL); 928 ASSERT(dce != NULL); 929 930 if (!(ixaflags & IXAF_PMTU_DISCOVERY)) { 931 ixa->ixa_fragsize = ip_get_base_mtu(nce->nce_ill, ire); 932 } else if (dce->dce_flags & DCEF_PMTU) { 933 /* 934 * To avoid a periodic timer to increase the path MTU we 935 * look at dce_last_change_time each time we send a packet. 936 */ 937 now = ddi_get_lbolt64(); 938 if (TICK_TO_SEC(now) - dce->dce_last_change_time > 939 ipst->ips_ip_pathmtu_interval) { 940 /* 941 * Older than 20 minutes. Drop the path MTU information. 942 */ 943 mutex_enter(&dce->dce_lock); 944 dce->dce_flags &= ~(DCEF_PMTU|DCEF_TOO_SMALL_PMTU); 945 dce->dce_last_change_time = TICK_TO_SEC(now); 946 mutex_exit(&dce->dce_lock); 947 dce_increment_generation(dce); 948 ixa->ixa_fragsize = ip_get_base_mtu(nce->nce_ill, ire); 949 } else { 950 uint_t fragsize; 951 952 fragsize = ip_get_base_mtu(nce->nce_ill, ire); 953 if (fragsize > dce->dce_pmtu) 954 fragsize = dce->dce_pmtu; 955 ixa->ixa_fragsize = fragsize; 956 } 957 } else { 958 ixa->ixa_fragsize = ip_get_base_mtu(nce->nce_ill, ire); 959 } 960 961 /* 962 * We use use ire_nexthop_ill (and not ncec_ill) to avoid the under ipmp 963 * interface for source address selection. 964 */ 965 ill = ire_nexthop_ill(ire); 966 967 if (ixaflags & IXAF_SET_SOURCE) { 968 ipaddr_t src; 969 970 /* 971 * We use the final destination to get 972 * correct selection for source routed packets 973 */ 974 975 /* If unreachable we have no ill but need some source */ 976 if (ill == NULL) { 977 src = htonl(INADDR_LOOPBACK); 978 error = 0; 979 } else { 980 error = ip_select_source_v4(ill, setsrc, dst, 981 ixa->ixa_multicast_ifaddr, ixa->ixa_zoneid, ipst, 982 &src, NULL, NULL); 983 } 984 if (error != 0) { 985 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCOutRequests); 986 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); 987 ip_drop_output("ipIfStatsOutDiscards - no source", 988 mp, ill); 989 freemsg(mp); 990 goto done; 991 } 992 ipha->ipha_src = src; 993 } else if (ixaflags & IXAF_VERIFY_SOURCE) { 994 /* Check if the IP source is assigned to the host. */ 995 if (!ip_verify_src(mp, ixa, NULL)) { 996 /* Don't send a packet with a source that isn't ours */ 997 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests); 998 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards); 999 ip_drop_output("ipIfStatsOutDiscards - invalid source", 1000 mp, ill); 1001 freemsg(mp); 1002 error = EADDRNOTAVAIL; 1003 goto done; 1004 } 1005 } 1006 1007 1008 /* 1009 * Check against global IPsec policy to set the AH/ESP attributes. 1010 * IPsec will set IXAF_IPSEC_* and ixa_ipsec_* as appropriate. 1011 */ 1012 if (!(ixaflags & (IXAF_NO_IPSEC|IXAF_IPSEC_SECURE))) { 1013 ASSERT(ixa->ixa_ipsec_policy == NULL); 1014 mp = ip_output_attach_policy(mp, ipha, NULL, NULL, ixa); 1015 if (mp == NULL) { 1016 /* MIB and ip_drop_packet already done */ 1017 return (EHOSTUNREACH); /* IPsec policy failure */ 1018 } 1019 } 1020 1021 if (ill != NULL) { 1022 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCOutRequests); 1023 } else { 1024 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests); 1025 } 1026 1027 /* 1028 * We update the statistics on the most specific IRE i.e., the first 1029 * one we found. 1030 * We don't have an IRE when we fragment, hence ire_ob_pkt_count 1031 * can only count the use prior to fragmentation. However the MIB 1032 * counters on the ill will be incremented in post fragmentation. 1033 */ 1034 ire->ire_ob_pkt_count++; 1035 1036 /* 1037 * Based on ire_type and ire_flags call one of: 1038 * ire_send_local_v4 - for IRE_LOCAL and IRE_LOOPBACK 1039 * ire_send_multirt_v4 - if RTF_MULTIRT 1040 * ire_send_noroute_v4 - if RTF_REJECT or RTF_BLACHOLE 1041 * ire_send_multicast_v4 - for IRE_MULTICAST 1042 * ire_send_broadcast_v4 - for IRE_BROADCAST 1043 * ire_send_wire_v4 - for the rest. 1044 */ 1045 error = (ire->ire_sendfn)(ire, mp, ipha, ixa, &dce->dce_ident); 1046 done: 1047 ire_refrele(ire); 1048 if (dce != NULL) 1049 dce_refrele(dce); 1050 if (ill != NULL) 1051 ill_refrele(ill); 1052 if (ixa->ixa_nce != NULL) 1053 nce_refrele(ixa->ixa_nce); 1054 ixa->ixa_nce = NULL; 1055 return (error); 1056 } 1057 1058 /* 1059 * ire_sendfn() functions. 1060 * These functions use the following xmit_attr: 1061 * - ixa_fragsize - read to determine whether or not to fragment 1062 * - IXAF_IPSEC_SECURE - to determine whether or not to invoke IPsec 1063 * - ixa_ipsec_* are used inside IPsec 1064 * - IXAF_SET_SOURCE - replace IP source in broadcast case. 1065 * - IXAF_LOOPBACK_COPY - for multicast and broadcast 1066 */ 1067 1068 1069 /* 1070 * ire_sendfn for IRE_LOCAL and IRE_LOOPBACK 1071 * 1072 * The checks for restrict_interzone_loopback are done in ire_route_recursive. 1073 */ 1074 /* ARGSUSED4 */ 1075 int 1076 ire_send_local_v4(ire_t *ire, mblk_t *mp, void *iph_arg, 1077 ip_xmit_attr_t *ixa, uint32_t *identp) 1078 { 1079 ipha_t *ipha = (ipha_t *)iph_arg; 1080 ip_stack_t *ipst = ixa->ixa_ipst; 1081 ill_t *ill = ire->ire_ill; 1082 ip_recv_attr_t iras; /* NOTE: No bzero for performance */ 1083 uint_t pktlen = ixa->ixa_pktlen; 1084 1085 /* 1086 * No fragmentation, no nce, no application of IPsec, 1087 * and no ipha_ident assignment. 1088 * 1089 * Note different order between IP provider and FW_HOOKS than in 1090 * send_wire case. 1091 */ 1092 1093 /* 1094 * DTrace this as ip:::send. A packet blocked by FW_HOOKS will fire the 1095 * send probe, but not the receive probe. 1096 */ 1097 DTRACE_IP7(send, mblk_t *, mp, conn_t *, NULL, void_ip_t *, 1098 ipha, __dtrace_ipsr_ill_t *, ill, ipha_t *, ipha, ip6_t *, NULL, 1099 int, 1); 1100 1101 if (HOOKS4_INTERESTED_LOOPBACK_OUT(ipst)) { 1102 int error; 1103 1104 DTRACE_PROBE4(ip4__loopback__out__start, ill_t *, NULL, 1105 ill_t *, ill, ipha_t *, ipha, mblk_t *, mp); 1106 FW_HOOKS(ipst->ips_ip4_loopback_out_event, 1107 ipst->ips_ipv4firewall_loopback_out, 1108 NULL, ill, ipha, mp, mp, 0, ipst, error); 1109 DTRACE_PROBE1(ip4__loopback__out__end, mblk_t *, mp); 1110 if (mp == NULL) 1111 return (error); 1112 1113 /* 1114 * Even if the destination was changed by the filter we use the 1115 * forwarding decision that was made based on the address 1116 * in ip_output/ip_set_destination. 1117 */ 1118 /* Length could be different */ 1119 ipha = (ipha_t *)mp->b_rptr; 1120 pktlen = ntohs(ipha->ipha_length); 1121 } 1122 1123 /* 1124 * If a callback is enabled then we need to know the 1125 * source and destination zoneids for the packet. We already 1126 * have those handy. 1127 */ 1128 if (ipst->ips_ip4_observe.he_interested) { 1129 zoneid_t szone, dzone; 1130 zoneid_t stackzoneid; 1131 1132 stackzoneid = netstackid_to_zoneid( 1133 ipst->ips_netstack->netstack_stackid); 1134 1135 if (stackzoneid == GLOBAL_ZONEID) { 1136 /* Shared-IP zone */ 1137 dzone = ire->ire_zoneid; 1138 szone = ixa->ixa_zoneid; 1139 } else { 1140 szone = dzone = stackzoneid; 1141 } 1142 ipobs_hook(mp, IPOBS_HOOK_LOCAL, szone, dzone, ill, ipst); 1143 } 1144 1145 /* Handle lo0 stats */ 1146 ipst->ips_loopback_packets++; 1147 1148 /* Map ixa to ira including IPsec policies */ 1149 ipsec_out_to_in(ixa, ill, &iras); 1150 iras.ira_pktlen = pktlen; 1151 1152 if (!IS_SIMPLE_IPH(ipha)) { 1153 ip_output_local_options(ipha, ipst); 1154 iras.ira_flags |= IRAF_IPV4_OPTIONS; 1155 } 1156 1157 if (HOOKS4_INTERESTED_LOOPBACK_IN(ipst)) { 1158 int error; 1159 1160 DTRACE_PROBE4(ip4__loopback__in__start, ill_t *, ill, 1161 ill_t *, NULL, ipha_t *, ipha, mblk_t *, mp); 1162 FW_HOOKS(ipst->ips_ip4_loopback_in_event, 1163 ipst->ips_ipv4firewall_loopback_in, 1164 ill, NULL, ipha, mp, mp, 0, ipst, error); 1165 1166 DTRACE_PROBE1(ip4__loopback__in__end, mblk_t *, mp); 1167 if (mp == NULL) { 1168 ira_cleanup(&iras, B_FALSE); 1169 return (error); 1170 } 1171 /* 1172 * Even if the destination was changed by the filter we use the 1173 * forwarding decision that was made based on the address 1174 * in ip_output/ip_set_destination. 1175 */ 1176 /* Length could be different */ 1177 ipha = (ipha_t *)mp->b_rptr; 1178 pktlen = iras.ira_pktlen = ntohs(ipha->ipha_length); 1179 } 1180 1181 DTRACE_IP7(receive, mblk_t *, mp, conn_t *, NULL, void_ip_t *, 1182 ipha, __dtrace_ipsr_ill_t *, ill, ipha_t *, ipha, ip6_t *, NULL, 1183 int, 1); 1184 1185 ire->ire_ib_pkt_count++; 1186 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInReceives); 1187 UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCInOctets, pktlen); 1188 1189 /* Destined to ire_zoneid - use that for fanout */ 1190 iras.ira_zoneid = ire->ire_zoneid; 1191 1192 if (is_system_labeled()) { 1193 iras.ira_flags |= IRAF_SYSTEM_LABELED; 1194 1195 /* 1196 * This updates ira_cred, ira_tsl and ira_free_flags based 1197 * on the label. We don't expect this to ever fail for 1198 * loopback packets, so we silently drop the packet should it 1199 * fail. 1200 */ 1201 if (!tsol_get_pkt_label(mp, IPV4_VERSION, &iras)) { 1202 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 1203 ip_drop_input("tsol_get_pkt_label", mp, ill); 1204 freemsg(mp); 1205 return (0); 1206 } 1207 ASSERT(iras.ira_tsl != NULL); 1208 1209 /* tsol_get_pkt_label sometimes does pullupmsg */ 1210 ipha = (ipha_t *)mp->b_rptr; 1211 } 1212 1213 ip_fanout_v4(mp, ipha, &iras); 1214 1215 /* We moved any IPsec refs from ixa to iras */ 1216 ira_cleanup(&iras, B_FALSE); 1217 return (0); 1218 } 1219 1220 /* 1221 * ire_sendfn for IRE_BROADCAST 1222 * If the broadcast address is present on multiple ills and ixa_ifindex 1223 * isn't set, then we generate 1224 * a separate datagram (potentially with different source address) for 1225 * those ills. In any case, only one copy is looped back to ip_input_v4. 1226 */ 1227 int 1228 ire_send_broadcast_v4(ire_t *ire, mblk_t *mp, void *iph_arg, 1229 ip_xmit_attr_t *ixa, uint32_t *identp) 1230 { 1231 ipha_t *ipha = (ipha_t *)iph_arg; 1232 ip_stack_t *ipst = ixa->ixa_ipst; 1233 irb_t *irb = ire->ire_bucket; 1234 ire_t *ire1; 1235 mblk_t *mp1; 1236 ipha_t *ipha1; 1237 iaflags_t ixaflags = ixa->ixa_flags; 1238 nce_t *nce1, *nce_orig; 1239 1240 /* 1241 * Unless ire_send_multirt_v4 already set a ttl, force the 1242 * ttl to a smallish value. 1243 */ 1244 if (!(ixa->ixa_flags & IXAF_NO_TTL_CHANGE)) { 1245 /* 1246 * To avoid broadcast storms, we usually set the TTL to 1 for 1247 * broadcasts. This can 1248 * be overridden stack-wide through the ip_broadcast_ttl 1249 * ndd tunable, or on a per-connection basis through the 1250 * IP_BROADCAST_TTL socket option. 1251 * 1252 * If SO_DONTROUTE/IXAF_DONTROUTE is set, then ire_send_wire_v4 1253 * will force ttl to one after we've set this. 1254 */ 1255 if (ixaflags & IXAF_BROADCAST_TTL_SET) 1256 ipha->ipha_ttl = ixa->ixa_broadcast_ttl; 1257 else 1258 ipha->ipha_ttl = ipst->ips_ip_broadcast_ttl; 1259 } 1260 /* 1261 * Make sure we get a loopback copy (after IPsec and frag) 1262 * Skip hardware checksum so that loopback copy is checksumed. 1263 */ 1264 ixa->ixa_flags |= IXAF_LOOPBACK_COPY | IXAF_NO_HW_CKSUM; 1265 1266 /* Do we need to potentially generate multiple copies? */ 1267 if (irb->irb_ire_cnt == 1 || ixa->ixa_ifindex != 0) 1268 return (ire_send_wire_v4(ire, mp, ipha, ixa, identp)); 1269 1270 /* 1271 * Loop over all IRE_BROADCAST in the bucket (might only be one). 1272 * Note that everything in the bucket has the same destination address. 1273 */ 1274 irb_refhold(irb); 1275 for (ire1 = irb->irb_ire; ire1 != NULL; ire1 = ire1->ire_next) { 1276 /* We do the main IRE after the end of the loop */ 1277 if (ire1 == ire) 1278 continue; 1279 1280 /* 1281 * Only IREs for the same IP address should be in the same 1282 * bucket. 1283 * But could have IRE_HOSTs in the case of CGTP. 1284 * If we find any multirt routes we bail out of the loop 1285 * and just do the single packet at the end; ip_postfrag_multirt 1286 * will duplicate the packet. 1287 */ 1288 ASSERT(ire1->ire_addr == ire->ire_addr); 1289 if (!(ire1->ire_type & IRE_BROADCAST)) 1290 continue; 1291 1292 if (IRE_IS_CONDEMNED(ire1)) 1293 continue; 1294 1295 if (ixa->ixa_zoneid != ALL_ZONES && 1296 ire->ire_zoneid != ire1->ire_zoneid) 1297 continue; 1298 1299 ASSERT(ire->ire_ill != ire1->ire_ill && ire1->ire_ill != NULL); 1300 1301 if (ire1->ire_flags & RTF_MULTIRT) 1302 break; 1303 1304 /* 1305 * For IPMP we only send for the ipmp_ill. arp_nce_init() will 1306 * ensure that this goes out on the cast_ill. 1307 */ 1308 if (IS_UNDER_IPMP(ire1->ire_ill)) 1309 continue; 1310 1311 mp1 = copymsg(mp); 1312 if (mp1 == NULL) { 1313 BUMP_MIB(ire1->ire_ill->ill_ip_mib, 1314 ipIfStatsOutDiscards); 1315 ip_drop_output("ipIfStatsOutDiscards", 1316 mp, ire1->ire_ill); 1317 continue; 1318 } 1319 1320 ipha1 = (ipha_t *)mp1->b_rptr; 1321 if (ixa->ixa_flags & IXAF_SET_SOURCE) { 1322 /* 1323 * Need to pick a different source address for each 1324 * interface. If we have a global IPsec policy and 1325 * no per-socket policy then we punt to 1326 * ip_output_simple_v4 using a separate ip_xmit_attr_t. 1327 */ 1328 if (ixaflags & IXAF_IPSEC_GLOBAL_POLICY) { 1329 ip_output_simple_broadcast(ixa, mp1); 1330 continue; 1331 } 1332 /* Pick a new source address for each interface */ 1333 if (ip_select_source_v4(ire1->ire_ill, INADDR_ANY, 1334 ipha1->ipha_dst, INADDR_ANY, ixa->ixa_zoneid, ipst, 1335 &ipha1->ipha_src, NULL, NULL) != 0) { 1336 BUMP_MIB(ire1->ire_ill->ill_ip_mib, 1337 ipIfStatsOutDiscards); 1338 ip_drop_output("ipIfStatsOutDiscards - select " 1339 "broadcast source", mp1, ire1->ire_ill); 1340 freemsg(mp1); 1341 continue; 1342 } 1343 /* 1344 * Check against global IPsec policy to set the AH/ESP 1345 * attributes. IPsec will set IXAF_IPSEC_* and 1346 * ixa_ipsec_* as appropriate. 1347 */ 1348 if (!(ixaflags & (IXAF_NO_IPSEC|IXAF_IPSEC_SECURE))) { 1349 ASSERT(ixa->ixa_ipsec_policy == NULL); 1350 mp1 = ip_output_attach_policy(mp1, ipha, NULL, 1351 NULL, ixa); 1352 if (mp1 == NULL) { 1353 /* 1354 * MIB and ip_drop_packet already 1355 * done 1356 */ 1357 continue; 1358 } 1359 } 1360 } 1361 /* Make sure we have an NCE on this ill */ 1362 nce1 = arp_nce_init(ire1->ire_ill, ire1->ire_addr, 1363 ire1->ire_type); 1364 if (nce1 == NULL) { 1365 BUMP_MIB(ire1->ire_ill->ill_ip_mib, 1366 ipIfStatsOutDiscards); 1367 ip_drop_output("ipIfStatsOutDiscards - broadcast nce", 1368 mp1, ire1->ire_ill); 1369 freemsg(mp1); 1370 continue; 1371 } 1372 nce_orig = ixa->ixa_nce; 1373 ixa->ixa_nce = nce1; 1374 1375 ire_refhold(ire1); 1376 /* 1377 * Ignore any errors here. We just collect the errno for 1378 * the main ire below 1379 */ 1380 (void) ire_send_wire_v4(ire1, mp1, ipha1, ixa, identp); 1381 ire_refrele(ire1); 1382 1383 ixa->ixa_nce = nce_orig; 1384 nce_refrele(nce1); 1385 1386 ixa->ixa_flags &= ~IXAF_LOOPBACK_COPY; 1387 } 1388 irb_refrele(irb); 1389 /* Finally, the main one */ 1390 1391 /* 1392 * For IPMP we only send broadcasts on the ipmp_ill. 1393 */ 1394 if (IS_UNDER_IPMP(ire->ire_ill)) { 1395 freemsg(mp); 1396 return (0); 1397 } 1398 1399 return (ire_send_wire_v4(ire, mp, ipha, ixa, identp)); 1400 } 1401 1402 /* 1403 * Send a packet using a different source address and different 1404 * IPsec policy. 1405 */ 1406 static void 1407 ip_output_simple_broadcast(ip_xmit_attr_t *ixa, mblk_t *mp) 1408 { 1409 ip_xmit_attr_t ixas; 1410 1411 bzero(&ixas, sizeof (ixas)); 1412 ixas.ixa_flags = IXAF_BASIC_SIMPLE_V4; 1413 ixas.ixa_zoneid = ixa->ixa_zoneid; 1414 ixas.ixa_ifindex = 0; 1415 ixas.ixa_ipst = ixa->ixa_ipst; 1416 ixas.ixa_cred = ixa->ixa_cred; 1417 ixas.ixa_cpid = ixa->ixa_cpid; 1418 ixas.ixa_tsl = ixa->ixa_tsl; 1419 ixas.ixa_multicast_ttl = IP_DEFAULT_MULTICAST_TTL; 1420 1421 (void) ip_output_simple(mp, &ixas); 1422 ixa_cleanup(&ixas); 1423 } 1424 1425 1426 static void 1427 multirt_check_v4(ire_t *ire, ipha_t *ipha, ip_xmit_attr_t *ixa) 1428 { 1429 ip_stack_t *ipst = ixa->ixa_ipst; 1430 1431 /* Limit the TTL on multirt packets */ 1432 if (ire->ire_type & IRE_MULTICAST) { 1433 if (ipha->ipha_ttl > 1) { 1434 ip2dbg(("ire_send_multirt_v4: forcing multicast " 1435 "multirt TTL to 1 (was %d), dst 0x%08x\n", 1436 ipha->ipha_ttl, ntohl(ire->ire_addr))); 1437 ipha->ipha_ttl = 1; 1438 } 1439 ixa->ixa_flags |= IXAF_NO_TTL_CHANGE; 1440 } else if ((ipst->ips_ip_multirt_ttl > 0) && 1441 (ipha->ipha_ttl > ipst->ips_ip_multirt_ttl)) { 1442 ipha->ipha_ttl = ipst->ips_ip_multirt_ttl; 1443 /* 1444 * Need to ensure we don't increase the ttl should we go through 1445 * ire_send_broadcast or multicast. 1446 */ 1447 ixa->ixa_flags |= IXAF_NO_TTL_CHANGE; 1448 } 1449 } 1450 1451 /* 1452 * ire_sendfn for IRE_MULTICAST 1453 */ 1454 int 1455 ire_send_multicast_v4(ire_t *ire, mblk_t *mp, void *iph_arg, 1456 ip_xmit_attr_t *ixa, uint32_t *identp) 1457 { 1458 ipha_t *ipha = (ipha_t *)iph_arg; 1459 ip_stack_t *ipst = ixa->ixa_ipst; 1460 ill_t *ill = ire->ire_ill; 1461 iaflags_t ixaflags = ixa->ixa_flags; 1462 1463 /* 1464 * The IRE_MULTICAST is the same whether or not multirt is in use. 1465 * Hence we need special-case code. 1466 */ 1467 if (ixaflags & IXAF_MULTIRT_MULTICAST) 1468 multirt_check_v4(ire, ipha, ixa); 1469 1470 /* 1471 * Check if anything in ip_input_v4 wants a copy of the transmitted 1472 * packet (after IPsec and fragmentation) 1473 * 1474 * 1. Multicast routers always need a copy unless SO_DONTROUTE is set 1475 * RSVP and the rsvp daemon is an example of a 1476 * protocol and user level process that 1477 * handles it's own routing. Hence, it uses the 1478 * SO_DONTROUTE option to accomplish this. 1479 * 2. If the sender has set IP_MULTICAST_LOOP, then we just 1480 * check whether there are any receivers for the group on the ill 1481 * (ignoring the zoneid). 1482 * 3. If IP_MULTICAST_LOOP is not set, then we check if there are 1483 * any members in other shared-IP zones. 1484 * If such members exist, then we indicate that the sending zone 1485 * shouldn't get a loopback copy to preserve the IP_MULTICAST_LOOP 1486 * behavior. 1487 * 1488 * When we loopback we skip hardware checksum to make sure loopback 1489 * copy is checksumed. 1490 * 1491 * Note that ire_ill is the upper in the case of IPMP. 1492 */ 1493 ixa->ixa_flags &= ~(IXAF_LOOPBACK_COPY | IXAF_NO_HW_CKSUM); 1494 if (ipst->ips_ip_g_mrouter && ill->ill_mrouter_cnt > 0 && 1495 !(ixaflags & IXAF_DONTROUTE)) { 1496 ixa->ixa_flags |= IXAF_LOOPBACK_COPY | IXAF_NO_HW_CKSUM; 1497 } else if (ixaflags & IXAF_MULTICAST_LOOP) { 1498 /* 1499 * If this zone or any other zone has members then loopback 1500 * a copy. 1501 */ 1502 if (ill_hasmembers_v4(ill, ipha->ipha_dst)) 1503 ixa->ixa_flags |= IXAF_LOOPBACK_COPY | IXAF_NO_HW_CKSUM; 1504 } else if (ipst->ips_netstack->netstack_numzones > 1) { 1505 /* 1506 * This zone should not have a copy. But there are some other 1507 * zones which might have members. 1508 */ 1509 if (ill_hasmembers_otherzones_v4(ill, ipha->ipha_dst, 1510 ixa->ixa_zoneid)) { 1511 ixa->ixa_flags |= IXAF_NO_LOOP_ZONEID_SET; 1512 ixa->ixa_no_loop_zoneid = ixa->ixa_zoneid; 1513 ixa->ixa_flags |= IXAF_LOOPBACK_COPY | IXAF_NO_HW_CKSUM; 1514 } 1515 } 1516 1517 /* 1518 * Unless ire_send_multirt_v4 or icmp_output_hdrincl already set a ttl, 1519 * force the ttl to the IP_MULTICAST_TTL value 1520 */ 1521 if (!(ixaflags & IXAF_NO_TTL_CHANGE)) { 1522 ipha->ipha_ttl = ixa->ixa_multicast_ttl; 1523 } 1524 1525 return (ire_send_wire_v4(ire, mp, ipha, ixa, identp)); 1526 } 1527 1528 /* 1529 * ire_sendfn for IREs with RTF_MULTIRT 1530 */ 1531 int 1532 ire_send_multirt_v4(ire_t *ire, mblk_t *mp, void *iph_arg, 1533 ip_xmit_attr_t *ixa, uint32_t *identp) 1534 { 1535 ipha_t *ipha = (ipha_t *)iph_arg; 1536 1537 multirt_check_v4(ire, ipha, ixa); 1538 1539 if (ire->ire_type & IRE_MULTICAST) 1540 return (ire_send_multicast_v4(ire, mp, ipha, ixa, identp)); 1541 else if (ire->ire_type & IRE_BROADCAST) 1542 return (ire_send_broadcast_v4(ire, mp, ipha, ixa, identp)); 1543 else 1544 return (ire_send_wire_v4(ire, mp, ipha, ixa, identp)); 1545 } 1546 1547 /* 1548 * ire_sendfn for IREs with RTF_REJECT/RTF_BLACKHOLE, including IRE_NOROUTE 1549 */ 1550 int 1551 ire_send_noroute_v4(ire_t *ire, mblk_t *mp, void *iph_arg, 1552 ip_xmit_attr_t *ixa, uint32_t *identp) 1553 { 1554 ip_stack_t *ipst = ixa->ixa_ipst; 1555 ipha_t *ipha = (ipha_t *)iph_arg; 1556 ill_t *ill; 1557 ip_recv_attr_t iras; 1558 boolean_t dummy; 1559 1560 /* We assign an IP ident for nice errors */ 1561 ipha->ipha_ident = atomic_inc_32_nv(identp); 1562 1563 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutNoRoutes); 1564 1565 if (ire->ire_type & IRE_NOROUTE) { 1566 /* A lack of a route as opposed to RTF_REJECT|BLACKHOLE */ 1567 ip_rts_change(RTM_MISS, ipha->ipha_dst, 0, 0, 0, 0, 0, 0, 1568 RTA_DST, ipst); 1569 } 1570 1571 if (ire->ire_flags & RTF_BLACKHOLE) { 1572 ip_drop_output("ipIfStatsOutNoRoutes RTF_BLACKHOLE", mp, NULL); 1573 freemsg(mp); 1574 /* No error even for local senders - silent blackhole */ 1575 return (0); 1576 } 1577 ip_drop_output("ipIfStatsOutNoRoutes RTF_REJECT", mp, NULL); 1578 1579 /* 1580 * We need an ill_t for the ip_recv_attr_t even though this packet 1581 * was never received and icmp_unreachable doesn't currently use 1582 * ira_ill. 1583 */ 1584 ill = ill_lookup_on_name("lo0", B_FALSE, 1585 !(ixa->ixa_flags & IRAF_IS_IPV4), &dummy, ipst); 1586 if (ill == NULL) { 1587 freemsg(mp); 1588 return (EHOSTUNREACH); 1589 } 1590 1591 bzero(&iras, sizeof (iras)); 1592 /* Map ixa to ira including IPsec policies */ 1593 ipsec_out_to_in(ixa, ill, &iras); 1594 1595 if (ip_source_routed(ipha, ipst)) { 1596 icmp_unreachable(mp, ICMP_SOURCE_ROUTE_FAILED, &iras); 1597 } else { 1598 icmp_unreachable(mp, ICMP_HOST_UNREACHABLE, &iras); 1599 } 1600 /* We moved any IPsec refs from ixa to iras */ 1601 ira_cleanup(&iras, B_FALSE); 1602 ill_refrele(ill); 1603 return (EHOSTUNREACH); 1604 } 1605 1606 /* 1607 * Calculate a checksum ignoring any hardware capabilities 1608 * 1609 * Returns B_FALSE if the packet was too short for the checksum. Caller 1610 * should free and do stats. 1611 */ 1612 static boolean_t 1613 ip_output_sw_cksum_v4(mblk_t *mp, ipha_t *ipha, ip_xmit_attr_t *ixa) 1614 { 1615 ip_stack_t *ipst = ixa->ixa_ipst; 1616 uint_t pktlen = ixa->ixa_pktlen; 1617 uint16_t *cksump; 1618 uint32_t cksum; 1619 uint8_t protocol = ixa->ixa_protocol; 1620 uint16_t ip_hdr_length = ixa->ixa_ip_hdr_length; 1621 ipaddr_t dst = ipha->ipha_dst; 1622 ipaddr_t src = ipha->ipha_src; 1623 1624 /* Just in case it contained garbage */ 1625 DB_CKSUMFLAGS(mp) &= ~HCK_FLAGS; 1626 1627 /* 1628 * Calculate ULP checksum 1629 */ 1630 if (protocol == IPPROTO_TCP) { 1631 cksump = IPH_TCPH_CHECKSUMP(ipha, ip_hdr_length); 1632 cksum = IP_TCP_CSUM_COMP; 1633 } else if (protocol == IPPROTO_UDP) { 1634 cksump = IPH_UDPH_CHECKSUMP(ipha, ip_hdr_length); 1635 cksum = IP_UDP_CSUM_COMP; 1636 } else if (protocol == IPPROTO_SCTP) { 1637 sctp_hdr_t *sctph; 1638 1639 ASSERT(MBLKL(mp) >= (ip_hdr_length + sizeof (*sctph))); 1640 sctph = (sctp_hdr_t *)(mp->b_rptr + ip_hdr_length); 1641 /* 1642 * Zero out the checksum field to ensure proper 1643 * checksum calculation. 1644 */ 1645 sctph->sh_chksum = 0; 1646 #ifdef DEBUG 1647 if (!skip_sctp_cksum) 1648 #endif 1649 sctph->sh_chksum = sctp_cksum(mp, ip_hdr_length); 1650 goto ip_hdr_cksum; 1651 } else { 1652 goto ip_hdr_cksum; 1653 } 1654 1655 /* ULP puts the checksum field is in the first mblk */ 1656 ASSERT(((uchar_t *)cksump) + sizeof (uint16_t) <= mp->b_wptr); 1657 1658 /* 1659 * We accumulate the pseudo header checksum in cksum. 1660 * This is pretty hairy code, so watch close. One 1661 * thing to keep in mind is that UDP and TCP have 1662 * stored their respective datagram lengths in their 1663 * checksum fields. This lines things up real nice. 1664 */ 1665 cksum += (dst >> 16) + (dst & 0xFFFF) + (src >> 16) + (src & 0xFFFF); 1666 1667 cksum = IP_CSUM(mp, ip_hdr_length, cksum); 1668 /* 1669 * For UDP/IPv4 a zero means that the packets wasn't checksummed. 1670 * Change to 0xffff 1671 */ 1672 if (protocol == IPPROTO_UDP && cksum == 0) 1673 *cksump = ~cksum; 1674 else 1675 *cksump = cksum; 1676 1677 IP_STAT(ipst, ip_out_sw_cksum); 1678 IP_STAT_UPDATE(ipst, ip_out_sw_cksum_bytes, pktlen); 1679 1680 ip_hdr_cksum: 1681 /* Calculate IPv4 header checksum */ 1682 ipha->ipha_hdr_checksum = 0; 1683 ipha->ipha_hdr_checksum = ip_csum_hdr(ipha); 1684 return (B_TRUE); 1685 } 1686 1687 /* 1688 * Calculate the ULP checksum - try to use hardware. 1689 * In the case of MULTIRT, broadcast or multicast the 1690 * IXAF_NO_HW_CKSUM is set in which case we use software. 1691 * 1692 * If the hardware supports IP header checksum offload; then clear the 1693 * contents of IP header checksum field as expected by NIC. 1694 * Do this only if we offloaded either full or partial sum. 1695 * 1696 * Returns B_FALSE if the packet was too short for the checksum. Caller 1697 * should free and do stats. 1698 */ 1699 static boolean_t 1700 ip_output_cksum_v4(iaflags_t ixaflags, mblk_t *mp, ipha_t *ipha, 1701 ip_xmit_attr_t *ixa, ill_t *ill) 1702 { 1703 uint_t pktlen = ixa->ixa_pktlen; 1704 uint16_t *cksump; 1705 uint16_t hck_flags; 1706 uint32_t cksum; 1707 uint8_t protocol = ixa->ixa_protocol; 1708 uint16_t ip_hdr_length = ixa->ixa_ip_hdr_length; 1709 1710 if ((ixaflags & IXAF_NO_HW_CKSUM) || !ILL_HCKSUM_CAPABLE(ill) || 1711 !dohwcksum) { 1712 return (ip_output_sw_cksum_v4(mp, ipha, ixa)); 1713 } 1714 1715 /* 1716 * Calculate ULP checksum. Note that we don't use cksump and cksum 1717 * if the ill has FULL support. 1718 */ 1719 if (protocol == IPPROTO_TCP) { 1720 cksump = IPH_TCPH_CHECKSUMP(ipha, ip_hdr_length); 1721 cksum = IP_TCP_CSUM_COMP; /* Pseudo-header cksum */ 1722 } else if (protocol == IPPROTO_UDP) { 1723 cksump = IPH_UDPH_CHECKSUMP(ipha, ip_hdr_length); 1724 cksum = IP_UDP_CSUM_COMP; /* Pseudo-header cksum */ 1725 } else if (protocol == IPPROTO_SCTP) { 1726 sctp_hdr_t *sctph; 1727 1728 ASSERT(MBLKL(mp) >= (ip_hdr_length + sizeof (*sctph))); 1729 sctph = (sctp_hdr_t *)(mp->b_rptr + ip_hdr_length); 1730 /* 1731 * Zero out the checksum field to ensure proper 1732 * checksum calculation. 1733 */ 1734 sctph->sh_chksum = 0; 1735 #ifdef DEBUG 1736 if (!skip_sctp_cksum) 1737 #endif 1738 sctph->sh_chksum = sctp_cksum(mp, ip_hdr_length); 1739 goto ip_hdr_cksum; 1740 } else { 1741 ip_hdr_cksum: 1742 /* Calculate IPv4 header checksum */ 1743 ipha->ipha_hdr_checksum = 0; 1744 ipha->ipha_hdr_checksum = ip_csum_hdr(ipha); 1745 return (B_TRUE); 1746 } 1747 1748 /* ULP puts the checksum field is in the first mblk */ 1749 ASSERT(((uchar_t *)cksump) + sizeof (uint16_t) <= mp->b_wptr); 1750 1751 /* 1752 * Underlying interface supports hardware checksum offload for 1753 * the payload; leave the payload checksum for the hardware to 1754 * calculate. N.B: We only need to set up checksum info on the 1755 * first mblk. 1756 */ 1757 hck_flags = ill->ill_hcksum_capab->ill_hcksum_txflags; 1758 1759 DB_CKSUMFLAGS(mp) &= ~HCK_FLAGS; 1760 if (hck_flags & HCKSUM_INET_FULL_V4) { 1761 /* 1762 * Hardware calculates pseudo-header, header and the 1763 * payload checksums, so clear the checksum field in 1764 * the protocol header. 1765 */ 1766 *cksump = 0; 1767 DB_CKSUMFLAGS(mp) |= HCK_FULLCKSUM; 1768 1769 ipha->ipha_hdr_checksum = 0; 1770 if (hck_flags & HCKSUM_IPHDRCKSUM) { 1771 DB_CKSUMFLAGS(mp) |= HCK_IPV4_HDRCKSUM; 1772 } else { 1773 ipha->ipha_hdr_checksum = ip_csum_hdr(ipha); 1774 } 1775 return (B_TRUE); 1776 } 1777 if ((hck_flags) & HCKSUM_INET_PARTIAL) { 1778 ipaddr_t dst = ipha->ipha_dst; 1779 ipaddr_t src = ipha->ipha_src; 1780 /* 1781 * Partial checksum offload has been enabled. Fill 1782 * the checksum field in the protocol header with the 1783 * pseudo-header checksum value. 1784 * 1785 * We accumulate the pseudo header checksum in cksum. 1786 * This is pretty hairy code, so watch close. One 1787 * thing to keep in mind is that UDP and TCP have 1788 * stored their respective datagram lengths in their 1789 * checksum fields. This lines things up real nice. 1790 */ 1791 cksum += (dst >> 16) + (dst & 0xFFFF) + 1792 (src >> 16) + (src & 0xFFFF); 1793 cksum += *(cksump); 1794 cksum = (cksum & 0xFFFF) + (cksum >> 16); 1795 *(cksump) = (cksum & 0xFFFF) + (cksum >> 16); 1796 1797 /* 1798 * Offsets are relative to beginning of IP header. 1799 */ 1800 DB_CKSUMSTART(mp) = ip_hdr_length; 1801 DB_CKSUMSTUFF(mp) = (uint8_t *)cksump - (uint8_t *)ipha; 1802 DB_CKSUMEND(mp) = pktlen; 1803 DB_CKSUMFLAGS(mp) |= HCK_PARTIALCKSUM; 1804 1805 ipha->ipha_hdr_checksum = 0; 1806 if (hck_flags & HCKSUM_IPHDRCKSUM) { 1807 DB_CKSUMFLAGS(mp) |= HCK_IPV4_HDRCKSUM; 1808 } else { 1809 ipha->ipha_hdr_checksum = ip_csum_hdr(ipha); 1810 } 1811 return (B_TRUE); 1812 } 1813 /* Hardware capabilities include neither full nor partial IPv4 */ 1814 return (ip_output_sw_cksum_v4(mp, ipha, ixa)); 1815 } 1816 1817 /* 1818 * ire_sendfn for offlink and onlink destinations. 1819 * Also called from the multicast, broadcast, multirt send functions. 1820 * 1821 * Assumes that the caller has a hold on the ire. 1822 * 1823 * This function doesn't care if the IRE just became condemned since that 1824 * can happen at any time. 1825 */ 1826 /* ARGSUSED */ 1827 int 1828 ire_send_wire_v4(ire_t *ire, mblk_t *mp, void *iph_arg, 1829 ip_xmit_attr_t *ixa, uint32_t *identp) 1830 { 1831 ip_stack_t *ipst = ixa->ixa_ipst; 1832 ipha_t *ipha = (ipha_t *)iph_arg; 1833 iaflags_t ixaflags = ixa->ixa_flags; 1834 ill_t *ill; 1835 1836 ASSERT(ixa->ixa_nce != NULL); 1837 ill = ixa->ixa_nce->nce_ill; 1838 1839 /* 1840 * This package comes from ipf, we have already been here once and 1841 * all work is already done. Go to send directly. Especially 1842 * - don't change ip header (ipha_ident, ipha_ttl), it's already set, 1843 * and in case of no cksum offload, the cksum would become invalid 1844 * - don't touch cksums, they are already prepared 1845 * - don't check for fragmentation, ixa_fragsize for LSO is lost 1846 * on the way and the check would fail in case of LSO 1847 */ 1848 if (ixaflags & IXAF_NO_PFHOOK) 1849 goto sendit; 1850 1851 if (ixaflags & IXAF_DONTROUTE) 1852 ipha->ipha_ttl = 1; 1853 1854 /* 1855 * Assign an ident value for this packet. There could be other 1856 * threads targeting the same destination, so we have to arrange 1857 * for a atomic increment. Note that we use a 32-bit atomic add 1858 * because it has better performance than its 16-bit sibling. 1859 * 1860 * Normally ixa_extra_ident is 0, but in the case of LSO it will 1861 * be the number of TCP segments that the driver/hardware will 1862 * extraly construct. 1863 * 1864 * If running in cluster mode and if the source address 1865 * belongs to a replicated service then vector through 1866 * cl_inet_ipident vector to allocate ip identifier 1867 * NOTE: This is a contract private interface with the 1868 * clustering group. 1869 */ 1870 if (cl_inet_ipident != NULL) { 1871 ipaddr_t src = ipha->ipha_src; 1872 ipaddr_t dst = ipha->ipha_dst; 1873 netstackid_t stack_id = ipst->ips_netstack->netstack_stackid; 1874 1875 ASSERT(cl_inet_isclusterwide != NULL); 1876 if ((*cl_inet_isclusterwide)(stack_id, IPPROTO_IP, 1877 AF_INET, (uint8_t *)(uintptr_t)src, NULL)) { 1878 /* 1879 * Note: not correct with LSO since we can't allocate 1880 * ixa_extra_ident+1 consecutive values. 1881 */ 1882 ipha->ipha_ident = (*cl_inet_ipident)(stack_id, 1883 IPPROTO_IP, AF_INET, (uint8_t *)(uintptr_t)src, 1884 (uint8_t *)(uintptr_t)dst, NULL); 1885 } else { 1886 ipha->ipha_ident = atomic_add_32_nv(identp, 1887 ixa->ixa_extra_ident + 1); 1888 } 1889 } else { 1890 ipha->ipha_ident = atomic_add_32_nv(identp, 1891 ixa->ixa_extra_ident + 1); 1892 } 1893 #ifndef _BIG_ENDIAN 1894 ipha->ipha_ident = htons(ipha->ipha_ident); 1895 #endif 1896 1897 /* 1898 * This might set b_band, thus the IPsec and fragmentation 1899 * code in IP ensures that b_band is updated in the first mblk. 1900 */ 1901 if (IPP_ENABLED(IPP_LOCAL_OUT, ipst)) { 1902 /* ip_process translates an IS_UNDER_IPMP */ 1903 mp = ip_process(IPP_LOCAL_OUT, mp, ill, ill); 1904 if (mp == NULL) { 1905 /* ip_drop_packet and MIB done */ 1906 return (0); /* Might just be delayed */ 1907 } 1908 } 1909 1910 /* 1911 * Verify any IPv4 options. 1912 * 1913 * The presense of IP options also forces the network stack to 1914 * calculate the checksum in software. This is because: 1915 * 1916 * Wrap around: certain partial-checksum NICs (eri, ce) limit 1917 * the size of "start offset" width to 6-bit. This effectively 1918 * sets the largest value of the offset to 64-bytes, starting 1919 * from the MAC header. When the cumulative MAC and IP headers 1920 * exceed such limit, the offset will wrap around. This causes 1921 * the checksum to be calculated at the wrong place. 1922 * 1923 * IPv4 source routing: none of the full-checksum capable NICs 1924 * is capable of correctly handling the IPv4 source-routing 1925 * option for purposes of calculating the pseudo-header; the 1926 * actual destination is different from the destination in the 1927 * header which is that of the next-hop. (This case may not be 1928 * true for NICs which can parse IPv6 extension headers, but 1929 * we choose to simplify the implementation by not offloading 1930 * checksum when they are present.) 1931 */ 1932 if (!IS_SIMPLE_IPH(ipha)) { 1933 ixaflags = ixa->ixa_flags |= IXAF_NO_HW_CKSUM; 1934 /* An IS_UNDER_IPMP ill is ok here */ 1935 if (ip_output_options(mp, ipha, ixa, ill)) { 1936 /* Packet has been consumed and ICMP error sent */ 1937 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); 1938 return (EINVAL); 1939 } 1940 } 1941 1942 /* 1943 * To handle IPsec/iptun's labeling needs we need to tag packets 1944 * while we still have ixa_tsl 1945 */ 1946 if (is_system_labeled() && ixa->ixa_tsl != NULL && 1947 (ill->ill_mactype == DL_6TO4 || ill->ill_mactype == DL_IPV4 || 1948 ill->ill_mactype == DL_IPV6)) { 1949 cred_t *newcr; 1950 1951 newcr = copycred_from_tslabel(ixa->ixa_cred, ixa->ixa_tsl, 1952 KM_NOSLEEP); 1953 if (newcr == NULL) { 1954 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); 1955 ip_drop_output("ipIfStatsOutDiscards - newcr", 1956 mp, ill); 1957 freemsg(mp); 1958 return (ENOBUFS); 1959 } 1960 mblk_setcred(mp, newcr, NOPID); 1961 crfree(newcr); /* mblk_setcred did its own crhold */ 1962 } 1963 1964 if (ixa->ixa_pktlen > ixa->ixa_fragsize || 1965 (ixaflags & IXAF_IPSEC_SECURE)) { 1966 uint32_t pktlen; 1967 1968 pktlen = ixa->ixa_pktlen; 1969 if (ixaflags & IXAF_IPSEC_SECURE) 1970 pktlen += ipsec_out_extra_length(ixa); 1971 1972 if (pktlen > IP_MAXPACKET) 1973 return (EMSGSIZE); 1974 1975 if (ixaflags & IXAF_SET_ULP_CKSUM) { 1976 /* 1977 * Compute ULP checksum and IP header checksum 1978 * using software 1979 */ 1980 if (!ip_output_sw_cksum_v4(mp, ipha, ixa)) { 1981 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); 1982 ip_drop_output("ipIfStatsOutDiscards", mp, ill); 1983 freemsg(mp); 1984 return (EINVAL); 1985 } 1986 } else { 1987 /* Calculate IPv4 header checksum */ 1988 ipha->ipha_hdr_checksum = 0; 1989 ipha->ipha_hdr_checksum = ip_csum_hdr(ipha); 1990 } 1991 1992 /* 1993 * If this packet would generate a icmp_frag_needed 1994 * message, we need to handle it before we do the IPsec 1995 * processing. Otherwise, we need to strip the IPsec 1996 * headers before we send up the message to the ULPs 1997 * which becomes messy and difficult. 1998 * 1999 * We check using IXAF_DONTFRAG. The DF bit in the header 2000 * is not inspected - it will be copied to any generated 2001 * fragments. 2002 */ 2003 if ((pktlen > ixa->ixa_fragsize) && 2004 (ixaflags & IXAF_DONTFRAG)) { 2005 /* Generate ICMP and return error */ 2006 ip_recv_attr_t iras; 2007 2008 DTRACE_PROBE4(ip4__fragsize__fail, uint_t, pktlen, 2009 uint_t, ixa->ixa_fragsize, uint_t, ixa->ixa_pktlen, 2010 uint_t, ixa->ixa_pmtu); 2011 2012 bzero(&iras, sizeof (iras)); 2013 /* Map ixa to ira including IPsec policies */ 2014 ipsec_out_to_in(ixa, ill, &iras); 2015 2016 ip_drop_output("ICMP_FRAG_NEEDED", mp, ill); 2017 icmp_frag_needed(mp, ixa->ixa_fragsize, &iras); 2018 /* We moved any IPsec refs from ixa to iras */ 2019 ira_cleanup(&iras, B_FALSE); 2020 return (EMSGSIZE); 2021 } 2022 DTRACE_PROBE4(ip4__fragsize__ok, uint_t, pktlen, 2023 uint_t, ixa->ixa_fragsize, uint_t, ixa->ixa_pktlen, 2024 uint_t, ixa->ixa_pmtu); 2025 2026 if (ixaflags & IXAF_IPSEC_SECURE) { 2027 /* 2028 * Pass in sufficient information so that 2029 * IPsec can determine whether to fragment, and 2030 * which function to call after fragmentation. 2031 */ 2032 return (ipsec_out_process(mp, ixa)); 2033 } 2034 return (ip_fragment_v4(mp, ixa->ixa_nce, ixaflags, 2035 ixa->ixa_pktlen, ixa->ixa_fragsize, ixa->ixa_xmit_hint, 2036 ixa->ixa_zoneid, ixa->ixa_no_loop_zoneid, 2037 ixa->ixa_postfragfn, &ixa->ixa_cookie)); 2038 } 2039 2040 if (ixaflags & IXAF_SET_ULP_CKSUM) { 2041 /* Compute ULP checksum and IP header checksum */ 2042 /* An IS_UNDER_IPMP ill is ok here */ 2043 if (!ip_output_cksum_v4(ixaflags, mp, ipha, ixa, ill)) { 2044 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); 2045 ip_drop_output("ipIfStatsOutDiscards", mp, ill); 2046 freemsg(mp); 2047 return (EINVAL); 2048 } 2049 } else { 2050 /* Calculate IPv4 header checksum */ 2051 ipha->ipha_hdr_checksum = 0; 2052 ipha->ipha_hdr_checksum = ip_csum_hdr(ipha); 2053 } 2054 2055 sendit: 2056 return ((ixa->ixa_postfragfn)(mp, ixa->ixa_nce, ixaflags, 2057 ixa->ixa_pktlen, ixa->ixa_xmit_hint, ixa->ixa_zoneid, 2058 ixa->ixa_no_loop_zoneid, &ixa->ixa_cookie)); 2059 } 2060 2061 /* 2062 * Send mp into ip_input 2063 * Common for IPv4 and IPv6 2064 */ 2065 void 2066 ip_postfrag_loopback(mblk_t *mp, nce_t *nce, iaflags_t ixaflags, 2067 uint_t pkt_len, zoneid_t nolzid) 2068 { 2069 rtc_t rtc; 2070 ill_t *ill = nce->nce_ill; 2071 ip_recv_attr_t iras; /* NOTE: No bzero for performance */ 2072 ncec_t *ncec; 2073 2074 ncec = nce->nce_common; 2075 iras.ira_flags = IRAF_VERIFY_IP_CKSUM | IRAF_VERIFY_ULP_CKSUM | 2076 IRAF_LOOPBACK | IRAF_L2SRC_LOOPBACK; 2077 if (ncec->ncec_flags & NCE_F_BCAST) 2078 iras.ira_flags |= IRAF_L2DST_BROADCAST; 2079 else if (ncec->ncec_flags & NCE_F_MCAST) 2080 iras.ira_flags |= IRAF_L2DST_MULTICAST; 2081 2082 iras.ira_free_flags = 0; 2083 iras.ira_cred = NULL; 2084 iras.ira_cpid = NOPID; 2085 iras.ira_tsl = NULL; 2086 iras.ira_zoneid = ALL_ZONES; 2087 iras.ira_pktlen = pkt_len; 2088 UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCInOctets, iras.ira_pktlen); 2089 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInReceives); 2090 2091 if (ixaflags & IXAF_IS_IPV4) 2092 iras.ira_flags |= IRAF_IS_IPV4; 2093 2094 iras.ira_ill = iras.ira_rill = ill; 2095 iras.ira_ruifindex = ill->ill_phyint->phyint_ifindex; 2096 iras.ira_rifindex = iras.ira_ruifindex; 2097 iras.ira_mhip = NULL; 2098 2099 iras.ira_flags |= ixaflags & IAF_MASK; 2100 iras.ira_no_loop_zoneid = nolzid; 2101 2102 /* Broadcast and multicast doesn't care about the squeue */ 2103 iras.ira_sqp = NULL; 2104 2105 rtc.rtc_ire = NULL; 2106 if (ixaflags & IXAF_IS_IPV4) { 2107 ipha_t *ipha = (ipha_t *)mp->b_rptr; 2108 2109 rtc.rtc_ipaddr = INADDR_ANY; 2110 2111 (*ill->ill_inputfn)(mp, ipha, &ipha->ipha_dst, &iras, &rtc); 2112 if (rtc.rtc_ire != NULL) { 2113 ASSERT(rtc.rtc_ipaddr != INADDR_ANY); 2114 ire_refrele(rtc.rtc_ire); 2115 } 2116 } else { 2117 ip6_t *ip6h = (ip6_t *)mp->b_rptr; 2118 2119 rtc.rtc_ip6addr = ipv6_all_zeros; 2120 2121 (*ill->ill_inputfn)(mp, ip6h, &ip6h->ip6_dst, &iras, &rtc); 2122 if (rtc.rtc_ire != NULL) { 2123 ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&rtc.rtc_ip6addr)); 2124 ire_refrele(rtc.rtc_ire); 2125 } 2126 } 2127 /* Any references to clean up? No hold on ira */ 2128 if (iras.ira_flags & (IRAF_IPSEC_SECURE|IRAF_SYSTEM_LABELED)) 2129 ira_cleanup(&iras, B_FALSE); 2130 } 2131 2132 /* 2133 * Post fragmentation function for IRE_MULTICAST and IRE_BROADCAST which 2134 * looks at the IXAF_LOOPBACK_COPY flag. 2135 * Common for IPv4 and IPv6. 2136 * 2137 * If the loopback copy fails (due to no memory) but we send the packet out 2138 * on the wire we return no failure. Only in the case we supress the wire 2139 * sending do we take the loopback failure into account. 2140 * 2141 * Note that we do not perform DTRACE_IP7 and FW_HOOKS for the looped back copy. 2142 * Those operations are performed on this packet in ip_xmit() and it would 2143 * be odd to do it twice for the same packet. 2144 */ 2145 int 2146 ip_postfrag_loopcheck(mblk_t *mp, nce_t *nce, iaflags_t ixaflags, 2147 uint_t pkt_len, uint32_t xmit_hint, zoneid_t szone, zoneid_t nolzid, 2148 uintptr_t *ixacookie) 2149 { 2150 ill_t *ill = nce->nce_ill; 2151 int error = 0; 2152 2153 /* 2154 * Check for IXAF_LOOPBACK_COPY - send a copy to ip as if the driver 2155 * had looped it back 2156 */ 2157 if (ixaflags & IXAF_LOOPBACK_COPY) { 2158 mblk_t *mp1; 2159 2160 mp1 = copymsg(mp); 2161 if (mp1 == NULL) { 2162 /* Failed to deliver the loopback copy. */ 2163 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); 2164 ip_drop_output("ipIfStatsOutDiscards", mp, ill); 2165 error = ENOBUFS; 2166 } else { 2167 ip_postfrag_loopback(mp1, nce, ixaflags, pkt_len, 2168 nolzid); 2169 } 2170 } 2171 2172 /* 2173 * If TTL = 0 then only do the loopback to this host i.e. we are 2174 * done. We are also done if this was the 2175 * loopback interface since it is sufficient 2176 * to loopback one copy of a multicast packet. 2177 */ 2178 if (ixaflags & IXAF_IS_IPV4) { 2179 ipha_t *ipha = (ipha_t *)mp->b_rptr; 2180 2181 if (ipha->ipha_ttl == 0) { 2182 ip_drop_output("multicast ipha_ttl not sent to wire", 2183 mp, ill); 2184 freemsg(mp); 2185 return (error); 2186 } 2187 } else { 2188 ip6_t *ip6h = (ip6_t *)mp->b_rptr; 2189 2190 if (ip6h->ip6_hops == 0) { 2191 ip_drop_output("multicast ipha_ttl not sent to wire", 2192 mp, ill); 2193 freemsg(mp); 2194 return (error); 2195 } 2196 } 2197 if (nce->nce_ill->ill_wq == NULL) { 2198 /* Loopback interface */ 2199 ip_drop_output("multicast on lo0 not sent to wire", mp, ill); 2200 freemsg(mp); 2201 return (error); 2202 } 2203 2204 return (ip_xmit(mp, nce, ixaflags, pkt_len, xmit_hint, szone, 0, 2205 ixacookie)); 2206 } 2207 2208 /* 2209 * Post fragmentation function for RTF_MULTIRT routes. 2210 * Since IRE_BROADCASTs can have RTF_MULTIRT, this function 2211 * checks IXAF_LOOPBACK_COPY. 2212 * 2213 * If no packet is sent due to failures then we return an errno, but if at 2214 * least one succeeded we return zero. 2215 */ 2216 int 2217 ip_postfrag_multirt_v4(mblk_t *mp, nce_t *nce, iaflags_t ixaflags, 2218 uint_t pkt_len, uint32_t xmit_hint, zoneid_t szone, zoneid_t nolzid, 2219 uintptr_t *ixacookie) 2220 { 2221 irb_t *irb; 2222 ipha_t *ipha = (ipha_t *)mp->b_rptr; 2223 ire_t *ire; 2224 ire_t *ire1; 2225 mblk_t *mp1; 2226 nce_t *nce1; 2227 ill_t *ill = nce->nce_ill; 2228 ill_t *ill1; 2229 ip_stack_t *ipst = ill->ill_ipst; 2230 int error = 0; 2231 int num_sent = 0; 2232 int err; 2233 uint_t ire_type; 2234 ipaddr_t nexthop; 2235 2236 ASSERT(ixaflags & IXAF_IS_IPV4); 2237 2238 /* Check for IXAF_LOOPBACK_COPY */ 2239 if (ixaflags & IXAF_LOOPBACK_COPY) { 2240 mblk_t *mp1; 2241 2242 mp1 = copymsg(mp); 2243 if (mp1 == NULL) { 2244 /* Failed to deliver the loopback copy. */ 2245 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); 2246 ip_drop_output("ipIfStatsOutDiscards", mp, ill); 2247 error = ENOBUFS; 2248 } else { 2249 ip_postfrag_loopback(mp1, nce, ixaflags, pkt_len, 2250 nolzid); 2251 } 2252 } 2253 2254 /* 2255 * Loop over RTF_MULTIRT for ipha_dst in the same bucket. Send 2256 * a copy to each one. 2257 * Use the nce (nexthop) and ipha_dst to find the ire. 2258 * 2259 * MULTIRT is not designed to work with shared-IP zones thus we don't 2260 * need to pass a zoneid or a label to the IRE lookup. 2261 */ 2262 if (V4_PART_OF_V6(nce->nce_addr) == ipha->ipha_dst) { 2263 /* Broadcast and multicast case */ 2264 ire = ire_ftable_lookup_v4(ipha->ipha_dst, 0, 0, 0, 2265 NULL, ALL_ZONES, NULL, MATCH_IRE_DSTONLY, 0, ipst, NULL); 2266 } else { 2267 ipaddr_t v4addr = V4_PART_OF_V6(nce->nce_addr); 2268 2269 /* Unicast case */ 2270 ire = ire_ftable_lookup_v4(ipha->ipha_dst, 0, v4addr, 0, 2271 NULL, ALL_ZONES, NULL, MATCH_IRE_GW, 0, ipst, NULL); 2272 } 2273 2274 if (ire == NULL || 2275 (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) || 2276 !(ire->ire_flags & RTF_MULTIRT)) { 2277 /* Drop */ 2278 ip_drop_output("ip_postfrag_multirt didn't find route", 2279 mp, nce->nce_ill); 2280 if (ire != NULL) 2281 ire_refrele(ire); 2282 return (ENETUNREACH); 2283 } 2284 2285 irb = ire->ire_bucket; 2286 irb_refhold(irb); 2287 for (ire1 = irb->irb_ire; ire1 != NULL; ire1 = ire1->ire_next) { 2288 /* 2289 * For broadcast we can have a mixture of IRE_BROADCAST and 2290 * IRE_HOST due to the manually added IRE_HOSTs that are used 2291 * to trigger the creation of the special CGTP broadcast routes. 2292 * Thus we have to skip if ire_type doesn't match the original. 2293 */ 2294 if (IRE_IS_CONDEMNED(ire1) || 2295 !(ire1->ire_flags & RTF_MULTIRT) || 2296 ire1->ire_type != ire->ire_type) 2297 continue; 2298 2299 /* Do the ire argument one after the loop */ 2300 if (ire1 == ire) 2301 continue; 2302 2303 ill1 = ire_nexthop_ill(ire1); 2304 if (ill1 == NULL) { 2305 /* 2306 * This ire might not have been picked by 2307 * ire_route_recursive, in which case ire_dep might 2308 * not have been setup yet. 2309 * We kick ire_route_recursive to try to resolve 2310 * starting at ire1. 2311 */ 2312 ire_t *ire2; 2313 uint_t match_flags = MATCH_IRE_DSTONLY; 2314 2315 if (ire1->ire_ill != NULL) 2316 match_flags |= MATCH_IRE_ILL; 2317 ire2 = ire_route_recursive_impl_v4(ire1, 2318 ire1->ire_addr, ire1->ire_type, ire1->ire_ill, 2319 ire1->ire_zoneid, NULL, match_flags, 2320 IRR_ALLOCATE, 0, ipst, NULL, NULL, NULL); 2321 if (ire2 != NULL) 2322 ire_refrele(ire2); 2323 ill1 = ire_nexthop_ill(ire1); 2324 } 2325 2326 if (ill1 == NULL) { 2327 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); 2328 ip_drop_output("ipIfStatsOutDiscards - no ill", 2329 mp, ill); 2330 error = ENETUNREACH; 2331 continue; 2332 } 2333 2334 /* Pick the addr and type to use for arp_nce_init */ 2335 if (nce->nce_common->ncec_flags & NCE_F_BCAST) { 2336 ire_type = IRE_BROADCAST; 2337 nexthop = ire1->ire_gateway_addr; 2338 } else if (nce->nce_common->ncec_flags & NCE_F_MCAST) { 2339 ire_type = IRE_MULTICAST; 2340 nexthop = ipha->ipha_dst; 2341 } else { 2342 ire_type = ire1->ire_type; /* Doesn't matter */ 2343 nexthop = ire1->ire_gateway_addr; 2344 } 2345 2346 /* If IPMP meta or under, then we just drop */ 2347 if (ill1->ill_grp != NULL) { 2348 BUMP_MIB(ill1->ill_ip_mib, ipIfStatsOutDiscards); 2349 ip_drop_output("ipIfStatsOutDiscards - IPMP", 2350 mp, ill1); 2351 ill_refrele(ill1); 2352 error = ENETUNREACH; 2353 continue; 2354 } 2355 2356 nce1 = arp_nce_init(ill1, nexthop, ire_type); 2357 if (nce1 == NULL) { 2358 BUMP_MIB(ill1->ill_ip_mib, ipIfStatsOutDiscards); 2359 ip_drop_output("ipIfStatsOutDiscards - no nce", 2360 mp, ill1); 2361 ill_refrele(ill1); 2362 error = ENETUNREACH; 2363 continue; 2364 } 2365 mp1 = copymsg(mp); 2366 if (mp1 == NULL) { 2367 BUMP_MIB(ill1->ill_ip_mib, ipIfStatsOutDiscards); 2368 ip_drop_output("ipIfStatsOutDiscards", mp, ill1); 2369 nce_refrele(nce1); 2370 ill_refrele(ill1); 2371 error = ENOBUFS; 2372 continue; 2373 } 2374 /* Preserve HW checksum for this copy */ 2375 DB_CKSUMSTART(mp1) = DB_CKSUMSTART(mp); 2376 DB_CKSUMSTUFF(mp1) = DB_CKSUMSTUFF(mp); 2377 DB_CKSUMEND(mp1) = DB_CKSUMEND(mp); 2378 DB_CKSUMFLAGS(mp1) = DB_CKSUMFLAGS(mp); 2379 DB_LSOMSS(mp1) = DB_LSOMSS(mp); 2380 2381 ire1->ire_ob_pkt_count++; 2382 err = ip_xmit(mp1, nce1, ixaflags, pkt_len, xmit_hint, szone, 2383 0, ixacookie); 2384 if (err == 0) 2385 num_sent++; 2386 else 2387 error = err; 2388 nce_refrele(nce1); 2389 ill_refrele(ill1); 2390 } 2391 irb_refrele(irb); 2392 ire_refrele(ire); 2393 /* Finally, the main one */ 2394 err = ip_xmit(mp, nce, ixaflags, pkt_len, xmit_hint, szone, 0, 2395 ixacookie); 2396 if (err == 0) 2397 num_sent++; 2398 else 2399 error = err; 2400 if (num_sent > 0) 2401 return (0); 2402 else 2403 return (error); 2404 } 2405 2406 /* 2407 * Verify local connectivity. This check is called by ULP fusion code. 2408 * The generation number on an IRE_LOCAL or IRE_LOOPBACK only changes if 2409 * the interface is brought down and back up. So we simply fail the local 2410 * process. The caller, TCP Fusion, should unfuse the connection. 2411 */ 2412 boolean_t 2413 ip_output_verify_local(ip_xmit_attr_t *ixa) 2414 { 2415 ire_t *ire = ixa->ixa_ire; 2416 2417 if (!(ire->ire_type & (IRE_LOCAL | IRE_LOOPBACK))) 2418 return (B_FALSE); 2419 2420 return (ixa->ixa_ire->ire_generation == ixa->ixa_ire_generation); 2421 } 2422 2423 /* 2424 * Local process for ULP loopback, TCP Fusion. Handle both IPv4 and IPv6. 2425 * 2426 * The caller must call ip_output_verify_local() first. This function handles 2427 * IPobs, FW_HOOKS, and/or IPsec cases sequentially. 2428 */ 2429 mblk_t * 2430 ip_output_process_local(mblk_t *mp, ip_xmit_attr_t *ixa, boolean_t hooks_out, 2431 boolean_t hooks_in, conn_t *peer_connp) 2432 { 2433 ill_t *ill = ixa->ixa_ire->ire_ill; 2434 ipha_t *ipha = NULL; 2435 ip6_t *ip6h = NULL; 2436 ip_stack_t *ipst = ixa->ixa_ipst; 2437 iaflags_t ixaflags = ixa->ixa_flags; 2438 ip_recv_attr_t iras; 2439 int error; 2440 2441 ASSERT(mp != NULL); 2442 2443 if (ixaflags & IXAF_IS_IPV4) { 2444 ipha = (ipha_t *)mp->b_rptr; 2445 2446 /* 2447 * If a callback is enabled then we need to know the 2448 * source and destination zoneids for the packet. We already 2449 * have those handy. 2450 */ 2451 if (ipst->ips_ip4_observe.he_interested) { 2452 zoneid_t szone, dzone; 2453 zoneid_t stackzoneid; 2454 2455 stackzoneid = netstackid_to_zoneid( 2456 ipst->ips_netstack->netstack_stackid); 2457 2458 if (stackzoneid == GLOBAL_ZONEID) { 2459 /* Shared-IP zone */ 2460 dzone = ixa->ixa_ire->ire_zoneid; 2461 szone = ixa->ixa_zoneid; 2462 } else { 2463 szone = dzone = stackzoneid; 2464 } 2465 ipobs_hook(mp, IPOBS_HOOK_LOCAL, szone, dzone, ill, 2466 ipst); 2467 } 2468 DTRACE_IP7(send, mblk_t *, mp, conn_t *, NULL, void_ip_t *, 2469 ipha, __dtrace_ipsr_ill_t *, ill, ipha_t *, ipha, ip6_t *, 2470 NULL, int, 1); 2471 2472 /* FW_HOOKS: LOOPBACK_OUT */ 2473 if (hooks_out) { 2474 DTRACE_PROBE4(ip4__loopback__out__start, ill_t *, NULL, 2475 ill_t *, ill, ipha_t *, ipha, mblk_t *, mp); 2476 FW_HOOKS(ipst->ips_ip4_loopback_out_event, 2477 ipst->ips_ipv4firewall_loopback_out, 2478 NULL, ill, ipha, mp, mp, 0, ipst, error); 2479 DTRACE_PROBE1(ip4__loopback__out__end, mblk_t *, mp); 2480 } 2481 if (mp == NULL) 2482 return (NULL); 2483 2484 /* FW_HOOKS: LOOPBACK_IN */ 2485 if (hooks_in) { 2486 DTRACE_PROBE4(ip4__loopback__in__start, ill_t *, ill, 2487 ill_t *, NULL, ipha_t *, ipha, mblk_t *, mp); 2488 FW_HOOKS(ipst->ips_ip4_loopback_in_event, 2489 ipst->ips_ipv4firewall_loopback_in, 2490 ill, NULL, ipha, mp, mp, 0, ipst, error); 2491 DTRACE_PROBE1(ip4__loopback__in__end, mblk_t *, mp); 2492 } 2493 if (mp == NULL) 2494 return (NULL); 2495 2496 DTRACE_IP7(receive, mblk_t *, mp, conn_t *, NULL, void_ip_t *, 2497 ipha, __dtrace_ipsr_ill_t *, ill, ipha_t *, ipha, ip6_t *, 2498 NULL, int, 1); 2499 2500 /* Inbound IPsec polocies */ 2501 if (peer_connp != NULL) { 2502 /* Map ixa to ira including IPsec policies. */ 2503 ipsec_out_to_in(ixa, ill, &iras); 2504 mp = ipsec_check_inbound_policy(mp, peer_connp, ipha, 2505 NULL, &iras); 2506 } 2507 } else { 2508 ip6h = (ip6_t *)mp->b_rptr; 2509 2510 /* 2511 * If a callback is enabled then we need to know the 2512 * source and destination zoneids for the packet. We already 2513 * have those handy. 2514 */ 2515 if (ipst->ips_ip6_observe.he_interested) { 2516 zoneid_t szone, dzone; 2517 zoneid_t stackzoneid; 2518 2519 stackzoneid = netstackid_to_zoneid( 2520 ipst->ips_netstack->netstack_stackid); 2521 2522 if (stackzoneid == GLOBAL_ZONEID) { 2523 /* Shared-IP zone */ 2524 dzone = ixa->ixa_ire->ire_zoneid; 2525 szone = ixa->ixa_zoneid; 2526 } else { 2527 szone = dzone = stackzoneid; 2528 } 2529 ipobs_hook(mp, IPOBS_HOOK_LOCAL, szone, dzone, ill, 2530 ipst); 2531 } 2532 DTRACE_IP7(send, mblk_t *, mp, conn_t *, NULL, void_ip_t *, 2533 ip6h, __dtrace_ipsr_ill_t *, ill, ipha_t *, NULL, ip6_t *, 2534 ip6h, int, 1); 2535 2536 /* FW_HOOKS: LOOPBACK_OUT */ 2537 if (hooks_out) { 2538 DTRACE_PROBE4(ip6__loopback__out__start, ill_t *, NULL, 2539 ill_t *, ill, ip6_t *, ip6h, mblk_t *, mp); 2540 FW_HOOKS6(ipst->ips_ip6_loopback_out_event, 2541 ipst->ips_ipv6firewall_loopback_out, 2542 NULL, ill, ip6h, mp, mp, 0, ipst, error); 2543 DTRACE_PROBE1(ip6__loopback__out__end, mblk_t *, mp); 2544 } 2545 if (mp == NULL) 2546 return (NULL); 2547 2548 /* FW_HOOKS: LOOPBACK_IN */ 2549 if (hooks_in) { 2550 DTRACE_PROBE4(ip6__loopback__in__start, ill_t *, ill, 2551 ill_t *, NULL, ip6_t *, ip6h, mblk_t *, mp); 2552 FW_HOOKS6(ipst->ips_ip6_loopback_in_event, 2553 ipst->ips_ipv6firewall_loopback_in, 2554 ill, NULL, ip6h, mp, mp, 0, ipst, error); 2555 DTRACE_PROBE1(ip6__loopback__in__end, mblk_t *, mp); 2556 } 2557 if (mp == NULL) 2558 return (NULL); 2559 2560 DTRACE_IP7(receive, mblk_t *, mp, conn_t *, NULL, void_ip_t *, 2561 ip6h, __dtrace_ipsr_ill_t *, ill, ipha_t *, NULL, ip6_t *, 2562 ip6h, int, 1); 2563 2564 /* Inbound IPsec polocies */ 2565 if (peer_connp != NULL) { 2566 /* Map ixa to ira including IPsec policies. */ 2567 ipsec_out_to_in(ixa, ill, &iras); 2568 mp = ipsec_check_inbound_policy(mp, peer_connp, NULL, 2569 ip6h, &iras); 2570 } 2571 } 2572 2573 if (mp == NULL) { 2574 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 2575 ip_drop_input("ipIfStatsInDiscards", NULL, ill); 2576 } 2577 2578 return (mp); 2579 } 2580