1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright 2019 Joyent, Inc. 24 * Copyright 2025 Oxide Computer Company 25 */ 26 27 /* 28 * MAC Services Module - misc utilities 29 */ 30 31 #include <sys/types.h> 32 #include <sys/mac.h> 33 #include <sys/mac_impl.h> 34 #include <sys/mac_client_priv.h> 35 #include <sys/mac_client_impl.h> 36 #include <sys/mac_soft_ring.h> 37 #include <sys/strsubr.h> 38 #include <sys/strsun.h> 39 #include <sys/vlan.h> 40 #include <sys/pattr.h> 41 #include <sys/pci_tools.h> 42 #include <inet/ip.h> 43 #include <inet/ip_impl.h> 44 #include <inet/ip6.h> 45 #include <sys/vtrace.h> 46 #include <sys/dlpi.h> 47 #include <sys/sunndi.h> 48 #include <inet/ipsec_impl.h> 49 #include <inet/sadb.h> 50 #include <inet/ipsecesp.h> 51 #include <inet/ipsecah.h> 52 #include <inet/tcp.h> 53 #include <inet/sctp_ip.h> 54 55 /* 56 * The next two functions are used for dropping packets or chains of 57 * packets, respectively. We could use one function for both but 58 * separating the use cases allows us to specify intent and prevent 59 * dropping more data than intended. 60 * 61 * The purpose of these functions is to aid the debugging effort, 62 * especially in production. Rather than use freemsg()/freemsgchain(), 63 * it's preferable to use these functions when dropping a packet in 64 * the MAC layer. These functions should only be used during 65 * unexpected conditions. That is, any time a packet is dropped 66 * outside of the regular, successful datapath. Consolidating all 67 * drops on these functions allows the user to trace one location and 68 * determine why the packet was dropped based on the msg. It also 69 * allows the user to inspect the packet before it is freed. Finally, 70 * it allows the user to avoid tracing freemsg()/freemsgchain() thus 71 * keeping the hot path running as efficiently as possible. 72 * 73 * NOTE: At this time not all MAC drops are aggregated on these 74 * functions; but that is the plan. This comment should be erased once 75 * completed. 76 */ 77 78 /*PRINTFLIKE2*/ 79 void 80 mac_drop_pkt(mblk_t *mp, const char *fmt, ...) 81 { 82 va_list adx; 83 char msg[128]; 84 char *msgp = msg; 85 86 ASSERT3P(mp->b_next, ==, NULL); 87 88 va_start(adx, fmt); 89 (void) vsnprintf(msgp, sizeof (msg), fmt, adx); 90 va_end(adx); 91 92 DTRACE_PROBE2(mac__drop, mblk_t *, mp, char *, msgp); 93 freemsg(mp); 94 } 95 96 /*PRINTFLIKE2*/ 97 void 98 mac_drop_chain(mblk_t *chain, const char *fmt, ...) 99 { 100 va_list adx; 101 char msg[128]; 102 char *msgp = msg; 103 104 va_start(adx, fmt); 105 (void) vsnprintf(msgp, sizeof (msg), fmt, adx); 106 va_end(adx); 107 108 /* 109 * We could use freemsgchain() for the actual freeing but 110 * since we are already walking the chain to fire the dtrace 111 * probe we might as well free the msg here too. 112 */ 113 for (mblk_t *mp = chain, *next; mp != NULL; ) { 114 next = mp->b_next; 115 DTRACE_PROBE2(mac__drop, mblk_t *, mp, char *, msgp); 116 mp->b_next = NULL; 117 freemsg(mp); 118 mp = next; 119 } 120 } 121 122 /* 123 * Copy an mblk, preserving its hardware checksum flags. 124 */ 125 static mblk_t * 126 mac_copymsg_cksum(mblk_t *mp) 127 { 128 mblk_t *mp1; 129 130 mp1 = copymsg(mp); 131 if (mp1 == NULL) 132 return (NULL); 133 134 mac_hcksum_clone(mp, mp1); 135 136 return (mp1); 137 } 138 139 /* 140 * Copy an mblk chain, presenting the hardware checksum flags of the 141 * individual mblks. 142 */ 143 mblk_t * 144 mac_copymsgchain_cksum(mblk_t *mp) 145 { 146 mblk_t *nmp = NULL; 147 mblk_t **nmpp = &nmp; 148 149 for (; mp != NULL; mp = mp->b_next) { 150 if ((*nmpp = mac_copymsg_cksum(mp)) == NULL) { 151 freemsgchain(nmp); 152 return (NULL); 153 } 154 155 nmpp = &((*nmpp)->b_next); 156 } 157 158 return (nmp); 159 } 160 161 /* 162 * Perform software checksum on a single message, if needed. The emulation 163 * performed is determined by an intersection of the mblk's flags and the emul 164 * flags requested. The emul flags are documented in mac.h. 165 */ 166 static mblk_t * 167 mac_sw_cksum(mblk_t *mp, mac_emul_t emul) 168 { 169 mac_ether_offload_info_t meoi = { 0 }; 170 const char *err = ""; 171 172 /* 173 * The only current caller is mac_hw_emul(), which handles any chaining 174 * of mblks prior to now. 175 */ 176 VERIFY3P(mp->b_next, ==, NULL); 177 178 uint32_t flags = DB_CKSUMFLAGS(mp); 179 180 /* Why call this if checksum emulation isn't needed? */ 181 ASSERT3U(flags & (HCK_FLAGS), !=, 0); 182 /* But also, requesting both ULP cksum types is improper */ 183 if ((flags & HCK_FULLCKSUM) != 0 && (flags & HCK_PARTIALCKSUM) != 0) { 184 err = "full and partial ULP cksum requested"; 185 goto bail; 186 } 187 188 const boolean_t do_v4_cksum = (emul & MAC_IPCKSUM_EMUL) != 0 && 189 (flags & HCK_IPV4_HDRCKSUM) != 0; 190 const boolean_t do_ulp_cksum = (emul & MAC_HWCKSUM_EMUL) != 0 && 191 (flags & (HCK_FULLCKSUM | HCK_PARTIALCKSUM)) != 0; 192 const boolean_t ulp_prefer_partial = (flags & HCK_PARTIALCKSUM) != 0; 193 194 mac_ether_offload_info(mp, &meoi); 195 if ((meoi.meoi_flags & MEOI_L2INFO_SET) == 0 || 196 (meoi.meoi_l3proto != ETHERTYPE_IP && 197 meoi.meoi_l3proto != ETHERTYPE_IPV6)) { 198 /* Non-IP traffic (like ARP) is left alone */ 199 return (mp); 200 } 201 202 /* 203 * Ensure that requested checksum type(s) are supported by the 204 * protocols encoded in the packet headers. 205 */ 206 if (do_v4_cksum) { 207 if (meoi.meoi_l3proto != ETHERTYPE_IP) { 208 err = "IPv4 csum requested on non-IPv4 packet"; 209 goto bail; 210 } 211 } 212 if (do_ulp_cksum) { 213 if ((meoi.meoi_flags & MEOI_L4INFO_SET) == 0) { 214 err = "missing ULP header"; 215 goto bail; 216 } 217 switch (meoi.meoi_l4proto) { 218 case IPPROTO_TCP: 219 case IPPROTO_UDP: 220 case IPPROTO_ICMP: 221 case IPPROTO_ICMPV6: 222 case IPPROTO_SCTP: 223 break; 224 default: 225 err = "unexpected ULP"; 226 goto bail; 227 } 228 } 229 230 /* 231 * If the first mblk of this packet contains only the Ethernet header, 232 * skip past it for now. Packets with their data contained in only a 233 * single mblk can then use the fastpaths tuned to that possibility. 234 */ 235 mblk_t *skipped_hdr = NULL; 236 if (MBLKL(mp) == meoi.meoi_l2hlen) { 237 meoi.meoi_len -= meoi.meoi_l2hlen; 238 meoi.meoi_l2hlen = 0; 239 skipped_hdr = mp; 240 mp = mp->b_cont; 241 242 ASSERT(mp != NULL); 243 } 244 245 /* 246 * Ensure that all of the headers we need to access are: 247 * 1. Collected in the first mblk 248 * 2. Held in a data-block which is safe for us to modify 249 * (It must have a refcount of 1) 250 */ 251 const size_t hdr_len_reqd = (meoi.meoi_l2hlen + meoi.meoi_l3hlen) + 252 (do_ulp_cksum ? meoi.meoi_l4hlen : 0); 253 if (MBLKL(mp) < hdr_len_reqd || DB_REF(mp) > 1) { 254 mblk_t *hdrmp = msgpullup(mp, hdr_len_reqd); 255 256 if (hdrmp == NULL) { 257 err = "could not pullup msg headers"; 258 goto bail; 259 } 260 261 mac_hcksum_clone(mp, hdrmp); 262 if (skipped_hdr != NULL) { 263 ASSERT3P(skipped_hdr->b_cont, ==, mp); 264 skipped_hdr->b_cont = hdrmp; 265 } 266 freemsg(mp); 267 mp = hdrmp; 268 } 269 270 /* Calculate IPv4 header checksum, if requested */ 271 if (do_v4_cksum) { 272 /* 273 * While unlikely, it's possible to write code that might end up 274 * calling mac_sw_cksum() twice on the same mblk (performing 275 * both LSO and checksum emulation in a single mblk chain loop 276 * -- the LSO emulation inserts a new chain into the existing 277 * chain and then the loop iterates back over the new segments 278 * and emulates the checksum a second time). Normally this 279 * wouldn't be a problem, because the HCK_*_OK flags are 280 * supposed to indicate that we don't need to do peform the 281 * work. But HCK_IPV4_HDRCKSUM and HCK_IPV4_HDRCKSUM_OK have the 282 * same value; so we cannot use these flags to determine if the 283 * IP header checksum has already been calculated or not. For 284 * this reason, we zero out the the checksum first. In the 285 * future, we should fix the HCK_* flags. 286 */ 287 ipha_t *ipha = (ipha_t *)(mp->b_rptr + meoi.meoi_l2hlen); 288 ipha->ipha_hdr_checksum = 0; 289 ipha->ipha_hdr_checksum = (uint16_t)ip_csum_hdr(ipha); 290 flags &= ~HCK_IPV4_HDRCKSUM; 291 flags |= HCK_IPV4_HDRCKSUM_OK; 292 } 293 294 /* 295 * The SCTP is different from all the other protocols in that it uses 296 * CRC32 for its checksum, rather than ones' complement. 297 */ 298 if (do_ulp_cksum && meoi.meoi_l4proto == IPPROTO_SCTP) { 299 if (ulp_prefer_partial) { 300 err = "SCTP does not support partial checksum"; 301 goto bail; 302 } 303 304 const uint_t ulp_off = meoi.meoi_l2hlen + meoi.meoi_l3hlen; 305 sctp_hdr_t *sctph = (sctp_hdr_t *)(mp->b_rptr + ulp_off); 306 307 sctph->sh_chksum = 0; 308 sctph->sh_chksum = sctp_cksum(mp, ulp_off); 309 310 flags &= ~HCK_FULLCKSUM; 311 flags |= HCK_FULLCKSUM_OK; 312 goto success; 313 } 314 315 /* Calculate full ULP checksum, if requested */ 316 if (do_ulp_cksum && !ulp_prefer_partial) { 317 /* 318 * Calculate address and length portions of pseudo-header csum 319 */ 320 uint32_t cksum = 0; 321 if (meoi.meoi_l3proto == ETHERTYPE_IP) { 322 const ipha_t *ipha = 323 (const ipha_t *)(mp->b_rptr + meoi.meoi_l2hlen); 324 const uint16_t *ipp = 325 (const uint16_t *)(&ipha->ipha_src); 326 327 cksum += ipp[0] + ipp[1] + ipp[2] + ipp[3]; 328 329 /* 330 * While it is tempting to calculate the payload length 331 * solely from `meoi`, like as done below for IPv6, 332 * doing so is a trap. Packets shorter than 60 bytes 333 * will get padded out to that length in order to meet 334 * the minimums for Ethernet. Instead, we pull the 335 * length from the IP header. 336 */ 337 const uint16_t payload_len = 338 ntohs(ipha->ipha_length) - meoi.meoi_l3hlen; 339 cksum += htons(payload_len); 340 } else if (meoi.meoi_l3proto == ETHERTYPE_IPV6) { 341 const ip6_t *ip6h = 342 (const ip6_t *)(mp->b_rptr + meoi.meoi_l2hlen); 343 const uint16_t *ipp = 344 (const uint16_t *)(&ip6h->ip6_src); 345 346 cksum += ipp[0] + ipp[1] + ipp[2] + ipp[3] + 347 ipp[4] + ipp[5] + ipp[6] + ipp[7]; 348 cksum += ipp[8] + ipp[9] + ipp[10] + ipp[11] + 349 ipp[12] + ipp[13] + ipp[14] + ipp[15]; 350 351 const uint16_t payload_len = meoi.meoi_len - 352 ((uint16_t)meoi.meoi_l2hlen + meoi.meoi_l3hlen); 353 cksum += htons(payload_len); 354 } else { 355 /* 356 * Since we already checked for recognized L3 protocols 357 * earlier, this should not be reachable. 358 */ 359 panic("L3 protocol unexpectedly changed"); 360 } 361 362 /* protocol portion of pseudo-header */ 363 uint_t cksum_off; 364 switch (meoi.meoi_l4proto) { 365 case IPPROTO_TCP: 366 cksum += IP_TCP_CSUM_COMP; 367 cksum_off = TCP_CHECKSUM_OFFSET; 368 break; 369 case IPPROTO_UDP: 370 cksum += IP_UDP_CSUM_COMP; 371 cksum_off = UDP_CHECKSUM_OFFSET; 372 break; 373 case IPPROTO_ICMP: 374 /* ICMP cksum does not include pseudo-header contents */ 375 cksum = 0; 376 cksum_off = ICMP_CHECKSUM_OFFSET; 377 break; 378 case IPPROTO_ICMPV6: 379 cksum += IP_ICMPV6_CSUM_COMP; 380 cksum_off = ICMPV6_CHECKSUM_OFFSET; 381 break; 382 default: 383 err = "unrecognized L4 protocol"; 384 goto bail; 385 } 386 387 /* 388 * With IP_CSUM() taking into account the pseudo-header 389 * checksum, make sure the ULP checksum field is zeroed before 390 * computing the rest; 391 */ 392 const uint_t l4_off = meoi.meoi_l3hlen + meoi.meoi_l2hlen; 393 uint16_t *up = (uint16_t *)(mp->b_rptr + l4_off + cksum_off); 394 *up = 0; 395 cksum = IP_CSUM(mp, l4_off, cksum); 396 397 if (meoi.meoi_l4proto == IPPROTO_UDP && cksum == 0) { 398 /* 399 * A zero checksum is not allowed on UDPv6, and on UDPv4 400 * implies no checksum. In either case, invert to a 401 * values of all-1s. 402 */ 403 *up = 0xffff; 404 } else { 405 *up = cksum; 406 } 407 408 flags &= ~HCK_FULLCKSUM; 409 flags |= HCK_FULLCKSUM_OK; 410 goto success; 411 } 412 413 /* Calculate partial ULP checksum, if requested */ 414 if (do_ulp_cksum && ulp_prefer_partial) { 415 uint32_t start, stuff, end, value; 416 mac_hcksum_get(mp, &start, &stuff, &end, &value, NULL); 417 418 ASSERT3S(end, >, start); 419 420 /* 421 * The prior size checks against the header length data ensure 422 * that the mblk contains everything through at least the ULP 423 * header, but if the partial checksum (unexpectedly) requests 424 * its result be stored past that, we cannot continue. 425 */ 426 if (stuff + sizeof (uint16_t) > MBLKL(mp)) { 427 err = "partial csum request is out of bounds"; 428 goto bail; 429 } 430 431 uchar_t *ipp = (uchar_t *)(mp->b_rptr + meoi.meoi_l2hlen); 432 uint16_t *up = (uint16_t *)(ipp + stuff); 433 434 const uint16_t partial = *up; 435 *up = 0; 436 const uint16_t cksum = 437 ~IP_CSUM_PARTIAL(mp, start + meoi.meoi_l2hlen, partial); 438 *up = cksum != 0 ? cksum : ~cksum; 439 440 flags &= ~HCK_PARTIALCKSUM; 441 flags |= HCK_FULLCKSUM_OK; 442 } 443 444 success: 445 /* 446 * With the checksum(s) calculated, store the updated flags to reflect 447 * the current status, and zero out any of the partial-checksum fields 448 * which would be irrelevant now. 449 */ 450 mac_hcksum_set(mp, 0, 0, 0, 0, flags); 451 452 /* Don't forget to reattach the header. */ 453 if (skipped_hdr != NULL) { 454 ASSERT3P(skipped_hdr->b_cont, ==, mp); 455 456 /* 457 * Duplicate the HCKSUM data into the header mblk. 458 * 459 * This mimics mac_add_vlan_tag() which ensures that both the 460 * first mblk _and_ the first data bearing mblk possess the 461 * HCKSUM information. Consumers like IP will end up discarding 462 * the ether_header mblk, so for now, it is important that the 463 * data be available in both places. 464 */ 465 mac_hcksum_clone(mp, skipped_hdr); 466 mp = skipped_hdr; 467 } 468 return (mp); 469 470 bail: 471 if (skipped_hdr != NULL) { 472 ASSERT3P(skipped_hdr->b_cont, ==, mp); 473 mp = skipped_hdr; 474 } 475 476 mac_drop_pkt(mp, err); 477 return (NULL); 478 } 479 480 /* 481 * Build a single data segment from an LSO packet. The mblk chain 482 * returned, seg_head, represents the data segment and is always 483 * exactly seg_len bytes long. The lso_mp and offset input/output 484 * parameters track our position in the LSO packet. This function 485 * exists solely as a helper to mac_sw_lso(). 486 * 487 * Case A 488 * 489 * The current lso_mp is larger than the requested seg_len. The 490 * beginning of seg_head may start at the beginning of lso_mp or 491 * offset into it. In either case, a single mblk is returned, and 492 * *offset is updated to reflect our new position in the current 493 * lso_mp. 494 * 495 * +----------------------------+ 496 * | in *lso_mp / out *lso_mp | 497 * +----------------------------+ 498 * ^ ^ 499 * | | 500 * | | 501 * | | 502 * +------------------------+ 503 * | seg_head | 504 * +------------------------+ 505 * ^ ^ 506 * | | 507 * in *offset = 0 out *offset = seg_len 508 * 509 * |------ seg_len ----| 510 * 511 * 512 * +------------------------------+ 513 * | in *lso_mp / out *lso_mp | 514 * +------------------------------+ 515 * ^ ^ 516 * | | 517 * | | 518 * | | 519 * +------------------------+ 520 * | seg_head | 521 * +------------------------+ 522 * ^ ^ 523 * | | 524 * in *offset = N out *offset = N + seg_len 525 * 526 * |------ seg_len ----| 527 * 528 * 529 * 530 * Case B 531 * 532 * The requested seg_len consumes exactly the rest of the lso_mp. 533 * I.e., the seg_head's b_wptr is equivalent to lso_mp's b_wptr. 534 * The seg_head may start at the beginning of the lso_mp or at some 535 * offset into it. In either case we return a single mblk, reset 536 * *offset to zero, and walk to the next lso_mp. 537 * 538 * +------------------------+ +------------------------+ 539 * | in *lso_mp |---------->| out *lso_mp | 540 * +------------------------+ +------------------------+ 541 * ^ ^ ^ 542 * | | | 543 * | | out *offset = 0 544 * | | 545 * +------------------------+ 546 * | seg_head | 547 * +------------------------+ 548 * ^ 549 * | 550 * in *offset = 0 551 * 552 * |------ seg_len ----| 553 * 554 * 555 * 556 * +----------------------------+ +------------------------+ 557 * | in *lso_mp |---------->| out *lso_mp | 558 * +----------------------------+ +------------------------+ 559 * ^ ^ ^ 560 * | | | 561 * | | out *offset = 0 562 * | | 563 * +------------------------+ 564 * | seg_head | 565 * +------------------------+ 566 * ^ 567 * | 568 * in *offset = N 569 * 570 * |------ seg_len ----| 571 * 572 * 573 * Case C 574 * 575 * The requested seg_len is greater than the current lso_mp. In 576 * this case we must consume LSO mblks until we have enough data to 577 * satisfy either case (A) or (B) above. We will return multiple 578 * mblks linked via b_cont, offset will be set based on the cases 579 * above, and lso_mp will walk forward at least one mblk, but maybe 580 * more. 581 * 582 * N.B. This digram is not exhaustive. The seg_head may start on 583 * the beginning of an lso_mp. The seg_tail may end exactly on the 584 * boundary of an lso_mp. And there may be two (in this case the 585 * middle block wouldn't exist), three, or more mblks in the 586 * seg_head chain. This is meant as one example of what might 587 * happen. The main thing to remember is that the seg_tail mblk 588 * must be one of case (A) or (B) above. 589 * 590 * +------------------+ +----------------+ +------------------+ 591 * | in *lso_mp |--->| *lso_mp |--->| out *lso_mp | 592 * +------------------+ +----------------+ +------------------+ 593 * ^ ^ ^ ^ ^ ^ 594 * | | | | | | 595 * | | | | | | 596 * | | | | | | 597 * | | | | | | 598 * +------------+ +----------------+ +------------+ 599 * | seg_head |--->| |--->| seg_tail | 600 * +------------+ +----------------+ +------------+ 601 * ^ ^ 602 * | | 603 * in *offset = N out *offset = MBLKL(seg_tail) 604 * 605 * |------------------- seg_len -------------------| 606 * 607 */ 608 static mblk_t * 609 build_data_seg(mblk_t **lso_mp, uint32_t *offset, uint32_t seg_len) 610 { 611 mblk_t *seg_head, *seg_tail, *seg_mp; 612 613 ASSERT3P(*lso_mp, !=, NULL); 614 ASSERT3U((*lso_mp)->b_rptr + *offset, <, (*lso_mp)->b_wptr); 615 616 seg_mp = dupb(*lso_mp); 617 if (seg_mp == NULL) 618 return (NULL); 619 620 seg_head = seg_mp; 621 seg_tail = seg_mp; 622 623 /* Continue where we left off from in the lso_mp. */ 624 seg_mp->b_rptr += *offset; 625 626 last_mblk: 627 /* Case (A) */ 628 if ((seg_mp->b_rptr + seg_len) < seg_mp->b_wptr) { 629 *offset += seg_len; 630 seg_mp->b_wptr = seg_mp->b_rptr + seg_len; 631 return (seg_head); 632 } 633 634 /* Case (B) */ 635 if ((seg_mp->b_rptr + seg_len) == seg_mp->b_wptr) { 636 *offset = 0; 637 *lso_mp = (*lso_mp)->b_cont; 638 return (seg_head); 639 } 640 641 /* Case (C) */ 642 ASSERT3U(seg_mp->b_rptr + seg_len, >, seg_mp->b_wptr); 643 644 /* 645 * The current LSO mblk doesn't have enough data to satisfy 646 * seg_len -- continue peeling off LSO mblks to build the new 647 * segment message. If allocation fails we free the previously 648 * allocated segment mblks and return NULL. 649 */ 650 while ((seg_mp->b_rptr + seg_len) > seg_mp->b_wptr) { 651 ASSERT3U(MBLKL(seg_mp), <=, seg_len); 652 seg_len -= MBLKL(seg_mp); 653 *offset = 0; 654 *lso_mp = (*lso_mp)->b_cont; 655 seg_mp = dupb(*lso_mp); 656 657 if (seg_mp == NULL) { 658 freemsgchain(seg_head); 659 return (NULL); 660 } 661 662 seg_tail->b_cont = seg_mp; 663 seg_tail = seg_mp; 664 } 665 666 /* 667 * We've walked enough LSO mblks that we can now satisfy the 668 * remaining seg_len. At this point we need to jump back to 669 * determine if we have arrived at case (A) or (B). 670 */ 671 672 /* Just to be paranoid that we didn't underflow. */ 673 ASSERT3U(seg_len, <, IP_MAXPACKET); 674 ASSERT3U(seg_len, >, 0); 675 goto last_mblk; 676 } 677 678 /* 679 * Perform software segmentation of a single LSO message. Take an LSO 680 * message as input and return head/tail pointers as output. This 681 * function should not be invoked directly but instead through 682 * mac_hw_emul(). 683 * 684 * The resulting chain is comprised of multiple (nsegs) MSS sized 685 * segments. Each segment will consist of two or more mblks joined by 686 * b_cont: a header and one or more data mblks. The header mblk is 687 * allocated anew for each message. The first segment's header is used 688 * as a template for the rest with adjustments made for things such as 689 * ID, sequence, length, TCP flags, etc. The data mblks reference into 690 * the existing LSO mblk (passed in as omp) by way of dupb(). Their 691 * b_rptr/b_wptr values are adjusted to reference only the fraction of 692 * the LSO message they are responsible for. At the successful 693 * completion of this function the original mblk (omp) is freed, 694 * leaving the newely created segment chain as the only remaining 695 * reference to the data. 696 */ 697 static void 698 mac_sw_lso(mblk_t *omp, mac_emul_t emul, mblk_t **head, mblk_t **tail, 699 uint_t *count) 700 { 701 uint32_t ocsum_flags, ocsum_start, ocsum_stuff; 702 uint32_t mss; 703 uint32_t oehlen, oiphlen, otcphlen, ohdrslen, opktlen; 704 uint32_t odatalen, oleft; 705 uint_t nsegs, seg; 706 int len; 707 708 const void *oiph; 709 const tcph_t *otcph; 710 ipha_t *niph; 711 tcph_t *ntcph; 712 uint16_t ip_id; 713 uint32_t tcp_seq, tcp_sum, otcp_sum; 714 715 boolean_t is_v6 = B_FALSE; 716 ip6_t *niph6; 717 718 uint32_t offset = 0; 719 mblk_t *odatamp; 720 mblk_t *seg_chain, *prev_nhdrmp, *next_nhdrmp, *nhdrmp, *ndatamp; 721 mblk_t *tmptail; 722 723 mac_ether_offload_info_t meoi = { 0 }; 724 725 ASSERT3P(head, !=, NULL); 726 ASSERT3P(tail, !=, NULL); 727 ASSERT3P(count, !=, NULL); 728 ASSERT3U((DB_CKSUMFLAGS(omp) & HW_LSO), !=, 0); 729 730 /* Assume we are dealing with a single LSO message. */ 731 ASSERT3P(omp->b_next, ==, NULL); 732 733 mac_ether_offload_info(omp, &meoi); 734 opktlen = meoi.meoi_len; 735 oehlen = meoi.meoi_l2hlen; 736 oiphlen = meoi.meoi_l3hlen; 737 otcphlen = meoi.meoi_l4hlen; 738 ohdrslen = oehlen + oiphlen + otcphlen; 739 740 /* Performing LSO requires that we successfully read fully up to L4 */ 741 if ((MEOI_L4INFO_SET & meoi.meoi_flags) == 0) { 742 mac_drop_pkt(omp, "unable to fully parse packet to L4"); 743 goto fail; 744 } 745 746 if (meoi.meoi_l3proto != ETHERTYPE_IP && 747 meoi.meoi_l3proto != ETHERTYPE_IPV6) { 748 mac_drop_pkt(omp, "LSO'd packet has non-IP L3 header: %x", 749 meoi.meoi_l3proto); 750 goto fail; 751 } 752 753 if (meoi.meoi_l4proto != IPPROTO_TCP) { 754 mac_drop_pkt(omp, "LSO unsupported protocol: %x", 755 meoi.meoi_l4proto); 756 goto fail; 757 } 758 759 is_v6 = meoi.meoi_l3proto == ETHERTYPE_IPV6; 760 761 mss = DB_LSOMSS(omp); 762 if (mss == 0) { 763 mac_drop_pkt(omp, "packet misconfigured for LSO (MSS == 0)"); 764 goto fail; 765 } 766 ASSERT3U(opktlen, <=, IP_MAXPACKET + oehlen); 767 768 /* 769 * Ensure the headers are contiguous. The IP header is used only for the 770 * benefit of DTrace SDTs, whereas the TCP header is actively read. 771 * This small pullup should only practically happen when 772 * mac_add_vlan_tag is in play, which prepends a new mblk in front 773 * containing the amended Ethernet header. 774 */ 775 if (MBLKL(omp) < ohdrslen) { 776 mblk_t *tmp = msgpullup(omp, ohdrslen); 777 778 if (tmp == NULL) { 779 mac_drop_pkt(omp, "failed to pull up"); 780 goto fail; 781 } 782 783 mac_hcksum_clone(omp, tmp); 784 freemsg(omp); 785 omp = tmp; 786 } 787 788 oiph = (void *)(omp->b_rptr + oehlen); 789 otcph = (tcph_t *)(omp->b_rptr + oehlen + oiphlen); 790 791 if (otcph->th_flags[0] & (TH_SYN | TH_RST | TH_URG)) { 792 mac_drop_pkt(omp, "LSO packet has SYN|RST|URG set"); 793 goto fail; 794 } 795 796 len = MBLKL(omp); 797 798 /* 799 * Either we have data in the first mblk or it's just the 800 * header. In either case, we need to set rptr to the start of 801 * the TCP data. 802 */ 803 if (len > ohdrslen) { 804 odatamp = omp; 805 offset = ohdrslen; 806 } else { 807 ASSERT3U(len, ==, ohdrslen); 808 odatamp = omp->b_cont; 809 offset = 0; 810 } 811 812 /* Make sure we still have enough data. */ 813 odatalen = opktlen - ohdrslen; 814 ASSERT3U(msgsize(odatamp), >=, odatalen); 815 816 /* 817 * If a MAC negotiated LSO then it must negotiate both 818 * HCKSUM_IPHDRCKSUM and either HCKSUM_INET_FULL_V4 or 819 * HCKSUM_INET_PARTIAL; because both the IP and TCP headers 820 * change during LSO segmentation (only the 3 fields of the 821 * pseudo header checksum don't change: src, dst, proto). Thus 822 * we would expect these flags (HCK_IPV4_HDRCKSUM | 823 * HCK_PARTIALCKSUM | HCK_FULLCKSUM) to be set and for this 824 * function to emulate those checksums in software. However, 825 * that assumes a world where we only expose LSO if the 826 * underlying hardware exposes LSO. Moving forward the plan is 827 * to assume LSO in the upper layers and have MAC perform 828 * software LSO when the underlying provider doesn't support 829 * it. In such a world, if the provider doesn't support LSO 830 * but does support hardware checksum offload, then we could 831 * simply perform the segmentation and allow the hardware to 832 * calculate the checksums. To the hardware it's just another 833 * chain of non-LSO packets. 834 */ 835 ASSERT3S(DB_TYPE(omp), ==, M_DATA); 836 ocsum_flags = DB_CKSUMFLAGS(omp); 837 ASSERT3U(ocsum_flags & (HCK_PARTIALCKSUM | HCK_FULLCKSUM), !=, 0); 838 839 /* 840 * If hardware only provides partial checksum then software 841 * must supply the pseudo-header checksum. In the case of LSO 842 * we leave the TCP length at zero to be filled in by 843 * hardware. This function must handle two scenarios. 844 * 845 * 1. Being called by a MAC client on the Rx path to segment 846 * an LSO packet and calculate the checksum. 847 * 848 * 2. Being called by a MAC provider to segment an LSO packet. 849 * In this case the LSO segmentation is performed in 850 * software (by this routine) but the MAC provider should 851 * still calculate the TCP/IP checksums in hardware. 852 * 853 * To elaborate on the second case: we cannot have the 854 * scenario where IP sends LSO packets but the underlying HW 855 * doesn't support checksum offload -- because in that case 856 * TCP/IP would calculate the checksum in software (for the 857 * LSO packet) but then MAC would segment the packet and have 858 * to redo all the checksum work. So IP should never do LSO 859 * if HW doesn't support both IP and TCP checksum. 860 */ 861 if (ocsum_flags & HCK_PARTIALCKSUM) { 862 ocsum_start = (uint32_t)DB_CKSUMSTART(omp); 863 ocsum_stuff = (uint32_t)DB_CKSUMSTUFF(omp); 864 } 865 866 /* 867 * Subtract one to account for the case where the data length 868 * is evenly divisble by the MSS. Add one to account for the 869 * fact that the division will always result in one less 870 * segment than needed. 871 */ 872 nsegs = ((odatalen - 1) / mss) + 1; 873 if (nsegs < 2) { 874 mac_drop_pkt(omp, "LSO not enough segs: %u", nsegs); 875 goto fail; 876 } 877 878 DTRACE_PROBE6(sw__lso__start, mblk_t *, omp, void_ip_t *, oiph, 879 __dtrace_tcp_tcph_t *, otcph, uint_t, odatalen, uint_t, mss, 880 uint_t, nsegs); 881 882 seg_chain = NULL; 883 tmptail = seg_chain; 884 oleft = odatalen; 885 886 for (uint_t i = 0; i < nsegs; i++) { 887 boolean_t last_seg = ((i + 1) == nsegs); 888 uint32_t seg_len; 889 890 /* 891 * If we fail to allocate, then drop the partially 892 * allocated chain as well as the LSO packet. Let the 893 * sender deal with the fallout. 894 */ 895 if ((nhdrmp = allocb(ohdrslen, 0)) == NULL) { 896 freemsgchain(seg_chain); 897 mac_drop_pkt(omp, "failed to alloc segment header"); 898 goto fail; 899 } 900 ASSERT3P(nhdrmp->b_cont, ==, NULL); 901 902 /* Copy over the header stack. */ 903 bcopy(omp->b_rptr, nhdrmp->b_rptr, ohdrslen); 904 nhdrmp->b_wptr += ohdrslen; 905 906 if (seg_chain == NULL) { 907 seg_chain = nhdrmp; 908 } else { 909 ASSERT3P(tmptail, !=, NULL); 910 tmptail->b_next = nhdrmp; 911 } 912 913 tmptail = nhdrmp; 914 915 /* 916 * Calculate this segment's length. It's either the MSS 917 * or whatever remains for the last segment. 918 */ 919 seg_len = last_seg ? oleft : mss; 920 ASSERT3U(seg_len, <=, mss); 921 ndatamp = build_data_seg(&odatamp, &offset, seg_len); 922 923 if (ndatamp == NULL) { 924 freemsgchain(seg_chain); 925 mac_drop_pkt(omp, "LSO failed to segment data"); 926 goto fail; 927 } 928 929 /* Attach data mblk to header mblk. */ 930 nhdrmp->b_cont = ndatamp; 931 DB_CKSUMFLAGS(ndatamp) &= ~HW_LSO; 932 ASSERT3U(seg_len, <=, oleft); 933 oleft -= seg_len; 934 935 /* Setup partial checksum offsets. */ 936 if (ocsum_flags & HCK_PARTIALCKSUM) { 937 DB_CKSUMSTART(nhdrmp) = ocsum_start; 938 DB_CKSUMEND(nhdrmp) = oiphlen + otcphlen + seg_len; 939 DB_CKSUMSTUFF(nhdrmp) = ocsum_stuff; 940 } 941 } 942 943 /* We should have consumed entire LSO msg. */ 944 ASSERT3S(oleft, ==, 0); 945 ASSERT3P(odatamp, ==, NULL); 946 947 /* 948 * All seg data mblks are referenced by the header mblks, null 949 * out this pointer to catch any bad derefs. 950 */ 951 ndatamp = NULL; 952 953 /* 954 * Set headers and checksum for first segment. 955 */ 956 nhdrmp = seg_chain; 957 ASSERT3U(msgsize(nhdrmp->b_cont), ==, mss); 958 959 if (is_v6) { 960 niph6 = (ip6_t *)(nhdrmp->b_rptr + oehlen); 961 niph6->ip6_plen = htons( 962 (oiphlen - IPV6_HDR_LEN) + otcphlen + mss); 963 } else { 964 niph = (ipha_t *)(nhdrmp->b_rptr + oehlen); 965 niph->ipha_length = htons(oiphlen + otcphlen + mss); 966 /* 967 * If the v4 checksum was filled, we won't have a v4 offload 968 * flag. We can't write zero checksums without inserting said 969 * flag, but our output frames won't necessarily be rechecked by 970 * the caller! As a compromise, we need to force emulation to 971 * uphold the same contracts the packet already agreed to. 972 */ 973 if (niph->ipha_hdr_checksum != 0) { 974 emul |= MAC_IPCKSUM_EMUL; 975 ocsum_flags |= HCK_IPV4_HDRCKSUM; 976 } 977 niph->ipha_hdr_checksum = 0; 978 ip_id = ntohs(niph->ipha_ident); 979 } 980 981 ntcph = (tcph_t *)(nhdrmp->b_rptr + oehlen + oiphlen); 982 tcp_seq = BE32_TO_U32(ntcph->th_seq); 983 tcp_seq += mss; 984 985 /* 986 * The first segment shouldn't: 987 * 988 * o indicate end of data transmission (FIN), 989 * o indicate immediate handling of the data (PUSH). 990 */ 991 ntcph->th_flags[0] &= ~(TH_FIN | TH_PUSH); 992 DB_CKSUMFLAGS(nhdrmp) = (uint16_t)(ocsum_flags & ~HW_LSO); 993 994 /* 995 * If the underlying HW provides partial checksum, then make 996 * sure to correct the pseudo header checksum before calling 997 * mac_sw_cksum(). The native TCP stack doesn't include the 998 * length field in the pseudo header when LSO is in play -- so 999 * we need to calculate it here. 1000 */ 1001 if (ocsum_flags & HCK_PARTIALCKSUM) { 1002 tcp_sum = BE16_TO_U16(ntcph->th_sum); 1003 otcp_sum = tcp_sum; 1004 tcp_sum += mss + otcphlen; 1005 tcp_sum = (tcp_sum >> 16) + (tcp_sum & 0xFFFF); 1006 U16_TO_BE16(tcp_sum, ntcph->th_sum); 1007 } 1008 1009 if ((ocsum_flags & HCK_TX_FLAGS) && (emul & MAC_HWCKSUM_EMULS)) { 1010 next_nhdrmp = nhdrmp->b_next; 1011 nhdrmp->b_next = NULL; 1012 nhdrmp = mac_sw_cksum(nhdrmp, emul); 1013 /* 1014 * The mblk could be replaced (via pull-up) or freed (due to 1015 * failure) during mac_sw_cksum(), so we must take care with the 1016 * result here. 1017 */ 1018 if (nhdrmp != NULL) { 1019 nhdrmp->b_next = next_nhdrmp; 1020 next_nhdrmp = NULL; 1021 seg_chain = nhdrmp; 1022 } else { 1023 freemsgchain(next_nhdrmp); 1024 /* 1025 * nhdrmp referenced the head of seg_chain when it was 1026 * freed, so further clean-up there is unnecessary 1027 */ 1028 seg_chain = NULL; 1029 mac_drop_pkt(omp, "LSO cksum emulation failed"); 1030 goto fail; 1031 } 1032 } 1033 1034 ASSERT3P(nhdrmp, !=, NULL); 1035 1036 seg = 1; 1037 DTRACE_PROBE5(sw__lso__seg, mblk_t *, nhdrmp, void_ip_t *, 1038 (is_v6 ? (void *)niph6 : (void *)niph), 1039 __dtrace_tcp_tcph_t *, ntcph, uint_t, mss, int_t, seg); 1040 seg++; 1041 1042 /* There better be at least 2 segs. */ 1043 ASSERT3P(nhdrmp->b_next, !=, NULL); 1044 prev_nhdrmp = nhdrmp; 1045 nhdrmp = nhdrmp->b_next; 1046 1047 /* 1048 * Now adjust the headers of the middle segments. For each 1049 * header we need to adjust the following. 1050 * 1051 * o IP ID 1052 * o IP length 1053 * o TCP sequence 1054 * o TCP flags 1055 * o cksum flags 1056 * o cksum values (if MAC_HWCKSUM_EMUL is set) 1057 */ 1058 for (; seg < nsegs; seg++) { 1059 /* 1060 * We use seg_chain as a reference to the first seg 1061 * header mblk -- this first header is a template for 1062 * the rest of the segments. This copy will include 1063 * the now updated checksum values from the first 1064 * header. We must reset these checksum values to 1065 * their original to make sure we produce the correct 1066 * value. 1067 */ 1068 ASSERT3P(msgsize(nhdrmp->b_cont), ==, mss); 1069 if (is_v6) { 1070 niph6 = (ip6_t *)(nhdrmp->b_rptr + oehlen); 1071 niph6->ip6_plen = htons( 1072 (oiphlen - IPV6_HDR_LEN) + otcphlen + mss); 1073 } else { 1074 niph = (ipha_t *)(nhdrmp->b_rptr + oehlen); 1075 niph->ipha_ident = htons(++ip_id); 1076 niph->ipha_length = htons(oiphlen + otcphlen + mss); 1077 niph->ipha_hdr_checksum = 0; 1078 } 1079 ntcph = (tcph_t *)(nhdrmp->b_rptr + oehlen + oiphlen); 1080 U32_TO_BE32(tcp_seq, ntcph->th_seq); 1081 tcp_seq += mss; 1082 /* 1083 * Just like the first segment, the middle segments 1084 * shouldn't have these flags set. 1085 */ 1086 ntcph->th_flags[0] &= ~(TH_FIN | TH_PUSH); 1087 DB_CKSUMFLAGS(nhdrmp) = (uint16_t)(ocsum_flags & ~HW_LSO); 1088 1089 /* 1090 * First and middle segs have same 1091 * pseudo-header checksum. 1092 */ 1093 if (ocsum_flags & HCK_PARTIALCKSUM) 1094 U16_TO_BE16(tcp_sum, ntcph->th_sum); 1095 1096 if ((ocsum_flags & HCK_TX_FLAGS) && 1097 (emul & MAC_HWCKSUM_EMULS)) { 1098 next_nhdrmp = nhdrmp->b_next; 1099 nhdrmp->b_next = NULL; 1100 nhdrmp = mac_sw_cksum(nhdrmp, emul); 1101 /* 1102 * Like above, handle cases where mac_sw_cksum() does a 1103 * pull-up or drop of the mblk. 1104 */ 1105 if (nhdrmp != NULL) { 1106 nhdrmp->b_next = next_nhdrmp; 1107 next_nhdrmp = NULL; 1108 prev_nhdrmp->b_next = nhdrmp; 1109 } else { 1110 freemsgchain(next_nhdrmp); 1111 /* 1112 * Critical to de-link the now-freed nhdrmp 1113 * before freeing the rest of the preceding 1114 * chain. 1115 */ 1116 prev_nhdrmp->b_next = NULL; 1117 freemsgchain(seg_chain); 1118 seg_chain = NULL; 1119 mac_drop_pkt(omp, "LSO cksum emulation failed"); 1120 goto fail; 1121 } 1122 } 1123 1124 DTRACE_PROBE5(sw__lso__seg, mblk_t *, nhdrmp, void_ip_t *, 1125 (is_v6 ? (void *)niph6 : (void *)niph), 1126 __dtrace_tcp_tcph_t *, ntcph, uint_t, mss, uint_t, seg); 1127 1128 ASSERT3P(nhdrmp->b_next, !=, NULL); 1129 prev_nhdrmp = nhdrmp; 1130 nhdrmp = nhdrmp->b_next; 1131 } 1132 1133 /* Make sure we are on the last segment. */ 1134 ASSERT3U(seg, ==, nsegs); 1135 ASSERT3P(nhdrmp->b_next, ==, NULL); 1136 1137 /* 1138 * Now we set the last segment header. The difference being 1139 * that FIN/PSH/RST flags are allowed. 1140 */ 1141 len = msgsize(nhdrmp->b_cont); 1142 ASSERT3S(len, >, 0); 1143 if (is_v6) { 1144 niph6 = (ip6_t *)(nhdrmp->b_rptr + oehlen); 1145 niph6->ip6_plen = htons( 1146 (oiphlen - IPV6_HDR_LEN) + otcphlen + len); 1147 } else { 1148 niph = (ipha_t *)(nhdrmp->b_rptr + oehlen); 1149 niph->ipha_ident = htons(++ip_id); 1150 niph->ipha_length = htons(oiphlen + otcphlen + len); 1151 niph->ipha_hdr_checksum = 0; 1152 } 1153 ntcph = (tcph_t *)(nhdrmp->b_rptr + oehlen + oiphlen); 1154 U32_TO_BE32(tcp_seq, ntcph->th_seq); 1155 1156 DB_CKSUMFLAGS(nhdrmp) = (uint16_t)(ocsum_flags & ~HW_LSO); 1157 if (ocsum_flags & HCK_PARTIALCKSUM) { 1158 tcp_sum = otcp_sum; 1159 tcp_sum += len + otcphlen; 1160 tcp_sum = (tcp_sum >> 16) + (tcp_sum & 0xFFFF); 1161 U16_TO_BE16(tcp_sum, ntcph->th_sum); 1162 } 1163 1164 if ((ocsum_flags & HCK_TX_FLAGS) && (emul & MAC_HWCKSUM_EMULS)) { 1165 /* This should be the last mblk. */ 1166 ASSERT3P(nhdrmp->b_next, ==, NULL); 1167 nhdrmp = mac_sw_cksum(nhdrmp, emul); 1168 /* 1169 * If the final mblk happens to be dropped as part of 1170 * mac_sw_cksum(), that is unfortunate, but it need not be a 1171 * show-stopper at this point. We can just pretend that final 1172 * packet was dropped in transit. 1173 */ 1174 prev_nhdrmp->b_next = nhdrmp; 1175 } 1176 1177 DTRACE_PROBE5(sw__lso__seg, mblk_t *, nhdrmp, void_ip_t *, 1178 (is_v6 ? (void *)niph6 : (void *)niph), 1179 __dtrace_tcp_tcph_t *, ntcph, uint_t, len, uint_t, seg); 1180 1181 /* 1182 * Free the reference to the original LSO message as it is 1183 * being replaced by seg_cahin. 1184 */ 1185 freemsg(omp); 1186 *head = seg_chain; 1187 *tail = nhdrmp; 1188 *count = nsegs; 1189 return; 1190 1191 fail: 1192 *head = NULL; 1193 *tail = NULL; 1194 *count = 0; 1195 } 1196 1197 #define HCK_NEEDED (HCK_IPV4_HDRCKSUM | HCK_PARTIALCKSUM | HCK_FULLCKSUM) 1198 1199 /* 1200 * Emulate various hardware offload features in software. Take a chain 1201 * of packets as input and emulate the hardware features specified in 1202 * 'emul'. The resulting chain's head pointer replaces the 'mp_chain' 1203 * pointer given as input, and its tail pointer is written to 1204 * '*otail'. The number of packets in the new chain is written to 1205 * '*ocount'. The 'otail' and 'ocount' arguments are optional and thus 1206 * may be NULL. The 'mp_chain' argument may point to a NULL chain; in 1207 * which case 'mp_chain' will simply stay a NULL chain. 1208 * 1209 * While unlikely, it is technically possible that this function could 1210 * receive a non-NULL chain as input and return a NULL chain as output 1211 * ('*mp_chain' and '*otail' would be NULL and '*ocount' would be 1212 * zero). This could happen if all the packets in the chain are 1213 * dropped or if we fail to allocate new mblks. In this case, there is 1214 * nothing for the caller to free. In any event, the caller shouldn't 1215 * assume that '*mp_chain' is non-NULL on return. 1216 * 1217 * This function was written with three main use cases in mind. 1218 * 1219 * 1. To emulate hardware offloads when traveling mac-loopback (two 1220 * clients on the same mac). This is wired up in mac_tx_send(). 1221 * 1222 * 2. To provide hardware offloads to the client when the underlying 1223 * provider cannot. This is currently wired up in mac_tx() but we 1224 * still only negotiate offloads when the underlying provider 1225 * supports them. 1226 * 1227 * 3. To emulate real hardware in simnet. 1228 */ 1229 void 1230 mac_hw_emul(mblk_t **mp_chain, mblk_t **otail, uint_t *ocount, mac_emul_t emul) 1231 { 1232 mblk_t *head = NULL, *tail = NULL; 1233 uint_t count = 0; 1234 1235 ASSERT3S(~(MAC_HWCKSUM_EMULS | MAC_LSO_EMUL) & emul, ==, 0); 1236 ASSERT3P(mp_chain, !=, NULL); 1237 1238 for (mblk_t *mp = *mp_chain; mp != NULL; ) { 1239 mblk_t *tmp, *next, *tmphead, *tmptail; 1240 struct ether_header *ehp; 1241 uint32_t flags; 1242 uint_t len = MBLKL(mp), l2len; 1243 1244 /* Perform LSO/cksum one message at a time. */ 1245 next = mp->b_next; 1246 mp->b_next = NULL; 1247 1248 /* 1249 * For our sanity the first mblk should contain at 1250 * least the full L2 header. 1251 */ 1252 if (len < sizeof (struct ether_header)) { 1253 mac_drop_pkt(mp, "packet too short (A): %u", len); 1254 mp = next; 1255 continue; 1256 } 1257 1258 ehp = (struct ether_header *)mp->b_rptr; 1259 if (ntohs(ehp->ether_type) == VLAN_TPID) 1260 l2len = sizeof (struct ether_vlan_header); 1261 else 1262 l2len = sizeof (struct ether_header); 1263 1264 /* 1265 * If the first mblk is solely the L2 header, then 1266 * there better be more data. 1267 */ 1268 if (len < l2len || (len == l2len && mp->b_cont == NULL)) { 1269 mac_drop_pkt(mp, "packet too short (C): %u", len); 1270 mp = next; 1271 continue; 1272 } 1273 1274 DTRACE_PROBE2(mac__emul, mblk_t *, mp, mac_emul_t, emul); 1275 1276 /* 1277 * We use DB_CKSUMFLAGS (instead of mac_hcksum_get()) 1278 * because we don't want to mask-out the LSO flag. 1279 */ 1280 flags = DB_CKSUMFLAGS(mp); 1281 1282 if ((flags & HW_LSO) && (emul & MAC_LSO_EMUL)) { 1283 uint_t tmpcount = 0; 1284 1285 /* 1286 * LSO fix-up handles checksum emulation 1287 * inline (if requested). It also frees mp. 1288 */ 1289 mac_sw_lso(mp, emul, &tmphead, &tmptail, 1290 &tmpcount); 1291 if (tmphead == NULL) { 1292 /* mac_sw_lso() freed the mp. */ 1293 mp = next; 1294 continue; 1295 } 1296 count += tmpcount; 1297 } else if ((flags & HCK_NEEDED) && (emul & MAC_HWCKSUM_EMULS)) { 1298 tmp = mac_sw_cksum(mp, emul); 1299 if (tmp == NULL) { 1300 /* mac_sw_cksum() freed the mp. */ 1301 mp = next; 1302 continue; 1303 } 1304 tmphead = tmp; 1305 tmptail = tmp; 1306 count++; 1307 } else { 1308 /* There is nothing to emulate. */ 1309 tmp = mp; 1310 tmphead = tmp; 1311 tmptail = tmp; 1312 count++; 1313 } 1314 1315 /* 1316 * The tmp mblk chain is either the start of the new 1317 * chain or added to the tail of the new chain. 1318 */ 1319 if (head == NULL) { 1320 head = tmphead; 1321 tail = tmptail; 1322 } else { 1323 /* Attach the new mblk to the end of the new chain. */ 1324 tail->b_next = tmphead; 1325 tail = tmptail; 1326 } 1327 1328 mp = next; 1329 } 1330 1331 *mp_chain = head; 1332 1333 if (otail != NULL) 1334 *otail = tail; 1335 1336 if (ocount != NULL) 1337 *ocount = count; 1338 } 1339 1340 /* 1341 * Add VLAN tag to the specified mblk. 1342 */ 1343 mblk_t * 1344 mac_add_vlan_tag(mblk_t *mp, uint_t pri, uint16_t vid) 1345 { 1346 mblk_t *hmp; 1347 struct ether_vlan_header *evhp; 1348 struct ether_header *ehp; 1349 1350 ASSERT(pri != 0 || vid != 0); 1351 1352 /* 1353 * Allocate an mblk for the new tagged ethernet header, 1354 * and copy the MAC addresses and ethertype from the 1355 * original header. 1356 */ 1357 1358 hmp = allocb(sizeof (struct ether_vlan_header), BPRI_MED); 1359 if (hmp == NULL) { 1360 freemsg(mp); 1361 return (NULL); 1362 } 1363 1364 evhp = (struct ether_vlan_header *)hmp->b_rptr; 1365 ehp = (struct ether_header *)mp->b_rptr; 1366 1367 bcopy(ehp, evhp, (ETHERADDRL * 2)); 1368 evhp->ether_type = ehp->ether_type; 1369 evhp->ether_tpid = htons(ETHERTYPE_VLAN); 1370 1371 hmp->b_wptr += sizeof (struct ether_vlan_header); 1372 mp->b_rptr += sizeof (struct ether_header); 1373 1374 /* 1375 * Free the original message if it's now empty. Link the 1376 * rest of messages to the header message. 1377 */ 1378 mac_hcksum_clone(mp, hmp); 1379 if (MBLKL(mp) == 0) { 1380 hmp->b_cont = mp->b_cont; 1381 freeb(mp); 1382 } else { 1383 hmp->b_cont = mp; 1384 } 1385 ASSERT(MBLKL(hmp) >= sizeof (struct ether_vlan_header)); 1386 1387 /* 1388 * Initialize the new TCI (Tag Control Information). 1389 */ 1390 evhp->ether_tci = htons(VLAN_TCI(pri, 0, vid)); 1391 1392 return (hmp); 1393 } 1394 1395 /* 1396 * Adds a VLAN tag with the specified VID and priority to each mblk of 1397 * the specified chain. 1398 */ 1399 mblk_t * 1400 mac_add_vlan_tag_chain(mblk_t *mp_chain, uint_t pri, uint16_t vid) 1401 { 1402 mblk_t *next_mp, **prev, *mp; 1403 1404 mp = mp_chain; 1405 prev = &mp_chain; 1406 1407 while (mp != NULL) { 1408 next_mp = mp->b_next; 1409 mp->b_next = NULL; 1410 if ((mp = mac_add_vlan_tag(mp, pri, vid)) == NULL) { 1411 freemsgchain(next_mp); 1412 break; 1413 } 1414 *prev = mp; 1415 prev = &mp->b_next; 1416 mp = mp->b_next = next_mp; 1417 } 1418 1419 return (mp_chain); 1420 } 1421 1422 /* 1423 * Strip VLAN tag 1424 */ 1425 mblk_t * 1426 mac_strip_vlan_tag(mblk_t *mp) 1427 { 1428 mblk_t *newmp; 1429 struct ether_vlan_header *evhp; 1430 1431 evhp = (struct ether_vlan_header *)mp->b_rptr; 1432 if (ntohs(evhp->ether_tpid) == ETHERTYPE_VLAN) { 1433 ASSERT(MBLKL(mp) >= sizeof (struct ether_vlan_header)); 1434 1435 if (DB_REF(mp) > 1) { 1436 newmp = copymsg(mp); 1437 if (newmp == NULL) 1438 return (NULL); 1439 freemsg(mp); 1440 mp = newmp; 1441 } 1442 1443 evhp = (struct ether_vlan_header *)mp->b_rptr; 1444 1445 ovbcopy(mp->b_rptr, mp->b_rptr + VLAN_TAGSZ, 2 * ETHERADDRL); 1446 mp->b_rptr += VLAN_TAGSZ; 1447 } 1448 return (mp); 1449 } 1450 1451 /* 1452 * Strip VLAN tag from each mblk of the chain. 1453 */ 1454 mblk_t * 1455 mac_strip_vlan_tag_chain(mblk_t *mp_chain) 1456 { 1457 mblk_t *mp, *next_mp, **prev; 1458 1459 mp = mp_chain; 1460 prev = &mp_chain; 1461 1462 while (mp != NULL) { 1463 next_mp = mp->b_next; 1464 mp->b_next = NULL; 1465 if ((mp = mac_strip_vlan_tag(mp)) == NULL) { 1466 freemsgchain(next_mp); 1467 break; 1468 } 1469 *prev = mp; 1470 prev = &mp->b_next; 1471 mp = mp->b_next = next_mp; 1472 } 1473 1474 return (mp_chain); 1475 } 1476 1477 /* 1478 * Default callback function. Used when the datapath is not yet initialized. 1479 */ 1480 /* ARGSUSED */ 1481 void 1482 mac_rx_def(void *arg, mac_resource_handle_t resource, mblk_t *mp_chain, 1483 boolean_t loopback) 1484 { 1485 freemsgchain(mp_chain); 1486 } 1487 1488 /* 1489 * Determines the IPv6 header length accounting for all the optional IPv6 1490 * headers (hop-by-hop, destination, routing and fragment). The header length 1491 * and next header value (a transport header) is captured. 1492 * 1493 * Returns B_FALSE if all the IP headers are not in the same mblk otherwise 1494 * returns B_TRUE. 1495 */ 1496 boolean_t 1497 mac_ip_hdr_length_v6(ip6_t *ip6h, uint8_t *endptr, uint16_t *hdr_length, 1498 uint8_t *next_hdr, ip6_frag_t **fragp) 1499 { 1500 uint16_t length; 1501 uint_t ehdrlen; 1502 uint8_t *whereptr; 1503 uint8_t *nexthdrp; 1504 ip6_dest_t *desthdr; 1505 ip6_rthdr_t *rthdr; 1506 ip6_frag_t *fraghdr; 1507 1508 if (((uchar_t *)ip6h + IPV6_HDR_LEN) > endptr) 1509 return (B_FALSE); 1510 ASSERT(IPH_HDR_VERSION(ip6h) == IPV6_VERSION); 1511 length = IPV6_HDR_LEN; 1512 whereptr = ((uint8_t *)&ip6h[1]); /* point to next hdr */ 1513 1514 if (fragp != NULL) 1515 *fragp = NULL; 1516 1517 nexthdrp = &ip6h->ip6_nxt; 1518 while (whereptr < endptr) { 1519 /* Is there enough left for len + nexthdr? */ 1520 if (whereptr + MIN_EHDR_LEN > endptr) 1521 break; 1522 1523 switch (*nexthdrp) { 1524 case IPPROTO_HOPOPTS: 1525 case IPPROTO_DSTOPTS: 1526 /* Assumes the headers are identical for hbh and dst */ 1527 desthdr = (ip6_dest_t *)whereptr; 1528 ehdrlen = 8 * (desthdr->ip6d_len + 1); 1529 if ((uchar_t *)desthdr + ehdrlen > endptr) 1530 return (B_FALSE); 1531 nexthdrp = &desthdr->ip6d_nxt; 1532 break; 1533 case IPPROTO_ROUTING: 1534 rthdr = (ip6_rthdr_t *)whereptr; 1535 ehdrlen = 8 * (rthdr->ip6r_len + 1); 1536 if ((uchar_t *)rthdr + ehdrlen > endptr) 1537 return (B_FALSE); 1538 nexthdrp = &rthdr->ip6r_nxt; 1539 break; 1540 case IPPROTO_FRAGMENT: 1541 fraghdr = (ip6_frag_t *)whereptr; 1542 ehdrlen = sizeof (ip6_frag_t); 1543 if ((uchar_t *)&fraghdr[1] > endptr) 1544 return (B_FALSE); 1545 nexthdrp = &fraghdr->ip6f_nxt; 1546 if (fragp != NULL) 1547 *fragp = fraghdr; 1548 break; 1549 case IPPROTO_NONE: 1550 /* No next header means we're finished */ 1551 default: 1552 *hdr_length = length; 1553 *next_hdr = *nexthdrp; 1554 return (B_TRUE); 1555 } 1556 length += ehdrlen; 1557 whereptr += ehdrlen; 1558 *hdr_length = length; 1559 *next_hdr = *nexthdrp; 1560 } 1561 switch (*nexthdrp) { 1562 case IPPROTO_HOPOPTS: 1563 case IPPROTO_DSTOPTS: 1564 case IPPROTO_ROUTING: 1565 case IPPROTO_FRAGMENT: 1566 /* 1567 * If any know extension headers are still to be processed, 1568 * the packet's malformed (or at least all the IP header(s) are 1569 * not in the same mblk - and that should never happen. 1570 */ 1571 return (B_FALSE); 1572 1573 default: 1574 /* 1575 * If we get here, we know that all of the IP headers were in 1576 * the same mblk, even if the ULP header is in the next mblk. 1577 */ 1578 *hdr_length = length; 1579 *next_hdr = *nexthdrp; 1580 return (B_TRUE); 1581 } 1582 } 1583 1584 /* 1585 * The following set of routines are there to take care of interrupt 1586 * re-targeting for legacy (fixed) interrupts. Some older versions 1587 * of the popular NICs like e1000g do not support MSI-X interrupts 1588 * and they reserve fixed interrupts for RX/TX rings. To re-target 1589 * these interrupts, PCITOOL ioctls need to be used. 1590 */ 1591 typedef struct mac_dladm_intr { 1592 int ino; 1593 int cpu_id; 1594 char driver_path[MAXPATHLEN]; 1595 char nexus_path[MAXPATHLEN]; 1596 } mac_dladm_intr_t; 1597 1598 /* Bind the interrupt to cpu_num */ 1599 static int 1600 mac_set_intr(ldi_handle_t lh, processorid_t cpu_num, int oldcpuid, int ino) 1601 { 1602 pcitool_intr_set_t iset; 1603 int err; 1604 1605 iset.old_cpu = oldcpuid; 1606 iset.ino = ino; 1607 iset.cpu_id = cpu_num; 1608 iset.user_version = PCITOOL_VERSION; 1609 err = ldi_ioctl(lh, PCITOOL_DEVICE_SET_INTR, (intptr_t)&iset, FKIOCTL, 1610 kcred, NULL); 1611 1612 return (err); 1613 } 1614 1615 /* 1616 * Search interrupt information. iget is filled in with the info to search 1617 */ 1618 static boolean_t 1619 mac_search_intrinfo(pcitool_intr_get_t *iget_p, mac_dladm_intr_t *dln) 1620 { 1621 int i; 1622 char driver_path[2 * MAXPATHLEN]; 1623 1624 for (i = 0; i < iget_p->num_devs; i++) { 1625 (void) strlcpy(driver_path, iget_p->dev[i].path, MAXPATHLEN); 1626 (void) snprintf(&driver_path[strlen(driver_path)], MAXPATHLEN, 1627 ":%s%d", iget_p->dev[i].driver_name, 1628 iget_p->dev[i].dev_inst); 1629 /* Match the device path for the device path */ 1630 if (strcmp(driver_path, dln->driver_path) == 0) { 1631 dln->ino = iget_p->ino; 1632 dln->cpu_id = iget_p->cpu_id; 1633 return (B_TRUE); 1634 } 1635 } 1636 return (B_FALSE); 1637 } 1638 1639 /* 1640 * Get information about ino, i.e. if this is the interrupt for our 1641 * device and where it is bound etc. 1642 */ 1643 static boolean_t 1644 mac_get_single_intr(ldi_handle_t lh, int oldcpuid, int ino, 1645 mac_dladm_intr_t *dln) 1646 { 1647 pcitool_intr_get_t *iget_p; 1648 int ipsz; 1649 int nipsz; 1650 int err; 1651 uint8_t inum; 1652 1653 /* 1654 * Check if SLEEP is OK, i.e if could come here in response to 1655 * changing the fanout due to some callback from the driver, say 1656 * link speed changes. 1657 */ 1658 ipsz = PCITOOL_IGET_SIZE(0); 1659 iget_p = kmem_zalloc(ipsz, KM_SLEEP); 1660 1661 iget_p->num_devs_ret = 0; 1662 iget_p->user_version = PCITOOL_VERSION; 1663 iget_p->cpu_id = oldcpuid; 1664 iget_p->ino = ino; 1665 1666 err = ldi_ioctl(lh, PCITOOL_DEVICE_GET_INTR, (intptr_t)iget_p, 1667 FKIOCTL, kcred, NULL); 1668 if (err != 0) { 1669 kmem_free(iget_p, ipsz); 1670 return (B_FALSE); 1671 } 1672 if (iget_p->num_devs == 0) { 1673 kmem_free(iget_p, ipsz); 1674 return (B_FALSE); 1675 } 1676 inum = iget_p->num_devs; 1677 if (iget_p->num_devs_ret < iget_p->num_devs) { 1678 /* Reallocate */ 1679 nipsz = PCITOOL_IGET_SIZE(iget_p->num_devs); 1680 1681 kmem_free(iget_p, ipsz); 1682 ipsz = nipsz; 1683 iget_p = kmem_zalloc(ipsz, KM_SLEEP); 1684 1685 iget_p->num_devs_ret = inum; 1686 iget_p->cpu_id = oldcpuid; 1687 iget_p->ino = ino; 1688 iget_p->user_version = PCITOOL_VERSION; 1689 err = ldi_ioctl(lh, PCITOOL_DEVICE_GET_INTR, (intptr_t)iget_p, 1690 FKIOCTL, kcred, NULL); 1691 if (err != 0) { 1692 kmem_free(iget_p, ipsz); 1693 return (B_FALSE); 1694 } 1695 /* defensive */ 1696 if (iget_p->num_devs != iget_p->num_devs_ret) { 1697 kmem_free(iget_p, ipsz); 1698 return (B_FALSE); 1699 } 1700 } 1701 1702 if (mac_search_intrinfo(iget_p, dln)) { 1703 kmem_free(iget_p, ipsz); 1704 return (B_TRUE); 1705 } 1706 kmem_free(iget_p, ipsz); 1707 return (B_FALSE); 1708 } 1709 1710 /* 1711 * Get the interrupts and check each one to see if it is for our device. 1712 */ 1713 static int 1714 mac_validate_intr(ldi_handle_t lh, mac_dladm_intr_t *dln, processorid_t cpuid) 1715 { 1716 pcitool_intr_info_t intr_info; 1717 int err; 1718 int ino; 1719 int oldcpuid; 1720 1721 err = ldi_ioctl(lh, PCITOOL_SYSTEM_INTR_INFO, (intptr_t)&intr_info, 1722 FKIOCTL, kcred, NULL); 1723 if (err != 0) 1724 return (-1); 1725 1726 for (oldcpuid = 0; oldcpuid < intr_info.num_cpu; oldcpuid++) { 1727 for (ino = 0; ino < intr_info.num_intr; ino++) { 1728 if (mac_get_single_intr(lh, oldcpuid, ino, dln)) { 1729 if (dln->cpu_id == cpuid) 1730 return (0); 1731 return (1); 1732 } 1733 } 1734 } 1735 return (-1); 1736 } 1737 1738 /* 1739 * Obtain the nexus parent node info. for mdip. 1740 */ 1741 static dev_info_t * 1742 mac_get_nexus_node(dev_info_t *mdip, mac_dladm_intr_t *dln) 1743 { 1744 struct dev_info *tdip = (struct dev_info *)mdip; 1745 struct ddi_minor_data *minordata; 1746 dev_info_t *pdip; 1747 char pathname[MAXPATHLEN]; 1748 1749 while (tdip != NULL) { 1750 /* 1751 * The netboot code could call this function while walking the 1752 * device tree so we need to use ndi_devi_tryenter() here to 1753 * avoid deadlock. 1754 */ 1755 if (ndi_devi_tryenter((dev_info_t *)tdip) == 0) 1756 break; 1757 1758 for (minordata = tdip->devi_minor; minordata != NULL; 1759 minordata = minordata->next) { 1760 if (strncmp(minordata->ddm_node_type, DDI_NT_INTRCTL, 1761 strlen(DDI_NT_INTRCTL)) == 0) { 1762 pdip = minordata->dip; 1763 (void) ddi_pathname(pdip, pathname); 1764 (void) snprintf(dln->nexus_path, MAXPATHLEN, 1765 "/devices%s:intr", pathname); 1766 (void) ddi_pathname_minor(minordata, pathname); 1767 ndi_devi_exit((dev_info_t *)tdip); 1768 return (pdip); 1769 } 1770 } 1771 ndi_devi_exit((dev_info_t *)tdip); 1772 tdip = tdip->devi_parent; 1773 } 1774 return (NULL); 1775 } 1776 1777 /* 1778 * For a primary MAC client, if the user has set a list or CPUs or 1779 * we have obtained it implicitly, we try to retarget the interrupt 1780 * for that device on one of the CPUs in the list. 1781 * We assign the interrupt to the same CPU as the poll thread. 1782 */ 1783 static boolean_t 1784 mac_check_interrupt_binding(dev_info_t *mdip, int32_t cpuid) 1785 { 1786 ldi_handle_t lh = NULL; 1787 ldi_ident_t li = NULL; 1788 int err; 1789 int ret; 1790 mac_dladm_intr_t dln; 1791 dev_info_t *dip; 1792 struct ddi_minor_data *minordata; 1793 1794 dln.nexus_path[0] = '\0'; 1795 dln.driver_path[0] = '\0'; 1796 1797 minordata = ((struct dev_info *)mdip)->devi_minor; 1798 while (minordata != NULL) { 1799 if (minordata->type == DDM_MINOR) 1800 break; 1801 minordata = minordata->next; 1802 } 1803 if (minordata == NULL) 1804 return (B_FALSE); 1805 1806 (void) ddi_pathname_minor(minordata, dln.driver_path); 1807 1808 dip = mac_get_nexus_node(mdip, &dln); 1809 /* defensive */ 1810 if (dip == NULL) 1811 return (B_FALSE); 1812 1813 err = ldi_ident_from_major(ddi_driver_major(dip), &li); 1814 if (err != 0) 1815 return (B_FALSE); 1816 1817 err = ldi_open_by_name(dln.nexus_path, FREAD|FWRITE, kcred, &lh, li); 1818 if (err != 0) 1819 return (B_FALSE); 1820 1821 ret = mac_validate_intr(lh, &dln, cpuid); 1822 if (ret < 0) { 1823 (void) ldi_close(lh, FREAD|FWRITE, kcred); 1824 return (B_FALSE); 1825 } 1826 /* cmn_note? */ 1827 if (ret != 0) 1828 if ((err = (mac_set_intr(lh, cpuid, dln.cpu_id, dln.ino))) 1829 != 0) { 1830 (void) ldi_close(lh, FREAD|FWRITE, kcred); 1831 return (B_FALSE); 1832 } 1833 (void) ldi_close(lh, FREAD|FWRITE, kcred); 1834 return (B_TRUE); 1835 } 1836 1837 void 1838 mac_client_set_intr_cpu(void *arg, mac_client_handle_t mch, int32_t cpuid) 1839 { 1840 dev_info_t *mdip = (dev_info_t *)arg; 1841 mac_client_impl_t *mcip = (mac_client_impl_t *)mch; 1842 mac_resource_props_t *mrp; 1843 mac_perim_handle_t mph; 1844 flow_entry_t *flent = mcip->mci_flent; 1845 mac_soft_ring_set_t *rx_srs; 1846 mac_cpus_t *srs_cpu; 1847 1848 if (!mac_check_interrupt_binding(mdip, cpuid)) 1849 cpuid = -1; 1850 mac_perim_enter_by_mh((mac_handle_t)mcip->mci_mip, &mph); 1851 mrp = MCIP_RESOURCE_PROPS(mcip); 1852 mrp->mrp_rx_intr_cpu = cpuid; 1853 if (flent != NULL && flent->fe_rx_srs_cnt == 2) { 1854 rx_srs = flent->fe_rx_srs[1]; 1855 srs_cpu = &rx_srs->srs_cpu; 1856 srs_cpu->mc_rx_intr_cpu = cpuid; 1857 } 1858 mac_perim_exit(mph); 1859 } 1860 1861 int32_t 1862 mac_client_intr_cpu(mac_client_handle_t mch) 1863 { 1864 mac_client_impl_t *mcip = (mac_client_impl_t *)mch; 1865 mac_cpus_t *srs_cpu; 1866 mac_soft_ring_set_t *rx_srs; 1867 flow_entry_t *flent = mcip->mci_flent; 1868 mac_resource_props_t *mrp = MCIP_RESOURCE_PROPS(mcip); 1869 mac_ring_t *ring; 1870 mac_intr_t *mintr; 1871 1872 /* 1873 * Check if we need to retarget the interrupt. We do this only 1874 * for the primary MAC client. We do this if we have the only 1875 * exclusive ring in the group. 1876 */ 1877 if (mac_is_primary_client(mcip) && flent->fe_rx_srs_cnt == 2) { 1878 rx_srs = flent->fe_rx_srs[1]; 1879 srs_cpu = &rx_srs->srs_cpu; 1880 ring = rx_srs->srs_ring; 1881 mintr = &ring->mr_info.mri_intr; 1882 /* 1883 * If ddi_handle is present or the poll CPU is 1884 * already bound to the interrupt CPU, return -1. 1885 */ 1886 if (mintr->mi_ddi_handle != NULL || 1887 ((mrp->mrp_ncpus != 0) && 1888 (mrp->mrp_rx_intr_cpu == srs_cpu->mc_rx_pollid))) { 1889 return (-1); 1890 } 1891 return (srs_cpu->mc_rx_pollid); 1892 } 1893 return (-1); 1894 } 1895 1896 void * 1897 mac_get_devinfo(mac_handle_t mh) 1898 { 1899 mac_impl_t *mip = (mac_impl_t *)mh; 1900 1901 return ((void *)mip->mi_dip); 1902 } 1903 1904 #define PKT_HASH_2BYTES(x) ((x)[0] ^ (x)[1]) 1905 #define PKT_HASH_4BYTES(x) ((x)[0] ^ (x)[1] ^ (x)[2] ^ (x)[3]) 1906 #define PKT_HASH_MAC(x) ((x)[0] ^ (x)[1] ^ (x)[2] ^ (x)[3] ^ (x)[4] ^ (x)[5]) 1907 1908 uint64_t 1909 mac_pkt_hash(uint_t media, mblk_t *mp, uint8_t policy, boolean_t is_outbound) 1910 { 1911 struct ether_header *ehp; 1912 uint64_t hash = 0; 1913 uint16_t sap; 1914 uint_t skip_len; 1915 uint8_t proto; 1916 boolean_t ip_fragmented; 1917 1918 /* 1919 * We may want to have one of these per MAC type plugin in the 1920 * future. For now supports only ethernet. 1921 */ 1922 if (media != DL_ETHER) 1923 return (0L); 1924 1925 /* for now we support only outbound packets */ 1926 ASSERT(is_outbound); 1927 ASSERT(IS_P2ALIGNED(mp->b_rptr, sizeof (uint16_t))); 1928 ASSERT(MBLKL(mp) >= sizeof (struct ether_header)); 1929 1930 /* compute L2 hash */ 1931 1932 ehp = (struct ether_header *)mp->b_rptr; 1933 1934 if ((policy & MAC_PKT_HASH_L2) != 0) { 1935 uchar_t *mac_src = ehp->ether_shost.ether_addr_octet; 1936 uchar_t *mac_dst = ehp->ether_dhost.ether_addr_octet; 1937 hash = PKT_HASH_MAC(mac_src) ^ PKT_HASH_MAC(mac_dst); 1938 policy &= ~MAC_PKT_HASH_L2; 1939 } 1940 1941 if (policy == 0) 1942 goto done; 1943 1944 /* skip ethernet header */ 1945 1946 sap = ntohs(ehp->ether_type); 1947 if (sap == ETHERTYPE_VLAN) { 1948 struct ether_vlan_header *evhp; 1949 mblk_t *newmp = NULL; 1950 1951 skip_len = sizeof (struct ether_vlan_header); 1952 if (MBLKL(mp) < skip_len) { 1953 /* the vlan tag is the payload, pull up first */ 1954 newmp = msgpullup(mp, -1); 1955 if ((newmp == NULL) || (MBLKL(newmp) < skip_len)) { 1956 goto done; 1957 } 1958 evhp = (struct ether_vlan_header *)newmp->b_rptr; 1959 } else { 1960 evhp = (struct ether_vlan_header *)mp->b_rptr; 1961 } 1962 1963 sap = ntohs(evhp->ether_type); 1964 freemsg(newmp); 1965 } else { 1966 skip_len = sizeof (struct ether_header); 1967 } 1968 1969 /* if ethernet header is in its own mblk, skip it */ 1970 if (MBLKL(mp) <= skip_len) { 1971 skip_len -= MBLKL(mp); 1972 mp = mp->b_cont; 1973 if (mp == NULL) 1974 goto done; 1975 } 1976 1977 sap = (sap < ETHERTYPE_802_MIN) ? 0 : sap; 1978 1979 /* compute IP src/dst addresses hash and skip IPv{4,6} header */ 1980 1981 switch (sap) { 1982 case ETHERTYPE_IP: { 1983 ipha_t *iphp; 1984 1985 /* 1986 * If the header is not aligned or the header doesn't fit 1987 * in the mblk, bail now. Note that this may cause packets 1988 * reordering. 1989 */ 1990 iphp = (ipha_t *)(mp->b_rptr + skip_len); 1991 if (((unsigned char *)iphp + sizeof (ipha_t) > mp->b_wptr) || 1992 !OK_32PTR((char *)iphp)) 1993 goto done; 1994 1995 proto = iphp->ipha_protocol; 1996 skip_len += IPH_HDR_LENGTH(iphp); 1997 1998 /* Check if the packet is fragmented. */ 1999 ip_fragmented = ntohs(iphp->ipha_fragment_offset_and_flags) & 2000 IPH_OFFSET; 2001 2002 /* 2003 * For fragmented packets, use addresses in addition to 2004 * the frag_id to generate the hash inorder to get 2005 * better distribution. 2006 */ 2007 if (ip_fragmented || (policy & MAC_PKT_HASH_L3) != 0) { 2008 uint8_t *ip_src = (uint8_t *)&(iphp->ipha_src); 2009 uint8_t *ip_dst = (uint8_t *)&(iphp->ipha_dst); 2010 2011 hash ^= (PKT_HASH_4BYTES(ip_src) ^ 2012 PKT_HASH_4BYTES(ip_dst)); 2013 policy &= ~MAC_PKT_HASH_L3; 2014 } 2015 2016 if (ip_fragmented) { 2017 uint8_t *identp = (uint8_t *)&iphp->ipha_ident; 2018 hash ^= PKT_HASH_2BYTES(identp); 2019 goto done; 2020 } 2021 break; 2022 } 2023 case ETHERTYPE_IPV6: { 2024 ip6_t *ip6hp; 2025 ip6_frag_t *frag = NULL; 2026 uint16_t hdr_length; 2027 2028 /* 2029 * If the header is not aligned or the header doesn't fit 2030 * in the mblk, bail now. Note that this may cause packets 2031 * reordering. 2032 */ 2033 2034 ip6hp = (ip6_t *)(mp->b_rptr + skip_len); 2035 if (((unsigned char *)ip6hp + IPV6_HDR_LEN > mp->b_wptr) || 2036 !OK_32PTR((char *)ip6hp)) 2037 goto done; 2038 2039 if (!mac_ip_hdr_length_v6(ip6hp, mp->b_wptr, &hdr_length, 2040 &proto, &frag)) 2041 goto done; 2042 skip_len += hdr_length; 2043 2044 /* 2045 * For fragmented packets, use addresses in addition to 2046 * the frag_id to generate the hash inorder to get 2047 * better distribution. 2048 */ 2049 if (frag != NULL || (policy & MAC_PKT_HASH_L3) != 0) { 2050 uint8_t *ip_src = &(ip6hp->ip6_src.s6_addr8[12]); 2051 uint8_t *ip_dst = &(ip6hp->ip6_dst.s6_addr8[12]); 2052 2053 hash ^= (PKT_HASH_4BYTES(ip_src) ^ 2054 PKT_HASH_4BYTES(ip_dst)); 2055 policy &= ~MAC_PKT_HASH_L3; 2056 } 2057 2058 if (frag != NULL) { 2059 uint8_t *identp = (uint8_t *)&frag->ip6f_ident; 2060 hash ^= PKT_HASH_4BYTES(identp); 2061 goto done; 2062 } 2063 break; 2064 } 2065 default: 2066 goto done; 2067 } 2068 2069 if (policy == 0) 2070 goto done; 2071 2072 /* if ip header is in its own mblk, skip it */ 2073 if (MBLKL(mp) <= skip_len) { 2074 skip_len -= MBLKL(mp); 2075 mp = mp->b_cont; 2076 if (mp == NULL) 2077 goto done; 2078 } 2079 2080 /* parse ULP header */ 2081 again: 2082 switch (proto) { 2083 case IPPROTO_TCP: 2084 case IPPROTO_UDP: 2085 case IPPROTO_ESP: 2086 case IPPROTO_SCTP: 2087 /* 2088 * These Internet Protocols are intentionally designed 2089 * for hashing from the git-go. Port numbers are in the first 2090 * word for transports, SPI is first for ESP. 2091 */ 2092 if (mp->b_rptr + skip_len + 4 > mp->b_wptr) 2093 goto done; 2094 hash ^= PKT_HASH_4BYTES((mp->b_rptr + skip_len)); 2095 break; 2096 2097 case IPPROTO_AH: { 2098 ah_t *ah = (ah_t *)(mp->b_rptr + skip_len); 2099 uint_t ah_length = AH_TOTAL_LEN(ah); 2100 2101 if ((unsigned char *)ah + sizeof (ah_t) > mp->b_wptr) 2102 goto done; 2103 2104 proto = ah->ah_nexthdr; 2105 skip_len += ah_length; 2106 2107 /* if AH header is in its own mblk, skip it */ 2108 if (MBLKL(mp) <= skip_len) { 2109 skip_len -= MBLKL(mp); 2110 mp = mp->b_cont; 2111 if (mp == NULL) 2112 goto done; 2113 } 2114 2115 goto again; 2116 } 2117 } 2118 2119 done: 2120 return (hash); 2121 } 2122