1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright 2019 Joyent, Inc. 24 * Copyright 2025 Oxide Computer Company 25 */ 26 27 /* 28 * MAC Services Module - misc utilities 29 */ 30 31 #include <sys/types.h> 32 #include <sys/mac.h> 33 #include <sys/mac_impl.h> 34 #include <sys/mac_client_priv.h> 35 #include <sys/mac_client_impl.h> 36 #include <sys/mac_soft_ring.h> 37 #include <sys/strsubr.h> 38 #include <sys/strsun.h> 39 #include <sys/vlan.h> 40 #include <sys/pattr.h> 41 #include <sys/pci_tools.h> 42 #include <inet/ip.h> 43 #include <inet/ip_impl.h> 44 #include <inet/ip6.h> 45 #include <sys/vtrace.h> 46 #include <sys/dlpi.h> 47 #include <sys/sunndi.h> 48 #include <inet/ipsec_impl.h> 49 #include <inet/sadb.h> 50 #include <inet/ipsecesp.h> 51 #include <inet/ipsecah.h> 52 #include <inet/tcp.h> 53 #include <inet/sctp_ip.h> 54 55 /* 56 * The next two functions are used for dropping packets or chains of 57 * packets, respectively. We could use one function for both but 58 * separating the use cases allows us to specify intent and prevent 59 * dropping more data than intended. 60 * 61 * The purpose of these functions is to aid the debugging effort, 62 * especially in production. Rather than use freemsg()/freemsgchain(), 63 * it's preferable to use these functions when dropping a packet in 64 * the MAC layer. These functions should only be used during 65 * unexpected conditions. That is, any time a packet is dropped 66 * outside of the regular, successful datapath. Consolidating all 67 * drops on these functions allows the user to trace one location and 68 * determine why the packet was dropped based on the msg. It also 69 * allows the user to inspect the packet before it is freed. Finally, 70 * it allows the user to avoid tracing freemsg()/freemsgchain() thus 71 * keeping the hot path running as efficiently as possible. 72 * 73 * NOTE: At this time not all MAC drops are aggregated on these 74 * functions; but that is the plan. This comment should be erased once 75 * completed. 76 */ 77 78 /*PRINTFLIKE2*/ 79 void 80 mac_drop_pkt(mblk_t *mp, const char *fmt, ...) 81 { 82 va_list adx; 83 char msg[128]; 84 char *msgp = msg; 85 86 ASSERT3P(mp->b_next, ==, NULL); 87 88 va_start(adx, fmt); 89 (void) vsnprintf(msgp, sizeof (msg), fmt, adx); 90 va_end(adx); 91 92 DTRACE_PROBE2(mac__drop, mblk_t *, mp, char *, msgp); 93 freemsg(mp); 94 } 95 96 /*PRINTFLIKE2*/ 97 void 98 mac_drop_chain(mblk_t *chain, const char *fmt, ...) 99 { 100 va_list adx; 101 char msg[128]; 102 char *msgp = msg; 103 104 va_start(adx, fmt); 105 (void) vsnprintf(msgp, sizeof (msg), fmt, adx); 106 va_end(adx); 107 108 /* 109 * We could use freemsgchain() for the actual freeing but 110 * since we are already walking the chain to fire the dtrace 111 * probe we might as well free the msg here too. 112 */ 113 for (mblk_t *mp = chain, *next; mp != NULL; ) { 114 next = mp->b_next; 115 DTRACE_PROBE2(mac__drop, mblk_t *, mp, char *, msgp); 116 mp->b_next = NULL; 117 freemsg(mp); 118 mp = next; 119 } 120 } 121 122 /* 123 * Copy an mblk, preserving its hardware checksum flags. 124 */ 125 static mblk_t * 126 mac_copymsg_cksum(mblk_t *mp) 127 { 128 mblk_t *mp1; 129 130 mp1 = copymsg(mp); 131 if (mp1 == NULL) 132 return (NULL); 133 134 mac_hcksum_clone(mp, mp1); 135 136 return (mp1); 137 } 138 139 /* 140 * Copy an mblk chain, presenting the hardware checksum flags of the 141 * individual mblks. 142 */ 143 mblk_t * 144 mac_copymsgchain_cksum(mblk_t *mp) 145 { 146 mblk_t *nmp = NULL; 147 mblk_t **nmpp = &nmp; 148 149 for (; mp != NULL; mp = mp->b_next) { 150 if ((*nmpp = mac_copymsg_cksum(mp)) == NULL) { 151 freemsgchain(nmp); 152 return (NULL); 153 } 154 155 nmpp = &((*nmpp)->b_next); 156 } 157 158 return (nmp); 159 } 160 161 /* 162 * Perform software checksum on a single message, if needed. The emulation 163 * performed is determined by an intersection of the mblk's flags and the emul 164 * flags requested. The emul flags are documented in mac.h. 165 */ 166 static mblk_t * 167 mac_sw_cksum(mblk_t *mp, mac_emul_t emul) 168 { 169 mac_ether_offload_info_t meoi = { 0 }; 170 const char *err = ""; 171 172 /* 173 * The only current caller is mac_hw_emul(), which handles any chaining 174 * of mblks prior to now. 175 */ 176 VERIFY3P(mp->b_next, ==, NULL); 177 178 uint32_t flags = DB_CKSUMFLAGS(mp); 179 180 /* Why call this if checksum emulation isn't needed? */ 181 ASSERT3U(flags & (HCK_FLAGS), !=, 0); 182 /* But also, requesting both ULP cksum types is improper */ 183 if ((flags & HCK_FULLCKSUM) != 0 && (flags & HCK_PARTIALCKSUM) != 0) { 184 err = "full and partial ULP cksum requested"; 185 goto bail; 186 } 187 188 const boolean_t do_v4_cksum = (emul & MAC_IPCKSUM_EMUL) != 0 && 189 (flags & HCK_IPV4_HDRCKSUM) != 0; 190 const boolean_t do_ulp_cksum = (emul & MAC_HWCKSUM_EMUL) != 0 && 191 (flags & (HCK_FULLCKSUM | HCK_PARTIALCKSUM)) != 0; 192 const boolean_t ulp_prefer_partial = (flags & HCK_PARTIALCKSUM) != 0; 193 194 mac_ether_offload_info(mp, &meoi); 195 if ((meoi.meoi_flags & MEOI_L2INFO_SET) == 0 || 196 (meoi.meoi_l3proto != ETHERTYPE_IP && 197 meoi.meoi_l3proto != ETHERTYPE_IPV6)) { 198 /* Non-IP traffic (like ARP) is left alone */ 199 return (mp); 200 } 201 202 /* 203 * Ensure that requested checksum type(s) are supported by the 204 * protocols encoded in the packet headers. 205 */ 206 if (do_v4_cksum) { 207 if (meoi.meoi_l3proto != ETHERTYPE_IP) { 208 err = "IPv4 csum requested on non-IPv4 packet"; 209 goto bail; 210 } 211 } 212 if (do_ulp_cksum) { 213 if ((meoi.meoi_flags & MEOI_L4INFO_SET) == 0) { 214 err = "missing ULP header"; 215 goto bail; 216 } 217 switch (meoi.meoi_l4proto) { 218 case IPPROTO_TCP: 219 case IPPROTO_UDP: 220 case IPPROTO_ICMP: 221 case IPPROTO_ICMPV6: 222 case IPPROTO_SCTP: 223 break; 224 default: 225 err = "unexpected ULP"; 226 goto bail; 227 } 228 } 229 230 /* 231 * If the first mblk of this packet contains only the Ethernet header, 232 * skip past it for now. Packets with their data contained in only a 233 * single mblk can then use the fastpaths tuned to that possibility. 234 */ 235 mblk_t *skipped_hdr = NULL; 236 if (MBLKL(mp) == meoi.meoi_l2hlen) { 237 meoi.meoi_len -= meoi.meoi_l2hlen; 238 meoi.meoi_l2hlen = 0; 239 skipped_hdr = mp; 240 mp = mp->b_cont; 241 242 ASSERT(mp != NULL); 243 } 244 245 /* 246 * Ensure that all of the headers we need to access are: 247 * 1. Collected in the first mblk 248 * 2. Held in a data-block which is safe for us to modify 249 * (It must have a refcount of 1) 250 */ 251 const size_t hdr_len_reqd = (meoi.meoi_l2hlen + meoi.meoi_l3hlen) + 252 (do_ulp_cksum ? meoi.meoi_l4hlen : 0); 253 if (MBLKL(mp) < hdr_len_reqd || DB_REF(mp) > 1) { 254 mblk_t *hdrmp = msgpullup(mp, hdr_len_reqd); 255 256 if (hdrmp == NULL) { 257 err = "could not pullup msg headers"; 258 goto bail; 259 } 260 261 mac_hcksum_clone(mp, hdrmp); 262 if (skipped_hdr != NULL) { 263 ASSERT3P(skipped_hdr->b_cont, ==, mp); 264 skipped_hdr->b_cont = hdrmp; 265 } 266 freemsg(mp); 267 mp = hdrmp; 268 } 269 270 /* Calculate IPv4 header checksum, if requested */ 271 if (do_v4_cksum) { 272 /* 273 * While unlikely, it's possible to write code that might end up 274 * calling mac_sw_cksum() twice on the same mblk (performing 275 * both LSO and checksum emulation in a single mblk chain loop 276 * -- the LSO emulation inserts a new chain into the existing 277 * chain and then the loop iterates back over the new segments 278 * and emulates the checksum a second time). Normally this 279 * wouldn't be a problem, because the HCK_*_OK flags are 280 * supposed to indicate that we don't need to do peform the 281 * work. But HCK_IPV4_HDRCKSUM and HCK_IPV4_HDRCKSUM_OK have the 282 * same value; so we cannot use these flags to determine if the 283 * IP header checksum has already been calculated or not. For 284 * this reason, we zero out the the checksum first. In the 285 * future, we should fix the HCK_* flags. 286 */ 287 ipha_t *ipha = (ipha_t *)(mp->b_rptr + meoi.meoi_l2hlen); 288 ipha->ipha_hdr_checksum = 0; 289 ipha->ipha_hdr_checksum = (uint16_t)ip_csum_hdr(ipha); 290 flags &= ~HCK_IPV4_HDRCKSUM; 291 flags |= HCK_IPV4_HDRCKSUM_OK; 292 } 293 294 /* 295 * The SCTP is different from all the other protocols in that it uses 296 * CRC32 for its checksum, rather than ones' complement. 297 */ 298 if (do_ulp_cksum && meoi.meoi_l4proto == IPPROTO_SCTP) { 299 if (ulp_prefer_partial) { 300 err = "SCTP does not support partial checksum"; 301 goto bail; 302 } 303 304 const uint_t ulp_off = meoi.meoi_l2hlen + meoi.meoi_l3hlen; 305 sctp_hdr_t *sctph = (sctp_hdr_t *)(mp->b_rptr + ulp_off); 306 307 sctph->sh_chksum = 0; 308 sctph->sh_chksum = sctp_cksum(mp, ulp_off); 309 310 flags &= ~HCK_FULLCKSUM; 311 flags |= HCK_FULLCKSUM_OK; 312 goto success; 313 } 314 315 /* Calculate full ULP checksum, if requested */ 316 if (do_ulp_cksum && !ulp_prefer_partial) { 317 /* 318 * Calculate address and length portions of pseudo-header csum 319 */ 320 uint32_t cksum = 0; 321 if (meoi.meoi_l3proto == ETHERTYPE_IP) { 322 const ipha_t *ipha = 323 (const ipha_t *)(mp->b_rptr + meoi.meoi_l2hlen); 324 const uint16_t *ipp = 325 (const uint16_t *)(&ipha->ipha_src); 326 327 cksum += ipp[0] + ipp[1] + ipp[2] + ipp[3]; 328 329 /* 330 * While it is tempting to calculate the payload length 331 * solely from `meoi`, like as done below for IPv6, 332 * doing so is a trap. Packets shorter than 60 bytes 333 * will get padded out to that length in order to meet 334 * the minimums for Ethernet. Instead, we pull the 335 * length from the IP header. 336 */ 337 const uint16_t payload_len = 338 ntohs(ipha->ipha_length) - meoi.meoi_l3hlen; 339 cksum += htons(payload_len); 340 } else if (meoi.meoi_l3proto == ETHERTYPE_IPV6) { 341 const ip6_t *ip6h = 342 (const ip6_t *)(mp->b_rptr + meoi.meoi_l2hlen); 343 const uint16_t *ipp = 344 (const uint16_t *)(&ip6h->ip6_src); 345 346 cksum += ipp[0] + ipp[1] + ipp[2] + ipp[3] + 347 ipp[4] + ipp[5] + ipp[6] + ipp[7]; 348 cksum += ipp[8] + ipp[9] + ipp[10] + ipp[11] + 349 ipp[12] + ipp[13] + ipp[14] + ipp[15]; 350 351 const uint16_t payload_len = meoi.meoi_len - 352 ((uint16_t)meoi.meoi_l2hlen + meoi.meoi_l3hlen); 353 cksum += htons(payload_len); 354 } else { 355 /* 356 * Since we already checked for recognized L3 protocols 357 * earlier, this should not be reachable. 358 */ 359 panic("L3 protocol unexpectedly changed"); 360 } 361 362 /* protocol portion of pseudo-header */ 363 uint_t cksum_off; 364 switch (meoi.meoi_l4proto) { 365 case IPPROTO_TCP: 366 cksum += IP_TCP_CSUM_COMP; 367 cksum_off = TCP_CHECKSUM_OFFSET; 368 break; 369 case IPPROTO_UDP: 370 cksum += IP_UDP_CSUM_COMP; 371 cksum_off = UDP_CHECKSUM_OFFSET; 372 break; 373 case IPPROTO_ICMP: 374 /* ICMP cksum does not include pseudo-header contents */ 375 cksum = 0; 376 cksum_off = ICMP_CHECKSUM_OFFSET; 377 break; 378 case IPPROTO_ICMPV6: 379 cksum += IP_ICMPV6_CSUM_COMP; 380 cksum_off = ICMPV6_CHECKSUM_OFFSET; 381 break; 382 default: 383 err = "unrecognized L4 protocol"; 384 goto bail; 385 } 386 387 /* 388 * With IP_CSUM() taking into account the pseudo-header 389 * checksum, make sure the ULP checksum field is zeroed before 390 * computing the rest; 391 */ 392 const uint_t l4_off = meoi.meoi_l3hlen + meoi.meoi_l2hlen; 393 uint16_t *up = (uint16_t *)(mp->b_rptr + l4_off + cksum_off); 394 *up = 0; 395 cksum = IP_CSUM(mp, l4_off, cksum); 396 397 if (meoi.meoi_l4proto == IPPROTO_UDP && cksum == 0) { 398 /* 399 * A zero checksum is not allowed on UDPv6, and on UDPv4 400 * implies no checksum. In either case, invert to a 401 * values of all-1s. 402 */ 403 *up = 0xffff; 404 } else { 405 *up = cksum; 406 } 407 408 flags &= ~HCK_FULLCKSUM; 409 flags |= HCK_FULLCKSUM_OK; 410 goto success; 411 } 412 413 /* Calculate partial ULP checksum, if requested */ 414 if (do_ulp_cksum && ulp_prefer_partial) { 415 uint32_t start, stuff, end, value; 416 mac_hcksum_get(mp, &start, &stuff, &end, &value, NULL); 417 418 ASSERT3S(end, >, start); 419 420 /* 421 * The prior size checks against the header length data ensure 422 * that the mblk contains everything through at least the ULP 423 * header, but if the partial checksum (unexpectedly) requests 424 * its result be stored past that, we cannot continue. 425 */ 426 if (stuff + sizeof (uint16_t) > MBLKL(mp)) { 427 err = "partial csum request is out of bounds"; 428 goto bail; 429 } 430 431 uchar_t *ipp = (uchar_t *)(mp->b_rptr + meoi.meoi_l2hlen); 432 uint16_t *up = (uint16_t *)(ipp + stuff); 433 434 const uint16_t partial = *up; 435 *up = 0; 436 const uint16_t cksum = 437 ~IP_CSUM_PARTIAL(mp, start + meoi.meoi_l2hlen, partial); 438 *up = cksum != 0 ? cksum : ~cksum; 439 440 flags &= ~HCK_PARTIALCKSUM; 441 flags |= HCK_FULLCKSUM_OK; 442 } 443 444 success: 445 /* 446 * With the checksum(s) calculated, store the updated flags to reflect 447 * the current status, and zero out any of the partial-checksum fields 448 * which would be irrelevant now. 449 */ 450 mac_hcksum_set(mp, 0, 0, 0, 0, flags); 451 452 /* Don't forget to reattach the header. */ 453 if (skipped_hdr != NULL) { 454 ASSERT3P(skipped_hdr->b_cont, ==, mp); 455 456 /* 457 * Duplicate the HCKSUM data into the header mblk. 458 * 459 * This mimics mac_add_vlan_tag() which ensures that both the 460 * first mblk _and_ the first data bearing mblk possess the 461 * HCKSUM information. Consumers like IP will end up discarding 462 * the ether_header mblk, so for now, it is important that the 463 * data be available in both places. 464 */ 465 mac_hcksum_clone(mp, skipped_hdr); 466 mp = skipped_hdr; 467 } 468 return (mp); 469 470 bail: 471 if (skipped_hdr != NULL) { 472 ASSERT3P(skipped_hdr->b_cont, ==, mp); 473 mp = skipped_hdr; 474 } 475 476 mac_drop_pkt(mp, err); 477 return (NULL); 478 } 479 480 /* 481 * Build a single data segment from an LSO packet. The mblk chain 482 * returned, seg_head, represents the data segment and is always 483 * exactly seg_len bytes long. The lso_mp and offset input/output 484 * parameters track our position in the LSO packet. This function 485 * exists solely as a helper to mac_sw_lso(). 486 * 487 * Case A 488 * 489 * The current lso_mp is larger than the requested seg_len. The 490 * beginning of seg_head may start at the beginning of lso_mp or 491 * offset into it. In either case, a single mblk is returned, and 492 * *offset is updated to reflect our new position in the current 493 * lso_mp. 494 * 495 * +----------------------------+ 496 * | in *lso_mp / out *lso_mp | 497 * +----------------------------+ 498 * ^ ^ 499 * | | 500 * | | 501 * | | 502 * +------------------------+ 503 * | seg_head | 504 * +------------------------+ 505 * ^ ^ 506 * | | 507 * in *offset = 0 out *offset = seg_len 508 * 509 * |------ seg_len ----| 510 * 511 * 512 * +------------------------------+ 513 * | in *lso_mp / out *lso_mp | 514 * +------------------------------+ 515 * ^ ^ 516 * | | 517 * | | 518 * | | 519 * +------------------------+ 520 * | seg_head | 521 * +------------------------+ 522 * ^ ^ 523 * | | 524 * in *offset = N out *offset = N + seg_len 525 * 526 * |------ seg_len ----| 527 * 528 * 529 * 530 * Case B 531 * 532 * The requested seg_len consumes exactly the rest of the lso_mp. 533 * I.e., the seg_head's b_wptr is equivalent to lso_mp's b_wptr. 534 * The seg_head may start at the beginning of the lso_mp or at some 535 * offset into it. In either case we return a single mblk, reset 536 * *offset to zero, and walk to the next lso_mp. 537 * 538 * +------------------------+ +------------------------+ 539 * | in *lso_mp |---------->| out *lso_mp | 540 * +------------------------+ +------------------------+ 541 * ^ ^ ^ 542 * | | | 543 * | | out *offset = 0 544 * | | 545 * +------------------------+ 546 * | seg_head | 547 * +------------------------+ 548 * ^ 549 * | 550 * in *offset = 0 551 * 552 * |------ seg_len ----| 553 * 554 * 555 * 556 * +----------------------------+ +------------------------+ 557 * | in *lso_mp |---------->| out *lso_mp | 558 * +----------------------------+ +------------------------+ 559 * ^ ^ ^ 560 * | | | 561 * | | out *offset = 0 562 * | | 563 * +------------------------+ 564 * | seg_head | 565 * +------------------------+ 566 * ^ 567 * | 568 * in *offset = N 569 * 570 * |------ seg_len ----| 571 * 572 * 573 * Case C 574 * 575 * The requested seg_len is greater than the current lso_mp. In 576 * this case we must consume LSO mblks until we have enough data to 577 * satisfy either case (A) or (B) above. We will return multiple 578 * mblks linked via b_cont, offset will be set based on the cases 579 * above, and lso_mp will walk forward at least one mblk, but maybe 580 * more. 581 * 582 * N.B. This digram is not exhaustive. The seg_head may start on 583 * the beginning of an lso_mp. The seg_tail may end exactly on the 584 * boundary of an lso_mp. And there may be two (in this case the 585 * middle block wouldn't exist), three, or more mblks in the 586 * seg_head chain. This is meant as one example of what might 587 * happen. The main thing to remember is that the seg_tail mblk 588 * must be one of case (A) or (B) above. 589 * 590 * +------------------+ +----------------+ +------------------+ 591 * | in *lso_mp |--->| *lso_mp |--->| out *lso_mp | 592 * +------------------+ +----------------+ +------------------+ 593 * ^ ^ ^ ^ ^ ^ 594 * | | | | | | 595 * | | | | | | 596 * | | | | | | 597 * | | | | | | 598 * +------------+ +----------------+ +------------+ 599 * | seg_head |--->| |--->| seg_tail | 600 * +------------+ +----------------+ +------------+ 601 * ^ ^ 602 * | | 603 * in *offset = N out *offset = MBLKL(seg_tail) 604 * 605 * |------------------- seg_len -------------------| 606 * 607 */ 608 static mblk_t * 609 build_data_seg(mblk_t **lso_mp, uint32_t *offset, uint32_t seg_len) 610 { 611 mblk_t *seg_head, *seg_tail, *seg_mp; 612 613 ASSERT3P(*lso_mp, !=, NULL); 614 ASSERT3U((*lso_mp)->b_rptr + *offset, <, (*lso_mp)->b_wptr); 615 616 seg_mp = dupb(*lso_mp); 617 if (seg_mp == NULL) 618 return (NULL); 619 620 seg_head = seg_mp; 621 seg_tail = seg_mp; 622 623 /* Continue where we left off from in the lso_mp. */ 624 seg_mp->b_rptr += *offset; 625 626 last_mblk: 627 /* Case (A) */ 628 if ((seg_mp->b_rptr + seg_len) < seg_mp->b_wptr) { 629 *offset += seg_len; 630 seg_mp->b_wptr = seg_mp->b_rptr + seg_len; 631 return (seg_head); 632 } 633 634 /* Case (B) */ 635 if ((seg_mp->b_rptr + seg_len) == seg_mp->b_wptr) { 636 *offset = 0; 637 *lso_mp = (*lso_mp)->b_cont; 638 return (seg_head); 639 } 640 641 /* Case (C) */ 642 ASSERT3U(seg_mp->b_rptr + seg_len, >, seg_mp->b_wptr); 643 644 /* 645 * The current LSO mblk doesn't have enough data to satisfy 646 * seg_len -- continue peeling off LSO mblks to build the new 647 * segment message. If allocation fails we free the previously 648 * allocated segment mblks and return NULL. 649 */ 650 while ((seg_mp->b_rptr + seg_len) > seg_mp->b_wptr) { 651 ASSERT3U(MBLKL(seg_mp), <=, seg_len); 652 seg_len -= MBLKL(seg_mp); 653 *offset = 0; 654 *lso_mp = (*lso_mp)->b_cont; 655 seg_mp = dupb(*lso_mp); 656 657 if (seg_mp == NULL) { 658 freemsgchain(seg_head); 659 return (NULL); 660 } 661 662 seg_tail->b_cont = seg_mp; 663 seg_tail = seg_mp; 664 } 665 666 /* 667 * We've walked enough LSO mblks that we can now satisfy the 668 * remaining seg_len. At this point we need to jump back to 669 * determine if we have arrived at case (A) or (B). 670 */ 671 672 /* Just to be paranoid that we didn't underflow. */ 673 ASSERT3U(seg_len, <, IP_MAXPACKET); 674 ASSERT3U(seg_len, >, 0); 675 goto last_mblk; 676 } 677 678 /* 679 * Perform software segmentation of a single LSO message. Take an LSO 680 * message as input and return head/tail pointers as output. This 681 * function should not be invoked directly but instead through 682 * mac_hw_emul(). 683 * 684 * The resulting chain is comprised of multiple (nsegs) MSS sized 685 * segments. Each segment will consist of two or more mblks joined by 686 * b_cont: a header and one or more data mblks. The header mblk is 687 * allocated anew for each message. The first segment's header is used 688 * as a template for the rest with adjustments made for things such as 689 * ID, sequence, length, TCP flags, etc. The data mblks reference into 690 * the existing LSO mblk (passed in as omp) by way of dupb(). Their 691 * b_rptr/b_wptr values are adjusted to reference only the fraction of 692 * the LSO message they are responsible for. At the successful 693 * completion of this function the original mblk (omp) is freed, 694 * leaving the newely created segment chain as the only remaining 695 * reference to the data. 696 */ 697 static void 698 mac_sw_lso(mblk_t *omp, mac_emul_t emul, mblk_t **head, mblk_t **tail, 699 uint_t *count) 700 { 701 uint32_t ocsum_flags, ocsum_start, ocsum_stuff; 702 uint32_t mss; 703 uint32_t oehlen, oiphlen, otcphlen, ohdrslen, opktlen, odatalen; 704 uint32_t oleft; 705 uint_t nsegs, seg; 706 int len; 707 708 struct ether_vlan_header *oevh; 709 const ipha_t *oiph; 710 const tcph_t *otcph; 711 ipha_t *niph; 712 tcph_t *ntcph; 713 uint16_t ip_id; 714 uint32_t tcp_seq, tcp_sum, otcp_sum; 715 716 uint32_t offset; 717 mblk_t *odatamp; 718 mblk_t *seg_chain, *prev_nhdrmp, *next_nhdrmp, *nhdrmp, *ndatamp; 719 mblk_t *tmptail; 720 721 ASSERT3P(head, !=, NULL); 722 ASSERT3P(tail, !=, NULL); 723 ASSERT3P(count, !=, NULL); 724 ASSERT3U((DB_CKSUMFLAGS(omp) & HW_LSO), !=, 0); 725 726 /* Assume we are dealing with a single LSO message. */ 727 ASSERT3P(omp->b_next, ==, NULL); 728 729 /* 730 * XXX: This is a hack to deal with mac_add_vlan_tag(). 731 * 732 * When VLANs are in play, mac_add_vlan_tag() creates a new 733 * mblk with just the ether_vlan_header and tacks it onto the 734 * front of 'omp'. This breaks the assumptions made below; 735 * namely that the TCP/IP headers are in the first mblk. In 736 * this case, since we already have to pay the cost of LSO 737 * emulation, we simply pull up everything. While this might 738 * seem irksome, keep in mind this will only apply in a couple 739 * of scenarios: a) an LSO-capable VLAN client sending to a 740 * non-LSO-capable client over the "MAC/bridge loopback" 741 * datapath or b) an LSO-capable VLAN client is sending to a 742 * client that, for whatever reason, doesn't have DLS-bypass 743 * enabled. Finally, we have to check for both a tagged and 744 * untagged sized mblk depending on if the mblk came via 745 * mac_promisc_dispatch() or mac_rx_deliver(). 746 * 747 * In the future, two things should be done: 748 * 749 * 1. This function should make use of some yet to be 750 * implemented "mblk helpers". These helper functions would 751 * perform all the b_cont walking for us and guarantee safe 752 * access to the mblk data. 753 * 754 * 2. We should add some slop to the mblks so that 755 * mac_add_vlan_tag() can just edit the first mblk instead 756 * of allocating on the hot path. 757 */ 758 if (MBLKL(omp) == sizeof (struct ether_vlan_header) || 759 MBLKL(omp) == sizeof (struct ether_header)) { 760 mblk_t *tmp = msgpullup(omp, -1); 761 762 if (tmp == NULL) { 763 mac_drop_pkt(omp, "failed to pull up"); 764 goto fail; 765 } 766 767 mac_hcksum_clone(omp, tmp); 768 freemsg(omp); 769 omp = tmp; 770 } 771 772 mss = DB_LSOMSS(omp); 773 ASSERT3U(msgsize(omp), <=, IP_MAXPACKET + 774 sizeof (struct ether_vlan_header)); 775 opktlen = msgsize(omp); 776 777 /* 778 * First, get references to the IP and TCP headers and 779 * determine the total TCP length (header + data). 780 * 781 * Thanks to mac_hw_emul() we know that the first mblk must 782 * contain (at minimum) the full L2 header. However, this 783 * function assumes more than that. It assumes the L2/L3/L4 784 * headers are all contained in the first mblk of a message 785 * (i.e., no b_cont walking for headers). While this is a 786 * current reality (our native TCP stack and viona both 787 * enforce this) things may become more nuanced in the future 788 * (e.g. when introducing encap support or adding new 789 * clients). For now we guard against this case by dropping 790 * the packet. 791 */ 792 oevh = (struct ether_vlan_header *)omp->b_rptr; 793 if (oevh->ether_tpid == htons(ETHERTYPE_VLAN)) 794 oehlen = sizeof (struct ether_vlan_header); 795 else 796 oehlen = sizeof (struct ether_header); 797 798 ASSERT3U(MBLKL(omp), >=, (oehlen + sizeof (ipha_t) + sizeof (tcph_t))); 799 if (MBLKL(omp) < (oehlen + sizeof (ipha_t) + sizeof (tcph_t))) { 800 mac_drop_pkt(omp, "mblk doesn't contain TCP/IP headers"); 801 goto fail; 802 } 803 804 oiph = (ipha_t *)(omp->b_rptr + oehlen); 805 oiphlen = IPH_HDR_LENGTH(oiph); 806 otcph = (tcph_t *)(omp->b_rptr + oehlen + oiphlen); 807 otcphlen = TCP_HDR_LENGTH(otcph); 808 809 /* 810 * Currently we only support LSO for TCP/IPv4. 811 */ 812 if (IPH_HDR_VERSION(oiph) != IPV4_VERSION) { 813 mac_drop_pkt(omp, "LSO unsupported IP version: %uhh", 814 IPH_HDR_VERSION(oiph)); 815 goto fail; 816 } 817 818 if (oiph->ipha_protocol != IPPROTO_TCP) { 819 mac_drop_pkt(omp, "LSO unsupported protocol: %uhh", 820 oiph->ipha_protocol); 821 goto fail; 822 } 823 824 if (otcph->th_flags[0] & (TH_SYN | TH_RST | TH_URG)) { 825 mac_drop_pkt(omp, "LSO packet has SYN|RST|URG set"); 826 goto fail; 827 } 828 829 ohdrslen = oehlen + oiphlen + otcphlen; 830 if ((len = MBLKL(omp)) < ohdrslen) { 831 mac_drop_pkt(omp, "LSO packet too short: %d < %u", len, 832 ohdrslen); 833 goto fail; 834 } 835 836 /* 837 * Either we have data in the first mblk or it's just the 838 * header. In either case, we need to set rptr to the start of 839 * the TCP data. 840 */ 841 if (len > ohdrslen) { 842 odatamp = omp; 843 offset = ohdrslen; 844 } else { 845 ASSERT3U(len, ==, ohdrslen); 846 odatamp = omp->b_cont; 847 offset = 0; 848 } 849 850 /* Make sure we still have enough data. */ 851 ASSERT3U(msgsize(odatamp), >=, opktlen - ohdrslen); 852 853 /* 854 * If a MAC negotiated LSO then it must negotioate both 855 * HCKSUM_IPHDRCKSUM and either HCKSUM_INET_FULL_V4 or 856 * HCKSUM_INET_PARTIAL; because both the IP and TCP headers 857 * change during LSO segmentation (only the 3 fields of the 858 * pseudo header checksum don't change: src, dst, proto). Thus 859 * we would expect these flags (HCK_IPV4_HDRCKSUM | 860 * HCK_PARTIALCKSUM | HCK_FULLCKSUM) to be set and for this 861 * function to emulate those checksums in software. However, 862 * that assumes a world where we only expose LSO if the 863 * underlying hardware exposes LSO. Moving forward the plan is 864 * to assume LSO in the upper layers and have MAC perform 865 * software LSO when the underlying provider doesn't support 866 * it. In such a world, if the provider doesn't support LSO 867 * but does support hardware checksum offload, then we could 868 * simply perform the segmentation and allow the hardware to 869 * calculate the checksums. To the hardware it's just another 870 * chain of non-LSO packets. 871 */ 872 ASSERT3S(DB_TYPE(omp), ==, M_DATA); 873 ocsum_flags = DB_CKSUMFLAGS(omp); 874 ASSERT3U(ocsum_flags & HCK_IPV4_HDRCKSUM, !=, 0); 875 ASSERT3U(ocsum_flags & (HCK_PARTIALCKSUM | HCK_FULLCKSUM), !=, 0); 876 877 /* 878 * If hardware only provides partial checksum then software 879 * must supply the pseudo-header checksum. In the case of LSO 880 * we leave the TCP length at zero to be filled in by 881 * hardware. This function must handle two scenarios. 882 * 883 * 1. Being called by a MAC client on the Rx path to segment 884 * an LSO packet and calculate the checksum. 885 * 886 * 2. Being called by a MAC provider to segment an LSO packet. 887 * In this case the LSO segmentation is performed in 888 * software (by this routine) but the MAC provider should 889 * still calculate the TCP/IP checksums in hardware. 890 * 891 * To elaborate on the second case: we cannot have the 892 * scenario where IP sends LSO packets but the underlying HW 893 * doesn't support checksum offload -- because in that case 894 * TCP/IP would calculate the checksum in software (for the 895 * LSO packet) but then MAC would segment the packet and have 896 * to redo all the checksum work. So IP should never do LSO 897 * if HW doesn't support both IP and TCP checksum. 898 */ 899 if (ocsum_flags & HCK_PARTIALCKSUM) { 900 ocsum_start = (uint32_t)DB_CKSUMSTART(omp); 901 ocsum_stuff = (uint32_t)DB_CKSUMSTUFF(omp); 902 } 903 904 odatalen = opktlen - ohdrslen; 905 906 /* 907 * Subtract one to account for the case where the data length 908 * is evenly divisble by the MSS. Add one to account for the 909 * fact that the division will always result in one less 910 * segment than needed. 911 */ 912 nsegs = ((odatalen - 1) / mss) + 1; 913 if (nsegs < 2) { 914 mac_drop_pkt(omp, "LSO not enough segs: %u", nsegs); 915 goto fail; 916 } 917 918 DTRACE_PROBE6(sw__lso__start, mblk_t *, omp, void_ip_t *, oiph, 919 __dtrace_tcp_tcph_t *, otcph, uint_t, odatalen, uint_t, mss, uint_t, 920 nsegs); 921 922 seg_chain = NULL; 923 tmptail = seg_chain; 924 oleft = odatalen; 925 926 for (uint_t i = 0; i < nsegs; i++) { 927 boolean_t last_seg = ((i + 1) == nsegs); 928 uint32_t seg_len; 929 930 /* 931 * If we fail to allocate, then drop the partially 932 * allocated chain as well as the LSO packet. Let the 933 * sender deal with the fallout. 934 */ 935 if ((nhdrmp = allocb(ohdrslen, 0)) == NULL) { 936 freemsgchain(seg_chain); 937 mac_drop_pkt(omp, "failed to alloc segment header"); 938 goto fail; 939 } 940 ASSERT3P(nhdrmp->b_cont, ==, NULL); 941 942 if (seg_chain == NULL) { 943 seg_chain = nhdrmp; 944 } else { 945 ASSERT3P(tmptail, !=, NULL); 946 tmptail->b_next = nhdrmp; 947 } 948 949 tmptail = nhdrmp; 950 951 /* 952 * Calculate this segment's lengh. It's either the MSS 953 * or whatever remains for the last segment. 954 */ 955 seg_len = last_seg ? oleft : mss; 956 ASSERT3U(seg_len, <=, mss); 957 ndatamp = build_data_seg(&odatamp, &offset, seg_len); 958 959 if (ndatamp == NULL) { 960 freemsgchain(seg_chain); 961 mac_drop_pkt(omp, "LSO failed to segment data"); 962 goto fail; 963 } 964 965 /* Attach data mblk to header mblk. */ 966 nhdrmp->b_cont = ndatamp; 967 DB_CKSUMFLAGS(ndatamp) &= ~HW_LSO; 968 ASSERT3U(seg_len, <=, oleft); 969 oleft -= seg_len; 970 } 971 972 /* We should have consumed entire LSO msg. */ 973 ASSERT3S(oleft, ==, 0); 974 ASSERT3P(odatamp, ==, NULL); 975 976 /* 977 * All seg data mblks are referenced by the header mblks, null 978 * out this pointer to catch any bad derefs. 979 */ 980 ndatamp = NULL; 981 982 /* 983 * Set headers and checksum for first segment. 984 */ 985 nhdrmp = seg_chain; 986 bcopy(omp->b_rptr, nhdrmp->b_rptr, ohdrslen); 987 nhdrmp->b_wptr = nhdrmp->b_rptr + ohdrslen; 988 niph = (ipha_t *)(nhdrmp->b_rptr + oehlen); 989 ASSERT3U(msgsize(nhdrmp->b_cont), ==, mss); 990 niph->ipha_length = htons(oiphlen + otcphlen + mss); 991 niph->ipha_hdr_checksum = 0; 992 ip_id = ntohs(niph->ipha_ident); 993 ntcph = (tcph_t *)(nhdrmp->b_rptr + oehlen + oiphlen); 994 tcp_seq = BE32_TO_U32(ntcph->th_seq); 995 tcp_seq += mss; 996 997 /* 998 * The first segment shouldn't: 999 * 1000 * o indicate end of data transmission (FIN), 1001 * o indicate immediate handling of the data (PUSH). 1002 */ 1003 ntcph->th_flags[0] &= ~(TH_FIN | TH_PUSH); 1004 DB_CKSUMFLAGS(nhdrmp) = (uint16_t)(ocsum_flags & ~HW_LSO); 1005 1006 /* 1007 * If the underlying HW provides partial checksum, then make 1008 * sure to correct the pseudo header checksum before calling 1009 * mac_sw_cksum(). The native TCP stack doesn't include the 1010 * length field in the pseudo header when LSO is in play -- so 1011 * we need to calculate it here. 1012 */ 1013 if (ocsum_flags & HCK_PARTIALCKSUM) { 1014 DB_CKSUMSTART(nhdrmp) = ocsum_start; 1015 DB_CKSUMEND(nhdrmp) = ntohs(niph->ipha_length); 1016 DB_CKSUMSTUFF(nhdrmp) = ocsum_stuff; 1017 tcp_sum = BE16_TO_U16(ntcph->th_sum); 1018 otcp_sum = tcp_sum; 1019 tcp_sum += mss + otcphlen; 1020 tcp_sum = (tcp_sum >> 16) + (tcp_sum & 0xFFFF); 1021 U16_TO_BE16(tcp_sum, ntcph->th_sum); 1022 } 1023 1024 if ((ocsum_flags & (HCK_PARTIALCKSUM | HCK_FULLCKSUM)) && 1025 (emul & MAC_HWCKSUM_EMULS)) { 1026 next_nhdrmp = nhdrmp->b_next; 1027 nhdrmp->b_next = NULL; 1028 nhdrmp = mac_sw_cksum(nhdrmp, emul); 1029 nhdrmp->b_next = next_nhdrmp; 1030 next_nhdrmp = NULL; 1031 1032 /* 1033 * We may have freed the nhdrmp argument during 1034 * checksum emulation, make sure that seg_chain 1035 * references a valid mblk. 1036 */ 1037 seg_chain = nhdrmp; 1038 } 1039 1040 ASSERT3P(nhdrmp, !=, NULL); 1041 1042 seg = 1; 1043 DTRACE_PROBE5(sw__lso__seg, mblk_t *, nhdrmp, void_ip_t *, 1044 (ipha_t *)(nhdrmp->b_rptr + oehlen), __dtrace_tcp_tcph_t *, 1045 (tcph_t *)(nhdrmp->b_rptr + oehlen + oiphlen), uint_t, mss, 1046 uint_t, seg); 1047 seg++; 1048 1049 /* There better be at least 2 segs. */ 1050 ASSERT3P(nhdrmp->b_next, !=, NULL); 1051 prev_nhdrmp = nhdrmp; 1052 nhdrmp = nhdrmp->b_next; 1053 1054 /* 1055 * Now adjust the headers of the middle segments. For each 1056 * header we need to adjust the following. 1057 * 1058 * o IP ID 1059 * o IP length 1060 * o TCP sequence 1061 * o TCP flags 1062 * o cksum flags 1063 * o cksum values (if MAC_HWCKSUM_EMUL is set) 1064 */ 1065 for (; seg < nsegs; seg++) { 1066 /* 1067 * We use seg_chain as a reference to the first seg 1068 * header mblk -- this first header is a template for 1069 * the rest of the segments. This copy will include 1070 * the now updated checksum values from the first 1071 * header. We must reset these checksum values to 1072 * their original to make sure we produce the correct 1073 * value. 1074 */ 1075 bcopy(seg_chain->b_rptr, nhdrmp->b_rptr, ohdrslen); 1076 nhdrmp->b_wptr = nhdrmp->b_rptr + ohdrslen; 1077 niph = (ipha_t *)(nhdrmp->b_rptr + oehlen); 1078 niph->ipha_ident = htons(++ip_id); 1079 ASSERT3P(msgsize(nhdrmp->b_cont), ==, mss); 1080 niph->ipha_length = htons(oiphlen + otcphlen + mss); 1081 niph->ipha_hdr_checksum = 0; 1082 ntcph = (tcph_t *)(nhdrmp->b_rptr + oehlen + oiphlen); 1083 U32_TO_BE32(tcp_seq, ntcph->th_seq); 1084 tcp_seq += mss; 1085 /* 1086 * Just like the first segment, the middle segments 1087 * shouldn't have these flags set. 1088 */ 1089 ntcph->th_flags[0] &= ~(TH_FIN | TH_PUSH); 1090 DB_CKSUMFLAGS(nhdrmp) = (uint16_t)(ocsum_flags & ~HW_LSO); 1091 1092 if (ocsum_flags & HCK_PARTIALCKSUM) { 1093 /* 1094 * First and middle segs have same 1095 * pseudo-header checksum. 1096 */ 1097 U16_TO_BE16(tcp_sum, ntcph->th_sum); 1098 DB_CKSUMSTART(nhdrmp) = ocsum_start; 1099 DB_CKSUMEND(nhdrmp) = ntohs(niph->ipha_length); 1100 DB_CKSUMSTUFF(nhdrmp) = ocsum_stuff; 1101 } 1102 1103 if ((ocsum_flags & (HCK_PARTIALCKSUM | HCK_FULLCKSUM)) && 1104 (emul & MAC_HWCKSUM_EMULS)) { 1105 next_nhdrmp = nhdrmp->b_next; 1106 nhdrmp->b_next = NULL; 1107 nhdrmp = mac_sw_cksum(nhdrmp, emul); 1108 nhdrmp->b_next = next_nhdrmp; 1109 next_nhdrmp = NULL; 1110 /* We may have freed the original nhdrmp. */ 1111 prev_nhdrmp->b_next = nhdrmp; 1112 } 1113 1114 DTRACE_PROBE5(sw__lso__seg, mblk_t *, nhdrmp, void_ip_t *, 1115 (ipha_t *)(nhdrmp->b_rptr + oehlen), __dtrace_tcp_tcph_t *, 1116 (tcph_t *)(nhdrmp->b_rptr + oehlen + oiphlen), 1117 uint_t, mss, uint_t, seg); 1118 1119 ASSERT3P(nhdrmp->b_next, !=, NULL); 1120 prev_nhdrmp = nhdrmp; 1121 nhdrmp = nhdrmp->b_next; 1122 } 1123 1124 /* Make sure we are on the last segment. */ 1125 ASSERT3U(seg, ==, nsegs); 1126 ASSERT3P(nhdrmp->b_next, ==, NULL); 1127 1128 /* 1129 * Now we set the last segment header. The difference being 1130 * that FIN/PSH/RST flags are allowed. 1131 */ 1132 bcopy(seg_chain->b_rptr, nhdrmp->b_rptr, ohdrslen); 1133 nhdrmp->b_wptr = nhdrmp->b_rptr + ohdrslen; 1134 niph = (ipha_t *)(nhdrmp->b_rptr + oehlen); 1135 niph->ipha_ident = htons(++ip_id); 1136 len = msgsize(nhdrmp->b_cont); 1137 ASSERT3S(len, >, 0); 1138 niph->ipha_length = htons(oiphlen + otcphlen + len); 1139 niph->ipha_hdr_checksum = 0; 1140 ntcph = (tcph_t *)(nhdrmp->b_rptr + oehlen + oiphlen); 1141 U32_TO_BE32(tcp_seq, ntcph->th_seq); 1142 1143 DB_CKSUMFLAGS(nhdrmp) = (uint16_t)(ocsum_flags & ~HW_LSO); 1144 if (ocsum_flags & HCK_PARTIALCKSUM) { 1145 DB_CKSUMSTART(nhdrmp) = ocsum_start; 1146 DB_CKSUMEND(nhdrmp) = ntohs(niph->ipha_length); 1147 DB_CKSUMSTUFF(nhdrmp) = ocsum_stuff; 1148 tcp_sum = otcp_sum; 1149 tcp_sum += len + otcphlen; 1150 tcp_sum = (tcp_sum >> 16) + (tcp_sum & 0xFFFF); 1151 U16_TO_BE16(tcp_sum, ntcph->th_sum); 1152 } 1153 1154 if ((ocsum_flags & (HCK_PARTIALCKSUM | HCK_FULLCKSUM)) && 1155 (emul & MAC_HWCKSUM_EMULS)) { 1156 /* This should be the last mblk. */ 1157 ASSERT3P(nhdrmp->b_next, ==, NULL); 1158 nhdrmp = mac_sw_cksum(nhdrmp, emul); 1159 prev_nhdrmp->b_next = nhdrmp; 1160 } 1161 1162 DTRACE_PROBE5(sw__lso__seg, mblk_t *, nhdrmp, void_ip_t *, 1163 (ipha_t *)(nhdrmp->b_rptr + oehlen), __dtrace_tcp_tcph_t *, 1164 (tcph_t *)(nhdrmp->b_rptr + oehlen + oiphlen), uint_t, len, 1165 uint_t, seg); 1166 1167 /* 1168 * Free the reference to the original LSO message as it is 1169 * being replaced by seg_cahin. 1170 */ 1171 freemsg(omp); 1172 *head = seg_chain; 1173 *tail = nhdrmp; 1174 *count = nsegs; 1175 return; 1176 1177 fail: 1178 *head = NULL; 1179 *tail = NULL; 1180 *count = 0; 1181 } 1182 1183 #define HCK_NEEDED (HCK_IPV4_HDRCKSUM | HCK_PARTIALCKSUM | HCK_FULLCKSUM) 1184 1185 /* 1186 * Emulate various hardware offload features in software. Take a chain 1187 * of packets as input and emulate the hardware features specified in 1188 * 'emul'. The resulting chain's head pointer replaces the 'mp_chain' 1189 * pointer given as input, and its tail pointer is written to 1190 * '*otail'. The number of packets in the new chain is written to 1191 * '*ocount'. The 'otail' and 'ocount' arguments are optional and thus 1192 * may be NULL. The 'mp_chain' argument may point to a NULL chain; in 1193 * which case 'mp_chain' will simply stay a NULL chain. 1194 * 1195 * While unlikely, it is technically possible that this function could 1196 * receive a non-NULL chain as input and return a NULL chain as output 1197 * ('*mp_chain' and '*otail' would be NULL and '*ocount' would be 1198 * zero). This could happen if all the packets in the chain are 1199 * dropped or if we fail to allocate new mblks. In this case, there is 1200 * nothing for the caller to free. In any event, the caller shouldn't 1201 * assume that '*mp_chain' is non-NULL on return. 1202 * 1203 * This function was written with three main use cases in mind. 1204 * 1205 * 1. To emulate hardware offloads when traveling mac-loopback (two 1206 * clients on the same mac). This is wired up in mac_tx_send(). 1207 * 1208 * 2. To provide hardware offloads to the client when the underlying 1209 * provider cannot. This is currently wired up in mac_tx() but we 1210 * still only negotiate offloads when the underlying provider 1211 * supports them. 1212 * 1213 * 3. To emulate real hardware in simnet. 1214 */ 1215 void 1216 mac_hw_emul(mblk_t **mp_chain, mblk_t **otail, uint_t *ocount, mac_emul_t emul) 1217 { 1218 mblk_t *head = NULL, *tail = NULL; 1219 uint_t count = 0; 1220 1221 ASSERT3S(~(MAC_HWCKSUM_EMULS | MAC_LSO_EMUL) & emul, ==, 0); 1222 ASSERT3P(mp_chain, !=, NULL); 1223 1224 for (mblk_t *mp = *mp_chain; mp != NULL; ) { 1225 mblk_t *tmp, *next, *tmphead, *tmptail; 1226 struct ether_header *ehp; 1227 uint32_t flags; 1228 uint_t len = MBLKL(mp), l2len; 1229 1230 /* Perform LSO/cksum one message at a time. */ 1231 next = mp->b_next; 1232 mp->b_next = NULL; 1233 1234 /* 1235 * For our sanity the first mblk should contain at 1236 * least the full L2 header. 1237 */ 1238 if (len < sizeof (struct ether_header)) { 1239 mac_drop_pkt(mp, "packet too short (A): %u", len); 1240 mp = next; 1241 continue; 1242 } 1243 1244 ehp = (struct ether_header *)mp->b_rptr; 1245 if (ntohs(ehp->ether_type) == VLAN_TPID) 1246 l2len = sizeof (struct ether_vlan_header); 1247 else 1248 l2len = sizeof (struct ether_header); 1249 1250 /* 1251 * If the first mblk is solely the L2 header, then 1252 * there better be more data. 1253 */ 1254 if (len < l2len || (len == l2len && mp->b_cont == NULL)) { 1255 mac_drop_pkt(mp, "packet too short (C): %u", len); 1256 mp = next; 1257 continue; 1258 } 1259 1260 DTRACE_PROBE2(mac__emul, mblk_t *, mp, mac_emul_t, emul); 1261 1262 /* 1263 * We use DB_CKSUMFLAGS (instead of mac_hcksum_get()) 1264 * because we don't want to mask-out the LSO flag. 1265 */ 1266 flags = DB_CKSUMFLAGS(mp); 1267 1268 if ((flags & HW_LSO) && (emul & MAC_LSO_EMUL)) { 1269 uint_t tmpcount = 0; 1270 1271 /* 1272 * LSO fix-up handles checksum emulation 1273 * inline (if requested). It also frees mp. 1274 */ 1275 mac_sw_lso(mp, emul, &tmphead, &tmptail, 1276 &tmpcount); 1277 if (tmphead == NULL) { 1278 /* mac_sw_lso() freed the mp. */ 1279 mp = next; 1280 continue; 1281 } 1282 count += tmpcount; 1283 } else if ((flags & HCK_NEEDED) && (emul & MAC_HWCKSUM_EMULS)) { 1284 tmp = mac_sw_cksum(mp, emul); 1285 if (tmp == NULL) { 1286 /* mac_sw_cksum() freed the mp. */ 1287 mp = next; 1288 continue; 1289 } 1290 tmphead = tmp; 1291 tmptail = tmp; 1292 count++; 1293 } else { 1294 /* There is nothing to emulate. */ 1295 tmp = mp; 1296 tmphead = tmp; 1297 tmptail = tmp; 1298 count++; 1299 } 1300 1301 /* 1302 * The tmp mblk chain is either the start of the new 1303 * chain or added to the tail of the new chain. 1304 */ 1305 if (head == NULL) { 1306 head = tmphead; 1307 tail = tmptail; 1308 } else { 1309 /* Attach the new mblk to the end of the new chain. */ 1310 tail->b_next = tmphead; 1311 tail = tmptail; 1312 } 1313 1314 mp = next; 1315 } 1316 1317 *mp_chain = head; 1318 1319 if (otail != NULL) 1320 *otail = tail; 1321 1322 if (ocount != NULL) 1323 *ocount = count; 1324 } 1325 1326 /* 1327 * Add VLAN tag to the specified mblk. 1328 */ 1329 mblk_t * 1330 mac_add_vlan_tag(mblk_t *mp, uint_t pri, uint16_t vid) 1331 { 1332 mblk_t *hmp; 1333 struct ether_vlan_header *evhp; 1334 struct ether_header *ehp; 1335 1336 ASSERT(pri != 0 || vid != 0); 1337 1338 /* 1339 * Allocate an mblk for the new tagged ethernet header, 1340 * and copy the MAC addresses and ethertype from the 1341 * original header. 1342 */ 1343 1344 hmp = allocb(sizeof (struct ether_vlan_header), BPRI_MED); 1345 if (hmp == NULL) { 1346 freemsg(mp); 1347 return (NULL); 1348 } 1349 1350 evhp = (struct ether_vlan_header *)hmp->b_rptr; 1351 ehp = (struct ether_header *)mp->b_rptr; 1352 1353 bcopy(ehp, evhp, (ETHERADDRL * 2)); 1354 evhp->ether_type = ehp->ether_type; 1355 evhp->ether_tpid = htons(ETHERTYPE_VLAN); 1356 1357 hmp->b_wptr += sizeof (struct ether_vlan_header); 1358 mp->b_rptr += sizeof (struct ether_header); 1359 1360 /* 1361 * Free the original message if it's now empty. Link the 1362 * rest of messages to the header message. 1363 */ 1364 mac_hcksum_clone(mp, hmp); 1365 if (MBLKL(mp) == 0) { 1366 hmp->b_cont = mp->b_cont; 1367 freeb(mp); 1368 } else { 1369 hmp->b_cont = mp; 1370 } 1371 ASSERT(MBLKL(hmp) >= sizeof (struct ether_vlan_header)); 1372 1373 /* 1374 * Initialize the new TCI (Tag Control Information). 1375 */ 1376 evhp->ether_tci = htons(VLAN_TCI(pri, 0, vid)); 1377 1378 return (hmp); 1379 } 1380 1381 /* 1382 * Adds a VLAN tag with the specified VID and priority to each mblk of 1383 * the specified chain. 1384 */ 1385 mblk_t * 1386 mac_add_vlan_tag_chain(mblk_t *mp_chain, uint_t pri, uint16_t vid) 1387 { 1388 mblk_t *next_mp, **prev, *mp; 1389 1390 mp = mp_chain; 1391 prev = &mp_chain; 1392 1393 while (mp != NULL) { 1394 next_mp = mp->b_next; 1395 mp->b_next = NULL; 1396 if ((mp = mac_add_vlan_tag(mp, pri, vid)) == NULL) { 1397 freemsgchain(next_mp); 1398 break; 1399 } 1400 *prev = mp; 1401 prev = &mp->b_next; 1402 mp = mp->b_next = next_mp; 1403 } 1404 1405 return (mp_chain); 1406 } 1407 1408 /* 1409 * Strip VLAN tag 1410 */ 1411 mblk_t * 1412 mac_strip_vlan_tag(mblk_t *mp) 1413 { 1414 mblk_t *newmp; 1415 struct ether_vlan_header *evhp; 1416 1417 evhp = (struct ether_vlan_header *)mp->b_rptr; 1418 if (ntohs(evhp->ether_tpid) == ETHERTYPE_VLAN) { 1419 ASSERT(MBLKL(mp) >= sizeof (struct ether_vlan_header)); 1420 1421 if (DB_REF(mp) > 1) { 1422 newmp = copymsg(mp); 1423 if (newmp == NULL) 1424 return (NULL); 1425 freemsg(mp); 1426 mp = newmp; 1427 } 1428 1429 evhp = (struct ether_vlan_header *)mp->b_rptr; 1430 1431 ovbcopy(mp->b_rptr, mp->b_rptr + VLAN_TAGSZ, 2 * ETHERADDRL); 1432 mp->b_rptr += VLAN_TAGSZ; 1433 } 1434 return (mp); 1435 } 1436 1437 /* 1438 * Strip VLAN tag from each mblk of the chain. 1439 */ 1440 mblk_t * 1441 mac_strip_vlan_tag_chain(mblk_t *mp_chain) 1442 { 1443 mblk_t *mp, *next_mp, **prev; 1444 1445 mp = mp_chain; 1446 prev = &mp_chain; 1447 1448 while (mp != NULL) { 1449 next_mp = mp->b_next; 1450 mp->b_next = NULL; 1451 if ((mp = mac_strip_vlan_tag(mp)) == NULL) { 1452 freemsgchain(next_mp); 1453 break; 1454 } 1455 *prev = mp; 1456 prev = &mp->b_next; 1457 mp = mp->b_next = next_mp; 1458 } 1459 1460 return (mp_chain); 1461 } 1462 1463 /* 1464 * Default callback function. Used when the datapath is not yet initialized. 1465 */ 1466 /* ARGSUSED */ 1467 void 1468 mac_rx_def(void *arg, mac_resource_handle_t resource, mblk_t *mp_chain, 1469 boolean_t loopback) 1470 { 1471 freemsgchain(mp_chain); 1472 } 1473 1474 /* 1475 * Determines the IPv6 header length accounting for all the optional IPv6 1476 * headers (hop-by-hop, destination, routing and fragment). The header length 1477 * and next header value (a transport header) is captured. 1478 * 1479 * Returns B_FALSE if all the IP headers are not in the same mblk otherwise 1480 * returns B_TRUE. 1481 */ 1482 boolean_t 1483 mac_ip_hdr_length_v6(ip6_t *ip6h, uint8_t *endptr, uint16_t *hdr_length, 1484 uint8_t *next_hdr, ip6_frag_t **fragp) 1485 { 1486 uint16_t length; 1487 uint_t ehdrlen; 1488 uint8_t *whereptr; 1489 uint8_t *nexthdrp; 1490 ip6_dest_t *desthdr; 1491 ip6_rthdr_t *rthdr; 1492 ip6_frag_t *fraghdr; 1493 1494 if (((uchar_t *)ip6h + IPV6_HDR_LEN) > endptr) 1495 return (B_FALSE); 1496 ASSERT(IPH_HDR_VERSION(ip6h) == IPV6_VERSION); 1497 length = IPV6_HDR_LEN; 1498 whereptr = ((uint8_t *)&ip6h[1]); /* point to next hdr */ 1499 1500 if (fragp != NULL) 1501 *fragp = NULL; 1502 1503 nexthdrp = &ip6h->ip6_nxt; 1504 while (whereptr < endptr) { 1505 /* Is there enough left for len + nexthdr? */ 1506 if (whereptr + MIN_EHDR_LEN > endptr) 1507 break; 1508 1509 switch (*nexthdrp) { 1510 case IPPROTO_HOPOPTS: 1511 case IPPROTO_DSTOPTS: 1512 /* Assumes the headers are identical for hbh and dst */ 1513 desthdr = (ip6_dest_t *)whereptr; 1514 ehdrlen = 8 * (desthdr->ip6d_len + 1); 1515 if ((uchar_t *)desthdr + ehdrlen > endptr) 1516 return (B_FALSE); 1517 nexthdrp = &desthdr->ip6d_nxt; 1518 break; 1519 case IPPROTO_ROUTING: 1520 rthdr = (ip6_rthdr_t *)whereptr; 1521 ehdrlen = 8 * (rthdr->ip6r_len + 1); 1522 if ((uchar_t *)rthdr + ehdrlen > endptr) 1523 return (B_FALSE); 1524 nexthdrp = &rthdr->ip6r_nxt; 1525 break; 1526 case IPPROTO_FRAGMENT: 1527 fraghdr = (ip6_frag_t *)whereptr; 1528 ehdrlen = sizeof (ip6_frag_t); 1529 if ((uchar_t *)&fraghdr[1] > endptr) 1530 return (B_FALSE); 1531 nexthdrp = &fraghdr->ip6f_nxt; 1532 if (fragp != NULL) 1533 *fragp = fraghdr; 1534 break; 1535 case IPPROTO_NONE: 1536 /* No next header means we're finished */ 1537 default: 1538 *hdr_length = length; 1539 *next_hdr = *nexthdrp; 1540 return (B_TRUE); 1541 } 1542 length += ehdrlen; 1543 whereptr += ehdrlen; 1544 *hdr_length = length; 1545 *next_hdr = *nexthdrp; 1546 } 1547 switch (*nexthdrp) { 1548 case IPPROTO_HOPOPTS: 1549 case IPPROTO_DSTOPTS: 1550 case IPPROTO_ROUTING: 1551 case IPPROTO_FRAGMENT: 1552 /* 1553 * If any know extension headers are still to be processed, 1554 * the packet's malformed (or at least all the IP header(s) are 1555 * not in the same mblk - and that should never happen. 1556 */ 1557 return (B_FALSE); 1558 1559 default: 1560 /* 1561 * If we get here, we know that all of the IP headers were in 1562 * the same mblk, even if the ULP header is in the next mblk. 1563 */ 1564 *hdr_length = length; 1565 *next_hdr = *nexthdrp; 1566 return (B_TRUE); 1567 } 1568 } 1569 1570 /* 1571 * The following set of routines are there to take care of interrupt 1572 * re-targeting for legacy (fixed) interrupts. Some older versions 1573 * of the popular NICs like e1000g do not support MSI-X interrupts 1574 * and they reserve fixed interrupts for RX/TX rings. To re-target 1575 * these interrupts, PCITOOL ioctls need to be used. 1576 */ 1577 typedef struct mac_dladm_intr { 1578 int ino; 1579 int cpu_id; 1580 char driver_path[MAXPATHLEN]; 1581 char nexus_path[MAXPATHLEN]; 1582 } mac_dladm_intr_t; 1583 1584 /* Bind the interrupt to cpu_num */ 1585 static int 1586 mac_set_intr(ldi_handle_t lh, processorid_t cpu_num, int oldcpuid, int ino) 1587 { 1588 pcitool_intr_set_t iset; 1589 int err; 1590 1591 iset.old_cpu = oldcpuid; 1592 iset.ino = ino; 1593 iset.cpu_id = cpu_num; 1594 iset.user_version = PCITOOL_VERSION; 1595 err = ldi_ioctl(lh, PCITOOL_DEVICE_SET_INTR, (intptr_t)&iset, FKIOCTL, 1596 kcred, NULL); 1597 1598 return (err); 1599 } 1600 1601 /* 1602 * Search interrupt information. iget is filled in with the info to search 1603 */ 1604 static boolean_t 1605 mac_search_intrinfo(pcitool_intr_get_t *iget_p, mac_dladm_intr_t *dln) 1606 { 1607 int i; 1608 char driver_path[2 * MAXPATHLEN]; 1609 1610 for (i = 0; i < iget_p->num_devs; i++) { 1611 (void) strlcpy(driver_path, iget_p->dev[i].path, MAXPATHLEN); 1612 (void) snprintf(&driver_path[strlen(driver_path)], MAXPATHLEN, 1613 ":%s%d", iget_p->dev[i].driver_name, 1614 iget_p->dev[i].dev_inst); 1615 /* Match the device path for the device path */ 1616 if (strcmp(driver_path, dln->driver_path) == 0) { 1617 dln->ino = iget_p->ino; 1618 dln->cpu_id = iget_p->cpu_id; 1619 return (B_TRUE); 1620 } 1621 } 1622 return (B_FALSE); 1623 } 1624 1625 /* 1626 * Get information about ino, i.e. if this is the interrupt for our 1627 * device and where it is bound etc. 1628 */ 1629 static boolean_t 1630 mac_get_single_intr(ldi_handle_t lh, int oldcpuid, int ino, 1631 mac_dladm_intr_t *dln) 1632 { 1633 pcitool_intr_get_t *iget_p; 1634 int ipsz; 1635 int nipsz; 1636 int err; 1637 uint8_t inum; 1638 1639 /* 1640 * Check if SLEEP is OK, i.e if could come here in response to 1641 * changing the fanout due to some callback from the driver, say 1642 * link speed changes. 1643 */ 1644 ipsz = PCITOOL_IGET_SIZE(0); 1645 iget_p = kmem_zalloc(ipsz, KM_SLEEP); 1646 1647 iget_p->num_devs_ret = 0; 1648 iget_p->user_version = PCITOOL_VERSION; 1649 iget_p->cpu_id = oldcpuid; 1650 iget_p->ino = ino; 1651 1652 err = ldi_ioctl(lh, PCITOOL_DEVICE_GET_INTR, (intptr_t)iget_p, 1653 FKIOCTL, kcred, NULL); 1654 if (err != 0) { 1655 kmem_free(iget_p, ipsz); 1656 return (B_FALSE); 1657 } 1658 if (iget_p->num_devs == 0) { 1659 kmem_free(iget_p, ipsz); 1660 return (B_FALSE); 1661 } 1662 inum = iget_p->num_devs; 1663 if (iget_p->num_devs_ret < iget_p->num_devs) { 1664 /* Reallocate */ 1665 nipsz = PCITOOL_IGET_SIZE(iget_p->num_devs); 1666 1667 kmem_free(iget_p, ipsz); 1668 ipsz = nipsz; 1669 iget_p = kmem_zalloc(ipsz, KM_SLEEP); 1670 1671 iget_p->num_devs_ret = inum; 1672 iget_p->cpu_id = oldcpuid; 1673 iget_p->ino = ino; 1674 iget_p->user_version = PCITOOL_VERSION; 1675 err = ldi_ioctl(lh, PCITOOL_DEVICE_GET_INTR, (intptr_t)iget_p, 1676 FKIOCTL, kcred, NULL); 1677 if (err != 0) { 1678 kmem_free(iget_p, ipsz); 1679 return (B_FALSE); 1680 } 1681 /* defensive */ 1682 if (iget_p->num_devs != iget_p->num_devs_ret) { 1683 kmem_free(iget_p, ipsz); 1684 return (B_FALSE); 1685 } 1686 } 1687 1688 if (mac_search_intrinfo(iget_p, dln)) { 1689 kmem_free(iget_p, ipsz); 1690 return (B_TRUE); 1691 } 1692 kmem_free(iget_p, ipsz); 1693 return (B_FALSE); 1694 } 1695 1696 /* 1697 * Get the interrupts and check each one to see if it is for our device. 1698 */ 1699 static int 1700 mac_validate_intr(ldi_handle_t lh, mac_dladm_intr_t *dln, processorid_t cpuid) 1701 { 1702 pcitool_intr_info_t intr_info; 1703 int err; 1704 int ino; 1705 int oldcpuid; 1706 1707 err = ldi_ioctl(lh, PCITOOL_SYSTEM_INTR_INFO, (intptr_t)&intr_info, 1708 FKIOCTL, kcred, NULL); 1709 if (err != 0) 1710 return (-1); 1711 1712 for (oldcpuid = 0; oldcpuid < intr_info.num_cpu; oldcpuid++) { 1713 for (ino = 0; ino < intr_info.num_intr; ino++) { 1714 if (mac_get_single_intr(lh, oldcpuid, ino, dln)) { 1715 if (dln->cpu_id == cpuid) 1716 return (0); 1717 return (1); 1718 } 1719 } 1720 } 1721 return (-1); 1722 } 1723 1724 /* 1725 * Obtain the nexus parent node info. for mdip. 1726 */ 1727 static dev_info_t * 1728 mac_get_nexus_node(dev_info_t *mdip, mac_dladm_intr_t *dln) 1729 { 1730 struct dev_info *tdip = (struct dev_info *)mdip; 1731 struct ddi_minor_data *minordata; 1732 dev_info_t *pdip; 1733 char pathname[MAXPATHLEN]; 1734 1735 while (tdip != NULL) { 1736 /* 1737 * The netboot code could call this function while walking the 1738 * device tree so we need to use ndi_devi_tryenter() here to 1739 * avoid deadlock. 1740 */ 1741 if (ndi_devi_tryenter((dev_info_t *)tdip) == 0) 1742 break; 1743 1744 for (minordata = tdip->devi_minor; minordata != NULL; 1745 minordata = minordata->next) { 1746 if (strncmp(minordata->ddm_node_type, DDI_NT_INTRCTL, 1747 strlen(DDI_NT_INTRCTL)) == 0) { 1748 pdip = minordata->dip; 1749 (void) ddi_pathname(pdip, pathname); 1750 (void) snprintf(dln->nexus_path, MAXPATHLEN, 1751 "/devices%s:intr", pathname); 1752 (void) ddi_pathname_minor(minordata, pathname); 1753 ndi_devi_exit((dev_info_t *)tdip); 1754 return (pdip); 1755 } 1756 } 1757 ndi_devi_exit((dev_info_t *)tdip); 1758 tdip = tdip->devi_parent; 1759 } 1760 return (NULL); 1761 } 1762 1763 /* 1764 * For a primary MAC client, if the user has set a list or CPUs or 1765 * we have obtained it implicitly, we try to retarget the interrupt 1766 * for that device on one of the CPUs in the list. 1767 * We assign the interrupt to the same CPU as the poll thread. 1768 */ 1769 static boolean_t 1770 mac_check_interrupt_binding(dev_info_t *mdip, int32_t cpuid) 1771 { 1772 ldi_handle_t lh = NULL; 1773 ldi_ident_t li = NULL; 1774 int err; 1775 int ret; 1776 mac_dladm_intr_t dln; 1777 dev_info_t *dip; 1778 struct ddi_minor_data *minordata; 1779 1780 dln.nexus_path[0] = '\0'; 1781 dln.driver_path[0] = '\0'; 1782 1783 minordata = ((struct dev_info *)mdip)->devi_minor; 1784 while (minordata != NULL) { 1785 if (minordata->type == DDM_MINOR) 1786 break; 1787 minordata = minordata->next; 1788 } 1789 if (minordata == NULL) 1790 return (B_FALSE); 1791 1792 (void) ddi_pathname_minor(minordata, dln.driver_path); 1793 1794 dip = mac_get_nexus_node(mdip, &dln); 1795 /* defensive */ 1796 if (dip == NULL) 1797 return (B_FALSE); 1798 1799 err = ldi_ident_from_major(ddi_driver_major(dip), &li); 1800 if (err != 0) 1801 return (B_FALSE); 1802 1803 err = ldi_open_by_name(dln.nexus_path, FREAD|FWRITE, kcred, &lh, li); 1804 if (err != 0) 1805 return (B_FALSE); 1806 1807 ret = mac_validate_intr(lh, &dln, cpuid); 1808 if (ret < 0) { 1809 (void) ldi_close(lh, FREAD|FWRITE, kcred); 1810 return (B_FALSE); 1811 } 1812 /* cmn_note? */ 1813 if (ret != 0) 1814 if ((err = (mac_set_intr(lh, cpuid, dln.cpu_id, dln.ino))) 1815 != 0) { 1816 (void) ldi_close(lh, FREAD|FWRITE, kcred); 1817 return (B_FALSE); 1818 } 1819 (void) ldi_close(lh, FREAD|FWRITE, kcred); 1820 return (B_TRUE); 1821 } 1822 1823 void 1824 mac_client_set_intr_cpu(void *arg, mac_client_handle_t mch, int32_t cpuid) 1825 { 1826 dev_info_t *mdip = (dev_info_t *)arg; 1827 mac_client_impl_t *mcip = (mac_client_impl_t *)mch; 1828 mac_resource_props_t *mrp; 1829 mac_perim_handle_t mph; 1830 flow_entry_t *flent = mcip->mci_flent; 1831 mac_soft_ring_set_t *rx_srs; 1832 mac_cpus_t *srs_cpu; 1833 1834 if (!mac_check_interrupt_binding(mdip, cpuid)) 1835 cpuid = -1; 1836 mac_perim_enter_by_mh((mac_handle_t)mcip->mci_mip, &mph); 1837 mrp = MCIP_RESOURCE_PROPS(mcip); 1838 mrp->mrp_rx_intr_cpu = cpuid; 1839 if (flent != NULL && flent->fe_rx_srs_cnt == 2) { 1840 rx_srs = flent->fe_rx_srs[1]; 1841 srs_cpu = &rx_srs->srs_cpu; 1842 srs_cpu->mc_rx_intr_cpu = cpuid; 1843 } 1844 mac_perim_exit(mph); 1845 } 1846 1847 int32_t 1848 mac_client_intr_cpu(mac_client_handle_t mch) 1849 { 1850 mac_client_impl_t *mcip = (mac_client_impl_t *)mch; 1851 mac_cpus_t *srs_cpu; 1852 mac_soft_ring_set_t *rx_srs; 1853 flow_entry_t *flent = mcip->mci_flent; 1854 mac_resource_props_t *mrp = MCIP_RESOURCE_PROPS(mcip); 1855 mac_ring_t *ring; 1856 mac_intr_t *mintr; 1857 1858 /* 1859 * Check if we need to retarget the interrupt. We do this only 1860 * for the primary MAC client. We do this if we have the only 1861 * exclusive ring in the group. 1862 */ 1863 if (mac_is_primary_client(mcip) && flent->fe_rx_srs_cnt == 2) { 1864 rx_srs = flent->fe_rx_srs[1]; 1865 srs_cpu = &rx_srs->srs_cpu; 1866 ring = rx_srs->srs_ring; 1867 mintr = &ring->mr_info.mri_intr; 1868 /* 1869 * If ddi_handle is present or the poll CPU is 1870 * already bound to the interrupt CPU, return -1. 1871 */ 1872 if (mintr->mi_ddi_handle != NULL || 1873 ((mrp->mrp_ncpus != 0) && 1874 (mrp->mrp_rx_intr_cpu == srs_cpu->mc_rx_pollid))) { 1875 return (-1); 1876 } 1877 return (srs_cpu->mc_rx_pollid); 1878 } 1879 return (-1); 1880 } 1881 1882 void * 1883 mac_get_devinfo(mac_handle_t mh) 1884 { 1885 mac_impl_t *mip = (mac_impl_t *)mh; 1886 1887 return ((void *)mip->mi_dip); 1888 } 1889 1890 #define PKT_HASH_2BYTES(x) ((x)[0] ^ (x)[1]) 1891 #define PKT_HASH_4BYTES(x) ((x)[0] ^ (x)[1] ^ (x)[2] ^ (x)[3]) 1892 #define PKT_HASH_MAC(x) ((x)[0] ^ (x)[1] ^ (x)[2] ^ (x)[3] ^ (x)[4] ^ (x)[5]) 1893 1894 uint64_t 1895 mac_pkt_hash(uint_t media, mblk_t *mp, uint8_t policy, boolean_t is_outbound) 1896 { 1897 struct ether_header *ehp; 1898 uint64_t hash = 0; 1899 uint16_t sap; 1900 uint_t skip_len; 1901 uint8_t proto; 1902 boolean_t ip_fragmented; 1903 1904 /* 1905 * We may want to have one of these per MAC type plugin in the 1906 * future. For now supports only ethernet. 1907 */ 1908 if (media != DL_ETHER) 1909 return (0L); 1910 1911 /* for now we support only outbound packets */ 1912 ASSERT(is_outbound); 1913 ASSERT(IS_P2ALIGNED(mp->b_rptr, sizeof (uint16_t))); 1914 ASSERT(MBLKL(mp) >= sizeof (struct ether_header)); 1915 1916 /* compute L2 hash */ 1917 1918 ehp = (struct ether_header *)mp->b_rptr; 1919 1920 if ((policy & MAC_PKT_HASH_L2) != 0) { 1921 uchar_t *mac_src = ehp->ether_shost.ether_addr_octet; 1922 uchar_t *mac_dst = ehp->ether_dhost.ether_addr_octet; 1923 hash = PKT_HASH_MAC(mac_src) ^ PKT_HASH_MAC(mac_dst); 1924 policy &= ~MAC_PKT_HASH_L2; 1925 } 1926 1927 if (policy == 0) 1928 goto done; 1929 1930 /* skip ethernet header */ 1931 1932 sap = ntohs(ehp->ether_type); 1933 if (sap == ETHERTYPE_VLAN) { 1934 struct ether_vlan_header *evhp; 1935 mblk_t *newmp = NULL; 1936 1937 skip_len = sizeof (struct ether_vlan_header); 1938 if (MBLKL(mp) < skip_len) { 1939 /* the vlan tag is the payload, pull up first */ 1940 newmp = msgpullup(mp, -1); 1941 if ((newmp == NULL) || (MBLKL(newmp) < skip_len)) { 1942 goto done; 1943 } 1944 evhp = (struct ether_vlan_header *)newmp->b_rptr; 1945 } else { 1946 evhp = (struct ether_vlan_header *)mp->b_rptr; 1947 } 1948 1949 sap = ntohs(evhp->ether_type); 1950 freemsg(newmp); 1951 } else { 1952 skip_len = sizeof (struct ether_header); 1953 } 1954 1955 /* if ethernet header is in its own mblk, skip it */ 1956 if (MBLKL(mp) <= skip_len) { 1957 skip_len -= MBLKL(mp); 1958 mp = mp->b_cont; 1959 if (mp == NULL) 1960 goto done; 1961 } 1962 1963 sap = (sap < ETHERTYPE_802_MIN) ? 0 : sap; 1964 1965 /* compute IP src/dst addresses hash and skip IPv{4,6} header */ 1966 1967 switch (sap) { 1968 case ETHERTYPE_IP: { 1969 ipha_t *iphp; 1970 1971 /* 1972 * If the header is not aligned or the header doesn't fit 1973 * in the mblk, bail now. Note that this may cause packets 1974 * reordering. 1975 */ 1976 iphp = (ipha_t *)(mp->b_rptr + skip_len); 1977 if (((unsigned char *)iphp + sizeof (ipha_t) > mp->b_wptr) || 1978 !OK_32PTR((char *)iphp)) 1979 goto done; 1980 1981 proto = iphp->ipha_protocol; 1982 skip_len += IPH_HDR_LENGTH(iphp); 1983 1984 /* Check if the packet is fragmented. */ 1985 ip_fragmented = ntohs(iphp->ipha_fragment_offset_and_flags) & 1986 IPH_OFFSET; 1987 1988 /* 1989 * For fragmented packets, use addresses in addition to 1990 * the frag_id to generate the hash inorder to get 1991 * better distribution. 1992 */ 1993 if (ip_fragmented || (policy & MAC_PKT_HASH_L3) != 0) { 1994 uint8_t *ip_src = (uint8_t *)&(iphp->ipha_src); 1995 uint8_t *ip_dst = (uint8_t *)&(iphp->ipha_dst); 1996 1997 hash ^= (PKT_HASH_4BYTES(ip_src) ^ 1998 PKT_HASH_4BYTES(ip_dst)); 1999 policy &= ~MAC_PKT_HASH_L3; 2000 } 2001 2002 if (ip_fragmented) { 2003 uint8_t *identp = (uint8_t *)&iphp->ipha_ident; 2004 hash ^= PKT_HASH_2BYTES(identp); 2005 goto done; 2006 } 2007 break; 2008 } 2009 case ETHERTYPE_IPV6: { 2010 ip6_t *ip6hp; 2011 ip6_frag_t *frag = NULL; 2012 uint16_t hdr_length; 2013 2014 /* 2015 * If the header is not aligned or the header doesn't fit 2016 * in the mblk, bail now. Note that this may cause packets 2017 * reordering. 2018 */ 2019 2020 ip6hp = (ip6_t *)(mp->b_rptr + skip_len); 2021 if (((unsigned char *)ip6hp + IPV6_HDR_LEN > mp->b_wptr) || 2022 !OK_32PTR((char *)ip6hp)) 2023 goto done; 2024 2025 if (!mac_ip_hdr_length_v6(ip6hp, mp->b_wptr, &hdr_length, 2026 &proto, &frag)) 2027 goto done; 2028 skip_len += hdr_length; 2029 2030 /* 2031 * For fragmented packets, use addresses in addition to 2032 * the frag_id to generate the hash inorder to get 2033 * better distribution. 2034 */ 2035 if (frag != NULL || (policy & MAC_PKT_HASH_L3) != 0) { 2036 uint8_t *ip_src = &(ip6hp->ip6_src.s6_addr8[12]); 2037 uint8_t *ip_dst = &(ip6hp->ip6_dst.s6_addr8[12]); 2038 2039 hash ^= (PKT_HASH_4BYTES(ip_src) ^ 2040 PKT_HASH_4BYTES(ip_dst)); 2041 policy &= ~MAC_PKT_HASH_L3; 2042 } 2043 2044 if (frag != NULL) { 2045 uint8_t *identp = (uint8_t *)&frag->ip6f_ident; 2046 hash ^= PKT_HASH_4BYTES(identp); 2047 goto done; 2048 } 2049 break; 2050 } 2051 default: 2052 goto done; 2053 } 2054 2055 if (policy == 0) 2056 goto done; 2057 2058 /* if ip header is in its own mblk, skip it */ 2059 if (MBLKL(mp) <= skip_len) { 2060 skip_len -= MBLKL(mp); 2061 mp = mp->b_cont; 2062 if (mp == NULL) 2063 goto done; 2064 } 2065 2066 /* parse ULP header */ 2067 again: 2068 switch (proto) { 2069 case IPPROTO_TCP: 2070 case IPPROTO_UDP: 2071 case IPPROTO_ESP: 2072 case IPPROTO_SCTP: 2073 /* 2074 * These Internet Protocols are intentionally designed 2075 * for hashing from the git-go. Port numbers are in the first 2076 * word for transports, SPI is first for ESP. 2077 */ 2078 if (mp->b_rptr + skip_len + 4 > mp->b_wptr) 2079 goto done; 2080 hash ^= PKT_HASH_4BYTES((mp->b_rptr + skip_len)); 2081 break; 2082 2083 case IPPROTO_AH: { 2084 ah_t *ah = (ah_t *)(mp->b_rptr + skip_len); 2085 uint_t ah_length = AH_TOTAL_LEN(ah); 2086 2087 if ((unsigned char *)ah + sizeof (ah_t) > mp->b_wptr) 2088 goto done; 2089 2090 proto = ah->ah_nexthdr; 2091 skip_len += ah_length; 2092 2093 /* if AH header is in its own mblk, skip it */ 2094 if (MBLKL(mp) <= skip_len) { 2095 skip_len -= MBLKL(mp); 2096 mp = mp->b_cont; 2097 if (mp == NULL) 2098 goto done; 2099 } 2100 2101 goto again; 2102 } 2103 } 2104 2105 done: 2106 return (hash); 2107 } 2108