1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright 2019 Joyent, Inc. 24 * Copyright 2025 Oxide Computer Company 25 */ 26 27 /* 28 * MAC Services Module - misc utilities 29 */ 30 31 #include <sys/types.h> 32 #include <sys/mac.h> 33 #include <sys/mac_impl.h> 34 #include <sys/mac_client_priv.h> 35 #include <sys/mac_client_impl.h> 36 #include <sys/mac_soft_ring.h> 37 #include <sys/strsubr.h> 38 #include <sys/strsun.h> 39 #include <sys/vlan.h> 40 #include <sys/pattr.h> 41 #include <sys/pci_tools.h> 42 #include <inet/ip.h> 43 #include <inet/ip_impl.h> 44 #include <inet/ip6.h> 45 #include <sys/vtrace.h> 46 #include <sys/dlpi.h> 47 #include <sys/sunndi.h> 48 #include <inet/ipsec_impl.h> 49 #include <inet/sadb.h> 50 #include <inet/ipsecesp.h> 51 #include <inet/ipsecah.h> 52 #include <inet/tcp.h> 53 #include <inet/sctp_ip.h> 54 55 /* 56 * The next two functions are used for dropping packets or chains of 57 * packets, respectively. We could use one function for both but 58 * separating the use cases allows us to specify intent and prevent 59 * dropping more data than intended. 60 * 61 * The purpose of these functions is to aid the debugging effort, 62 * especially in production. Rather than use freemsg()/freemsgchain(), 63 * it's preferable to use these functions when dropping a packet in 64 * the MAC layer. These functions should only be used during 65 * unexpected conditions. That is, any time a packet is dropped 66 * outside of the regular, successful datapath. Consolidating all 67 * drops on these functions allows the user to trace one location and 68 * determine why the packet was dropped based on the msg. It also 69 * allows the user to inspect the packet before it is freed. Finally, 70 * it allows the user to avoid tracing freemsg()/freemsgchain() thus 71 * keeping the hot path running as efficiently as possible. 72 * 73 * NOTE: At this time not all MAC drops are aggregated on these 74 * functions; but that is the plan. This comment should be erased once 75 * completed. 76 */ 77 78 /*PRINTFLIKE2*/ 79 void 80 mac_drop_pkt(mblk_t *mp, const char *fmt, ...) 81 { 82 va_list adx; 83 char msg[128]; 84 char *msgp = msg; 85 86 ASSERT3P(mp->b_next, ==, NULL); 87 88 va_start(adx, fmt); 89 (void) vsnprintf(msgp, sizeof (msg), fmt, adx); 90 va_end(adx); 91 92 DTRACE_PROBE2(mac__drop, mblk_t *, mp, char *, msgp); 93 freemsg(mp); 94 } 95 96 /*PRINTFLIKE2*/ 97 void 98 mac_drop_chain(mblk_t *chain, const char *fmt, ...) 99 { 100 va_list adx; 101 char msg[128]; 102 char *msgp = msg; 103 104 va_start(adx, fmt); 105 (void) vsnprintf(msgp, sizeof (msg), fmt, adx); 106 va_end(adx); 107 108 /* 109 * We could use freemsgchain() for the actual freeing but 110 * since we are already walking the chain to fire the dtrace 111 * probe we might as well free the msg here too. 112 */ 113 for (mblk_t *mp = chain, *next; mp != NULL; ) { 114 next = mp->b_next; 115 DTRACE_PROBE2(mac__drop, mblk_t *, mp, char *, msgp); 116 mp->b_next = NULL; 117 freemsg(mp); 118 mp = next; 119 } 120 } 121 122 /* 123 * Copy an mblk, preserving its hardware checksum flags. 124 */ 125 static mblk_t * 126 mac_copymsg_cksum(mblk_t *mp) 127 { 128 mblk_t *mp1; 129 130 mp1 = copymsg(mp); 131 if (mp1 == NULL) 132 return (NULL); 133 134 mac_hcksum_clone(mp, mp1); 135 136 return (mp1); 137 } 138 139 /* 140 * Copy an mblk chain, presenting the hardware checksum flags of the 141 * individual mblks. 142 */ 143 mblk_t * 144 mac_copymsgchain_cksum(mblk_t *mp) 145 { 146 mblk_t *nmp = NULL; 147 mblk_t **nmpp = &nmp; 148 149 for (; mp != NULL; mp = mp->b_next) { 150 if ((*nmpp = mac_copymsg_cksum(mp)) == NULL) { 151 freemsgchain(nmp); 152 return (NULL); 153 } 154 155 nmpp = &((*nmpp)->b_next); 156 } 157 158 return (nmp); 159 } 160 161 /* 162 * Perform software checksum on a single message, if needed. The emulation 163 * performed is determined by an intersection of the mblk's flags and the emul 164 * flags requested. The emul flags are documented in mac.h. 165 */ 166 static mblk_t * 167 mac_sw_cksum(mblk_t *mp, mac_emul_t emul) 168 { 169 mac_ether_offload_info_t meoi = { 0 }; 170 const char *err = ""; 171 172 /* 173 * The only current caller is mac_hw_emul(), which handles any chaining 174 * of mblks prior to now. 175 */ 176 VERIFY3P(mp->b_next, ==, NULL); 177 178 uint32_t flags = DB_CKSUMFLAGS(mp); 179 180 /* Why call this if checksum emulation isn't needed? */ 181 ASSERT3U(flags & (HCK_FLAGS), !=, 0); 182 /* But also, requesting both ULP cksum types is improper */ 183 if ((flags & HCK_FULLCKSUM) != 0 && (flags & HCK_PARTIALCKSUM) != 0) { 184 err = "full and partial ULP cksum requested"; 185 goto bail; 186 } 187 188 const boolean_t do_v4_cksum = (emul & MAC_IPCKSUM_EMUL) != 0 && 189 (flags & HCK_IPV4_HDRCKSUM) != 0; 190 const boolean_t do_ulp_cksum = (emul & MAC_HWCKSUM_EMUL) != 0 && 191 (flags & (HCK_FULLCKSUM | HCK_PARTIALCKSUM)) != 0; 192 const boolean_t ulp_prefer_partial = (flags & HCK_PARTIALCKSUM) != 0; 193 194 mac_ether_offload_info(mp, &meoi); 195 if ((meoi.meoi_flags & MEOI_L2INFO_SET) == 0 || 196 (meoi.meoi_l3proto != ETHERTYPE_IP && 197 meoi.meoi_l3proto != ETHERTYPE_IPV6)) { 198 /* Non-IP traffic (like ARP) is left alone */ 199 return (mp); 200 } 201 202 /* 203 * Ensure that requested checksum type(s) are supported by the 204 * protocols encoded in the packet headers. 205 */ 206 if (do_v4_cksum) { 207 if (meoi.meoi_l3proto != ETHERTYPE_IP) { 208 err = "IPv4 csum requested on non-IPv4 packet"; 209 goto bail; 210 } 211 } 212 if (do_ulp_cksum) { 213 if ((meoi.meoi_flags & MEOI_L4INFO_SET) == 0) { 214 err = "missing ULP header"; 215 goto bail; 216 } 217 switch (meoi.meoi_l4proto) { 218 case IPPROTO_TCP: 219 case IPPROTO_UDP: 220 case IPPROTO_ICMP: 221 case IPPROTO_ICMPV6: 222 case IPPROTO_SCTP: 223 break; 224 default: 225 err = "unexpected ULP"; 226 goto bail; 227 } 228 } 229 230 /* 231 * If the first mblk of this packet contains only the Ethernet header, 232 * skip past it for now. Packets with their data contained in only a 233 * single mblk can then use the fastpaths tuned to that possibility. 234 */ 235 mblk_t *skipped_hdr = NULL; 236 if (MBLKL(mp) == meoi.meoi_l2hlen) { 237 meoi.meoi_len -= meoi.meoi_l2hlen; 238 meoi.meoi_l2hlen = 0; 239 skipped_hdr = mp; 240 mp = mp->b_cont; 241 242 ASSERT(mp != NULL); 243 } 244 245 /* 246 * Ensure that all of the headers we need to access are: 247 * 1. Collected in the first mblk 248 * 2. Held in a data-block which is safe for us to modify 249 * (It must have a refcount of 1) 250 */ 251 const size_t hdr_len_reqd = (meoi.meoi_l2hlen + meoi.meoi_l3hlen) + 252 (do_ulp_cksum ? meoi.meoi_l4hlen : 0); 253 if (MBLKL(mp) < hdr_len_reqd || DB_REF(mp) > 1) { 254 mblk_t *hdrmp = msgpullup(mp, hdr_len_reqd); 255 256 if (hdrmp == NULL) { 257 err = "could not pullup msg headers"; 258 goto bail; 259 } 260 261 mac_hcksum_clone(mp, hdrmp); 262 if (skipped_hdr != NULL) { 263 ASSERT3P(skipped_hdr->b_cont, ==, mp); 264 skipped_hdr->b_cont = hdrmp; 265 } 266 freemsg(mp); 267 mp = hdrmp; 268 } 269 270 /* Calculate IPv4 header checksum, if requested */ 271 if (do_v4_cksum) { 272 /* 273 * While unlikely, it's possible to write code that might end up 274 * calling mac_sw_cksum() twice on the same mblk (performing 275 * both LSO and checksum emulation in a single mblk chain loop 276 * -- the LSO emulation inserts a new chain into the existing 277 * chain and then the loop iterates back over the new segments 278 * and emulates the checksum a second time). Normally this 279 * wouldn't be a problem, because the HCK_*_OK flags are 280 * supposed to indicate that we don't need to do peform the 281 * work. But HCK_IPV4_HDRCKSUM and HCK_IPV4_HDRCKSUM_OK have the 282 * same value; so we cannot use these flags to determine if the 283 * IP header checksum has already been calculated or not. For 284 * this reason, we zero out the the checksum first. In the 285 * future, we should fix the HCK_* flags. 286 */ 287 ipha_t *ipha = (ipha_t *)(mp->b_rptr + meoi.meoi_l2hlen); 288 ipha->ipha_hdr_checksum = 0; 289 ipha->ipha_hdr_checksum = (uint16_t)ip_csum_hdr(ipha); 290 flags &= ~HCK_IPV4_HDRCKSUM; 291 flags |= HCK_IPV4_HDRCKSUM_OK; 292 } 293 294 /* 295 * The SCTP is different from all the other protocols in that it uses 296 * CRC32 for its checksum, rather than ones' complement. 297 */ 298 if (do_ulp_cksum && meoi.meoi_l4proto == IPPROTO_SCTP) { 299 if (ulp_prefer_partial) { 300 err = "SCTP does not support partial checksum"; 301 goto bail; 302 } 303 304 const uint_t ulp_off = meoi.meoi_l2hlen + meoi.meoi_l3hlen; 305 sctp_hdr_t *sctph = (sctp_hdr_t *)(mp->b_rptr + ulp_off); 306 307 sctph->sh_chksum = 0; 308 sctph->sh_chksum = sctp_cksum(mp, ulp_off); 309 310 flags &= ~HCK_FULLCKSUM; 311 flags |= HCK_FULLCKSUM_OK; 312 goto success; 313 } 314 315 /* Calculate full ULP checksum, if requested */ 316 if (do_ulp_cksum && !ulp_prefer_partial) { 317 /* 318 * Calculate address and length portions of pseudo-header csum 319 */ 320 uint32_t cksum = 0; 321 if (meoi.meoi_l3proto == ETHERTYPE_IP) { 322 const ipha_t *ipha = 323 (const ipha_t *)(mp->b_rptr + meoi.meoi_l2hlen); 324 const uint16_t *ipp = 325 (const uint16_t *)(&ipha->ipha_src); 326 327 cksum += ipp[0] + ipp[1] + ipp[2] + ipp[3]; 328 329 /* 330 * While it is tempting to calculate the payload length 331 * solely from `meoi`, like as done below for IPv6, 332 * doing so is a trap. Packets shorter than 60 bytes 333 * will get padded out to that length in order to meet 334 * the minimums for Ethernet. Instead, we pull the 335 * length from the IP header. 336 */ 337 const uint16_t payload_len = 338 ntohs(ipha->ipha_length) - meoi.meoi_l3hlen; 339 cksum += htons(payload_len); 340 } else if (meoi.meoi_l3proto == ETHERTYPE_IPV6) { 341 const ip6_t *ip6h = 342 (const ip6_t *)(mp->b_rptr + meoi.meoi_l2hlen); 343 const uint16_t *ipp = 344 (const uint16_t *)(&ip6h->ip6_src); 345 346 cksum += ipp[0] + ipp[1] + ipp[2] + ipp[3] + 347 ipp[4] + ipp[5] + ipp[6] + ipp[7]; 348 cksum += ipp[8] + ipp[9] + ipp[10] + ipp[11] + 349 ipp[12] + ipp[13] + ipp[14] + ipp[15]; 350 351 const uint16_t payload_len = meoi.meoi_len - 352 ((uint16_t)meoi.meoi_l2hlen + meoi.meoi_l3hlen); 353 cksum += htons(payload_len); 354 } else { 355 /* 356 * Since we already checked for recognized L3 protocols 357 * earlier, this should not be reachable. 358 */ 359 panic("L3 protocol unexpectedly changed"); 360 } 361 362 /* protocol portion of pseudo-header */ 363 uint_t cksum_off; 364 switch (meoi.meoi_l4proto) { 365 case IPPROTO_TCP: 366 cksum += IP_TCP_CSUM_COMP; 367 cksum_off = TCP_CHECKSUM_OFFSET; 368 break; 369 case IPPROTO_UDP: 370 cksum += IP_UDP_CSUM_COMP; 371 cksum_off = UDP_CHECKSUM_OFFSET; 372 break; 373 case IPPROTO_ICMP: 374 /* ICMP cksum does not include pseudo-header contents */ 375 cksum = 0; 376 cksum_off = ICMP_CHECKSUM_OFFSET; 377 break; 378 case IPPROTO_ICMPV6: 379 cksum += IP_ICMPV6_CSUM_COMP; 380 cksum_off = ICMPV6_CHECKSUM_OFFSET; 381 break; 382 default: 383 err = "unrecognized L4 protocol"; 384 goto bail; 385 } 386 387 /* 388 * With IP_CSUM() taking into account the pseudo-header 389 * checksum, make sure the ULP checksum field is zeroed before 390 * computing the rest; 391 */ 392 const uint_t l4_off = meoi.meoi_l3hlen + meoi.meoi_l2hlen; 393 uint16_t *up = (uint16_t *)(mp->b_rptr + l4_off + cksum_off); 394 *up = 0; 395 cksum = IP_CSUM(mp, l4_off, cksum); 396 397 if (meoi.meoi_l4proto == IPPROTO_UDP && cksum == 0) { 398 /* 399 * A zero checksum is not allowed on UDPv6, and on UDPv4 400 * implies no checksum. In either case, invert to a 401 * values of all-1s. 402 */ 403 *up = 0xffff; 404 } else { 405 *up = cksum; 406 } 407 408 flags &= ~HCK_FULLCKSUM; 409 flags |= HCK_FULLCKSUM_OK; 410 goto success; 411 } 412 413 /* Calculate partial ULP checksum, if requested */ 414 if (do_ulp_cksum && ulp_prefer_partial) { 415 uint32_t start, stuff, end, value; 416 mac_hcksum_get(mp, &start, &stuff, &end, &value, NULL); 417 418 ASSERT3S(end, >, start); 419 420 /* 421 * The prior size checks against the header length data ensure 422 * that the mblk contains everything through at least the ULP 423 * header, but if the partial checksum (unexpectedly) requests 424 * its result be stored past that, we cannot continue. 425 */ 426 if (stuff + sizeof (uint16_t) > MBLKL(mp)) { 427 err = "partial csum request is out of bounds"; 428 goto bail; 429 } 430 431 uchar_t *ipp = (uchar_t *)(mp->b_rptr + meoi.meoi_l2hlen); 432 uint16_t *up = (uint16_t *)(ipp + stuff); 433 434 const uint16_t partial = *up; 435 *up = 0; 436 const uint16_t cksum = 437 ~IP_CSUM_PARTIAL(mp, start + meoi.meoi_l2hlen, partial); 438 *up = cksum != 0 ? cksum : ~cksum; 439 440 flags &= ~HCK_PARTIALCKSUM; 441 flags |= HCK_FULLCKSUM_OK; 442 } 443 444 success: 445 /* 446 * With the checksum(s) calculated, store the updated flags to reflect 447 * the current status, and zero out any of the partial-checksum fields 448 * which would be irrelevant now. 449 */ 450 mac_hcksum_set(mp, 0, 0, 0, 0, flags); 451 452 /* Don't forget to reattach the header. */ 453 if (skipped_hdr != NULL) { 454 ASSERT3P(skipped_hdr->b_cont, ==, mp); 455 456 /* 457 * Duplicate the HCKSUM data into the header mblk. 458 * 459 * This mimics mac_add_vlan_tag() which ensures that both the 460 * first mblk _and_ the first data bearing mblk possess the 461 * HCKSUM information. Consumers like IP will end up discarding 462 * the ether_header mblk, so for now, it is important that the 463 * data be available in both places. 464 */ 465 mac_hcksum_clone(mp, skipped_hdr); 466 mp = skipped_hdr; 467 } 468 return (mp); 469 470 bail: 471 if (skipped_hdr != NULL) { 472 ASSERT3P(skipped_hdr->b_cont, ==, mp); 473 mp = skipped_hdr; 474 } 475 476 mac_drop_pkt(mp, err); 477 return (NULL); 478 } 479 480 /* 481 * Build a single data segment from an LSO packet. The mblk chain 482 * returned, seg_head, represents the data segment and is always 483 * exactly seg_len bytes long. The lso_mp and offset input/output 484 * parameters track our position in the LSO packet. This function 485 * exists solely as a helper to mac_sw_lso(). 486 * 487 * Case A 488 * 489 * The current lso_mp is larger than the requested seg_len. The 490 * beginning of seg_head may start at the beginning of lso_mp or 491 * offset into it. In either case, a single mblk is returned, and 492 * *offset is updated to reflect our new position in the current 493 * lso_mp. 494 * 495 * +----------------------------+ 496 * | in *lso_mp / out *lso_mp | 497 * +----------------------------+ 498 * ^ ^ 499 * | | 500 * | | 501 * | | 502 * +------------------------+ 503 * | seg_head | 504 * +------------------------+ 505 * ^ ^ 506 * | | 507 * in *offset = 0 out *offset = seg_len 508 * 509 * |------ seg_len ----| 510 * 511 * 512 * +------------------------------+ 513 * | in *lso_mp / out *lso_mp | 514 * +------------------------------+ 515 * ^ ^ 516 * | | 517 * | | 518 * | | 519 * +------------------------+ 520 * | seg_head | 521 * +------------------------+ 522 * ^ ^ 523 * | | 524 * in *offset = N out *offset = N + seg_len 525 * 526 * |------ seg_len ----| 527 * 528 * 529 * 530 * Case B 531 * 532 * The requested seg_len consumes exactly the rest of the lso_mp. 533 * I.e., the seg_head's b_wptr is equivalent to lso_mp's b_wptr. 534 * The seg_head may start at the beginning of the lso_mp or at some 535 * offset into it. In either case we return a single mblk, reset 536 * *offset to zero, and walk to the next lso_mp. 537 * 538 * +------------------------+ +------------------------+ 539 * | in *lso_mp |---------->| out *lso_mp | 540 * +------------------------+ +------------------------+ 541 * ^ ^ ^ 542 * | | | 543 * | | out *offset = 0 544 * | | 545 * +------------------------+ 546 * | seg_head | 547 * +------------------------+ 548 * ^ 549 * | 550 * in *offset = 0 551 * 552 * |------ seg_len ----| 553 * 554 * 555 * 556 * +----------------------------+ +------------------------+ 557 * | in *lso_mp |---------->| out *lso_mp | 558 * +----------------------------+ +------------------------+ 559 * ^ ^ ^ 560 * | | | 561 * | | out *offset = 0 562 * | | 563 * +------------------------+ 564 * | seg_head | 565 * +------------------------+ 566 * ^ 567 * | 568 * in *offset = N 569 * 570 * |------ seg_len ----| 571 * 572 * 573 * Case C 574 * 575 * The requested seg_len is greater than the current lso_mp. In 576 * this case we must consume LSO mblks until we have enough data to 577 * satisfy either case (A) or (B) above. We will return multiple 578 * mblks linked via b_cont, offset will be set based on the cases 579 * above, and lso_mp will walk forward at least one mblk, but maybe 580 * more. 581 * 582 * N.B. This digram is not exhaustive. The seg_head may start on 583 * the beginning of an lso_mp. The seg_tail may end exactly on the 584 * boundary of an lso_mp. And there may be two (in this case the 585 * middle block wouldn't exist), three, or more mblks in the 586 * seg_head chain. This is meant as one example of what might 587 * happen. The main thing to remember is that the seg_tail mblk 588 * must be one of case (A) or (B) above. 589 * 590 * +------------------+ +----------------+ +------------------+ 591 * | in *lso_mp |--->| *lso_mp |--->| out *lso_mp | 592 * +------------------+ +----------------+ +------------------+ 593 * ^ ^ ^ ^ ^ ^ 594 * | | | | | | 595 * | | | | | | 596 * | | | | | | 597 * | | | | | | 598 * +------------+ +----------------+ +------------+ 599 * | seg_head |--->| |--->| seg_tail | 600 * +------------+ +----------------+ +------------+ 601 * ^ ^ 602 * | | 603 * in *offset = N out *offset = MBLKL(seg_tail) 604 * 605 * |------------------- seg_len -------------------| 606 * 607 */ 608 static mblk_t * 609 build_data_seg(mblk_t **lso_mp, uint32_t *offset, uint32_t seg_len) 610 { 611 mblk_t *seg_head, *seg_tail, *seg_mp; 612 613 ASSERT3P(*lso_mp, !=, NULL); 614 ASSERT3U((*lso_mp)->b_rptr + *offset, <, (*lso_mp)->b_wptr); 615 616 seg_mp = dupb(*lso_mp); 617 if (seg_mp == NULL) 618 return (NULL); 619 620 seg_head = seg_mp; 621 seg_tail = seg_mp; 622 623 /* Continue where we left off from in the lso_mp. */ 624 seg_mp->b_rptr += *offset; 625 626 last_mblk: 627 /* Case (A) */ 628 if ((seg_mp->b_rptr + seg_len) < seg_mp->b_wptr) { 629 *offset += seg_len; 630 seg_mp->b_wptr = seg_mp->b_rptr + seg_len; 631 return (seg_head); 632 } 633 634 /* Case (B) */ 635 if ((seg_mp->b_rptr + seg_len) == seg_mp->b_wptr) { 636 *offset = 0; 637 *lso_mp = (*lso_mp)->b_cont; 638 return (seg_head); 639 } 640 641 /* Case (C) */ 642 ASSERT3U(seg_mp->b_rptr + seg_len, >, seg_mp->b_wptr); 643 644 /* 645 * The current LSO mblk doesn't have enough data to satisfy 646 * seg_len -- continue peeling off LSO mblks to build the new 647 * segment message. If allocation fails we free the previously 648 * allocated segment mblks and return NULL. 649 */ 650 while ((seg_mp->b_rptr + seg_len) > seg_mp->b_wptr) { 651 ASSERT3U(MBLKL(seg_mp), <=, seg_len); 652 seg_len -= MBLKL(seg_mp); 653 *offset = 0; 654 *lso_mp = (*lso_mp)->b_cont; 655 seg_mp = dupb(*lso_mp); 656 657 if (seg_mp == NULL) { 658 freemsgchain(seg_head); 659 return (NULL); 660 } 661 662 seg_tail->b_cont = seg_mp; 663 seg_tail = seg_mp; 664 } 665 666 /* 667 * We've walked enough LSO mblks that we can now satisfy the 668 * remaining seg_len. At this point we need to jump back to 669 * determine if we have arrived at case (A) or (B). 670 */ 671 672 /* Just to be paranoid that we didn't underflow. */ 673 ASSERT3U(seg_len, <, IP_MAXPACKET); 674 ASSERT3U(seg_len, >, 0); 675 goto last_mblk; 676 } 677 678 /* 679 * Perform software segmentation of a single LSO message. Take an LSO 680 * message as input and return head/tail pointers as output. This 681 * function should not be invoked directly but instead through 682 * mac_hw_emul(). 683 * 684 * The resulting chain is comprised of multiple (nsegs) MSS sized 685 * segments. Each segment will consist of two or more mblks joined by 686 * b_cont: a header and one or more data mblks. The header mblk is 687 * allocated anew for each message. The first segment's header is used 688 * as a template for the rest with adjustments made for things such as 689 * ID, sequence, length, TCP flags, etc. The data mblks reference into 690 * the existing LSO mblk (passed in as omp) by way of dupb(). Their 691 * b_rptr/b_wptr values are adjusted to reference only the fraction of 692 * the LSO message they are responsible for. At the successful 693 * completion of this function the original mblk (omp) is freed, 694 * leaving the newely created segment chain as the only remaining 695 * reference to the data. 696 */ 697 static void 698 mac_sw_lso(mblk_t *omp, mac_emul_t emul, mblk_t **head, mblk_t **tail, 699 uint_t *count) 700 { 701 uint32_t ocsum_flags, ocsum_start, ocsum_stuff; 702 uint32_t mss; 703 uint32_t oehlen, oiphlen, otcphlen, ohdrslen, opktlen, odatalen; 704 uint32_t oleft; 705 uint_t nsegs, seg; 706 int len; 707 708 struct ether_vlan_header *oevh; 709 const ipha_t *oiph; 710 const tcph_t *otcph; 711 ipha_t *niph; 712 tcph_t *ntcph; 713 uint16_t ip_id; 714 uint32_t tcp_seq, tcp_sum, otcp_sum; 715 716 uint32_t offset; 717 mblk_t *odatamp; 718 mblk_t *seg_chain, *prev_nhdrmp, *next_nhdrmp, *nhdrmp, *ndatamp; 719 mblk_t *tmptail; 720 721 ASSERT3P(head, !=, NULL); 722 ASSERT3P(tail, !=, NULL); 723 ASSERT3P(count, !=, NULL); 724 ASSERT3U((DB_CKSUMFLAGS(omp) & HW_LSO), !=, 0); 725 726 /* Assume we are dealing with a single LSO message. */ 727 ASSERT3P(omp->b_next, ==, NULL); 728 729 /* 730 * XXX: This is a hack to deal with mac_add_vlan_tag(). 731 * 732 * When VLANs are in play, mac_add_vlan_tag() creates a new 733 * mblk with just the ether_vlan_header and tacks it onto the 734 * front of 'omp'. This breaks the assumptions made below; 735 * namely that the TCP/IP headers are in the first mblk. In 736 * this case, since we already have to pay the cost of LSO 737 * emulation, we simply pull up everything. While this might 738 * seem irksome, keep in mind this will only apply in a couple 739 * of scenarios: a) an LSO-capable VLAN client sending to a 740 * non-LSO-capable client over the "MAC/bridge loopback" 741 * datapath or b) an LSO-capable VLAN client is sending to a 742 * client that, for whatever reason, doesn't have DLS-bypass 743 * enabled. Finally, we have to check for both a tagged and 744 * untagged sized mblk depending on if the mblk came via 745 * mac_promisc_dispatch() or mac_rx_deliver(). 746 * 747 * In the future, two things should be done: 748 * 749 * 1. This function should make use of some yet to be 750 * implemented "mblk helpers". These helper functions would 751 * perform all the b_cont walking for us and guarantee safe 752 * access to the mblk data. 753 * 754 * 2. We should add some slop to the mblks so that 755 * mac_add_vlan_tag() can just edit the first mblk instead 756 * of allocating on the hot path. 757 */ 758 if (MBLKL(omp) == sizeof (struct ether_vlan_header) || 759 MBLKL(omp) == sizeof (struct ether_header)) { 760 mblk_t *tmp = msgpullup(omp, -1); 761 762 if (tmp == NULL) { 763 mac_drop_pkt(omp, "failed to pull up"); 764 goto fail; 765 } 766 767 mac_hcksum_clone(omp, tmp); 768 freemsg(omp); 769 omp = tmp; 770 } 771 772 mss = DB_LSOMSS(omp); 773 ASSERT3U(msgsize(omp), <=, IP_MAXPACKET + 774 sizeof (struct ether_vlan_header)); 775 opktlen = msgsize(omp); 776 777 /* 778 * First, get references to the IP and TCP headers and 779 * determine the total TCP length (header + data). 780 * 781 * Thanks to mac_hw_emul() we know that the first mblk must 782 * contain (at minimum) the full L2 header. However, this 783 * function assumes more than that. It assumes the L2/L3/L4 784 * headers are all contained in the first mblk of a message 785 * (i.e., no b_cont walking for headers). While this is a 786 * current reality (our native TCP stack and viona both 787 * enforce this) things may become more nuanced in the future 788 * (e.g. when introducing encap support or adding new 789 * clients). For now we guard against this case by dropping 790 * the packet. 791 */ 792 oevh = (struct ether_vlan_header *)omp->b_rptr; 793 if (oevh->ether_tpid == htons(ETHERTYPE_VLAN)) 794 oehlen = sizeof (struct ether_vlan_header); 795 else 796 oehlen = sizeof (struct ether_header); 797 798 ASSERT3U(MBLKL(omp), >=, (oehlen + sizeof (ipha_t) + sizeof (tcph_t))); 799 if (MBLKL(omp) < (oehlen + sizeof (ipha_t) + sizeof (tcph_t))) { 800 mac_drop_pkt(omp, "mblk doesn't contain TCP/IP headers"); 801 goto fail; 802 } 803 804 oiph = (ipha_t *)(omp->b_rptr + oehlen); 805 oiphlen = IPH_HDR_LENGTH(oiph); 806 otcph = (tcph_t *)(omp->b_rptr + oehlen + oiphlen); 807 otcphlen = TCP_HDR_LENGTH(otcph); 808 809 /* 810 * Currently we only support LSO for TCP/IPv4. 811 */ 812 if (IPH_HDR_VERSION(oiph) != IPV4_VERSION) { 813 mac_drop_pkt(omp, "LSO unsupported IP version: %uhh", 814 IPH_HDR_VERSION(oiph)); 815 goto fail; 816 } 817 818 if (oiph->ipha_protocol != IPPROTO_TCP) { 819 mac_drop_pkt(omp, "LSO unsupported protocol: %uhh", 820 oiph->ipha_protocol); 821 goto fail; 822 } 823 824 if (otcph->th_flags[0] & (TH_SYN | TH_RST | TH_URG)) { 825 mac_drop_pkt(omp, "LSO packet has SYN|RST|URG set"); 826 goto fail; 827 } 828 829 ohdrslen = oehlen + oiphlen + otcphlen; 830 if ((len = MBLKL(omp)) < ohdrslen) { 831 mac_drop_pkt(omp, "LSO packet too short: %d < %u", len, 832 ohdrslen); 833 goto fail; 834 } 835 836 /* 837 * Either we have data in the first mblk or it's just the 838 * header. In either case, we need to set rptr to the start of 839 * the TCP data. 840 */ 841 if (len > ohdrslen) { 842 odatamp = omp; 843 offset = ohdrslen; 844 } else { 845 ASSERT3U(len, ==, ohdrslen); 846 odatamp = omp->b_cont; 847 offset = 0; 848 } 849 850 /* Make sure we still have enough data. */ 851 ASSERT3U(msgsize(odatamp), >=, opktlen - ohdrslen); 852 853 /* 854 * If a MAC negotiated LSO then it must negotioate both 855 * HCKSUM_IPHDRCKSUM and either HCKSUM_INET_FULL_V4 or 856 * HCKSUM_INET_PARTIAL; because both the IP and TCP headers 857 * change during LSO segmentation (only the 3 fields of the 858 * pseudo header checksum don't change: src, dst, proto). Thus 859 * we would expect these flags (HCK_IPV4_HDRCKSUM | 860 * HCK_PARTIALCKSUM | HCK_FULLCKSUM) to be set and for this 861 * function to emulate those checksums in software. However, 862 * that assumes a world where we only expose LSO if the 863 * underlying hardware exposes LSO. Moving forward the plan is 864 * to assume LSO in the upper layers and have MAC perform 865 * software LSO when the underlying provider doesn't support 866 * it. In such a world, if the provider doesn't support LSO 867 * but does support hardware checksum offload, then we could 868 * simply perform the segmentation and allow the hardware to 869 * calculate the checksums. To the hardware it's just another 870 * chain of non-LSO packets. 871 */ 872 ASSERT3S(DB_TYPE(omp), ==, M_DATA); 873 ocsum_flags = DB_CKSUMFLAGS(omp); 874 ASSERT3U(ocsum_flags & HCK_IPV4_HDRCKSUM, !=, 0); 875 ASSERT3U(ocsum_flags & (HCK_PARTIALCKSUM | HCK_FULLCKSUM), !=, 0); 876 877 /* 878 * If hardware only provides partial checksum then software 879 * must supply the pseudo-header checksum. In the case of LSO 880 * we leave the TCP length at zero to be filled in by 881 * hardware. This function must handle two scenarios. 882 * 883 * 1. Being called by a MAC client on the Rx path to segment 884 * an LSO packet and calculate the checksum. 885 * 886 * 2. Being called by a MAC provider to segment an LSO packet. 887 * In this case the LSO segmentation is performed in 888 * software (by this routine) but the MAC provider should 889 * still calculate the TCP/IP checksums in hardware. 890 * 891 * To elaborate on the second case: we cannot have the 892 * scenario where IP sends LSO packets but the underlying HW 893 * doesn't support checksum offload -- because in that case 894 * TCP/IP would calculate the checksum in software (for the 895 * LSO packet) but then MAC would segment the packet and have 896 * to redo all the checksum work. So IP should never do LSO 897 * if HW doesn't support both IP and TCP checksum. 898 */ 899 if (ocsum_flags & HCK_PARTIALCKSUM) { 900 ocsum_start = (uint32_t)DB_CKSUMSTART(omp); 901 ocsum_stuff = (uint32_t)DB_CKSUMSTUFF(omp); 902 } 903 904 odatalen = opktlen - ohdrslen; 905 906 /* 907 * Subtract one to account for the case where the data length 908 * is evenly divisble by the MSS. Add one to account for the 909 * fact that the division will always result in one less 910 * segment than needed. 911 */ 912 nsegs = ((odatalen - 1) / mss) + 1; 913 if (nsegs < 2) { 914 mac_drop_pkt(omp, "LSO not enough segs: %u", nsegs); 915 goto fail; 916 } 917 918 DTRACE_PROBE6(sw__lso__start, mblk_t *, omp, void_ip_t *, oiph, 919 __dtrace_tcp_tcph_t *, otcph, uint_t, odatalen, uint_t, mss, uint_t, 920 nsegs); 921 922 seg_chain = NULL; 923 tmptail = seg_chain; 924 oleft = odatalen; 925 926 for (uint_t i = 0; i < nsegs; i++) { 927 boolean_t last_seg = ((i + 1) == nsegs); 928 uint32_t seg_len; 929 930 /* 931 * If we fail to allocate, then drop the partially 932 * allocated chain as well as the LSO packet. Let the 933 * sender deal with the fallout. 934 */ 935 if ((nhdrmp = allocb(ohdrslen, 0)) == NULL) { 936 freemsgchain(seg_chain); 937 mac_drop_pkt(omp, "failed to alloc segment header"); 938 goto fail; 939 } 940 ASSERT3P(nhdrmp->b_cont, ==, NULL); 941 942 if (seg_chain == NULL) { 943 seg_chain = nhdrmp; 944 } else { 945 ASSERT3P(tmptail, !=, NULL); 946 tmptail->b_next = nhdrmp; 947 } 948 949 tmptail = nhdrmp; 950 951 /* 952 * Calculate this segment's lengh. It's either the MSS 953 * or whatever remains for the last segment. 954 */ 955 seg_len = last_seg ? oleft : mss; 956 ASSERT3U(seg_len, <=, mss); 957 ndatamp = build_data_seg(&odatamp, &offset, seg_len); 958 959 if (ndatamp == NULL) { 960 freemsgchain(seg_chain); 961 mac_drop_pkt(omp, "LSO failed to segment data"); 962 goto fail; 963 } 964 965 /* Attach data mblk to header mblk. */ 966 nhdrmp->b_cont = ndatamp; 967 DB_CKSUMFLAGS(ndatamp) &= ~HW_LSO; 968 ASSERT3U(seg_len, <=, oleft); 969 oleft -= seg_len; 970 } 971 972 /* We should have consumed entire LSO msg. */ 973 ASSERT3S(oleft, ==, 0); 974 ASSERT3P(odatamp, ==, NULL); 975 976 /* 977 * All seg data mblks are referenced by the header mblks, null 978 * out this pointer to catch any bad derefs. 979 */ 980 ndatamp = NULL; 981 982 /* 983 * Set headers and checksum for first segment. 984 */ 985 nhdrmp = seg_chain; 986 bcopy(omp->b_rptr, nhdrmp->b_rptr, ohdrslen); 987 nhdrmp->b_wptr = nhdrmp->b_rptr + ohdrslen; 988 niph = (ipha_t *)(nhdrmp->b_rptr + oehlen); 989 ASSERT3U(msgsize(nhdrmp->b_cont), ==, mss); 990 niph->ipha_length = htons(oiphlen + otcphlen + mss); 991 niph->ipha_hdr_checksum = 0; 992 ip_id = ntohs(niph->ipha_ident); 993 ntcph = (tcph_t *)(nhdrmp->b_rptr + oehlen + oiphlen); 994 tcp_seq = BE32_TO_U32(ntcph->th_seq); 995 tcp_seq += mss; 996 997 /* 998 * The first segment shouldn't: 999 * 1000 * o indicate end of data transmission (FIN), 1001 * o indicate immediate handling of the data (PUSH). 1002 */ 1003 ntcph->th_flags[0] &= ~(TH_FIN | TH_PUSH); 1004 DB_CKSUMFLAGS(nhdrmp) = (uint16_t)(ocsum_flags & ~HW_LSO); 1005 1006 /* 1007 * If the underlying HW provides partial checksum, then make 1008 * sure to correct the pseudo header checksum before calling 1009 * mac_sw_cksum(). The native TCP stack doesn't include the 1010 * length field in the pseudo header when LSO is in play -- so 1011 * we need to calculate it here. 1012 */ 1013 if (ocsum_flags & HCK_PARTIALCKSUM) { 1014 DB_CKSUMSTART(nhdrmp) = ocsum_start; 1015 DB_CKSUMEND(nhdrmp) = ntohs(niph->ipha_length); 1016 DB_CKSUMSTUFF(nhdrmp) = ocsum_stuff; 1017 tcp_sum = BE16_TO_U16(ntcph->th_sum); 1018 otcp_sum = tcp_sum; 1019 tcp_sum += mss + otcphlen; 1020 tcp_sum = (tcp_sum >> 16) + (tcp_sum & 0xFFFF); 1021 U16_TO_BE16(tcp_sum, ntcph->th_sum); 1022 } 1023 1024 if ((ocsum_flags & (HCK_PARTIALCKSUM | HCK_FULLCKSUM)) && 1025 (emul & MAC_HWCKSUM_EMULS)) { 1026 next_nhdrmp = nhdrmp->b_next; 1027 nhdrmp->b_next = NULL; 1028 nhdrmp = mac_sw_cksum(nhdrmp, emul); 1029 /* 1030 * The mblk could be replaced (via pull-up) or freed (due to 1031 * failure) during mac_sw_cksum(), so we must take care with the 1032 * result here. 1033 */ 1034 if (nhdrmp != NULL) { 1035 nhdrmp->b_next = next_nhdrmp; 1036 next_nhdrmp = NULL; 1037 seg_chain = nhdrmp; 1038 } else { 1039 freemsgchain(next_nhdrmp); 1040 /* 1041 * nhdrmp referenced the head of seg_chain when it was 1042 * freed, so further clean-up there is unnecessary 1043 */ 1044 seg_chain = NULL; 1045 mac_drop_pkt(omp, "LSO cksum emulation failed"); 1046 goto fail; 1047 } 1048 } 1049 1050 ASSERT3P(nhdrmp, !=, NULL); 1051 1052 seg = 1; 1053 DTRACE_PROBE5(sw__lso__seg, mblk_t *, nhdrmp, void_ip_t *, 1054 (ipha_t *)(nhdrmp->b_rptr + oehlen), __dtrace_tcp_tcph_t *, 1055 (tcph_t *)(nhdrmp->b_rptr + oehlen + oiphlen), uint_t, mss, 1056 uint_t, seg); 1057 seg++; 1058 1059 /* There better be at least 2 segs. */ 1060 ASSERT3P(nhdrmp->b_next, !=, NULL); 1061 prev_nhdrmp = nhdrmp; 1062 nhdrmp = nhdrmp->b_next; 1063 1064 /* 1065 * Now adjust the headers of the middle segments. For each 1066 * header we need to adjust the following. 1067 * 1068 * o IP ID 1069 * o IP length 1070 * o TCP sequence 1071 * o TCP flags 1072 * o cksum flags 1073 * o cksum values (if MAC_HWCKSUM_EMUL is set) 1074 */ 1075 for (; seg < nsegs; seg++) { 1076 /* 1077 * We use seg_chain as a reference to the first seg 1078 * header mblk -- this first header is a template for 1079 * the rest of the segments. This copy will include 1080 * the now updated checksum values from the first 1081 * header. We must reset these checksum values to 1082 * their original to make sure we produce the correct 1083 * value. 1084 */ 1085 bcopy(seg_chain->b_rptr, nhdrmp->b_rptr, ohdrslen); 1086 nhdrmp->b_wptr = nhdrmp->b_rptr + ohdrslen; 1087 niph = (ipha_t *)(nhdrmp->b_rptr + oehlen); 1088 niph->ipha_ident = htons(++ip_id); 1089 ASSERT3P(msgsize(nhdrmp->b_cont), ==, mss); 1090 niph->ipha_length = htons(oiphlen + otcphlen + mss); 1091 niph->ipha_hdr_checksum = 0; 1092 ntcph = (tcph_t *)(nhdrmp->b_rptr + oehlen + oiphlen); 1093 U32_TO_BE32(tcp_seq, ntcph->th_seq); 1094 tcp_seq += mss; 1095 /* 1096 * Just like the first segment, the middle segments 1097 * shouldn't have these flags set. 1098 */ 1099 ntcph->th_flags[0] &= ~(TH_FIN | TH_PUSH); 1100 DB_CKSUMFLAGS(nhdrmp) = (uint16_t)(ocsum_flags & ~HW_LSO); 1101 1102 if (ocsum_flags & HCK_PARTIALCKSUM) { 1103 /* 1104 * First and middle segs have same 1105 * pseudo-header checksum. 1106 */ 1107 U16_TO_BE16(tcp_sum, ntcph->th_sum); 1108 DB_CKSUMSTART(nhdrmp) = ocsum_start; 1109 DB_CKSUMEND(nhdrmp) = ntohs(niph->ipha_length); 1110 DB_CKSUMSTUFF(nhdrmp) = ocsum_stuff; 1111 } 1112 1113 if ((ocsum_flags & (HCK_PARTIALCKSUM | HCK_FULLCKSUM)) && 1114 (emul & MAC_HWCKSUM_EMULS)) { 1115 next_nhdrmp = nhdrmp->b_next; 1116 nhdrmp->b_next = NULL; 1117 nhdrmp = mac_sw_cksum(nhdrmp, emul); 1118 /* 1119 * Like above, handle cases where mac_sw_cksum() does a 1120 * pull-up or drop of the mblk. 1121 */ 1122 if (nhdrmp != NULL) { 1123 nhdrmp->b_next = next_nhdrmp; 1124 next_nhdrmp = NULL; 1125 prev_nhdrmp->b_next = nhdrmp; 1126 } else { 1127 freemsgchain(next_nhdrmp); 1128 /* 1129 * Critical to de-link the now-freed nhdrmp 1130 * before freeing the rest of the preceding 1131 * chain. 1132 */ 1133 prev_nhdrmp->b_next = NULL; 1134 freemsgchain(seg_chain); 1135 seg_chain = NULL; 1136 mac_drop_pkt(omp, "LSO cksum emulation failed"); 1137 goto fail; 1138 } 1139 } 1140 1141 DTRACE_PROBE5(sw__lso__seg, mblk_t *, nhdrmp, void_ip_t *, 1142 (ipha_t *)(nhdrmp->b_rptr + oehlen), __dtrace_tcp_tcph_t *, 1143 (tcph_t *)(nhdrmp->b_rptr + oehlen + oiphlen), 1144 uint_t, mss, uint_t, seg); 1145 1146 ASSERT3P(nhdrmp->b_next, !=, NULL); 1147 prev_nhdrmp = nhdrmp; 1148 nhdrmp = nhdrmp->b_next; 1149 } 1150 1151 /* Make sure we are on the last segment. */ 1152 ASSERT3U(seg, ==, nsegs); 1153 ASSERT3P(nhdrmp->b_next, ==, NULL); 1154 1155 /* 1156 * Now we set the last segment header. The difference being 1157 * that FIN/PSH/RST flags are allowed. 1158 */ 1159 bcopy(seg_chain->b_rptr, nhdrmp->b_rptr, ohdrslen); 1160 nhdrmp->b_wptr = nhdrmp->b_rptr + ohdrslen; 1161 niph = (ipha_t *)(nhdrmp->b_rptr + oehlen); 1162 niph->ipha_ident = htons(++ip_id); 1163 len = msgsize(nhdrmp->b_cont); 1164 ASSERT3S(len, >, 0); 1165 niph->ipha_length = htons(oiphlen + otcphlen + len); 1166 niph->ipha_hdr_checksum = 0; 1167 ntcph = (tcph_t *)(nhdrmp->b_rptr + oehlen + oiphlen); 1168 U32_TO_BE32(tcp_seq, ntcph->th_seq); 1169 1170 DB_CKSUMFLAGS(nhdrmp) = (uint16_t)(ocsum_flags & ~HW_LSO); 1171 if (ocsum_flags & HCK_PARTIALCKSUM) { 1172 DB_CKSUMSTART(nhdrmp) = ocsum_start; 1173 DB_CKSUMEND(nhdrmp) = ntohs(niph->ipha_length); 1174 DB_CKSUMSTUFF(nhdrmp) = ocsum_stuff; 1175 tcp_sum = otcp_sum; 1176 tcp_sum += len + otcphlen; 1177 tcp_sum = (tcp_sum >> 16) + (tcp_sum & 0xFFFF); 1178 U16_TO_BE16(tcp_sum, ntcph->th_sum); 1179 } 1180 1181 if ((ocsum_flags & (HCK_PARTIALCKSUM | HCK_FULLCKSUM)) && 1182 (emul & MAC_HWCKSUM_EMULS)) { 1183 /* This should be the last mblk. */ 1184 ASSERT3P(nhdrmp->b_next, ==, NULL); 1185 nhdrmp = mac_sw_cksum(nhdrmp, emul); 1186 /* 1187 * If the final mblk happens to be dropped as part of 1188 * mac_sw_cksum(), that is unfortunate, but it need not be a 1189 * show-stopper at this point. We can just pretend that final 1190 * packet was dropped in transit. 1191 */ 1192 prev_nhdrmp->b_next = nhdrmp; 1193 } 1194 1195 DTRACE_PROBE5(sw__lso__seg, mblk_t *, nhdrmp, void_ip_t *, 1196 (ipha_t *)(nhdrmp->b_rptr + oehlen), __dtrace_tcp_tcph_t *, 1197 (tcph_t *)(nhdrmp->b_rptr + oehlen + oiphlen), uint_t, len, 1198 uint_t, seg); 1199 1200 /* 1201 * Free the reference to the original LSO message as it is 1202 * being replaced by seg_cahin. 1203 */ 1204 freemsg(omp); 1205 *head = seg_chain; 1206 *tail = nhdrmp; 1207 *count = nsegs; 1208 return; 1209 1210 fail: 1211 *head = NULL; 1212 *tail = NULL; 1213 *count = 0; 1214 } 1215 1216 #define HCK_NEEDED (HCK_IPV4_HDRCKSUM | HCK_PARTIALCKSUM | HCK_FULLCKSUM) 1217 1218 /* 1219 * Emulate various hardware offload features in software. Take a chain 1220 * of packets as input and emulate the hardware features specified in 1221 * 'emul'. The resulting chain's head pointer replaces the 'mp_chain' 1222 * pointer given as input, and its tail pointer is written to 1223 * '*otail'. The number of packets in the new chain is written to 1224 * '*ocount'. The 'otail' and 'ocount' arguments are optional and thus 1225 * may be NULL. The 'mp_chain' argument may point to a NULL chain; in 1226 * which case 'mp_chain' will simply stay a NULL chain. 1227 * 1228 * While unlikely, it is technically possible that this function could 1229 * receive a non-NULL chain as input and return a NULL chain as output 1230 * ('*mp_chain' and '*otail' would be NULL and '*ocount' would be 1231 * zero). This could happen if all the packets in the chain are 1232 * dropped or if we fail to allocate new mblks. In this case, there is 1233 * nothing for the caller to free. In any event, the caller shouldn't 1234 * assume that '*mp_chain' is non-NULL on return. 1235 * 1236 * This function was written with three main use cases in mind. 1237 * 1238 * 1. To emulate hardware offloads when traveling mac-loopback (two 1239 * clients on the same mac). This is wired up in mac_tx_send(). 1240 * 1241 * 2. To provide hardware offloads to the client when the underlying 1242 * provider cannot. This is currently wired up in mac_tx() but we 1243 * still only negotiate offloads when the underlying provider 1244 * supports them. 1245 * 1246 * 3. To emulate real hardware in simnet. 1247 */ 1248 void 1249 mac_hw_emul(mblk_t **mp_chain, mblk_t **otail, uint_t *ocount, mac_emul_t emul) 1250 { 1251 mblk_t *head = NULL, *tail = NULL; 1252 uint_t count = 0; 1253 1254 ASSERT3S(~(MAC_HWCKSUM_EMULS | MAC_LSO_EMUL) & emul, ==, 0); 1255 ASSERT3P(mp_chain, !=, NULL); 1256 1257 for (mblk_t *mp = *mp_chain; mp != NULL; ) { 1258 mblk_t *tmp, *next, *tmphead, *tmptail; 1259 struct ether_header *ehp; 1260 uint32_t flags; 1261 uint_t len = MBLKL(mp), l2len; 1262 1263 /* Perform LSO/cksum one message at a time. */ 1264 next = mp->b_next; 1265 mp->b_next = NULL; 1266 1267 /* 1268 * For our sanity the first mblk should contain at 1269 * least the full L2 header. 1270 */ 1271 if (len < sizeof (struct ether_header)) { 1272 mac_drop_pkt(mp, "packet too short (A): %u", len); 1273 mp = next; 1274 continue; 1275 } 1276 1277 ehp = (struct ether_header *)mp->b_rptr; 1278 if (ntohs(ehp->ether_type) == VLAN_TPID) 1279 l2len = sizeof (struct ether_vlan_header); 1280 else 1281 l2len = sizeof (struct ether_header); 1282 1283 /* 1284 * If the first mblk is solely the L2 header, then 1285 * there better be more data. 1286 */ 1287 if (len < l2len || (len == l2len && mp->b_cont == NULL)) { 1288 mac_drop_pkt(mp, "packet too short (C): %u", len); 1289 mp = next; 1290 continue; 1291 } 1292 1293 DTRACE_PROBE2(mac__emul, mblk_t *, mp, mac_emul_t, emul); 1294 1295 /* 1296 * We use DB_CKSUMFLAGS (instead of mac_hcksum_get()) 1297 * because we don't want to mask-out the LSO flag. 1298 */ 1299 flags = DB_CKSUMFLAGS(mp); 1300 1301 if ((flags & HW_LSO) && (emul & MAC_LSO_EMUL)) { 1302 uint_t tmpcount = 0; 1303 1304 /* 1305 * LSO fix-up handles checksum emulation 1306 * inline (if requested). It also frees mp. 1307 */ 1308 mac_sw_lso(mp, emul, &tmphead, &tmptail, 1309 &tmpcount); 1310 if (tmphead == NULL) { 1311 /* mac_sw_lso() freed the mp. */ 1312 mp = next; 1313 continue; 1314 } 1315 count += tmpcount; 1316 } else if ((flags & HCK_NEEDED) && (emul & MAC_HWCKSUM_EMULS)) { 1317 tmp = mac_sw_cksum(mp, emul); 1318 if (tmp == NULL) { 1319 /* mac_sw_cksum() freed the mp. */ 1320 mp = next; 1321 continue; 1322 } 1323 tmphead = tmp; 1324 tmptail = tmp; 1325 count++; 1326 } else { 1327 /* There is nothing to emulate. */ 1328 tmp = mp; 1329 tmphead = tmp; 1330 tmptail = tmp; 1331 count++; 1332 } 1333 1334 /* 1335 * The tmp mblk chain is either the start of the new 1336 * chain or added to the tail of the new chain. 1337 */ 1338 if (head == NULL) { 1339 head = tmphead; 1340 tail = tmptail; 1341 } else { 1342 /* Attach the new mblk to the end of the new chain. */ 1343 tail->b_next = tmphead; 1344 tail = tmptail; 1345 } 1346 1347 mp = next; 1348 } 1349 1350 *mp_chain = head; 1351 1352 if (otail != NULL) 1353 *otail = tail; 1354 1355 if (ocount != NULL) 1356 *ocount = count; 1357 } 1358 1359 /* 1360 * Add VLAN tag to the specified mblk. 1361 */ 1362 mblk_t * 1363 mac_add_vlan_tag(mblk_t *mp, uint_t pri, uint16_t vid) 1364 { 1365 mblk_t *hmp; 1366 struct ether_vlan_header *evhp; 1367 struct ether_header *ehp; 1368 1369 ASSERT(pri != 0 || vid != 0); 1370 1371 /* 1372 * Allocate an mblk for the new tagged ethernet header, 1373 * and copy the MAC addresses and ethertype from the 1374 * original header. 1375 */ 1376 1377 hmp = allocb(sizeof (struct ether_vlan_header), BPRI_MED); 1378 if (hmp == NULL) { 1379 freemsg(mp); 1380 return (NULL); 1381 } 1382 1383 evhp = (struct ether_vlan_header *)hmp->b_rptr; 1384 ehp = (struct ether_header *)mp->b_rptr; 1385 1386 bcopy(ehp, evhp, (ETHERADDRL * 2)); 1387 evhp->ether_type = ehp->ether_type; 1388 evhp->ether_tpid = htons(ETHERTYPE_VLAN); 1389 1390 hmp->b_wptr += sizeof (struct ether_vlan_header); 1391 mp->b_rptr += sizeof (struct ether_header); 1392 1393 /* 1394 * Free the original message if it's now empty. Link the 1395 * rest of messages to the header message. 1396 */ 1397 mac_hcksum_clone(mp, hmp); 1398 if (MBLKL(mp) == 0) { 1399 hmp->b_cont = mp->b_cont; 1400 freeb(mp); 1401 } else { 1402 hmp->b_cont = mp; 1403 } 1404 ASSERT(MBLKL(hmp) >= sizeof (struct ether_vlan_header)); 1405 1406 /* 1407 * Initialize the new TCI (Tag Control Information). 1408 */ 1409 evhp->ether_tci = htons(VLAN_TCI(pri, 0, vid)); 1410 1411 return (hmp); 1412 } 1413 1414 /* 1415 * Adds a VLAN tag with the specified VID and priority to each mblk of 1416 * the specified chain. 1417 */ 1418 mblk_t * 1419 mac_add_vlan_tag_chain(mblk_t *mp_chain, uint_t pri, uint16_t vid) 1420 { 1421 mblk_t *next_mp, **prev, *mp; 1422 1423 mp = mp_chain; 1424 prev = &mp_chain; 1425 1426 while (mp != NULL) { 1427 next_mp = mp->b_next; 1428 mp->b_next = NULL; 1429 if ((mp = mac_add_vlan_tag(mp, pri, vid)) == NULL) { 1430 freemsgchain(next_mp); 1431 break; 1432 } 1433 *prev = mp; 1434 prev = &mp->b_next; 1435 mp = mp->b_next = next_mp; 1436 } 1437 1438 return (mp_chain); 1439 } 1440 1441 /* 1442 * Strip VLAN tag 1443 */ 1444 mblk_t * 1445 mac_strip_vlan_tag(mblk_t *mp) 1446 { 1447 mblk_t *newmp; 1448 struct ether_vlan_header *evhp; 1449 1450 evhp = (struct ether_vlan_header *)mp->b_rptr; 1451 if (ntohs(evhp->ether_tpid) == ETHERTYPE_VLAN) { 1452 ASSERT(MBLKL(mp) >= sizeof (struct ether_vlan_header)); 1453 1454 if (DB_REF(mp) > 1) { 1455 newmp = copymsg(mp); 1456 if (newmp == NULL) 1457 return (NULL); 1458 freemsg(mp); 1459 mp = newmp; 1460 } 1461 1462 evhp = (struct ether_vlan_header *)mp->b_rptr; 1463 1464 ovbcopy(mp->b_rptr, mp->b_rptr + VLAN_TAGSZ, 2 * ETHERADDRL); 1465 mp->b_rptr += VLAN_TAGSZ; 1466 } 1467 return (mp); 1468 } 1469 1470 /* 1471 * Strip VLAN tag from each mblk of the chain. 1472 */ 1473 mblk_t * 1474 mac_strip_vlan_tag_chain(mblk_t *mp_chain) 1475 { 1476 mblk_t *mp, *next_mp, **prev; 1477 1478 mp = mp_chain; 1479 prev = &mp_chain; 1480 1481 while (mp != NULL) { 1482 next_mp = mp->b_next; 1483 mp->b_next = NULL; 1484 if ((mp = mac_strip_vlan_tag(mp)) == NULL) { 1485 freemsgchain(next_mp); 1486 break; 1487 } 1488 *prev = mp; 1489 prev = &mp->b_next; 1490 mp = mp->b_next = next_mp; 1491 } 1492 1493 return (mp_chain); 1494 } 1495 1496 /* 1497 * Default callback function. Used when the datapath is not yet initialized. 1498 */ 1499 /* ARGSUSED */ 1500 void 1501 mac_rx_def(void *arg, mac_resource_handle_t resource, mblk_t *mp_chain, 1502 boolean_t loopback) 1503 { 1504 freemsgchain(mp_chain); 1505 } 1506 1507 /* 1508 * Determines the IPv6 header length accounting for all the optional IPv6 1509 * headers (hop-by-hop, destination, routing and fragment). The header length 1510 * and next header value (a transport header) is captured. 1511 * 1512 * Returns B_FALSE if all the IP headers are not in the same mblk otherwise 1513 * returns B_TRUE. 1514 */ 1515 boolean_t 1516 mac_ip_hdr_length_v6(ip6_t *ip6h, uint8_t *endptr, uint16_t *hdr_length, 1517 uint8_t *next_hdr, ip6_frag_t **fragp) 1518 { 1519 uint16_t length; 1520 uint_t ehdrlen; 1521 uint8_t *whereptr; 1522 uint8_t *nexthdrp; 1523 ip6_dest_t *desthdr; 1524 ip6_rthdr_t *rthdr; 1525 ip6_frag_t *fraghdr; 1526 1527 if (((uchar_t *)ip6h + IPV6_HDR_LEN) > endptr) 1528 return (B_FALSE); 1529 ASSERT(IPH_HDR_VERSION(ip6h) == IPV6_VERSION); 1530 length = IPV6_HDR_LEN; 1531 whereptr = ((uint8_t *)&ip6h[1]); /* point to next hdr */ 1532 1533 if (fragp != NULL) 1534 *fragp = NULL; 1535 1536 nexthdrp = &ip6h->ip6_nxt; 1537 while (whereptr < endptr) { 1538 /* Is there enough left for len + nexthdr? */ 1539 if (whereptr + MIN_EHDR_LEN > endptr) 1540 break; 1541 1542 switch (*nexthdrp) { 1543 case IPPROTO_HOPOPTS: 1544 case IPPROTO_DSTOPTS: 1545 /* Assumes the headers are identical for hbh and dst */ 1546 desthdr = (ip6_dest_t *)whereptr; 1547 ehdrlen = 8 * (desthdr->ip6d_len + 1); 1548 if ((uchar_t *)desthdr + ehdrlen > endptr) 1549 return (B_FALSE); 1550 nexthdrp = &desthdr->ip6d_nxt; 1551 break; 1552 case IPPROTO_ROUTING: 1553 rthdr = (ip6_rthdr_t *)whereptr; 1554 ehdrlen = 8 * (rthdr->ip6r_len + 1); 1555 if ((uchar_t *)rthdr + ehdrlen > endptr) 1556 return (B_FALSE); 1557 nexthdrp = &rthdr->ip6r_nxt; 1558 break; 1559 case IPPROTO_FRAGMENT: 1560 fraghdr = (ip6_frag_t *)whereptr; 1561 ehdrlen = sizeof (ip6_frag_t); 1562 if ((uchar_t *)&fraghdr[1] > endptr) 1563 return (B_FALSE); 1564 nexthdrp = &fraghdr->ip6f_nxt; 1565 if (fragp != NULL) 1566 *fragp = fraghdr; 1567 break; 1568 case IPPROTO_NONE: 1569 /* No next header means we're finished */ 1570 default: 1571 *hdr_length = length; 1572 *next_hdr = *nexthdrp; 1573 return (B_TRUE); 1574 } 1575 length += ehdrlen; 1576 whereptr += ehdrlen; 1577 *hdr_length = length; 1578 *next_hdr = *nexthdrp; 1579 } 1580 switch (*nexthdrp) { 1581 case IPPROTO_HOPOPTS: 1582 case IPPROTO_DSTOPTS: 1583 case IPPROTO_ROUTING: 1584 case IPPROTO_FRAGMENT: 1585 /* 1586 * If any know extension headers are still to be processed, 1587 * the packet's malformed (or at least all the IP header(s) are 1588 * not in the same mblk - and that should never happen. 1589 */ 1590 return (B_FALSE); 1591 1592 default: 1593 /* 1594 * If we get here, we know that all of the IP headers were in 1595 * the same mblk, even if the ULP header is in the next mblk. 1596 */ 1597 *hdr_length = length; 1598 *next_hdr = *nexthdrp; 1599 return (B_TRUE); 1600 } 1601 } 1602 1603 /* 1604 * The following set of routines are there to take care of interrupt 1605 * re-targeting for legacy (fixed) interrupts. Some older versions 1606 * of the popular NICs like e1000g do not support MSI-X interrupts 1607 * and they reserve fixed interrupts for RX/TX rings. To re-target 1608 * these interrupts, PCITOOL ioctls need to be used. 1609 */ 1610 typedef struct mac_dladm_intr { 1611 int ino; 1612 int cpu_id; 1613 char driver_path[MAXPATHLEN]; 1614 char nexus_path[MAXPATHLEN]; 1615 } mac_dladm_intr_t; 1616 1617 /* Bind the interrupt to cpu_num */ 1618 static int 1619 mac_set_intr(ldi_handle_t lh, processorid_t cpu_num, int oldcpuid, int ino) 1620 { 1621 pcitool_intr_set_t iset; 1622 int err; 1623 1624 iset.old_cpu = oldcpuid; 1625 iset.ino = ino; 1626 iset.cpu_id = cpu_num; 1627 iset.user_version = PCITOOL_VERSION; 1628 err = ldi_ioctl(lh, PCITOOL_DEVICE_SET_INTR, (intptr_t)&iset, FKIOCTL, 1629 kcred, NULL); 1630 1631 return (err); 1632 } 1633 1634 /* 1635 * Search interrupt information. iget is filled in with the info to search 1636 */ 1637 static boolean_t 1638 mac_search_intrinfo(pcitool_intr_get_t *iget_p, mac_dladm_intr_t *dln) 1639 { 1640 int i; 1641 char driver_path[2 * MAXPATHLEN]; 1642 1643 for (i = 0; i < iget_p->num_devs; i++) { 1644 (void) strlcpy(driver_path, iget_p->dev[i].path, MAXPATHLEN); 1645 (void) snprintf(&driver_path[strlen(driver_path)], MAXPATHLEN, 1646 ":%s%d", iget_p->dev[i].driver_name, 1647 iget_p->dev[i].dev_inst); 1648 /* Match the device path for the device path */ 1649 if (strcmp(driver_path, dln->driver_path) == 0) { 1650 dln->ino = iget_p->ino; 1651 dln->cpu_id = iget_p->cpu_id; 1652 return (B_TRUE); 1653 } 1654 } 1655 return (B_FALSE); 1656 } 1657 1658 /* 1659 * Get information about ino, i.e. if this is the interrupt for our 1660 * device and where it is bound etc. 1661 */ 1662 static boolean_t 1663 mac_get_single_intr(ldi_handle_t lh, int oldcpuid, int ino, 1664 mac_dladm_intr_t *dln) 1665 { 1666 pcitool_intr_get_t *iget_p; 1667 int ipsz; 1668 int nipsz; 1669 int err; 1670 uint8_t inum; 1671 1672 /* 1673 * Check if SLEEP is OK, i.e if could come here in response to 1674 * changing the fanout due to some callback from the driver, say 1675 * link speed changes. 1676 */ 1677 ipsz = PCITOOL_IGET_SIZE(0); 1678 iget_p = kmem_zalloc(ipsz, KM_SLEEP); 1679 1680 iget_p->num_devs_ret = 0; 1681 iget_p->user_version = PCITOOL_VERSION; 1682 iget_p->cpu_id = oldcpuid; 1683 iget_p->ino = ino; 1684 1685 err = ldi_ioctl(lh, PCITOOL_DEVICE_GET_INTR, (intptr_t)iget_p, 1686 FKIOCTL, kcred, NULL); 1687 if (err != 0) { 1688 kmem_free(iget_p, ipsz); 1689 return (B_FALSE); 1690 } 1691 if (iget_p->num_devs == 0) { 1692 kmem_free(iget_p, ipsz); 1693 return (B_FALSE); 1694 } 1695 inum = iget_p->num_devs; 1696 if (iget_p->num_devs_ret < iget_p->num_devs) { 1697 /* Reallocate */ 1698 nipsz = PCITOOL_IGET_SIZE(iget_p->num_devs); 1699 1700 kmem_free(iget_p, ipsz); 1701 ipsz = nipsz; 1702 iget_p = kmem_zalloc(ipsz, KM_SLEEP); 1703 1704 iget_p->num_devs_ret = inum; 1705 iget_p->cpu_id = oldcpuid; 1706 iget_p->ino = ino; 1707 iget_p->user_version = PCITOOL_VERSION; 1708 err = ldi_ioctl(lh, PCITOOL_DEVICE_GET_INTR, (intptr_t)iget_p, 1709 FKIOCTL, kcred, NULL); 1710 if (err != 0) { 1711 kmem_free(iget_p, ipsz); 1712 return (B_FALSE); 1713 } 1714 /* defensive */ 1715 if (iget_p->num_devs != iget_p->num_devs_ret) { 1716 kmem_free(iget_p, ipsz); 1717 return (B_FALSE); 1718 } 1719 } 1720 1721 if (mac_search_intrinfo(iget_p, dln)) { 1722 kmem_free(iget_p, ipsz); 1723 return (B_TRUE); 1724 } 1725 kmem_free(iget_p, ipsz); 1726 return (B_FALSE); 1727 } 1728 1729 /* 1730 * Get the interrupts and check each one to see if it is for our device. 1731 */ 1732 static int 1733 mac_validate_intr(ldi_handle_t lh, mac_dladm_intr_t *dln, processorid_t cpuid) 1734 { 1735 pcitool_intr_info_t intr_info; 1736 int err; 1737 int ino; 1738 int oldcpuid; 1739 1740 err = ldi_ioctl(lh, PCITOOL_SYSTEM_INTR_INFO, (intptr_t)&intr_info, 1741 FKIOCTL, kcred, NULL); 1742 if (err != 0) 1743 return (-1); 1744 1745 for (oldcpuid = 0; oldcpuid < intr_info.num_cpu; oldcpuid++) { 1746 for (ino = 0; ino < intr_info.num_intr; ino++) { 1747 if (mac_get_single_intr(lh, oldcpuid, ino, dln)) { 1748 if (dln->cpu_id == cpuid) 1749 return (0); 1750 return (1); 1751 } 1752 } 1753 } 1754 return (-1); 1755 } 1756 1757 /* 1758 * Obtain the nexus parent node info. for mdip. 1759 */ 1760 static dev_info_t * 1761 mac_get_nexus_node(dev_info_t *mdip, mac_dladm_intr_t *dln) 1762 { 1763 struct dev_info *tdip = (struct dev_info *)mdip; 1764 struct ddi_minor_data *minordata; 1765 dev_info_t *pdip; 1766 char pathname[MAXPATHLEN]; 1767 1768 while (tdip != NULL) { 1769 /* 1770 * The netboot code could call this function while walking the 1771 * device tree so we need to use ndi_devi_tryenter() here to 1772 * avoid deadlock. 1773 */ 1774 if (ndi_devi_tryenter((dev_info_t *)tdip) == 0) 1775 break; 1776 1777 for (minordata = tdip->devi_minor; minordata != NULL; 1778 minordata = minordata->next) { 1779 if (strncmp(minordata->ddm_node_type, DDI_NT_INTRCTL, 1780 strlen(DDI_NT_INTRCTL)) == 0) { 1781 pdip = minordata->dip; 1782 (void) ddi_pathname(pdip, pathname); 1783 (void) snprintf(dln->nexus_path, MAXPATHLEN, 1784 "/devices%s:intr", pathname); 1785 (void) ddi_pathname_minor(minordata, pathname); 1786 ndi_devi_exit((dev_info_t *)tdip); 1787 return (pdip); 1788 } 1789 } 1790 ndi_devi_exit((dev_info_t *)tdip); 1791 tdip = tdip->devi_parent; 1792 } 1793 return (NULL); 1794 } 1795 1796 /* 1797 * For a primary MAC client, if the user has set a list or CPUs or 1798 * we have obtained it implicitly, we try to retarget the interrupt 1799 * for that device on one of the CPUs in the list. 1800 * We assign the interrupt to the same CPU as the poll thread. 1801 */ 1802 static boolean_t 1803 mac_check_interrupt_binding(dev_info_t *mdip, int32_t cpuid) 1804 { 1805 ldi_handle_t lh = NULL; 1806 ldi_ident_t li = NULL; 1807 int err; 1808 int ret; 1809 mac_dladm_intr_t dln; 1810 dev_info_t *dip; 1811 struct ddi_minor_data *minordata; 1812 1813 dln.nexus_path[0] = '\0'; 1814 dln.driver_path[0] = '\0'; 1815 1816 minordata = ((struct dev_info *)mdip)->devi_minor; 1817 while (minordata != NULL) { 1818 if (minordata->type == DDM_MINOR) 1819 break; 1820 minordata = minordata->next; 1821 } 1822 if (minordata == NULL) 1823 return (B_FALSE); 1824 1825 (void) ddi_pathname_minor(minordata, dln.driver_path); 1826 1827 dip = mac_get_nexus_node(mdip, &dln); 1828 /* defensive */ 1829 if (dip == NULL) 1830 return (B_FALSE); 1831 1832 err = ldi_ident_from_major(ddi_driver_major(dip), &li); 1833 if (err != 0) 1834 return (B_FALSE); 1835 1836 err = ldi_open_by_name(dln.nexus_path, FREAD|FWRITE, kcred, &lh, li); 1837 if (err != 0) 1838 return (B_FALSE); 1839 1840 ret = mac_validate_intr(lh, &dln, cpuid); 1841 if (ret < 0) { 1842 (void) ldi_close(lh, FREAD|FWRITE, kcred); 1843 return (B_FALSE); 1844 } 1845 /* cmn_note? */ 1846 if (ret != 0) 1847 if ((err = (mac_set_intr(lh, cpuid, dln.cpu_id, dln.ino))) 1848 != 0) { 1849 (void) ldi_close(lh, FREAD|FWRITE, kcred); 1850 return (B_FALSE); 1851 } 1852 (void) ldi_close(lh, FREAD|FWRITE, kcred); 1853 return (B_TRUE); 1854 } 1855 1856 void 1857 mac_client_set_intr_cpu(void *arg, mac_client_handle_t mch, int32_t cpuid) 1858 { 1859 dev_info_t *mdip = (dev_info_t *)arg; 1860 mac_client_impl_t *mcip = (mac_client_impl_t *)mch; 1861 mac_resource_props_t *mrp; 1862 mac_perim_handle_t mph; 1863 flow_entry_t *flent = mcip->mci_flent; 1864 mac_soft_ring_set_t *rx_srs; 1865 mac_cpus_t *srs_cpu; 1866 1867 if (!mac_check_interrupt_binding(mdip, cpuid)) 1868 cpuid = -1; 1869 mac_perim_enter_by_mh((mac_handle_t)mcip->mci_mip, &mph); 1870 mrp = MCIP_RESOURCE_PROPS(mcip); 1871 mrp->mrp_rx_intr_cpu = cpuid; 1872 if (flent != NULL && flent->fe_rx_srs_cnt == 2) { 1873 rx_srs = flent->fe_rx_srs[1]; 1874 srs_cpu = &rx_srs->srs_cpu; 1875 srs_cpu->mc_rx_intr_cpu = cpuid; 1876 } 1877 mac_perim_exit(mph); 1878 } 1879 1880 int32_t 1881 mac_client_intr_cpu(mac_client_handle_t mch) 1882 { 1883 mac_client_impl_t *mcip = (mac_client_impl_t *)mch; 1884 mac_cpus_t *srs_cpu; 1885 mac_soft_ring_set_t *rx_srs; 1886 flow_entry_t *flent = mcip->mci_flent; 1887 mac_resource_props_t *mrp = MCIP_RESOURCE_PROPS(mcip); 1888 mac_ring_t *ring; 1889 mac_intr_t *mintr; 1890 1891 /* 1892 * Check if we need to retarget the interrupt. We do this only 1893 * for the primary MAC client. We do this if we have the only 1894 * exclusive ring in the group. 1895 */ 1896 if (mac_is_primary_client(mcip) && flent->fe_rx_srs_cnt == 2) { 1897 rx_srs = flent->fe_rx_srs[1]; 1898 srs_cpu = &rx_srs->srs_cpu; 1899 ring = rx_srs->srs_ring; 1900 mintr = &ring->mr_info.mri_intr; 1901 /* 1902 * If ddi_handle is present or the poll CPU is 1903 * already bound to the interrupt CPU, return -1. 1904 */ 1905 if (mintr->mi_ddi_handle != NULL || 1906 ((mrp->mrp_ncpus != 0) && 1907 (mrp->mrp_rx_intr_cpu == srs_cpu->mc_rx_pollid))) { 1908 return (-1); 1909 } 1910 return (srs_cpu->mc_rx_pollid); 1911 } 1912 return (-1); 1913 } 1914 1915 void * 1916 mac_get_devinfo(mac_handle_t mh) 1917 { 1918 mac_impl_t *mip = (mac_impl_t *)mh; 1919 1920 return ((void *)mip->mi_dip); 1921 } 1922 1923 #define PKT_HASH_2BYTES(x) ((x)[0] ^ (x)[1]) 1924 #define PKT_HASH_4BYTES(x) ((x)[0] ^ (x)[1] ^ (x)[2] ^ (x)[3]) 1925 #define PKT_HASH_MAC(x) ((x)[0] ^ (x)[1] ^ (x)[2] ^ (x)[3] ^ (x)[4] ^ (x)[5]) 1926 1927 uint64_t 1928 mac_pkt_hash(uint_t media, mblk_t *mp, uint8_t policy, boolean_t is_outbound) 1929 { 1930 struct ether_header *ehp; 1931 uint64_t hash = 0; 1932 uint16_t sap; 1933 uint_t skip_len; 1934 uint8_t proto; 1935 boolean_t ip_fragmented; 1936 1937 /* 1938 * We may want to have one of these per MAC type plugin in the 1939 * future. For now supports only ethernet. 1940 */ 1941 if (media != DL_ETHER) 1942 return (0L); 1943 1944 /* for now we support only outbound packets */ 1945 ASSERT(is_outbound); 1946 ASSERT(IS_P2ALIGNED(mp->b_rptr, sizeof (uint16_t))); 1947 ASSERT(MBLKL(mp) >= sizeof (struct ether_header)); 1948 1949 /* compute L2 hash */ 1950 1951 ehp = (struct ether_header *)mp->b_rptr; 1952 1953 if ((policy & MAC_PKT_HASH_L2) != 0) { 1954 uchar_t *mac_src = ehp->ether_shost.ether_addr_octet; 1955 uchar_t *mac_dst = ehp->ether_dhost.ether_addr_octet; 1956 hash = PKT_HASH_MAC(mac_src) ^ PKT_HASH_MAC(mac_dst); 1957 policy &= ~MAC_PKT_HASH_L2; 1958 } 1959 1960 if (policy == 0) 1961 goto done; 1962 1963 /* skip ethernet header */ 1964 1965 sap = ntohs(ehp->ether_type); 1966 if (sap == ETHERTYPE_VLAN) { 1967 struct ether_vlan_header *evhp; 1968 mblk_t *newmp = NULL; 1969 1970 skip_len = sizeof (struct ether_vlan_header); 1971 if (MBLKL(mp) < skip_len) { 1972 /* the vlan tag is the payload, pull up first */ 1973 newmp = msgpullup(mp, -1); 1974 if ((newmp == NULL) || (MBLKL(newmp) < skip_len)) { 1975 goto done; 1976 } 1977 evhp = (struct ether_vlan_header *)newmp->b_rptr; 1978 } else { 1979 evhp = (struct ether_vlan_header *)mp->b_rptr; 1980 } 1981 1982 sap = ntohs(evhp->ether_type); 1983 freemsg(newmp); 1984 } else { 1985 skip_len = sizeof (struct ether_header); 1986 } 1987 1988 /* if ethernet header is in its own mblk, skip it */ 1989 if (MBLKL(mp) <= skip_len) { 1990 skip_len -= MBLKL(mp); 1991 mp = mp->b_cont; 1992 if (mp == NULL) 1993 goto done; 1994 } 1995 1996 sap = (sap < ETHERTYPE_802_MIN) ? 0 : sap; 1997 1998 /* compute IP src/dst addresses hash and skip IPv{4,6} header */ 1999 2000 switch (sap) { 2001 case ETHERTYPE_IP: { 2002 ipha_t *iphp; 2003 2004 /* 2005 * If the header is not aligned or the header doesn't fit 2006 * in the mblk, bail now. Note that this may cause packets 2007 * reordering. 2008 */ 2009 iphp = (ipha_t *)(mp->b_rptr + skip_len); 2010 if (((unsigned char *)iphp + sizeof (ipha_t) > mp->b_wptr) || 2011 !OK_32PTR((char *)iphp)) 2012 goto done; 2013 2014 proto = iphp->ipha_protocol; 2015 skip_len += IPH_HDR_LENGTH(iphp); 2016 2017 /* Check if the packet is fragmented. */ 2018 ip_fragmented = ntohs(iphp->ipha_fragment_offset_and_flags) & 2019 IPH_OFFSET; 2020 2021 /* 2022 * For fragmented packets, use addresses in addition to 2023 * the frag_id to generate the hash inorder to get 2024 * better distribution. 2025 */ 2026 if (ip_fragmented || (policy & MAC_PKT_HASH_L3) != 0) { 2027 uint8_t *ip_src = (uint8_t *)&(iphp->ipha_src); 2028 uint8_t *ip_dst = (uint8_t *)&(iphp->ipha_dst); 2029 2030 hash ^= (PKT_HASH_4BYTES(ip_src) ^ 2031 PKT_HASH_4BYTES(ip_dst)); 2032 policy &= ~MAC_PKT_HASH_L3; 2033 } 2034 2035 if (ip_fragmented) { 2036 uint8_t *identp = (uint8_t *)&iphp->ipha_ident; 2037 hash ^= PKT_HASH_2BYTES(identp); 2038 goto done; 2039 } 2040 break; 2041 } 2042 case ETHERTYPE_IPV6: { 2043 ip6_t *ip6hp; 2044 ip6_frag_t *frag = NULL; 2045 uint16_t hdr_length; 2046 2047 /* 2048 * If the header is not aligned or the header doesn't fit 2049 * in the mblk, bail now. Note that this may cause packets 2050 * reordering. 2051 */ 2052 2053 ip6hp = (ip6_t *)(mp->b_rptr + skip_len); 2054 if (((unsigned char *)ip6hp + IPV6_HDR_LEN > mp->b_wptr) || 2055 !OK_32PTR((char *)ip6hp)) 2056 goto done; 2057 2058 if (!mac_ip_hdr_length_v6(ip6hp, mp->b_wptr, &hdr_length, 2059 &proto, &frag)) 2060 goto done; 2061 skip_len += hdr_length; 2062 2063 /* 2064 * For fragmented packets, use addresses in addition to 2065 * the frag_id to generate the hash inorder to get 2066 * better distribution. 2067 */ 2068 if (frag != NULL || (policy & MAC_PKT_HASH_L3) != 0) { 2069 uint8_t *ip_src = &(ip6hp->ip6_src.s6_addr8[12]); 2070 uint8_t *ip_dst = &(ip6hp->ip6_dst.s6_addr8[12]); 2071 2072 hash ^= (PKT_HASH_4BYTES(ip_src) ^ 2073 PKT_HASH_4BYTES(ip_dst)); 2074 policy &= ~MAC_PKT_HASH_L3; 2075 } 2076 2077 if (frag != NULL) { 2078 uint8_t *identp = (uint8_t *)&frag->ip6f_ident; 2079 hash ^= PKT_HASH_4BYTES(identp); 2080 goto done; 2081 } 2082 break; 2083 } 2084 default: 2085 goto done; 2086 } 2087 2088 if (policy == 0) 2089 goto done; 2090 2091 /* if ip header is in its own mblk, skip it */ 2092 if (MBLKL(mp) <= skip_len) { 2093 skip_len -= MBLKL(mp); 2094 mp = mp->b_cont; 2095 if (mp == NULL) 2096 goto done; 2097 } 2098 2099 /* parse ULP header */ 2100 again: 2101 switch (proto) { 2102 case IPPROTO_TCP: 2103 case IPPROTO_UDP: 2104 case IPPROTO_ESP: 2105 case IPPROTO_SCTP: 2106 /* 2107 * These Internet Protocols are intentionally designed 2108 * for hashing from the git-go. Port numbers are in the first 2109 * word for transports, SPI is first for ESP. 2110 */ 2111 if (mp->b_rptr + skip_len + 4 > mp->b_wptr) 2112 goto done; 2113 hash ^= PKT_HASH_4BYTES((mp->b_rptr + skip_len)); 2114 break; 2115 2116 case IPPROTO_AH: { 2117 ah_t *ah = (ah_t *)(mp->b_rptr + skip_len); 2118 uint_t ah_length = AH_TOTAL_LEN(ah); 2119 2120 if ((unsigned char *)ah + sizeof (ah_t) > mp->b_wptr) 2121 goto done; 2122 2123 proto = ah->ah_nexthdr; 2124 skip_len += ah_length; 2125 2126 /* if AH header is in its own mblk, skip it */ 2127 if (MBLKL(mp) <= skip_len) { 2128 skip_len -= MBLKL(mp); 2129 mp = mp->b_cont; 2130 if (mp == NULL) 2131 goto done; 2132 } 2133 2134 goto again; 2135 } 2136 } 2137 2138 done: 2139 return (hash); 2140 } 2141