1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright 2019 Joyent, Inc. 24 * Copyright 2023 Oxide Computer Company 25 */ 26 27 /* 28 * MAC Services Module - misc utilities 29 */ 30 31 #include <sys/types.h> 32 #include <sys/mac.h> 33 #include <sys/mac_impl.h> 34 #include <sys/mac_client_priv.h> 35 #include <sys/mac_client_impl.h> 36 #include <sys/mac_soft_ring.h> 37 #include <sys/strsubr.h> 38 #include <sys/strsun.h> 39 #include <sys/vlan.h> 40 #include <sys/pattr.h> 41 #include <sys/pci_tools.h> 42 #include <inet/ip.h> 43 #include <inet/ip_impl.h> 44 #include <inet/ip6.h> 45 #include <sys/vtrace.h> 46 #include <sys/dlpi.h> 47 #include <sys/sunndi.h> 48 #include <inet/ipsec_impl.h> 49 #include <inet/sadb.h> 50 #include <inet/ipsecesp.h> 51 #include <inet/ipsecah.h> 52 #include <inet/tcp.h> 53 #include <inet/udp_impl.h> 54 #include <inet/sctp_ip.h> 55 56 /* 57 * The next two functions are used for dropping packets or chains of 58 * packets, respectively. We could use one function for both but 59 * separating the use cases allows us to specify intent and prevent 60 * dropping more data than intended. 61 * 62 * The purpose of these functions is to aid the debugging effort, 63 * especially in production. Rather than use freemsg()/freemsgchain(), 64 * it's preferable to use these functions when dropping a packet in 65 * the MAC layer. These functions should only be used during 66 * unexpected conditions. That is, any time a packet is dropped 67 * outside of the regular, successful datapath. Consolidating all 68 * drops on these functions allows the user to trace one location and 69 * determine why the packet was dropped based on the msg. It also 70 * allows the user to inspect the packet before it is freed. Finally, 71 * it allows the user to avoid tracing freemsg()/freemsgchain() thus 72 * keeping the hot path running as efficiently as possible. 73 * 74 * NOTE: At this time not all MAC drops are aggregated on these 75 * functions; but that is the plan. This comment should be erased once 76 * completed. 77 */ 78 79 /*PRINTFLIKE2*/ 80 void 81 mac_drop_pkt(mblk_t *mp, const char *fmt, ...) 82 { 83 va_list adx; 84 char msg[128]; 85 char *msgp = msg; 86 87 ASSERT3P(mp->b_next, ==, NULL); 88 89 va_start(adx, fmt); 90 (void) vsnprintf(msgp, sizeof (msg), fmt, adx); 91 va_end(adx); 92 93 DTRACE_PROBE2(mac__drop, mblk_t *, mp, char *, msgp); 94 freemsg(mp); 95 } 96 97 /*PRINTFLIKE2*/ 98 void 99 mac_drop_chain(mblk_t *chain, const char *fmt, ...) 100 { 101 va_list adx; 102 char msg[128]; 103 char *msgp = msg; 104 105 va_start(adx, fmt); 106 (void) vsnprintf(msgp, sizeof (msg), fmt, adx); 107 va_end(adx); 108 109 /* 110 * We could use freemsgchain() for the actual freeing but 111 * since we are already walking the chain to fire the dtrace 112 * probe we might as well free the msg here too. 113 */ 114 for (mblk_t *mp = chain, *next; mp != NULL; ) { 115 next = mp->b_next; 116 DTRACE_PROBE2(mac__drop, mblk_t *, mp, char *, msgp); 117 mp->b_next = NULL; 118 freemsg(mp); 119 mp = next; 120 } 121 } 122 123 /* 124 * Copy an mblk, preserving its hardware checksum flags. 125 */ 126 static mblk_t * 127 mac_copymsg_cksum(mblk_t *mp) 128 { 129 mblk_t *mp1; 130 131 mp1 = copymsg(mp); 132 if (mp1 == NULL) 133 return (NULL); 134 135 mac_hcksum_clone(mp, mp1); 136 137 return (mp1); 138 } 139 140 /* 141 * Copy an mblk chain, presenting the hardware checksum flags of the 142 * individual mblks. 143 */ 144 mblk_t * 145 mac_copymsgchain_cksum(mblk_t *mp) 146 { 147 mblk_t *nmp = NULL; 148 mblk_t **nmpp = &nmp; 149 150 for (; mp != NULL; mp = mp->b_next) { 151 if ((*nmpp = mac_copymsg_cksum(mp)) == NULL) { 152 freemsgchain(nmp); 153 return (NULL); 154 } 155 156 nmpp = &((*nmpp)->b_next); 157 } 158 159 return (nmp); 160 } 161 162 /* 163 * Calculate the ULP checksum for IPv4. Return true if the calculation 164 * was successful, or false if an error occurred. If the later, place 165 * an error message into '*err'. 166 */ 167 static boolean_t 168 mac_sw_cksum_ipv4(mblk_t *mp, uint32_t ip_hdr_offset, ipha_t *ipha, 169 const char **err) 170 { 171 const uint8_t proto = ipha->ipha_protocol; 172 size_t len; 173 const uint32_t ip_hdr_sz = IPH_HDR_LENGTH(ipha); 174 /* ULP offset from start of L2. */ 175 const uint32_t ulp_offset = ip_hdr_offset + ip_hdr_sz; 176 ipaddr_t src, dst; 177 uint32_t cksum; 178 uint16_t *up; 179 180 /* 181 * We need a pointer to the ULP checksum. We're assuming the 182 * ULP checksum pointer resides in the first mblk. Our native 183 * TCP stack should always put the headers in the first mblk, 184 * but currently we have no way to guarantee that other 185 * clients don't spread headers (or even header fields) across 186 * mblks. 187 */ 188 switch (proto) { 189 case IPPROTO_TCP: 190 ASSERT3U(MBLKL(mp), >=, (ulp_offset + sizeof (tcph_t))); 191 if (MBLKL(mp) < (ulp_offset + sizeof (tcph_t))) { 192 *err = "mblk doesn't contain TCP header"; 193 goto bail; 194 } 195 196 up = IPH_TCPH_CHECKSUMP(ipha, ip_hdr_sz); 197 cksum = IP_TCP_CSUM_COMP; 198 break; 199 200 case IPPROTO_UDP: 201 ASSERT3U(MBLKL(mp), >=, (ulp_offset + sizeof (udpha_t))); 202 if (MBLKL(mp) < (ulp_offset + sizeof (udpha_t))) { 203 *err = "mblk doesn't contain UDP header"; 204 goto bail; 205 } 206 207 up = IPH_UDPH_CHECKSUMP(ipha, ip_hdr_sz); 208 cksum = IP_UDP_CSUM_COMP; 209 break; 210 211 case IPPROTO_SCTP: { 212 sctp_hdr_t *sctph; 213 214 ASSERT3U(MBLKL(mp), >=, (ulp_offset + sizeof (sctp_hdr_t))); 215 if (MBLKL(mp) < (ulp_offset + sizeof (sctp_hdr_t))) { 216 *err = "mblk doesn't contain SCTP header"; 217 goto bail; 218 } 219 220 sctph = (sctp_hdr_t *)(mp->b_rptr + ulp_offset); 221 sctph->sh_chksum = 0; 222 sctph->sh_chksum = sctp_cksum(mp, ulp_offset); 223 return (B_TRUE); 224 } 225 226 default: 227 *err = "unexpected protocol"; 228 goto bail; 229 230 } 231 232 /* Pseudo-header checksum. */ 233 src = ipha->ipha_src; 234 dst = ipha->ipha_dst; 235 len = ntohs(ipha->ipha_length) - ip_hdr_sz; 236 237 cksum += (dst >> 16) + (dst & 0xFFFF) + (src >> 16) + (src & 0xFFFF); 238 cksum += htons(len); 239 240 /* 241 * We have already accounted for the pseudo checksum above. 242 * Make sure the ULP checksum field is zero before computing 243 * the rest. 244 */ 245 *up = 0; 246 cksum = IP_CSUM(mp, ulp_offset, cksum); 247 *up = (uint16_t)(cksum ? cksum : ~cksum); 248 249 return (B_TRUE); 250 251 bail: 252 return (B_FALSE); 253 } 254 255 /* 256 * Calculate the ULP checksum for IPv6. Return true if the calculation 257 * was successful, or false if an error occurred. If the later, place 258 * an error message into '*err'. 259 */ 260 static boolean_t 261 mac_sw_cksum_ipv6(mblk_t *mp, uint32_t ip_hdr_offset, const char **err) 262 { 263 ip6_t *ip6h = (ip6_t *)(mp->b_rptr + ip_hdr_offset); 264 const uint8_t proto = ip6h->ip6_nxt; 265 const uint16_t *iphs = (uint16_t *)ip6h; 266 /* ULP offset from start of L2. */ 267 uint32_t ulp_offset; 268 size_t len; 269 uint32_t cksum; 270 uint16_t *up; 271 uint16_t ip_hdr_sz; 272 273 if (!ip_hdr_length_nexthdr_v6(mp, ip6h, &ip_hdr_sz, NULL)) { 274 *err = "malformed IPv6 header"; 275 goto bail; 276 } 277 278 ulp_offset = ip_hdr_offset + ip_hdr_sz; 279 280 /* 281 * We need a pointer to the ULP checksum. We're assuming the 282 * ULP checksum pointer resides in the first mblk. Our native 283 * TCP stack should always put the headers in the first mblk, 284 * but currently we have no way to guarantee that other 285 * clients don't spread headers (or even header fields) across 286 * mblks. 287 */ 288 switch (proto) { 289 case IPPROTO_TCP: 290 ASSERT3U(MBLKL(mp), >=, (ulp_offset + sizeof (tcph_t))); 291 if (MBLKL(mp) < (ulp_offset + sizeof (tcph_t))) { 292 *err = "mblk doesn't contain TCP header"; 293 goto bail; 294 } 295 296 up = IPH_TCPH_CHECKSUMP(ip6h, ip_hdr_sz); 297 cksum = IP_TCP_CSUM_COMP; 298 break; 299 300 case IPPROTO_UDP: 301 ASSERT3U(MBLKL(mp), >=, (ulp_offset + sizeof (udpha_t))); 302 if (MBLKL(mp) < (ulp_offset + sizeof (udpha_t))) { 303 *err = "mblk doesn't contain UDP header"; 304 goto bail; 305 } 306 307 up = IPH_UDPH_CHECKSUMP(ip6h, ip_hdr_sz); 308 cksum = IP_UDP_CSUM_COMP; 309 break; 310 311 case IPPROTO_SCTP: { 312 sctp_hdr_t *sctph; 313 314 ASSERT3U(MBLKL(mp), >=, (ulp_offset + sizeof (sctp_hdr_t))); 315 if (MBLKL(mp) < (ulp_offset + sizeof (sctp_hdr_t))) { 316 *err = "mblk doesn't contain SCTP header"; 317 goto bail; 318 } 319 320 sctph = (sctp_hdr_t *)(mp->b_rptr + ulp_offset); 321 /* 322 * Zero out the checksum field to ensure proper 323 * checksum calculation. 324 */ 325 sctph->sh_chksum = 0; 326 sctph->sh_chksum = sctp_cksum(mp, ulp_offset); 327 return (B_TRUE); 328 } 329 330 default: 331 *err = "unexpected protocol"; 332 goto bail; 333 } 334 335 /* 336 * The payload length includes the payload and the IPv6 337 * extension headers; the idea is to subtract the extension 338 * header length to get the real payload length. 339 */ 340 len = ntohs(ip6h->ip6_plen) - (ip_hdr_sz - IPV6_HDR_LEN); 341 cksum += len; 342 343 /* 344 * We accumulate the pseudo header checksum in cksum; then we 345 * call IP_CSUM to compute the checksum over the payload. 346 */ 347 cksum += iphs[4] + iphs[5] + iphs[6] + iphs[7] + iphs[8] + iphs[9] + 348 iphs[10] + iphs[11] + iphs[12] + iphs[13] + iphs[14] + iphs[15] + 349 iphs[16] + iphs[17] + iphs[18] + iphs[19]; 350 cksum = IP_CSUM(mp, ulp_offset, cksum); 351 352 /* For UDP/IPv6 a zero UDP checksum is not allowed. Change to 0xffff */ 353 if (proto == IPPROTO_UDP && cksum == 0) 354 cksum = ~cksum; 355 356 *up = (uint16_t)cksum; 357 358 return (B_TRUE); 359 360 bail: 361 return (B_FALSE); 362 } 363 364 /* 365 * Perform software checksum on a single message, if needed. The 366 * emulation performed is determined by an intersection of the mblk's 367 * flags and the emul flags requested. The emul flags are documented 368 * in mac.h. 369 */ 370 static mblk_t * 371 mac_sw_cksum(mblk_t *mp, mac_emul_t emul) 372 { 373 mblk_t *skipped_hdr = NULL; 374 uint32_t flags, start, stuff, end, value; 375 uint32_t ip_hdr_offset; 376 uint16_t etype; 377 size_t ip_hdr_sz; 378 struct ether_header *ehp; 379 const char *err = ""; 380 381 /* 382 * This function should only be called from mac_hw_emul() 383 * which handles mblk chains and the shared ref case. 384 */ 385 ASSERT3P(mp->b_next, ==, NULL); 386 387 mac_hcksum_get(mp, &start, &stuff, &end, &value, NULL); 388 389 flags = DB_CKSUMFLAGS(mp); 390 391 /* Why call this if checksum emulation isn't needed? */ 392 ASSERT3U(flags & (HCK_FLAGS), !=, 0); 393 394 /* 395 * Ethernet, and optionally VLAN header. mac_hw_emul() has 396 * already verified we have enough data to read the L2 header. 397 */ 398 ehp = (struct ether_header *)mp->b_rptr; 399 if (ntohs(ehp->ether_type) == VLAN_TPID) { 400 struct ether_vlan_header *evhp; 401 402 evhp = (struct ether_vlan_header *)mp->b_rptr; 403 etype = ntohs(evhp->ether_type); 404 ip_hdr_offset = sizeof (struct ether_vlan_header); 405 } else { 406 etype = ntohs(ehp->ether_type); 407 ip_hdr_offset = sizeof (struct ether_header); 408 } 409 410 /* 411 * If this packet isn't IP, then leave it alone. We don't want 412 * to affect non-IP traffic like ARP. Assume the IP header 413 * doesn't include any options, for now. We will use the 414 * correct size later after we know there are enough bytes to 415 * at least fill out the basic header. 416 */ 417 switch (etype) { 418 case ETHERTYPE_IP: 419 ip_hdr_sz = sizeof (ipha_t); 420 break; 421 case ETHERTYPE_IPV6: 422 ip_hdr_sz = sizeof (ip6_t); 423 break; 424 default: 425 return (mp); 426 } 427 428 ASSERT3U(MBLKL(mp), >=, ip_hdr_offset); 429 430 /* 431 * If the first mblk of this packet contains only the ethernet 432 * header, skip past it for now. Packets with their data 433 * contained in only a single mblk can then use the fastpaths 434 * tuned to that possibility. 435 */ 436 if (MBLKL(mp) == ip_hdr_offset) { 437 ip_hdr_offset -= MBLKL(mp); 438 /* This is guaranteed by mac_hw_emul(). */ 439 ASSERT3P(mp->b_cont, !=, NULL); 440 skipped_hdr = mp; 441 mp = mp->b_cont; 442 } 443 444 /* 445 * Both full and partial checksum rely on finding the IP 446 * header in the current mblk. Our native TCP stack honors 447 * this assumption but it's prudent to guard our future 448 * clients that might not honor this contract. 449 */ 450 ASSERT3U(MBLKL(mp), >=, ip_hdr_offset + ip_hdr_sz); 451 if (MBLKL(mp) < (ip_hdr_offset + ip_hdr_sz)) { 452 err = "mblk doesn't contain IP header"; 453 goto bail; 454 } 455 456 /* 457 * We are about to modify the header mblk; make sure we are 458 * modifying our own copy. The code that follows assumes that 459 * the IP/ULP headers exist in this mblk (and drops the 460 * message if they don't). 461 */ 462 if (DB_REF(mp) > 1) { 463 mblk_t *tmp = copyb(mp); 464 465 if (tmp == NULL) { 466 err = "copyb failed"; 467 goto bail; 468 } 469 470 if (skipped_hdr != NULL) { 471 ASSERT3P(skipped_hdr->b_cont, ==, mp); 472 skipped_hdr->b_cont = tmp; 473 } 474 475 tmp->b_cont = mp->b_cont; 476 freeb(mp); 477 mp = tmp; 478 } 479 480 if (etype == ETHERTYPE_IP) { 481 ipha_t *ipha = (ipha_t *)(mp->b_rptr + ip_hdr_offset); 482 483 if ((flags & HCK_FULLCKSUM) && (emul & MAC_HWCKSUM_EMUL)) { 484 if (!mac_sw_cksum_ipv4(mp, ip_hdr_offset, ipha, &err)) 485 goto bail; 486 } 487 488 /* We always update the ULP checksum flags. */ 489 if ((flags & HCK_FULLCKSUM) && (emul & MAC_HWCKSUM_EMULS)) { 490 flags &= ~HCK_FULLCKSUM; 491 flags |= HCK_FULLCKSUM_OK; 492 value = 0; 493 } 494 495 /* 496 * While unlikely, it's possible to write code that 497 * might end up calling mac_sw_cksum() twice on the 498 * same mblk (performing both LSO and checksum 499 * emualtion in a single mblk chain loop -- the LSO 500 * emulation inserts a new chain into the existing 501 * chain and then the loop iterates back over the new 502 * segments and emulates the checksum a second time). 503 * Normally this wouldn't be a problem, because the 504 * HCK_*_OK flags are supposed to indicate that we 505 * don't need to do peform the work. But 506 * HCK_IPV4_HDRCKSUM and HCK_IPV4_HDRCKSUM_OK have the 507 * same value; so we cannot use these flags to 508 * determine if the IP header checksum has already 509 * been calculated or not. For this reason, we zero 510 * out the the checksum first. In the future, we 511 * should fix the HCK_* flags. 512 */ 513 if ((flags & HCK_IPV4_HDRCKSUM) && (emul & MAC_HWCKSUM_EMULS)) { 514 ipha->ipha_hdr_checksum = 0; 515 ipha->ipha_hdr_checksum = (uint16_t)ip_csum_hdr(ipha); 516 flags &= ~HCK_IPV4_HDRCKSUM; 517 flags |= HCK_IPV4_HDRCKSUM_OK; 518 } 519 } else if (etype == ETHERTYPE_IPV6) { 520 /* There is no IP header checksum for IPv6. */ 521 if ((flags & HCK_FULLCKSUM) && (emul & MAC_HWCKSUM_EMUL)) { 522 if (!mac_sw_cksum_ipv6(mp, ip_hdr_offset, &err)) 523 goto bail; 524 flags &= ~HCK_FULLCKSUM; 525 flags |= HCK_FULLCKSUM_OK; 526 value = 0; 527 } 528 } 529 530 /* 531 * Partial checksum is the same for both IPv4 and IPv6. 532 */ 533 if ((flags & HCK_PARTIALCKSUM) && (emul & MAC_HWCKSUM_EMUL)) { 534 uint16_t *up, partial, cksum; 535 uchar_t *ipp; /* ptr to beginning of IP header */ 536 537 ipp = mp->b_rptr + ip_hdr_offset; 538 up = (uint16_t *)((uchar_t *)ipp + stuff); 539 partial = *up; 540 *up = 0; 541 542 ASSERT3S(end, >, start); 543 cksum = ~IP_CSUM_PARTIAL(mp, ip_hdr_offset + start, partial); 544 *up = cksum != 0 ? cksum : ~cksum; 545 } 546 547 /* We always update the ULP checksum flags. */ 548 if ((flags & HCK_PARTIALCKSUM) && (emul & MAC_HWCKSUM_EMULS)) { 549 flags &= ~HCK_PARTIALCKSUM; 550 flags |= HCK_FULLCKSUM_OK; 551 value = 0; 552 } 553 554 mac_hcksum_set(mp, start, stuff, end, value, flags); 555 556 /* Don't forget to reattach the header. */ 557 if (skipped_hdr != NULL) { 558 ASSERT3P(skipped_hdr->b_cont, ==, mp); 559 560 /* 561 * Duplicate the HCKSUM data into the header mblk. 562 * This mimics mac_add_vlan_tag which ensures that 563 * both the first mblk _and_ the first data bearing 564 * mblk possess the HCKSUM information. Consumers like 565 * IP will end up discarding the ether_header mblk, so 566 * for now, it is important that the data be available 567 * in both places. 568 */ 569 mac_hcksum_clone(mp, skipped_hdr); 570 mp = skipped_hdr; 571 } 572 573 return (mp); 574 575 bail: 576 if (skipped_hdr != NULL) { 577 ASSERT3P(skipped_hdr->b_cont, ==, mp); 578 mp = skipped_hdr; 579 } 580 581 mac_drop_pkt(mp, err); 582 return (NULL); 583 } 584 585 /* 586 * Build a single data segment from an LSO packet. The mblk chain 587 * returned, seg_head, represents the data segment and is always 588 * exactly seg_len bytes long. The lso_mp and offset input/output 589 * parameters track our position in the LSO packet. This function 590 * exists solely as a helper to mac_sw_lso(). 591 * 592 * Case A 593 * 594 * The current lso_mp is larger than the requested seg_len. The 595 * beginning of seg_head may start at the beginning of lso_mp or 596 * offset into it. In either case, a single mblk is returned, and 597 * *offset is updated to reflect our new position in the current 598 * lso_mp. 599 * 600 * +----------------------------+ 601 * | in *lso_mp / out *lso_mp | 602 * +----------------------------+ 603 * ^ ^ 604 * | | 605 * | | 606 * | | 607 * +------------------------+ 608 * | seg_head | 609 * +------------------------+ 610 * ^ ^ 611 * | | 612 * in *offset = 0 out *offset = seg_len 613 * 614 * |------ seg_len ----| 615 * 616 * 617 * +------------------------------+ 618 * | in *lso_mp / out *lso_mp | 619 * +------------------------------+ 620 * ^ ^ 621 * | | 622 * | | 623 * | | 624 * +------------------------+ 625 * | seg_head | 626 * +------------------------+ 627 * ^ ^ 628 * | | 629 * in *offset = N out *offset = N + seg_len 630 * 631 * |------ seg_len ----| 632 * 633 * 634 * 635 * Case B 636 * 637 * The requested seg_len consumes exactly the rest of the lso_mp. 638 * I.e., the seg_head's b_wptr is equivalent to lso_mp's b_wptr. 639 * The seg_head may start at the beginning of the lso_mp or at some 640 * offset into it. In either case we return a single mblk, reset 641 * *offset to zero, and walk to the next lso_mp. 642 * 643 * +------------------------+ +------------------------+ 644 * | in *lso_mp |---------->| out *lso_mp | 645 * +------------------------+ +------------------------+ 646 * ^ ^ ^ 647 * | | | 648 * | | out *offset = 0 649 * | | 650 * +------------------------+ 651 * | seg_head | 652 * +------------------------+ 653 * ^ 654 * | 655 * in *offset = 0 656 * 657 * |------ seg_len ----| 658 * 659 * 660 * 661 * +----------------------------+ +------------------------+ 662 * | in *lso_mp |---------->| out *lso_mp | 663 * +----------------------------+ +------------------------+ 664 * ^ ^ ^ 665 * | | | 666 * | | out *offset = 0 667 * | | 668 * +------------------------+ 669 * | seg_head | 670 * +------------------------+ 671 * ^ 672 * | 673 * in *offset = N 674 * 675 * |------ seg_len ----| 676 * 677 * 678 * Case C 679 * 680 * The requested seg_len is greater than the current lso_mp. In 681 * this case we must consume LSO mblks until we have enough data to 682 * satisfy either case (A) or (B) above. We will return multiple 683 * mblks linked via b_cont, offset will be set based on the cases 684 * above, and lso_mp will walk forward at least one mblk, but maybe 685 * more. 686 * 687 * N.B. This digram is not exhaustive. The seg_head may start on 688 * the beginning of an lso_mp. The seg_tail may end exactly on the 689 * boundary of an lso_mp. And there may be two (in this case the 690 * middle block wouldn't exist), three, or more mblks in the 691 * seg_head chain. This is meant as one example of what might 692 * happen. The main thing to remember is that the seg_tail mblk 693 * must be one of case (A) or (B) above. 694 * 695 * +------------------+ +----------------+ +------------------+ 696 * | in *lso_mp |--->| *lso_mp |--->| out *lso_mp | 697 * +------------------+ +----------------+ +------------------+ 698 * ^ ^ ^ ^ ^ ^ 699 * | | | | | | 700 * | | | | | | 701 * | | | | | | 702 * | | | | | | 703 * +------------+ +----------------+ +------------+ 704 * | seg_head |--->| |--->| seg_tail | 705 * +------------+ +----------------+ +------------+ 706 * ^ ^ 707 * | | 708 * in *offset = N out *offset = MBLKL(seg_tail) 709 * 710 * |------------------- seg_len -------------------| 711 * 712 */ 713 static mblk_t * 714 build_data_seg(mblk_t **lso_mp, uint32_t *offset, uint32_t seg_len) 715 { 716 mblk_t *seg_head, *seg_tail, *seg_mp; 717 718 ASSERT3P(*lso_mp, !=, NULL); 719 ASSERT3U((*lso_mp)->b_rptr + *offset, <, (*lso_mp)->b_wptr); 720 721 seg_mp = dupb(*lso_mp); 722 if (seg_mp == NULL) 723 return (NULL); 724 725 seg_head = seg_mp; 726 seg_tail = seg_mp; 727 728 /* Continue where we left off from in the lso_mp. */ 729 seg_mp->b_rptr += *offset; 730 731 last_mblk: 732 /* Case (A) */ 733 if ((seg_mp->b_rptr + seg_len) < seg_mp->b_wptr) { 734 *offset += seg_len; 735 seg_mp->b_wptr = seg_mp->b_rptr + seg_len; 736 return (seg_head); 737 } 738 739 /* Case (B) */ 740 if ((seg_mp->b_rptr + seg_len) == seg_mp->b_wptr) { 741 *offset = 0; 742 *lso_mp = (*lso_mp)->b_cont; 743 return (seg_head); 744 } 745 746 /* Case (C) */ 747 ASSERT3U(seg_mp->b_rptr + seg_len, >, seg_mp->b_wptr); 748 749 /* 750 * The current LSO mblk doesn't have enough data to satisfy 751 * seg_len -- continue peeling off LSO mblks to build the new 752 * segment message. If allocation fails we free the previously 753 * allocated segment mblks and return NULL. 754 */ 755 while ((seg_mp->b_rptr + seg_len) > seg_mp->b_wptr) { 756 ASSERT3U(MBLKL(seg_mp), <=, seg_len); 757 seg_len -= MBLKL(seg_mp); 758 *offset = 0; 759 *lso_mp = (*lso_mp)->b_cont; 760 seg_mp = dupb(*lso_mp); 761 762 if (seg_mp == NULL) { 763 freemsgchain(seg_head); 764 return (NULL); 765 } 766 767 seg_tail->b_cont = seg_mp; 768 seg_tail = seg_mp; 769 } 770 771 /* 772 * We've walked enough LSO mblks that we can now satisfy the 773 * remaining seg_len. At this point we need to jump back to 774 * determine if we have arrived at case (A) or (B). 775 */ 776 777 /* Just to be paranoid that we didn't underflow. */ 778 ASSERT3U(seg_len, <, IP_MAXPACKET); 779 ASSERT3U(seg_len, >, 0); 780 goto last_mblk; 781 } 782 783 /* 784 * Perform software segmentation of a single LSO message. Take an LSO 785 * message as input and return head/tail pointers as output. This 786 * function should not be invoked directly but instead through 787 * mac_hw_emul(). 788 * 789 * The resulting chain is comprised of multiple (nsegs) MSS sized 790 * segments. Each segment will consist of two or more mblks joined by 791 * b_cont: a header and one or more data mblks. The header mblk is 792 * allocated anew for each message. The first segment's header is used 793 * as a template for the rest with adjustments made for things such as 794 * ID, sequence, length, TCP flags, etc. The data mblks reference into 795 * the existing LSO mblk (passed in as omp) by way of dupb(). Their 796 * b_rptr/b_wptr values are adjusted to reference only the fraction of 797 * the LSO message they are responsible for. At the successful 798 * completion of this function the original mblk (omp) is freed, 799 * leaving the newely created segment chain as the only remaining 800 * reference to the data. 801 */ 802 static void 803 mac_sw_lso(mblk_t *omp, mac_emul_t emul, mblk_t **head, mblk_t **tail, 804 uint_t *count) 805 { 806 uint32_t ocsum_flags, ocsum_start, ocsum_stuff; 807 uint32_t mss; 808 uint32_t oehlen, oiphlen, otcphlen, ohdrslen, opktlen, odatalen; 809 uint32_t oleft; 810 uint_t nsegs, seg; 811 int len; 812 813 struct ether_vlan_header *oevh; 814 const ipha_t *oiph; 815 const tcph_t *otcph; 816 ipha_t *niph; 817 tcph_t *ntcph; 818 uint16_t ip_id; 819 uint32_t tcp_seq, tcp_sum, otcp_sum; 820 821 uint32_t offset; 822 mblk_t *odatamp; 823 mblk_t *seg_chain, *prev_nhdrmp, *next_nhdrmp, *nhdrmp, *ndatamp; 824 mblk_t *tmptail; 825 826 ASSERT3P(head, !=, NULL); 827 ASSERT3P(tail, !=, NULL); 828 ASSERT3P(count, !=, NULL); 829 ASSERT3U((DB_CKSUMFLAGS(omp) & HW_LSO), !=, 0); 830 831 /* Assume we are dealing with a single LSO message. */ 832 ASSERT3P(omp->b_next, ==, NULL); 833 834 /* 835 * XXX: This is a hack to deal with mac_add_vlan_tag(). 836 * 837 * When VLANs are in play, mac_add_vlan_tag() creates a new 838 * mblk with just the ether_vlan_header and tacks it onto the 839 * front of 'omp'. This breaks the assumptions made below; 840 * namely that the TCP/IP headers are in the first mblk. In 841 * this case, since we already have to pay the cost of LSO 842 * emulation, we simply pull up everything. While this might 843 * seem irksome, keep in mind this will only apply in a couple 844 * of scenarios: a) an LSO-capable VLAN client sending to a 845 * non-LSO-capable client over the "MAC/bridge loopback" 846 * datapath or b) an LSO-capable VLAN client is sending to a 847 * client that, for whatever reason, doesn't have DLS-bypass 848 * enabled. Finally, we have to check for both a tagged and 849 * untagged sized mblk depending on if the mblk came via 850 * mac_promisc_dispatch() or mac_rx_deliver(). 851 * 852 * In the future, two things should be done: 853 * 854 * 1. This function should make use of some yet to be 855 * implemented "mblk helpers". These helper functions would 856 * perform all the b_cont walking for us and guarantee safe 857 * access to the mblk data. 858 * 859 * 2. We should add some slop to the mblks so that 860 * mac_add_vlan_tag() can just edit the first mblk instead 861 * of allocating on the hot path. 862 */ 863 if (MBLKL(omp) == sizeof (struct ether_vlan_header) || 864 MBLKL(omp) == sizeof (struct ether_header)) { 865 mblk_t *tmp = msgpullup(omp, -1); 866 867 if (tmp == NULL) { 868 mac_drop_pkt(omp, "failed to pull up"); 869 goto fail; 870 } 871 872 mac_hcksum_clone(omp, tmp); 873 freemsg(omp); 874 omp = tmp; 875 } 876 877 mss = DB_LSOMSS(omp); 878 ASSERT3U(msgsize(omp), <=, IP_MAXPACKET + 879 sizeof (struct ether_vlan_header)); 880 opktlen = msgsize(omp); 881 882 /* 883 * First, get references to the IP and TCP headers and 884 * determine the total TCP length (header + data). 885 * 886 * Thanks to mac_hw_emul() we know that the first mblk must 887 * contain (at minimum) the full L2 header. However, this 888 * function assumes more than that. It assumes the L2/L3/L4 889 * headers are all contained in the first mblk of a message 890 * (i.e., no b_cont walking for headers). While this is a 891 * current reality (our native TCP stack and viona both 892 * enforce this) things may become more nuanced in the future 893 * (e.g. when introducing encap support or adding new 894 * clients). For now we guard against this case by dropping 895 * the packet. 896 */ 897 oevh = (struct ether_vlan_header *)omp->b_rptr; 898 if (oevh->ether_tpid == htons(ETHERTYPE_VLAN)) 899 oehlen = sizeof (struct ether_vlan_header); 900 else 901 oehlen = sizeof (struct ether_header); 902 903 ASSERT3U(MBLKL(omp), >=, (oehlen + sizeof (ipha_t) + sizeof (tcph_t))); 904 if (MBLKL(omp) < (oehlen + sizeof (ipha_t) + sizeof (tcph_t))) { 905 mac_drop_pkt(omp, "mblk doesn't contain TCP/IP headers"); 906 goto fail; 907 } 908 909 oiph = (ipha_t *)(omp->b_rptr + oehlen); 910 oiphlen = IPH_HDR_LENGTH(oiph); 911 otcph = (tcph_t *)(omp->b_rptr + oehlen + oiphlen); 912 otcphlen = TCP_HDR_LENGTH(otcph); 913 914 /* 915 * Currently we only support LSO for TCP/IPv4. 916 */ 917 if (IPH_HDR_VERSION(oiph) != IPV4_VERSION) { 918 mac_drop_pkt(omp, "LSO unsupported IP version: %uhh", 919 IPH_HDR_VERSION(oiph)); 920 goto fail; 921 } 922 923 if (oiph->ipha_protocol != IPPROTO_TCP) { 924 mac_drop_pkt(omp, "LSO unsupported protocol: %uhh", 925 oiph->ipha_protocol); 926 goto fail; 927 } 928 929 if (otcph->th_flags[0] & (TH_SYN | TH_RST | TH_URG)) { 930 mac_drop_pkt(omp, "LSO packet has SYN|RST|URG set"); 931 goto fail; 932 } 933 934 ohdrslen = oehlen + oiphlen + otcphlen; 935 if ((len = MBLKL(omp)) < ohdrslen) { 936 mac_drop_pkt(omp, "LSO packet too short: %d < %u", len, 937 ohdrslen); 938 goto fail; 939 } 940 941 /* 942 * Either we have data in the first mblk or it's just the 943 * header. In either case, we need to set rptr to the start of 944 * the TCP data. 945 */ 946 if (len > ohdrslen) { 947 odatamp = omp; 948 offset = ohdrslen; 949 } else { 950 ASSERT3U(len, ==, ohdrslen); 951 odatamp = omp->b_cont; 952 offset = 0; 953 } 954 955 /* Make sure we still have enough data. */ 956 ASSERT3U(msgsize(odatamp), >=, opktlen - ohdrslen); 957 958 /* 959 * If a MAC negotiated LSO then it must negotioate both 960 * HCKSUM_IPHDRCKSUM and either HCKSUM_INET_FULL_V4 or 961 * HCKSUM_INET_PARTIAL; because both the IP and TCP headers 962 * change during LSO segmentation (only the 3 fields of the 963 * pseudo header checksum don't change: src, dst, proto). Thus 964 * we would expect these flags (HCK_IPV4_HDRCKSUM | 965 * HCK_PARTIALCKSUM | HCK_FULLCKSUM) to be set and for this 966 * function to emulate those checksums in software. However, 967 * that assumes a world where we only expose LSO if the 968 * underlying hardware exposes LSO. Moving forward the plan is 969 * to assume LSO in the upper layers and have MAC perform 970 * software LSO when the underlying provider doesn't support 971 * it. In such a world, if the provider doesn't support LSO 972 * but does support hardware checksum offload, then we could 973 * simply perform the segmentation and allow the hardware to 974 * calculate the checksums. To the hardware it's just another 975 * chain of non-LSO packets. 976 */ 977 ASSERT3S(DB_TYPE(omp), ==, M_DATA); 978 ocsum_flags = DB_CKSUMFLAGS(omp); 979 ASSERT3U(ocsum_flags & HCK_IPV4_HDRCKSUM, !=, 0); 980 ASSERT3U(ocsum_flags & (HCK_PARTIALCKSUM | HCK_FULLCKSUM), !=, 0); 981 982 /* 983 * If hardware only provides partial checksum then software 984 * must supply the pseudo-header checksum. In the case of LSO 985 * we leave the TCP length at zero to be filled in by 986 * hardware. This function must handle two scenarios. 987 * 988 * 1. Being called by a MAC client on the Rx path to segment 989 * an LSO packet and calculate the checksum. 990 * 991 * 2. Being called by a MAC provider to segment an LSO packet. 992 * In this case the LSO segmentation is performed in 993 * software (by this routine) but the MAC provider should 994 * still calculate the TCP/IP checksums in hardware. 995 * 996 * To elaborate on the second case: we cannot have the 997 * scenario where IP sends LSO packets but the underlying HW 998 * doesn't support checksum offload -- because in that case 999 * TCP/IP would calculate the checksum in software (for the 1000 * LSO packet) but then MAC would segment the packet and have 1001 * to redo all the checksum work. So IP should never do LSO 1002 * if HW doesn't support both IP and TCP checksum. 1003 */ 1004 if (ocsum_flags & HCK_PARTIALCKSUM) { 1005 ocsum_start = (uint32_t)DB_CKSUMSTART(omp); 1006 ocsum_stuff = (uint32_t)DB_CKSUMSTUFF(omp); 1007 } 1008 1009 odatalen = opktlen - ohdrslen; 1010 1011 /* 1012 * Subtract one to account for the case where the data length 1013 * is evenly divisble by the MSS. Add one to account for the 1014 * fact that the division will always result in one less 1015 * segment than needed. 1016 */ 1017 nsegs = ((odatalen - 1) / mss) + 1; 1018 if (nsegs < 2) { 1019 mac_drop_pkt(omp, "LSO not enough segs: %u", nsegs); 1020 goto fail; 1021 } 1022 1023 DTRACE_PROBE6(sw__lso__start, mblk_t *, omp, void_ip_t *, oiph, 1024 __dtrace_tcp_tcph_t *, otcph, uint_t, odatalen, uint_t, mss, uint_t, 1025 nsegs); 1026 1027 seg_chain = NULL; 1028 tmptail = seg_chain; 1029 oleft = odatalen; 1030 1031 for (uint_t i = 0; i < nsegs; i++) { 1032 boolean_t last_seg = ((i + 1) == nsegs); 1033 uint32_t seg_len; 1034 1035 /* 1036 * If we fail to allocate, then drop the partially 1037 * allocated chain as well as the LSO packet. Let the 1038 * sender deal with the fallout. 1039 */ 1040 if ((nhdrmp = allocb(ohdrslen, 0)) == NULL) { 1041 freemsgchain(seg_chain); 1042 mac_drop_pkt(omp, "failed to alloc segment header"); 1043 goto fail; 1044 } 1045 ASSERT3P(nhdrmp->b_cont, ==, NULL); 1046 1047 if (seg_chain == NULL) { 1048 seg_chain = nhdrmp; 1049 } else { 1050 ASSERT3P(tmptail, !=, NULL); 1051 tmptail->b_next = nhdrmp; 1052 } 1053 1054 tmptail = nhdrmp; 1055 1056 /* 1057 * Calculate this segment's lengh. It's either the MSS 1058 * or whatever remains for the last segment. 1059 */ 1060 seg_len = last_seg ? oleft : mss; 1061 ASSERT3U(seg_len, <=, mss); 1062 ndatamp = build_data_seg(&odatamp, &offset, seg_len); 1063 1064 if (ndatamp == NULL) { 1065 freemsgchain(seg_chain); 1066 mac_drop_pkt(omp, "LSO failed to segment data"); 1067 goto fail; 1068 } 1069 1070 /* Attach data mblk to header mblk. */ 1071 nhdrmp->b_cont = ndatamp; 1072 DB_CKSUMFLAGS(ndatamp) &= ~HW_LSO; 1073 ASSERT3U(seg_len, <=, oleft); 1074 oleft -= seg_len; 1075 } 1076 1077 /* We should have consumed entire LSO msg. */ 1078 ASSERT3S(oleft, ==, 0); 1079 ASSERT3P(odatamp, ==, NULL); 1080 1081 /* 1082 * All seg data mblks are referenced by the header mblks, null 1083 * out this pointer to catch any bad derefs. 1084 */ 1085 ndatamp = NULL; 1086 1087 /* 1088 * Set headers and checksum for first segment. 1089 */ 1090 nhdrmp = seg_chain; 1091 bcopy(omp->b_rptr, nhdrmp->b_rptr, ohdrslen); 1092 nhdrmp->b_wptr = nhdrmp->b_rptr + ohdrslen; 1093 niph = (ipha_t *)(nhdrmp->b_rptr + oehlen); 1094 ASSERT3U(msgsize(nhdrmp->b_cont), ==, mss); 1095 niph->ipha_length = htons(oiphlen + otcphlen + mss); 1096 niph->ipha_hdr_checksum = 0; 1097 ip_id = ntohs(niph->ipha_ident); 1098 ntcph = (tcph_t *)(nhdrmp->b_rptr + oehlen + oiphlen); 1099 tcp_seq = BE32_TO_U32(ntcph->th_seq); 1100 tcp_seq += mss; 1101 1102 /* 1103 * The first segment shouldn't: 1104 * 1105 * o indicate end of data transmission (FIN), 1106 * o indicate immediate handling of the data (PUSH). 1107 */ 1108 ntcph->th_flags[0] &= ~(TH_FIN | TH_PUSH); 1109 DB_CKSUMFLAGS(nhdrmp) = (uint16_t)(ocsum_flags & ~HW_LSO); 1110 1111 /* 1112 * If the underlying HW provides partial checksum, then make 1113 * sure to correct the pseudo header checksum before calling 1114 * mac_sw_cksum(). The native TCP stack doesn't include the 1115 * length field in the pseudo header when LSO is in play -- so 1116 * we need to calculate it here. 1117 */ 1118 if (ocsum_flags & HCK_PARTIALCKSUM) { 1119 DB_CKSUMSTART(nhdrmp) = ocsum_start; 1120 DB_CKSUMEND(nhdrmp) = ntohs(niph->ipha_length); 1121 DB_CKSUMSTUFF(nhdrmp) = ocsum_stuff; 1122 tcp_sum = BE16_TO_U16(ntcph->th_sum); 1123 otcp_sum = tcp_sum; 1124 tcp_sum += mss + otcphlen; 1125 tcp_sum = (tcp_sum >> 16) + (tcp_sum & 0xFFFF); 1126 U16_TO_BE16(tcp_sum, ntcph->th_sum); 1127 } 1128 1129 if ((ocsum_flags & (HCK_PARTIALCKSUM | HCK_FULLCKSUM)) && 1130 (emul & MAC_HWCKSUM_EMULS)) { 1131 next_nhdrmp = nhdrmp->b_next; 1132 nhdrmp->b_next = NULL; 1133 nhdrmp = mac_sw_cksum(nhdrmp, emul); 1134 nhdrmp->b_next = next_nhdrmp; 1135 next_nhdrmp = NULL; 1136 1137 /* 1138 * We may have freed the nhdrmp argument during 1139 * checksum emulation, make sure that seg_chain 1140 * references a valid mblk. 1141 */ 1142 seg_chain = nhdrmp; 1143 } 1144 1145 ASSERT3P(nhdrmp, !=, NULL); 1146 1147 seg = 1; 1148 DTRACE_PROBE5(sw__lso__seg, mblk_t *, nhdrmp, void_ip_t *, 1149 (ipha_t *)(nhdrmp->b_rptr + oehlen), __dtrace_tcp_tcph_t *, 1150 (tcph_t *)(nhdrmp->b_rptr + oehlen + oiphlen), uint_t, mss, 1151 uint_t, seg); 1152 seg++; 1153 1154 /* There better be at least 2 segs. */ 1155 ASSERT3P(nhdrmp->b_next, !=, NULL); 1156 prev_nhdrmp = nhdrmp; 1157 nhdrmp = nhdrmp->b_next; 1158 1159 /* 1160 * Now adjust the headers of the middle segments. For each 1161 * header we need to adjust the following. 1162 * 1163 * o IP ID 1164 * o IP length 1165 * o TCP sequence 1166 * o TCP flags 1167 * o cksum flags 1168 * o cksum values (if MAC_HWCKSUM_EMUL is set) 1169 */ 1170 for (; seg < nsegs; seg++) { 1171 /* 1172 * We use seg_chain as a reference to the first seg 1173 * header mblk -- this first header is a template for 1174 * the rest of the segments. This copy will include 1175 * the now updated checksum values from the first 1176 * header. We must reset these checksum values to 1177 * their original to make sure we produce the correct 1178 * value. 1179 */ 1180 bcopy(seg_chain->b_rptr, nhdrmp->b_rptr, ohdrslen); 1181 nhdrmp->b_wptr = nhdrmp->b_rptr + ohdrslen; 1182 niph = (ipha_t *)(nhdrmp->b_rptr + oehlen); 1183 niph->ipha_ident = htons(++ip_id); 1184 ASSERT3P(msgsize(nhdrmp->b_cont), ==, mss); 1185 niph->ipha_length = htons(oiphlen + otcphlen + mss); 1186 niph->ipha_hdr_checksum = 0; 1187 ntcph = (tcph_t *)(nhdrmp->b_rptr + oehlen + oiphlen); 1188 U32_TO_BE32(tcp_seq, ntcph->th_seq); 1189 tcp_seq += mss; 1190 /* 1191 * Just like the first segment, the middle segments 1192 * shouldn't have these flags set. 1193 */ 1194 ntcph->th_flags[0] &= ~(TH_FIN | TH_PUSH); 1195 DB_CKSUMFLAGS(nhdrmp) = (uint16_t)(ocsum_flags & ~HW_LSO); 1196 1197 if (ocsum_flags & HCK_PARTIALCKSUM) { 1198 /* 1199 * First and middle segs have same 1200 * pseudo-header checksum. 1201 */ 1202 U16_TO_BE16(tcp_sum, ntcph->th_sum); 1203 DB_CKSUMSTART(nhdrmp) = ocsum_start; 1204 DB_CKSUMEND(nhdrmp) = ntohs(niph->ipha_length); 1205 DB_CKSUMSTUFF(nhdrmp) = ocsum_stuff; 1206 } 1207 1208 if ((ocsum_flags & (HCK_PARTIALCKSUM | HCK_FULLCKSUM)) && 1209 (emul & MAC_HWCKSUM_EMULS)) { 1210 next_nhdrmp = nhdrmp->b_next; 1211 nhdrmp->b_next = NULL; 1212 nhdrmp = mac_sw_cksum(nhdrmp, emul); 1213 nhdrmp->b_next = next_nhdrmp; 1214 next_nhdrmp = NULL; 1215 /* We may have freed the original nhdrmp. */ 1216 prev_nhdrmp->b_next = nhdrmp; 1217 } 1218 1219 DTRACE_PROBE5(sw__lso__seg, mblk_t *, nhdrmp, void_ip_t *, 1220 (ipha_t *)(nhdrmp->b_rptr + oehlen), __dtrace_tcp_tcph_t *, 1221 (tcph_t *)(nhdrmp->b_rptr + oehlen + oiphlen), 1222 uint_t, mss, uint_t, seg); 1223 1224 ASSERT3P(nhdrmp->b_next, !=, NULL); 1225 prev_nhdrmp = nhdrmp; 1226 nhdrmp = nhdrmp->b_next; 1227 } 1228 1229 /* Make sure we are on the last segment. */ 1230 ASSERT3U(seg, ==, nsegs); 1231 ASSERT3P(nhdrmp->b_next, ==, NULL); 1232 1233 /* 1234 * Now we set the last segment header. The difference being 1235 * that FIN/PSH/RST flags are allowed. 1236 */ 1237 bcopy(seg_chain->b_rptr, nhdrmp->b_rptr, ohdrslen); 1238 nhdrmp->b_wptr = nhdrmp->b_rptr + ohdrslen; 1239 niph = (ipha_t *)(nhdrmp->b_rptr + oehlen); 1240 niph->ipha_ident = htons(++ip_id); 1241 len = msgsize(nhdrmp->b_cont); 1242 ASSERT3S(len, >, 0); 1243 niph->ipha_length = htons(oiphlen + otcphlen + len); 1244 niph->ipha_hdr_checksum = 0; 1245 ntcph = (tcph_t *)(nhdrmp->b_rptr + oehlen + oiphlen); 1246 U32_TO_BE32(tcp_seq, ntcph->th_seq); 1247 1248 DB_CKSUMFLAGS(nhdrmp) = (uint16_t)(ocsum_flags & ~HW_LSO); 1249 if (ocsum_flags & HCK_PARTIALCKSUM) { 1250 DB_CKSUMSTART(nhdrmp) = ocsum_start; 1251 DB_CKSUMEND(nhdrmp) = ntohs(niph->ipha_length); 1252 DB_CKSUMSTUFF(nhdrmp) = ocsum_stuff; 1253 tcp_sum = otcp_sum; 1254 tcp_sum += len + otcphlen; 1255 tcp_sum = (tcp_sum >> 16) + (tcp_sum & 0xFFFF); 1256 U16_TO_BE16(tcp_sum, ntcph->th_sum); 1257 } 1258 1259 if ((ocsum_flags & (HCK_PARTIALCKSUM | HCK_FULLCKSUM)) && 1260 (emul & MAC_HWCKSUM_EMULS)) { 1261 /* This should be the last mblk. */ 1262 ASSERT3P(nhdrmp->b_next, ==, NULL); 1263 nhdrmp = mac_sw_cksum(nhdrmp, emul); 1264 prev_nhdrmp->b_next = nhdrmp; 1265 } 1266 1267 DTRACE_PROBE5(sw__lso__seg, mblk_t *, nhdrmp, void_ip_t *, 1268 (ipha_t *)(nhdrmp->b_rptr + oehlen), __dtrace_tcp_tcph_t *, 1269 (tcph_t *)(nhdrmp->b_rptr + oehlen + oiphlen), uint_t, len, 1270 uint_t, seg); 1271 1272 /* 1273 * Free the reference to the original LSO message as it is 1274 * being replaced by seg_cahin. 1275 */ 1276 freemsg(omp); 1277 *head = seg_chain; 1278 *tail = nhdrmp; 1279 *count = nsegs; 1280 return; 1281 1282 fail: 1283 *head = NULL; 1284 *tail = NULL; 1285 *count = 0; 1286 } 1287 1288 #define HCK_NEEDED (HCK_IPV4_HDRCKSUM | HCK_PARTIALCKSUM | HCK_FULLCKSUM) 1289 1290 /* 1291 * Emulate various hardware offload features in software. Take a chain 1292 * of packets as input and emulate the hardware features specified in 1293 * 'emul'. The resulting chain's head pointer replaces the 'mp_chain' 1294 * pointer given as input, and its tail pointer is written to 1295 * '*otail'. The number of packets in the new chain is written to 1296 * '*ocount'. The 'otail' and 'ocount' arguments are optional and thus 1297 * may be NULL. The 'mp_chain' argument may point to a NULL chain; in 1298 * which case 'mp_chain' will simply stay a NULL chain. 1299 * 1300 * While unlikely, it is technically possible that this function could 1301 * receive a non-NULL chain as input and return a NULL chain as output 1302 * ('*mp_chain' and '*otail' would be NULL and '*ocount' would be 1303 * zero). This could happen if all the packets in the chain are 1304 * dropped or if we fail to allocate new mblks. In this case, there is 1305 * nothing for the caller to free. In any event, the caller shouldn't 1306 * assume that '*mp_chain' is non-NULL on return. 1307 * 1308 * This function was written with three main use cases in mind. 1309 * 1310 * 1. To emulate hardware offloads when traveling mac-loopback (two 1311 * clients on the same mac). This is wired up in mac_tx_send(). 1312 * 1313 * 2. To provide hardware offloads to the client when the underlying 1314 * provider cannot. This is currently wired up in mac_tx() but we 1315 * still only negotiate offloads when the underlying provider 1316 * supports them. 1317 * 1318 * 3. To emulate real hardware in simnet. 1319 */ 1320 void 1321 mac_hw_emul(mblk_t **mp_chain, mblk_t **otail, uint_t *ocount, mac_emul_t emul) 1322 { 1323 mblk_t *head = NULL, *tail = NULL; 1324 uint_t count = 0; 1325 1326 ASSERT3S(~(MAC_HWCKSUM_EMULS | MAC_LSO_EMUL) & emul, ==, 0); 1327 ASSERT3P(mp_chain, !=, NULL); 1328 1329 for (mblk_t *mp = *mp_chain; mp != NULL; ) { 1330 mblk_t *tmp, *next, *tmphead, *tmptail; 1331 struct ether_header *ehp; 1332 uint32_t flags; 1333 uint_t len = MBLKL(mp), l2len; 1334 1335 /* Perform LSO/cksum one message at a time. */ 1336 next = mp->b_next; 1337 mp->b_next = NULL; 1338 1339 /* 1340 * For our sanity the first mblk should contain at 1341 * least the full L2 header. 1342 */ 1343 if (len < sizeof (struct ether_header)) { 1344 mac_drop_pkt(mp, "packet too short (A): %u", len); 1345 mp = next; 1346 continue; 1347 } 1348 1349 ehp = (struct ether_header *)mp->b_rptr; 1350 if (ntohs(ehp->ether_type) == VLAN_TPID) 1351 l2len = sizeof (struct ether_vlan_header); 1352 else 1353 l2len = sizeof (struct ether_header); 1354 1355 /* 1356 * If the first mblk is solely the L2 header, then 1357 * there better be more data. 1358 */ 1359 if (len < l2len || (len == l2len && mp->b_cont == NULL)) { 1360 mac_drop_pkt(mp, "packet too short (C): %u", len); 1361 mp = next; 1362 continue; 1363 } 1364 1365 DTRACE_PROBE2(mac__emul, mblk_t *, mp, mac_emul_t, emul); 1366 1367 /* 1368 * We use DB_CKSUMFLAGS (instead of mac_hcksum_get()) 1369 * because we don't want to mask-out the LSO flag. 1370 */ 1371 flags = DB_CKSUMFLAGS(mp); 1372 1373 if ((flags & HW_LSO) && (emul & MAC_LSO_EMUL)) { 1374 uint_t tmpcount = 0; 1375 1376 /* 1377 * LSO fix-up handles checksum emulation 1378 * inline (if requested). It also frees mp. 1379 */ 1380 mac_sw_lso(mp, emul, &tmphead, &tmptail, 1381 &tmpcount); 1382 if (tmphead == NULL) { 1383 /* mac_sw_lso() freed the mp. */ 1384 mp = next; 1385 continue; 1386 } 1387 count += tmpcount; 1388 } else if ((flags & HCK_NEEDED) && (emul & MAC_HWCKSUM_EMULS)) { 1389 tmp = mac_sw_cksum(mp, emul); 1390 if (tmp == NULL) { 1391 /* mac_sw_cksum() freed the mp. */ 1392 mp = next; 1393 continue; 1394 } 1395 tmphead = tmp; 1396 tmptail = tmp; 1397 count++; 1398 } else { 1399 /* There is nothing to emulate. */ 1400 tmp = mp; 1401 tmphead = tmp; 1402 tmptail = tmp; 1403 count++; 1404 } 1405 1406 /* 1407 * The tmp mblk chain is either the start of the new 1408 * chain or added to the tail of the new chain. 1409 */ 1410 if (head == NULL) { 1411 head = tmphead; 1412 tail = tmptail; 1413 } else { 1414 /* Attach the new mblk to the end of the new chain. */ 1415 tail->b_next = tmphead; 1416 tail = tmptail; 1417 } 1418 1419 mp = next; 1420 } 1421 1422 *mp_chain = head; 1423 1424 if (otail != NULL) 1425 *otail = tail; 1426 1427 if (ocount != NULL) 1428 *ocount = count; 1429 } 1430 1431 /* 1432 * Add VLAN tag to the specified mblk. 1433 */ 1434 mblk_t * 1435 mac_add_vlan_tag(mblk_t *mp, uint_t pri, uint16_t vid) 1436 { 1437 mblk_t *hmp; 1438 struct ether_vlan_header *evhp; 1439 struct ether_header *ehp; 1440 1441 ASSERT(pri != 0 || vid != 0); 1442 1443 /* 1444 * Allocate an mblk for the new tagged ethernet header, 1445 * and copy the MAC addresses and ethertype from the 1446 * original header. 1447 */ 1448 1449 hmp = allocb(sizeof (struct ether_vlan_header), BPRI_MED); 1450 if (hmp == NULL) { 1451 freemsg(mp); 1452 return (NULL); 1453 } 1454 1455 evhp = (struct ether_vlan_header *)hmp->b_rptr; 1456 ehp = (struct ether_header *)mp->b_rptr; 1457 1458 bcopy(ehp, evhp, (ETHERADDRL * 2)); 1459 evhp->ether_type = ehp->ether_type; 1460 evhp->ether_tpid = htons(ETHERTYPE_VLAN); 1461 1462 hmp->b_wptr += sizeof (struct ether_vlan_header); 1463 mp->b_rptr += sizeof (struct ether_header); 1464 1465 /* 1466 * Free the original message if it's now empty. Link the 1467 * rest of messages to the header message. 1468 */ 1469 mac_hcksum_clone(mp, hmp); 1470 if (MBLKL(mp) == 0) { 1471 hmp->b_cont = mp->b_cont; 1472 freeb(mp); 1473 } else { 1474 hmp->b_cont = mp; 1475 } 1476 ASSERT(MBLKL(hmp) >= sizeof (struct ether_vlan_header)); 1477 1478 /* 1479 * Initialize the new TCI (Tag Control Information). 1480 */ 1481 evhp->ether_tci = htons(VLAN_TCI(pri, 0, vid)); 1482 1483 return (hmp); 1484 } 1485 1486 /* 1487 * Adds a VLAN tag with the specified VID and priority to each mblk of 1488 * the specified chain. 1489 */ 1490 mblk_t * 1491 mac_add_vlan_tag_chain(mblk_t *mp_chain, uint_t pri, uint16_t vid) 1492 { 1493 mblk_t *next_mp, **prev, *mp; 1494 1495 mp = mp_chain; 1496 prev = &mp_chain; 1497 1498 while (mp != NULL) { 1499 next_mp = mp->b_next; 1500 mp->b_next = NULL; 1501 if ((mp = mac_add_vlan_tag(mp, pri, vid)) == NULL) { 1502 freemsgchain(next_mp); 1503 break; 1504 } 1505 *prev = mp; 1506 prev = &mp->b_next; 1507 mp = mp->b_next = next_mp; 1508 } 1509 1510 return (mp_chain); 1511 } 1512 1513 /* 1514 * Strip VLAN tag 1515 */ 1516 mblk_t * 1517 mac_strip_vlan_tag(mblk_t *mp) 1518 { 1519 mblk_t *newmp; 1520 struct ether_vlan_header *evhp; 1521 1522 evhp = (struct ether_vlan_header *)mp->b_rptr; 1523 if (ntohs(evhp->ether_tpid) == ETHERTYPE_VLAN) { 1524 ASSERT(MBLKL(mp) >= sizeof (struct ether_vlan_header)); 1525 1526 if (DB_REF(mp) > 1) { 1527 newmp = copymsg(mp); 1528 if (newmp == NULL) 1529 return (NULL); 1530 freemsg(mp); 1531 mp = newmp; 1532 } 1533 1534 evhp = (struct ether_vlan_header *)mp->b_rptr; 1535 1536 ovbcopy(mp->b_rptr, mp->b_rptr + VLAN_TAGSZ, 2 * ETHERADDRL); 1537 mp->b_rptr += VLAN_TAGSZ; 1538 } 1539 return (mp); 1540 } 1541 1542 /* 1543 * Strip VLAN tag from each mblk of the chain. 1544 */ 1545 mblk_t * 1546 mac_strip_vlan_tag_chain(mblk_t *mp_chain) 1547 { 1548 mblk_t *mp, *next_mp, **prev; 1549 1550 mp = mp_chain; 1551 prev = &mp_chain; 1552 1553 while (mp != NULL) { 1554 next_mp = mp->b_next; 1555 mp->b_next = NULL; 1556 if ((mp = mac_strip_vlan_tag(mp)) == NULL) { 1557 freemsgchain(next_mp); 1558 break; 1559 } 1560 *prev = mp; 1561 prev = &mp->b_next; 1562 mp = mp->b_next = next_mp; 1563 } 1564 1565 return (mp_chain); 1566 } 1567 1568 /* 1569 * Default callback function. Used when the datapath is not yet initialized. 1570 */ 1571 /* ARGSUSED */ 1572 void 1573 mac_rx_def(void *arg, mac_resource_handle_t resource, mblk_t *mp_chain, 1574 boolean_t loopback) 1575 { 1576 freemsgchain(mp_chain); 1577 } 1578 1579 /* 1580 * Determines the IPv6 header length accounting for all the optional IPv6 1581 * headers (hop-by-hop, destination, routing and fragment). The header length 1582 * and next header value (a transport header) is captured. 1583 * 1584 * Returns B_FALSE if all the IP headers are not in the same mblk otherwise 1585 * returns B_TRUE. 1586 */ 1587 boolean_t 1588 mac_ip_hdr_length_v6(ip6_t *ip6h, uint8_t *endptr, uint16_t *hdr_length, 1589 uint8_t *next_hdr, ip6_frag_t **fragp) 1590 { 1591 uint16_t length; 1592 uint_t ehdrlen; 1593 uint8_t *whereptr; 1594 uint8_t *nexthdrp; 1595 ip6_dest_t *desthdr; 1596 ip6_rthdr_t *rthdr; 1597 ip6_frag_t *fraghdr; 1598 1599 if (((uchar_t *)ip6h + IPV6_HDR_LEN) > endptr) 1600 return (B_FALSE); 1601 ASSERT(IPH_HDR_VERSION(ip6h) == IPV6_VERSION); 1602 length = IPV6_HDR_LEN; 1603 whereptr = ((uint8_t *)&ip6h[1]); /* point to next hdr */ 1604 1605 if (fragp != NULL) 1606 *fragp = NULL; 1607 1608 nexthdrp = &ip6h->ip6_nxt; 1609 while (whereptr < endptr) { 1610 /* Is there enough left for len + nexthdr? */ 1611 if (whereptr + MIN_EHDR_LEN > endptr) 1612 break; 1613 1614 switch (*nexthdrp) { 1615 case IPPROTO_HOPOPTS: 1616 case IPPROTO_DSTOPTS: 1617 /* Assumes the headers are identical for hbh and dst */ 1618 desthdr = (ip6_dest_t *)whereptr; 1619 ehdrlen = 8 * (desthdr->ip6d_len + 1); 1620 if ((uchar_t *)desthdr + ehdrlen > endptr) 1621 return (B_FALSE); 1622 nexthdrp = &desthdr->ip6d_nxt; 1623 break; 1624 case IPPROTO_ROUTING: 1625 rthdr = (ip6_rthdr_t *)whereptr; 1626 ehdrlen = 8 * (rthdr->ip6r_len + 1); 1627 if ((uchar_t *)rthdr + ehdrlen > endptr) 1628 return (B_FALSE); 1629 nexthdrp = &rthdr->ip6r_nxt; 1630 break; 1631 case IPPROTO_FRAGMENT: 1632 fraghdr = (ip6_frag_t *)whereptr; 1633 ehdrlen = sizeof (ip6_frag_t); 1634 if ((uchar_t *)&fraghdr[1] > endptr) 1635 return (B_FALSE); 1636 nexthdrp = &fraghdr->ip6f_nxt; 1637 if (fragp != NULL) 1638 *fragp = fraghdr; 1639 break; 1640 case IPPROTO_NONE: 1641 /* No next header means we're finished */ 1642 default: 1643 *hdr_length = length; 1644 *next_hdr = *nexthdrp; 1645 return (B_TRUE); 1646 } 1647 length += ehdrlen; 1648 whereptr += ehdrlen; 1649 *hdr_length = length; 1650 *next_hdr = *nexthdrp; 1651 } 1652 switch (*nexthdrp) { 1653 case IPPROTO_HOPOPTS: 1654 case IPPROTO_DSTOPTS: 1655 case IPPROTO_ROUTING: 1656 case IPPROTO_FRAGMENT: 1657 /* 1658 * If any know extension headers are still to be processed, 1659 * the packet's malformed (or at least all the IP header(s) are 1660 * not in the same mblk - and that should never happen. 1661 */ 1662 return (B_FALSE); 1663 1664 default: 1665 /* 1666 * If we get here, we know that all of the IP headers were in 1667 * the same mblk, even if the ULP header is in the next mblk. 1668 */ 1669 *hdr_length = length; 1670 *next_hdr = *nexthdrp; 1671 return (B_TRUE); 1672 } 1673 } 1674 1675 /* 1676 * The following set of routines are there to take care of interrupt 1677 * re-targeting for legacy (fixed) interrupts. Some older versions 1678 * of the popular NICs like e1000g do not support MSI-X interrupts 1679 * and they reserve fixed interrupts for RX/TX rings. To re-target 1680 * these interrupts, PCITOOL ioctls need to be used. 1681 */ 1682 typedef struct mac_dladm_intr { 1683 int ino; 1684 int cpu_id; 1685 char driver_path[MAXPATHLEN]; 1686 char nexus_path[MAXPATHLEN]; 1687 } mac_dladm_intr_t; 1688 1689 /* Bind the interrupt to cpu_num */ 1690 static int 1691 mac_set_intr(ldi_handle_t lh, processorid_t cpu_num, int oldcpuid, int ino) 1692 { 1693 pcitool_intr_set_t iset; 1694 int err; 1695 1696 iset.old_cpu = oldcpuid; 1697 iset.ino = ino; 1698 iset.cpu_id = cpu_num; 1699 iset.user_version = PCITOOL_VERSION; 1700 err = ldi_ioctl(lh, PCITOOL_DEVICE_SET_INTR, (intptr_t)&iset, FKIOCTL, 1701 kcred, NULL); 1702 1703 return (err); 1704 } 1705 1706 /* 1707 * Search interrupt information. iget is filled in with the info to search 1708 */ 1709 static boolean_t 1710 mac_search_intrinfo(pcitool_intr_get_t *iget_p, mac_dladm_intr_t *dln) 1711 { 1712 int i; 1713 char driver_path[2 * MAXPATHLEN]; 1714 1715 for (i = 0; i < iget_p->num_devs; i++) { 1716 (void) strlcpy(driver_path, iget_p->dev[i].path, MAXPATHLEN); 1717 (void) snprintf(&driver_path[strlen(driver_path)], MAXPATHLEN, 1718 ":%s%d", iget_p->dev[i].driver_name, 1719 iget_p->dev[i].dev_inst); 1720 /* Match the device path for the device path */ 1721 if (strcmp(driver_path, dln->driver_path) == 0) { 1722 dln->ino = iget_p->ino; 1723 dln->cpu_id = iget_p->cpu_id; 1724 return (B_TRUE); 1725 } 1726 } 1727 return (B_FALSE); 1728 } 1729 1730 /* 1731 * Get information about ino, i.e. if this is the interrupt for our 1732 * device and where it is bound etc. 1733 */ 1734 static boolean_t 1735 mac_get_single_intr(ldi_handle_t lh, int oldcpuid, int ino, 1736 mac_dladm_intr_t *dln) 1737 { 1738 pcitool_intr_get_t *iget_p; 1739 int ipsz; 1740 int nipsz; 1741 int err; 1742 uint8_t inum; 1743 1744 /* 1745 * Check if SLEEP is OK, i.e if could come here in response to 1746 * changing the fanout due to some callback from the driver, say 1747 * link speed changes. 1748 */ 1749 ipsz = PCITOOL_IGET_SIZE(0); 1750 iget_p = kmem_zalloc(ipsz, KM_SLEEP); 1751 1752 iget_p->num_devs_ret = 0; 1753 iget_p->user_version = PCITOOL_VERSION; 1754 iget_p->cpu_id = oldcpuid; 1755 iget_p->ino = ino; 1756 1757 err = ldi_ioctl(lh, PCITOOL_DEVICE_GET_INTR, (intptr_t)iget_p, 1758 FKIOCTL, kcred, NULL); 1759 if (err != 0) { 1760 kmem_free(iget_p, ipsz); 1761 return (B_FALSE); 1762 } 1763 if (iget_p->num_devs == 0) { 1764 kmem_free(iget_p, ipsz); 1765 return (B_FALSE); 1766 } 1767 inum = iget_p->num_devs; 1768 if (iget_p->num_devs_ret < iget_p->num_devs) { 1769 /* Reallocate */ 1770 nipsz = PCITOOL_IGET_SIZE(iget_p->num_devs); 1771 1772 kmem_free(iget_p, ipsz); 1773 ipsz = nipsz; 1774 iget_p = kmem_zalloc(ipsz, KM_SLEEP); 1775 1776 iget_p->num_devs_ret = inum; 1777 iget_p->cpu_id = oldcpuid; 1778 iget_p->ino = ino; 1779 iget_p->user_version = PCITOOL_VERSION; 1780 err = ldi_ioctl(lh, PCITOOL_DEVICE_GET_INTR, (intptr_t)iget_p, 1781 FKIOCTL, kcred, NULL); 1782 if (err != 0) { 1783 kmem_free(iget_p, ipsz); 1784 return (B_FALSE); 1785 } 1786 /* defensive */ 1787 if (iget_p->num_devs != iget_p->num_devs_ret) { 1788 kmem_free(iget_p, ipsz); 1789 return (B_FALSE); 1790 } 1791 } 1792 1793 if (mac_search_intrinfo(iget_p, dln)) { 1794 kmem_free(iget_p, ipsz); 1795 return (B_TRUE); 1796 } 1797 kmem_free(iget_p, ipsz); 1798 return (B_FALSE); 1799 } 1800 1801 /* 1802 * Get the interrupts and check each one to see if it is for our device. 1803 */ 1804 static int 1805 mac_validate_intr(ldi_handle_t lh, mac_dladm_intr_t *dln, processorid_t cpuid) 1806 { 1807 pcitool_intr_info_t intr_info; 1808 int err; 1809 int ino; 1810 int oldcpuid; 1811 1812 err = ldi_ioctl(lh, PCITOOL_SYSTEM_INTR_INFO, (intptr_t)&intr_info, 1813 FKIOCTL, kcred, NULL); 1814 if (err != 0) 1815 return (-1); 1816 1817 for (oldcpuid = 0; oldcpuid < intr_info.num_cpu; oldcpuid++) { 1818 for (ino = 0; ino < intr_info.num_intr; ino++) { 1819 if (mac_get_single_intr(lh, oldcpuid, ino, dln)) { 1820 if (dln->cpu_id == cpuid) 1821 return (0); 1822 return (1); 1823 } 1824 } 1825 } 1826 return (-1); 1827 } 1828 1829 /* 1830 * Obtain the nexus parent node info. for mdip. 1831 */ 1832 static dev_info_t * 1833 mac_get_nexus_node(dev_info_t *mdip, mac_dladm_intr_t *dln) 1834 { 1835 struct dev_info *tdip = (struct dev_info *)mdip; 1836 struct ddi_minor_data *minordata; 1837 int circ; 1838 dev_info_t *pdip; 1839 char pathname[MAXPATHLEN]; 1840 1841 while (tdip != NULL) { 1842 /* 1843 * The netboot code could call this function while walking the 1844 * device tree so we need to use ndi_devi_tryenter() here to 1845 * avoid deadlock. 1846 */ 1847 if (ndi_devi_tryenter((dev_info_t *)tdip, &circ) == 0) 1848 break; 1849 1850 for (minordata = tdip->devi_minor; minordata != NULL; 1851 minordata = minordata->next) { 1852 if (strncmp(minordata->ddm_node_type, DDI_NT_INTRCTL, 1853 strlen(DDI_NT_INTRCTL)) == 0) { 1854 pdip = minordata->dip; 1855 (void) ddi_pathname(pdip, pathname); 1856 (void) snprintf(dln->nexus_path, MAXPATHLEN, 1857 "/devices%s:intr", pathname); 1858 (void) ddi_pathname_minor(minordata, pathname); 1859 ndi_devi_exit((dev_info_t *)tdip, circ); 1860 return (pdip); 1861 } 1862 } 1863 ndi_devi_exit((dev_info_t *)tdip, circ); 1864 tdip = tdip->devi_parent; 1865 } 1866 return (NULL); 1867 } 1868 1869 /* 1870 * For a primary MAC client, if the user has set a list or CPUs or 1871 * we have obtained it implicitly, we try to retarget the interrupt 1872 * for that device on one of the CPUs in the list. 1873 * We assign the interrupt to the same CPU as the poll thread. 1874 */ 1875 static boolean_t 1876 mac_check_interrupt_binding(dev_info_t *mdip, int32_t cpuid) 1877 { 1878 ldi_handle_t lh = NULL; 1879 ldi_ident_t li = NULL; 1880 int err; 1881 int ret; 1882 mac_dladm_intr_t dln; 1883 dev_info_t *dip; 1884 struct ddi_minor_data *minordata; 1885 1886 dln.nexus_path[0] = '\0'; 1887 dln.driver_path[0] = '\0'; 1888 1889 minordata = ((struct dev_info *)mdip)->devi_minor; 1890 while (minordata != NULL) { 1891 if (minordata->type == DDM_MINOR) 1892 break; 1893 minordata = minordata->next; 1894 } 1895 if (minordata == NULL) 1896 return (B_FALSE); 1897 1898 (void) ddi_pathname_minor(minordata, dln.driver_path); 1899 1900 dip = mac_get_nexus_node(mdip, &dln); 1901 /* defensive */ 1902 if (dip == NULL) 1903 return (B_FALSE); 1904 1905 err = ldi_ident_from_major(ddi_driver_major(dip), &li); 1906 if (err != 0) 1907 return (B_FALSE); 1908 1909 err = ldi_open_by_name(dln.nexus_path, FREAD|FWRITE, kcred, &lh, li); 1910 if (err != 0) 1911 return (B_FALSE); 1912 1913 ret = mac_validate_intr(lh, &dln, cpuid); 1914 if (ret < 0) { 1915 (void) ldi_close(lh, FREAD|FWRITE, kcred); 1916 return (B_FALSE); 1917 } 1918 /* cmn_note? */ 1919 if (ret != 0) 1920 if ((err = (mac_set_intr(lh, cpuid, dln.cpu_id, dln.ino))) 1921 != 0) { 1922 (void) ldi_close(lh, FREAD|FWRITE, kcred); 1923 return (B_FALSE); 1924 } 1925 (void) ldi_close(lh, FREAD|FWRITE, kcred); 1926 return (B_TRUE); 1927 } 1928 1929 void 1930 mac_client_set_intr_cpu(void *arg, mac_client_handle_t mch, int32_t cpuid) 1931 { 1932 dev_info_t *mdip = (dev_info_t *)arg; 1933 mac_client_impl_t *mcip = (mac_client_impl_t *)mch; 1934 mac_resource_props_t *mrp; 1935 mac_perim_handle_t mph; 1936 flow_entry_t *flent = mcip->mci_flent; 1937 mac_soft_ring_set_t *rx_srs; 1938 mac_cpus_t *srs_cpu; 1939 1940 if (!mac_check_interrupt_binding(mdip, cpuid)) 1941 cpuid = -1; 1942 mac_perim_enter_by_mh((mac_handle_t)mcip->mci_mip, &mph); 1943 mrp = MCIP_RESOURCE_PROPS(mcip); 1944 mrp->mrp_rx_intr_cpu = cpuid; 1945 if (flent != NULL && flent->fe_rx_srs_cnt == 2) { 1946 rx_srs = flent->fe_rx_srs[1]; 1947 srs_cpu = &rx_srs->srs_cpu; 1948 srs_cpu->mc_rx_intr_cpu = cpuid; 1949 } 1950 mac_perim_exit(mph); 1951 } 1952 1953 int32_t 1954 mac_client_intr_cpu(mac_client_handle_t mch) 1955 { 1956 mac_client_impl_t *mcip = (mac_client_impl_t *)mch; 1957 mac_cpus_t *srs_cpu; 1958 mac_soft_ring_set_t *rx_srs; 1959 flow_entry_t *flent = mcip->mci_flent; 1960 mac_resource_props_t *mrp = MCIP_RESOURCE_PROPS(mcip); 1961 mac_ring_t *ring; 1962 mac_intr_t *mintr; 1963 1964 /* 1965 * Check if we need to retarget the interrupt. We do this only 1966 * for the primary MAC client. We do this if we have the only 1967 * exclusive ring in the group. 1968 */ 1969 if (mac_is_primary_client(mcip) && flent->fe_rx_srs_cnt == 2) { 1970 rx_srs = flent->fe_rx_srs[1]; 1971 srs_cpu = &rx_srs->srs_cpu; 1972 ring = rx_srs->srs_ring; 1973 mintr = &ring->mr_info.mri_intr; 1974 /* 1975 * If ddi_handle is present or the poll CPU is 1976 * already bound to the interrupt CPU, return -1. 1977 */ 1978 if (mintr->mi_ddi_handle != NULL || 1979 ((mrp->mrp_ncpus != 0) && 1980 (mrp->mrp_rx_intr_cpu == srs_cpu->mc_rx_pollid))) { 1981 return (-1); 1982 } 1983 return (srs_cpu->mc_rx_pollid); 1984 } 1985 return (-1); 1986 } 1987 1988 void * 1989 mac_get_devinfo(mac_handle_t mh) 1990 { 1991 mac_impl_t *mip = (mac_impl_t *)mh; 1992 1993 return ((void *)mip->mi_dip); 1994 } 1995 1996 #define PKT_HASH_2BYTES(x) ((x)[0] ^ (x)[1]) 1997 #define PKT_HASH_4BYTES(x) ((x)[0] ^ (x)[1] ^ (x)[2] ^ (x)[3]) 1998 #define PKT_HASH_MAC(x) ((x)[0] ^ (x)[1] ^ (x)[2] ^ (x)[3] ^ (x)[4] ^ (x)[5]) 1999 2000 uint64_t 2001 mac_pkt_hash(uint_t media, mblk_t *mp, uint8_t policy, boolean_t is_outbound) 2002 { 2003 struct ether_header *ehp; 2004 uint64_t hash = 0; 2005 uint16_t sap; 2006 uint_t skip_len; 2007 uint8_t proto; 2008 boolean_t ip_fragmented; 2009 2010 /* 2011 * We may want to have one of these per MAC type plugin in the 2012 * future. For now supports only ethernet. 2013 */ 2014 if (media != DL_ETHER) 2015 return (0L); 2016 2017 /* for now we support only outbound packets */ 2018 ASSERT(is_outbound); 2019 ASSERT(IS_P2ALIGNED(mp->b_rptr, sizeof (uint16_t))); 2020 ASSERT(MBLKL(mp) >= sizeof (struct ether_header)); 2021 2022 /* compute L2 hash */ 2023 2024 ehp = (struct ether_header *)mp->b_rptr; 2025 2026 if ((policy & MAC_PKT_HASH_L2) != 0) { 2027 uchar_t *mac_src = ehp->ether_shost.ether_addr_octet; 2028 uchar_t *mac_dst = ehp->ether_dhost.ether_addr_octet; 2029 hash = PKT_HASH_MAC(mac_src) ^ PKT_HASH_MAC(mac_dst); 2030 policy &= ~MAC_PKT_HASH_L2; 2031 } 2032 2033 if (policy == 0) 2034 goto done; 2035 2036 /* skip ethernet header */ 2037 2038 sap = ntohs(ehp->ether_type); 2039 if (sap == ETHERTYPE_VLAN) { 2040 struct ether_vlan_header *evhp; 2041 mblk_t *newmp = NULL; 2042 2043 skip_len = sizeof (struct ether_vlan_header); 2044 if (MBLKL(mp) < skip_len) { 2045 /* the vlan tag is the payload, pull up first */ 2046 newmp = msgpullup(mp, -1); 2047 if ((newmp == NULL) || (MBLKL(newmp) < skip_len)) { 2048 goto done; 2049 } 2050 evhp = (struct ether_vlan_header *)newmp->b_rptr; 2051 } else { 2052 evhp = (struct ether_vlan_header *)mp->b_rptr; 2053 } 2054 2055 sap = ntohs(evhp->ether_type); 2056 freemsg(newmp); 2057 } else { 2058 skip_len = sizeof (struct ether_header); 2059 } 2060 2061 /* if ethernet header is in its own mblk, skip it */ 2062 if (MBLKL(mp) <= skip_len) { 2063 skip_len -= MBLKL(mp); 2064 mp = mp->b_cont; 2065 if (mp == NULL) 2066 goto done; 2067 } 2068 2069 sap = (sap < ETHERTYPE_802_MIN) ? 0 : sap; 2070 2071 /* compute IP src/dst addresses hash and skip IPv{4,6} header */ 2072 2073 switch (sap) { 2074 case ETHERTYPE_IP: { 2075 ipha_t *iphp; 2076 2077 /* 2078 * If the header is not aligned or the header doesn't fit 2079 * in the mblk, bail now. Note that this may cause packets 2080 * reordering. 2081 */ 2082 iphp = (ipha_t *)(mp->b_rptr + skip_len); 2083 if (((unsigned char *)iphp + sizeof (ipha_t) > mp->b_wptr) || 2084 !OK_32PTR((char *)iphp)) 2085 goto done; 2086 2087 proto = iphp->ipha_protocol; 2088 skip_len += IPH_HDR_LENGTH(iphp); 2089 2090 /* Check if the packet is fragmented. */ 2091 ip_fragmented = ntohs(iphp->ipha_fragment_offset_and_flags) & 2092 IPH_OFFSET; 2093 2094 /* 2095 * For fragmented packets, use addresses in addition to 2096 * the frag_id to generate the hash inorder to get 2097 * better distribution. 2098 */ 2099 if (ip_fragmented || (policy & MAC_PKT_HASH_L3) != 0) { 2100 uint8_t *ip_src = (uint8_t *)&(iphp->ipha_src); 2101 uint8_t *ip_dst = (uint8_t *)&(iphp->ipha_dst); 2102 2103 hash ^= (PKT_HASH_4BYTES(ip_src) ^ 2104 PKT_HASH_4BYTES(ip_dst)); 2105 policy &= ~MAC_PKT_HASH_L3; 2106 } 2107 2108 if (ip_fragmented) { 2109 uint8_t *identp = (uint8_t *)&iphp->ipha_ident; 2110 hash ^= PKT_HASH_2BYTES(identp); 2111 goto done; 2112 } 2113 break; 2114 } 2115 case ETHERTYPE_IPV6: { 2116 ip6_t *ip6hp; 2117 ip6_frag_t *frag = NULL; 2118 uint16_t hdr_length; 2119 2120 /* 2121 * If the header is not aligned or the header doesn't fit 2122 * in the mblk, bail now. Note that this may cause packets 2123 * reordering. 2124 */ 2125 2126 ip6hp = (ip6_t *)(mp->b_rptr + skip_len); 2127 if (((unsigned char *)ip6hp + IPV6_HDR_LEN > mp->b_wptr) || 2128 !OK_32PTR((char *)ip6hp)) 2129 goto done; 2130 2131 if (!mac_ip_hdr_length_v6(ip6hp, mp->b_wptr, &hdr_length, 2132 &proto, &frag)) 2133 goto done; 2134 skip_len += hdr_length; 2135 2136 /* 2137 * For fragmented packets, use addresses in addition to 2138 * the frag_id to generate the hash inorder to get 2139 * better distribution. 2140 */ 2141 if (frag != NULL || (policy & MAC_PKT_HASH_L3) != 0) { 2142 uint8_t *ip_src = &(ip6hp->ip6_src.s6_addr8[12]); 2143 uint8_t *ip_dst = &(ip6hp->ip6_dst.s6_addr8[12]); 2144 2145 hash ^= (PKT_HASH_4BYTES(ip_src) ^ 2146 PKT_HASH_4BYTES(ip_dst)); 2147 policy &= ~MAC_PKT_HASH_L3; 2148 } 2149 2150 if (frag != NULL) { 2151 uint8_t *identp = (uint8_t *)&frag->ip6f_ident; 2152 hash ^= PKT_HASH_4BYTES(identp); 2153 goto done; 2154 } 2155 break; 2156 } 2157 default: 2158 goto done; 2159 } 2160 2161 if (policy == 0) 2162 goto done; 2163 2164 /* if ip header is in its own mblk, skip it */ 2165 if (MBLKL(mp) <= skip_len) { 2166 skip_len -= MBLKL(mp); 2167 mp = mp->b_cont; 2168 if (mp == NULL) 2169 goto done; 2170 } 2171 2172 /* parse ULP header */ 2173 again: 2174 switch (proto) { 2175 case IPPROTO_TCP: 2176 case IPPROTO_UDP: 2177 case IPPROTO_ESP: 2178 case IPPROTO_SCTP: 2179 /* 2180 * These Internet Protocols are intentionally designed 2181 * for hashing from the git-go. Port numbers are in the first 2182 * word for transports, SPI is first for ESP. 2183 */ 2184 if (mp->b_rptr + skip_len + 4 > mp->b_wptr) 2185 goto done; 2186 hash ^= PKT_HASH_4BYTES((mp->b_rptr + skip_len)); 2187 break; 2188 2189 case IPPROTO_AH: { 2190 ah_t *ah = (ah_t *)(mp->b_rptr + skip_len); 2191 uint_t ah_length = AH_TOTAL_LEN(ah); 2192 2193 if ((unsigned char *)ah + sizeof (ah_t) > mp->b_wptr) 2194 goto done; 2195 2196 proto = ah->ah_nexthdr; 2197 skip_len += ah_length; 2198 2199 /* if AH header is in its own mblk, skip it */ 2200 if (MBLKL(mp) <= skip_len) { 2201 skip_len -= MBLKL(mp); 2202 mp = mp->b_cont; 2203 if (mp == NULL) 2204 goto done; 2205 } 2206 2207 goto again; 2208 } 2209 } 2210 2211 done: 2212 return (hash); 2213 } 2214