1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright 2019 Joyent, Inc. 24 */ 25 26 /* 27 * MAC Services Module - misc utilities 28 */ 29 30 #include <sys/types.h> 31 #include <sys/mac.h> 32 #include <sys/mac_impl.h> 33 #include <sys/mac_client_priv.h> 34 #include <sys/mac_client_impl.h> 35 #include <sys/mac_soft_ring.h> 36 #include <sys/strsubr.h> 37 #include <sys/strsun.h> 38 #include <sys/vlan.h> 39 #include <sys/pattr.h> 40 #include <sys/pci_tools.h> 41 #include <inet/ip.h> 42 #include <inet/ip_impl.h> 43 #include <inet/ip6.h> 44 #include <sys/vtrace.h> 45 #include <sys/dlpi.h> 46 #include <sys/sunndi.h> 47 #include <inet/ipsec_impl.h> 48 #include <inet/sadb.h> 49 #include <inet/ipsecesp.h> 50 #include <inet/ipsecah.h> 51 #include <inet/tcp.h> 52 #include <inet/udp_impl.h> 53 #include <inet/sctp_ip.h> 54 55 /* 56 * The next two functions are used for dropping packets or chains of 57 * packets, respectively. We could use one function for both but 58 * separating the use cases allows us to specify intent and prevent 59 * dropping more data than intended. 60 * 61 * The purpose of these functions is to aid the debugging effort, 62 * especially in production. Rather than use freemsg()/freemsgchain(), 63 * it's preferable to use these functions when dropping a packet in 64 * the MAC layer. These functions should only be used during 65 * unexpected conditions. That is, any time a packet is dropped 66 * outside of the regular, successful datapath. Consolidating all 67 * drops on these functions allows the user to trace one location and 68 * determine why the packet was dropped based on the msg. It also 69 * allows the user to inspect the packet before it is freed. Finally, 70 * it allows the user to avoid tracing freemsg()/freemsgchain() thus 71 * keeping the hot path running as efficiently as possible. 72 * 73 * NOTE: At this time not all MAC drops are aggregated on these 74 * functions; but that is the plan. This comment should be erased once 75 * completed. 76 */ 77 78 /*PRINTFLIKE2*/ 79 void 80 mac_drop_pkt(mblk_t *mp, const char *fmt, ...) 81 { 82 va_list adx; 83 char msg[128]; 84 char *msgp = msg; 85 86 ASSERT3P(mp->b_next, ==, NULL); 87 88 va_start(adx, fmt); 89 (void) vsnprintf(msgp, sizeof (msg), fmt, adx); 90 va_end(adx); 91 92 DTRACE_PROBE2(mac__drop, mblk_t *, mp, char *, msgp); 93 freemsg(mp); 94 } 95 96 /*PRINTFLIKE2*/ 97 void 98 mac_drop_chain(mblk_t *chain, const char *fmt, ...) 99 { 100 va_list adx; 101 char msg[128]; 102 char *msgp = msg; 103 104 va_start(adx, fmt); 105 (void) vsnprintf(msgp, sizeof (msg), fmt, adx); 106 va_end(adx); 107 108 /* 109 * We could use freemsgchain() for the actual freeing but 110 * since we are already walking the chain to fire the dtrace 111 * probe we might as well free the msg here too. 112 */ 113 for (mblk_t *mp = chain, *next; mp != NULL; ) { 114 next = mp->b_next; 115 DTRACE_PROBE2(mac__drop, mblk_t *, mp, char *, msgp); 116 freemsg(mp); 117 mp = next; 118 } 119 } 120 121 /* 122 * Copy an mblk, preserving its hardware checksum flags. 123 */ 124 static mblk_t * 125 mac_copymsg_cksum(mblk_t *mp) 126 { 127 mblk_t *mp1; 128 129 mp1 = copymsg(mp); 130 if (mp1 == NULL) 131 return (NULL); 132 133 mac_hcksum_clone(mp, mp1); 134 135 return (mp1); 136 } 137 138 /* 139 * Copy an mblk chain, presenting the hardware checksum flags of the 140 * individual mblks. 141 */ 142 mblk_t * 143 mac_copymsgchain_cksum(mblk_t *mp) 144 { 145 mblk_t *nmp = NULL; 146 mblk_t **nmpp = &nmp; 147 148 for (; mp != NULL; mp = mp->b_next) { 149 if ((*nmpp = mac_copymsg_cksum(mp)) == NULL) { 150 freemsgchain(nmp); 151 return (NULL); 152 } 153 154 nmpp = &((*nmpp)->b_next); 155 } 156 157 return (nmp); 158 } 159 160 /* 161 * Calculate the ULP checksum for IPv4. Return true if the calculation 162 * was successful, or false if an error occurred. If the later, place 163 * an error message into '*err'. 164 */ 165 static boolean_t 166 mac_sw_cksum_ipv4(mblk_t *mp, uint32_t ip_hdr_offset, ipha_t *ipha, 167 const char **err) 168 { 169 const uint8_t proto = ipha->ipha_protocol; 170 size_t len; 171 const uint32_t ip_hdr_sz = IPH_HDR_LENGTH(ipha); 172 /* ULP offset from start of L2. */ 173 const uint32_t ulp_offset = ip_hdr_offset + ip_hdr_sz; 174 ipaddr_t src, dst; 175 uint32_t cksum; 176 uint16_t *up; 177 178 /* 179 * We need a pointer to the ULP checksum. We're assuming the 180 * ULP checksum pointer resides in the first mblk. Our native 181 * TCP stack should always put the headers in the first mblk, 182 * but currently we have no way to guarantee that other 183 * clients don't spread headers (or even header fields) across 184 * mblks. 185 */ 186 switch (proto) { 187 case IPPROTO_TCP: 188 ASSERT3U(MBLKL(mp), >=, (ulp_offset + sizeof (tcph_t))); 189 if (MBLKL(mp) < (ulp_offset + sizeof (tcph_t))) { 190 *err = "mblk doesn't contain TCP header"; 191 goto bail; 192 } 193 194 up = IPH_TCPH_CHECKSUMP(ipha, ip_hdr_sz); 195 cksum = IP_TCP_CSUM_COMP; 196 break; 197 198 case IPPROTO_UDP: 199 ASSERT3U(MBLKL(mp), >=, (ulp_offset + sizeof (udpha_t))); 200 if (MBLKL(mp) < (ulp_offset + sizeof (udpha_t))) { 201 *err = "mblk doesn't contain UDP header"; 202 goto bail; 203 } 204 205 up = IPH_UDPH_CHECKSUMP(ipha, ip_hdr_sz); 206 cksum = IP_UDP_CSUM_COMP; 207 break; 208 209 case IPPROTO_SCTP: { 210 sctp_hdr_t *sctph; 211 212 ASSERT3U(MBLKL(mp), >=, (ulp_offset + sizeof (sctp_hdr_t))); 213 if (MBLKL(mp) < (ulp_offset + sizeof (sctp_hdr_t))) { 214 *err = "mblk doesn't contain SCTP header"; 215 goto bail; 216 } 217 218 sctph = (sctp_hdr_t *)(mp->b_rptr + ulp_offset); 219 sctph->sh_chksum = 0; 220 sctph->sh_chksum = sctp_cksum(mp, ulp_offset); 221 return (B_TRUE); 222 } 223 224 default: 225 *err = "unexpected protocol"; 226 goto bail; 227 228 } 229 230 /* Pseudo-header checksum. */ 231 src = ipha->ipha_src; 232 dst = ipha->ipha_dst; 233 len = ntohs(ipha->ipha_length) - ip_hdr_sz; 234 235 cksum += (dst >> 16) + (dst & 0xFFFF) + (src >> 16) + (src & 0xFFFF); 236 cksum += htons(len); 237 238 /* 239 * We have already accounted for the pseudo checksum above. 240 * Make sure the ULP checksum field is zero before computing 241 * the rest. 242 */ 243 *up = 0; 244 cksum = IP_CSUM(mp, ulp_offset, cksum); 245 *up = (uint16_t)(cksum ? cksum : ~cksum); 246 247 return (B_TRUE); 248 249 bail: 250 return (B_FALSE); 251 } 252 253 /* 254 * Calculate the ULP checksum for IPv6. Return true if the calculation 255 * was successful, or false if an error occurred. If the later, place 256 * an error message into '*err'. 257 */ 258 static boolean_t 259 mac_sw_cksum_ipv6(mblk_t *mp, uint32_t ip_hdr_offset, const char **err) 260 { 261 ip6_t *ip6h = (ip6_t *)(mp->b_rptr + ip_hdr_offset); 262 const uint8_t proto = ip6h->ip6_nxt; 263 const uint16_t *iphs = (uint16_t *)ip6h; 264 /* ULP offset from start of L2. */ 265 uint32_t ulp_offset; 266 size_t len; 267 uint32_t cksum; 268 uint16_t *up; 269 uint16_t ip_hdr_sz; 270 271 if (!ip_hdr_length_nexthdr_v6(mp, ip6h, &ip_hdr_sz, NULL)) { 272 *err = "malformed IPv6 header"; 273 goto bail; 274 } 275 276 ulp_offset = ip_hdr_offset + ip_hdr_sz; 277 278 /* 279 * We need a pointer to the ULP checksum. We're assuming the 280 * ULP checksum pointer resides in the first mblk. Our native 281 * TCP stack should always put the headers in the first mblk, 282 * but currently we have no way to guarantee that other 283 * clients don't spread headers (or even header fields) across 284 * mblks. 285 */ 286 switch (proto) { 287 case IPPROTO_TCP: 288 ASSERT3U(MBLKL(mp), >=, (ulp_offset + sizeof (tcph_t))); 289 if (MBLKL(mp) < (ulp_offset + sizeof (tcph_t))) { 290 *err = "mblk doesn't contain TCP header"; 291 goto bail; 292 } 293 294 up = IPH_TCPH_CHECKSUMP(ip6h, ip_hdr_sz); 295 cksum = IP_TCP_CSUM_COMP; 296 break; 297 298 case IPPROTO_UDP: 299 ASSERT3U(MBLKL(mp), >=, (ulp_offset + sizeof (udpha_t))); 300 if (MBLKL(mp) < (ulp_offset + sizeof (udpha_t))) { 301 *err = "mblk doesn't contain UDP header"; 302 goto bail; 303 } 304 305 up = IPH_UDPH_CHECKSUMP(ip6h, ip_hdr_sz); 306 cksum = IP_UDP_CSUM_COMP; 307 break; 308 309 case IPPROTO_SCTP: { 310 sctp_hdr_t *sctph; 311 312 ASSERT3U(MBLKL(mp), >=, (ulp_offset + sizeof (sctp_hdr_t))); 313 if (MBLKL(mp) < (ulp_offset + sizeof (sctp_hdr_t))) { 314 *err = "mblk doesn't contain SCTP header"; 315 goto bail; 316 } 317 318 sctph = (sctp_hdr_t *)(mp->b_rptr + ulp_offset); 319 /* 320 * Zero out the checksum field to ensure proper 321 * checksum calculation. 322 */ 323 sctph->sh_chksum = 0; 324 sctph->sh_chksum = sctp_cksum(mp, ulp_offset); 325 return (B_TRUE); 326 } 327 328 default: 329 *err = "unexpected protocol"; 330 goto bail; 331 } 332 333 /* 334 * The payload length includes the payload and the IPv6 335 * extension headers; the idea is to subtract the extension 336 * header length to get the real payload length. 337 */ 338 len = ntohs(ip6h->ip6_plen) - (ip_hdr_sz - IPV6_HDR_LEN); 339 cksum += len; 340 341 /* 342 * We accumulate the pseudo header checksum in cksum; then we 343 * call IP_CSUM to compute the checksum over the payload. 344 */ 345 cksum += iphs[4] + iphs[5] + iphs[6] + iphs[7] + iphs[8] + iphs[9] + 346 iphs[10] + iphs[11] + iphs[12] + iphs[13] + iphs[14] + iphs[15] + 347 iphs[16] + iphs[17] + iphs[18] + iphs[19]; 348 cksum = IP_CSUM(mp, ulp_offset, cksum); 349 350 /* For UDP/IPv6 a zero UDP checksum is not allowed. Change to 0xffff */ 351 if (proto == IPPROTO_UDP && cksum == 0) 352 cksum = ~cksum; 353 354 *up = (uint16_t)cksum; 355 356 return (B_TRUE); 357 358 bail: 359 return (B_FALSE); 360 } 361 362 /* 363 * Perform software checksum on a single message, if needed. The 364 * emulation performed is determined by an intersection of the mblk's 365 * flags and the emul flags requested. The emul flags are documented 366 * in mac.h. 367 */ 368 static mblk_t * 369 mac_sw_cksum(mblk_t *mp, mac_emul_t emul) 370 { 371 mblk_t *skipped_hdr = NULL; 372 uint32_t flags, start, stuff, end, value; 373 uint32_t ip_hdr_offset; 374 uint16_t etype; 375 size_t ip_hdr_sz; 376 struct ether_header *ehp; 377 const char *err = ""; 378 379 /* 380 * This function should only be called from mac_hw_emul() 381 * which handles mblk chains and the shared ref case. 382 */ 383 ASSERT3P(mp->b_next, ==, NULL); 384 385 mac_hcksum_get(mp, &start, &stuff, &end, &value, NULL); 386 387 flags = DB_CKSUMFLAGS(mp); 388 389 /* Why call this if checksum emulation isn't needed? */ 390 ASSERT3U(flags & (HCK_FLAGS), !=, 0); 391 392 /* 393 * Ethernet, and optionally VLAN header. mac_hw_emul() has 394 * already verified we have enough data to read the L2 header. 395 */ 396 ehp = (struct ether_header *)mp->b_rptr; 397 if (ntohs(ehp->ether_type) == VLAN_TPID) { 398 struct ether_vlan_header *evhp; 399 400 evhp = (struct ether_vlan_header *)mp->b_rptr; 401 etype = ntohs(evhp->ether_type); 402 ip_hdr_offset = sizeof (struct ether_vlan_header); 403 } else { 404 etype = ntohs(ehp->ether_type); 405 ip_hdr_offset = sizeof (struct ether_header); 406 } 407 408 /* 409 * If this packet isn't IP, then leave it alone. We don't want 410 * to affect non-IP traffic like ARP. Assume the IP header 411 * doesn't include any options, for now. We will use the 412 * correct size later after we know there are enough bytes to 413 * at least fill out the basic header. 414 */ 415 switch (etype) { 416 case ETHERTYPE_IP: 417 ip_hdr_sz = sizeof (ipha_t); 418 break; 419 case ETHERTYPE_IPV6: 420 ip_hdr_sz = sizeof (ip6_t); 421 break; 422 default: 423 return (mp); 424 } 425 426 ASSERT3U(MBLKL(mp), >=, ip_hdr_offset); 427 428 /* 429 * If the first mblk of this packet contains only the ethernet 430 * header, skip past it for now. Packets with their data 431 * contained in only a single mblk can then use the fastpaths 432 * tuned to that possibility. 433 */ 434 if (MBLKL(mp) == ip_hdr_offset) { 435 ip_hdr_offset -= MBLKL(mp); 436 /* This is guaranteed by mac_hw_emul(). */ 437 ASSERT3P(mp->b_cont, !=, NULL); 438 skipped_hdr = mp; 439 mp = mp->b_cont; 440 } 441 442 /* 443 * Both full and partial checksum rely on finding the IP 444 * header in the current mblk. Our native TCP stack honors 445 * this assumption but it's prudent to guard our future 446 * clients that might not honor this contract. 447 */ 448 ASSERT3U(MBLKL(mp), >=, ip_hdr_offset + ip_hdr_sz); 449 if (MBLKL(mp) < (ip_hdr_offset + ip_hdr_sz)) { 450 err = "mblk doesn't contain IP header"; 451 goto bail; 452 } 453 454 /* 455 * We are about to modify the header mblk; make sure we are 456 * modifying our own copy. The code that follows assumes that 457 * the IP/ULP headers exist in this mblk (and drops the 458 * message if they don't). 459 */ 460 if (DB_REF(mp) > 1) { 461 mblk_t *tmp = copyb(mp); 462 463 if (tmp == NULL) { 464 err = "copyb failed"; 465 goto bail; 466 } 467 468 if (skipped_hdr != NULL) { 469 ASSERT3P(skipped_hdr->b_cont, ==, mp); 470 skipped_hdr->b_cont = tmp; 471 } 472 473 tmp->b_cont = mp->b_cont; 474 freeb(mp); 475 mp = tmp; 476 } 477 478 if (etype == ETHERTYPE_IP) { 479 ipha_t *ipha = (ipha_t *)(mp->b_rptr + ip_hdr_offset); 480 481 if ((flags & HCK_FULLCKSUM) && (emul & MAC_HWCKSUM_EMUL)) { 482 if (!mac_sw_cksum_ipv4(mp, ip_hdr_offset, ipha, &err)) 483 goto bail; 484 } 485 486 /* We always update the ULP checksum flags. */ 487 if ((flags & HCK_FULLCKSUM) && (emul & MAC_HWCKSUM_EMULS)) { 488 flags &= ~HCK_FULLCKSUM; 489 flags |= HCK_FULLCKSUM_OK; 490 value = 0; 491 } 492 493 /* 494 * While unlikely, it's possible to write code that 495 * might end up calling mac_sw_cksum() twice on the 496 * same mblk (performing both LSO and checksum 497 * emualtion in a single mblk chain loop -- the LSO 498 * emulation inserts a new chain into the existing 499 * chain and then the loop iterates back over the new 500 * segments and emulates the checksum a second time). 501 * Normally this wouldn't be a problem, because the 502 * HCK_*_OK flags are supposed to indicate that we 503 * don't need to do peform the work. But 504 * HCK_IPV4_HDRCKSUM and HCK_IPV4_HDRCKSUM_OK have the 505 * same value; so we cannot use these flags to 506 * determine if the IP header checksum has already 507 * been calculated or not. For this reason, we zero 508 * out the the checksum first. In the future, we 509 * should fix the HCK_* flags. 510 */ 511 if ((flags & HCK_IPV4_HDRCKSUM) && (emul & MAC_HWCKSUM_EMULS)) { 512 ipha->ipha_hdr_checksum = 0; 513 ipha->ipha_hdr_checksum = (uint16_t)ip_csum_hdr(ipha); 514 flags &= ~HCK_IPV4_HDRCKSUM; 515 flags |= HCK_IPV4_HDRCKSUM_OK; 516 } 517 } else if (etype == ETHERTYPE_IPV6) { 518 /* There is no IP header checksum for IPv6. */ 519 if ((flags & HCK_FULLCKSUM) && (emul & MAC_HWCKSUM_EMUL)) { 520 if (!mac_sw_cksum_ipv6(mp, ip_hdr_offset, &err)) 521 goto bail; 522 flags &= ~HCK_FULLCKSUM; 523 flags |= HCK_FULLCKSUM_OK; 524 value = 0; 525 } 526 } 527 528 /* 529 * Partial checksum is the same for both IPv4 and IPv6. 530 */ 531 if ((flags & HCK_PARTIALCKSUM) && (emul & MAC_HWCKSUM_EMUL)) { 532 uint16_t *up, partial, cksum; 533 uchar_t *ipp; /* ptr to beginning of IP header */ 534 535 ipp = mp->b_rptr + ip_hdr_offset; 536 up = (uint16_t *)((uchar_t *)ipp + stuff); 537 partial = *up; 538 *up = 0; 539 540 ASSERT3S(end, >, start); 541 cksum = ~IP_CSUM_PARTIAL(mp, ip_hdr_offset + start, partial); 542 *up = cksum != 0 ? cksum : ~cksum; 543 } 544 545 /* We always update the ULP checksum flags. */ 546 if ((flags & HCK_PARTIALCKSUM) && (emul & MAC_HWCKSUM_EMULS)) { 547 flags &= ~HCK_PARTIALCKSUM; 548 flags |= HCK_FULLCKSUM_OK; 549 value = 0; 550 } 551 552 mac_hcksum_set(mp, start, stuff, end, value, flags); 553 554 /* Don't forget to reattach the header. */ 555 if (skipped_hdr != NULL) { 556 ASSERT3P(skipped_hdr->b_cont, ==, mp); 557 558 /* 559 * Duplicate the HCKSUM data into the header mblk. 560 * This mimics mac_add_vlan_tag which ensures that 561 * both the first mblk _and_ the first data bearing 562 * mblk possess the HCKSUM information. Consumers like 563 * IP will end up discarding the ether_header mblk, so 564 * for now, it is important that the data be available 565 * in both places. 566 */ 567 mac_hcksum_clone(mp, skipped_hdr); 568 mp = skipped_hdr; 569 } 570 571 return (mp); 572 573 bail: 574 if (skipped_hdr != NULL) { 575 ASSERT3P(skipped_hdr->b_cont, ==, mp); 576 mp = skipped_hdr; 577 } 578 579 mac_drop_pkt(mp, err); 580 return (NULL); 581 } 582 583 /* 584 * Build a single data segment from an LSO packet. The mblk chain 585 * returned, seg_head, represents the data segment and is always 586 * exactly seg_len bytes long. The lso_mp and offset input/output 587 * parameters track our position in the LSO packet. This function 588 * exists solely as a helper to mac_sw_lso(). 589 * 590 * Case A 591 * 592 * The current lso_mp is larger than the requested seg_len. The 593 * beginning of seg_head may start at the beginning of lso_mp or 594 * offset into it. In either case, a single mblk is returned, and 595 * *offset is updated to reflect our new position in the current 596 * lso_mp. 597 * 598 * +----------------------------+ 599 * | in *lso_mp / out *lso_mp | 600 * +----------------------------+ 601 * ^ ^ 602 * | | 603 * | | 604 * | | 605 * +------------------------+ 606 * | seg_head | 607 * +------------------------+ 608 * ^ ^ 609 * | | 610 * in *offset = 0 out *offset = seg_len 611 * 612 * |------ seg_len ----| 613 * 614 * 615 * +------------------------------+ 616 * | in *lso_mp / out *lso_mp | 617 * +------------------------------+ 618 * ^ ^ 619 * | | 620 * | | 621 * | | 622 * +------------------------+ 623 * | seg_head | 624 * +------------------------+ 625 * ^ ^ 626 * | | 627 * in *offset = N out *offset = N + seg_len 628 * 629 * |------ seg_len ----| 630 * 631 * 632 * 633 * Case B 634 * 635 * The requested seg_len consumes exactly the rest of the lso_mp. 636 * I.e., the seg_head's b_wptr is equivalent to lso_mp's b_wptr. 637 * The seg_head may start at the beginning of the lso_mp or at some 638 * offset into it. In either case we return a single mblk, reset 639 * *offset to zero, and walk to the next lso_mp. 640 * 641 * +------------------------+ +------------------------+ 642 * | in *lso_mp |---------->| out *lso_mp | 643 * +------------------------+ +------------------------+ 644 * ^ ^ ^ 645 * | | | 646 * | | out *offset = 0 647 * | | 648 * +------------------------+ 649 * | seg_head | 650 * +------------------------+ 651 * ^ 652 * | 653 * in *offset = 0 654 * 655 * |------ seg_len ----| 656 * 657 * 658 * 659 * +----------------------------+ +------------------------+ 660 * | in *lso_mp |---------->| out *lso_mp | 661 * +----------------------------+ +------------------------+ 662 * ^ ^ ^ 663 * | | | 664 * | | out *offset = 0 665 * | | 666 * +------------------------+ 667 * | seg_head | 668 * +------------------------+ 669 * ^ 670 * | 671 * in *offset = N 672 * 673 * |------ seg_len ----| 674 * 675 * 676 * Case C 677 * 678 * The requested seg_len is greater than the current lso_mp. In 679 * this case we must consume LSO mblks until we have enough data to 680 * satisfy either case (A) or (B) above. We will return multiple 681 * mblks linked via b_cont, offset will be set based on the cases 682 * above, and lso_mp will walk forward at least one mblk, but maybe 683 * more. 684 * 685 * N.B. This digram is not exhaustive. The seg_head may start on 686 * the beginning of an lso_mp. The seg_tail may end exactly on the 687 * boundary of an lso_mp. And there may be two (in this case the 688 * middle block wouldn't exist), three, or more mblks in the 689 * seg_head chain. This is meant as one example of what might 690 * happen. The main thing to remember is that the seg_tail mblk 691 * must be one of case (A) or (B) above. 692 * 693 * +------------------+ +----------------+ +------------------+ 694 * | in *lso_mp |--->| *lso_mp |--->| out *lso_mp | 695 * +------------------+ +----------------+ +------------------+ 696 * ^ ^ ^ ^ ^ ^ 697 * | | | | | | 698 * | | | | | | 699 * | | | | | | 700 * | | | | | | 701 * +------------+ +----------------+ +------------+ 702 * | seg_head |--->| |--->| seg_tail | 703 * +------------+ +----------------+ +------------+ 704 * ^ ^ 705 * | | 706 * in *offset = N out *offset = MBLKL(seg_tail) 707 * 708 * |------------------- seg_len -------------------| 709 * 710 */ 711 static mblk_t * 712 build_data_seg(mblk_t **lso_mp, uint32_t *offset, uint32_t seg_len) 713 { 714 mblk_t *seg_head, *seg_tail, *seg_mp; 715 716 ASSERT3P(*lso_mp, !=, NULL); 717 ASSERT3U((*lso_mp)->b_rptr + *offset, <, (*lso_mp)->b_wptr); 718 719 seg_mp = dupb(*lso_mp); 720 if (seg_mp == NULL) 721 return (NULL); 722 723 seg_head = seg_mp; 724 seg_tail = seg_mp; 725 726 /* Continue where we left off from in the lso_mp. */ 727 seg_mp->b_rptr += *offset; 728 729 last_mblk: 730 /* Case (A) */ 731 if ((seg_mp->b_rptr + seg_len) < seg_mp->b_wptr) { 732 *offset += seg_len; 733 seg_mp->b_wptr = seg_mp->b_rptr + seg_len; 734 return (seg_head); 735 } 736 737 /* Case (B) */ 738 if ((seg_mp->b_rptr + seg_len) == seg_mp->b_wptr) { 739 *offset = 0; 740 *lso_mp = (*lso_mp)->b_cont; 741 return (seg_head); 742 } 743 744 /* Case (C) */ 745 ASSERT3U(seg_mp->b_rptr + seg_len, >, seg_mp->b_wptr); 746 747 /* 748 * The current LSO mblk doesn't have enough data to satisfy 749 * seg_len -- continue peeling off LSO mblks to build the new 750 * segment message. If allocation fails we free the previously 751 * allocated segment mblks and return NULL. 752 */ 753 while ((seg_mp->b_rptr + seg_len) > seg_mp->b_wptr) { 754 ASSERT3U(MBLKL(seg_mp), <=, seg_len); 755 seg_len -= MBLKL(seg_mp); 756 *offset = 0; 757 *lso_mp = (*lso_mp)->b_cont; 758 seg_mp = dupb(*lso_mp); 759 760 if (seg_mp == NULL) { 761 freemsgchain(seg_head); 762 return (NULL); 763 } 764 765 seg_tail->b_cont = seg_mp; 766 seg_tail = seg_mp; 767 } 768 769 /* 770 * We've walked enough LSO mblks that we can now satisfy the 771 * remaining seg_len. At this point we need to jump back to 772 * determine if we have arrived at case (A) or (B). 773 */ 774 775 /* Just to be paranoid that we didn't underflow. */ 776 ASSERT3U(seg_len, <, IP_MAXPACKET); 777 ASSERT3U(seg_len, >, 0); 778 goto last_mblk; 779 } 780 781 /* 782 * Perform software segmentation of a single LSO message. Take an LSO 783 * message as input and return head/tail pointers as output. This 784 * function should not be invoked directly but instead through 785 * mac_hw_emul(). 786 * 787 * The resulting chain is comprised of multiple (nsegs) MSS sized 788 * segments. Each segment will consist of two or more mblks joined by 789 * b_cont: a header and one or more data mblks. The header mblk is 790 * allocated anew for each message. The first segment's header is used 791 * as a template for the rest with adjustments made for things such as 792 * ID, sequence, length, TCP flags, etc. The data mblks reference into 793 * the existing LSO mblk (passed in as omp) by way of dupb(). Their 794 * b_rptr/b_wptr values are adjusted to reference only the fraction of 795 * the LSO message they are responsible for. At the successful 796 * completion of this function the original mblk (omp) is freed, 797 * leaving the newely created segment chain as the only remaining 798 * reference to the data. 799 */ 800 static void 801 mac_sw_lso(mblk_t *omp, mac_emul_t emul, mblk_t **head, mblk_t **tail, 802 uint_t *count) 803 { 804 uint32_t ocsum_flags, ocsum_start, ocsum_stuff; 805 uint32_t mss; 806 uint32_t oehlen, oiphlen, otcphlen, ohdrslen, opktlen, odatalen; 807 uint32_t oleft; 808 uint_t nsegs, seg; 809 int len; 810 811 struct ether_vlan_header *oevh; 812 const ipha_t *oiph; 813 const tcph_t *otcph; 814 ipha_t *niph; 815 tcph_t *ntcph; 816 uint16_t ip_id; 817 uint32_t tcp_seq, tcp_sum, otcp_sum; 818 819 uint32_t offset; 820 mblk_t *odatamp; 821 mblk_t *seg_chain, *prev_nhdrmp, *next_nhdrmp, *nhdrmp, *ndatamp; 822 mblk_t *tmptail; 823 824 ASSERT3P(head, !=, NULL); 825 ASSERT3P(tail, !=, NULL); 826 ASSERT3P(count, !=, NULL); 827 ASSERT3U((DB_CKSUMFLAGS(omp) & HW_LSO), !=, 0); 828 829 /* Assume we are dealing with a single LSO message. */ 830 ASSERT3P(omp->b_next, ==, NULL); 831 832 /* 833 * XXX: This is a hack to deal with mac_add_vlan_tag(). 834 * 835 * When VLANs are in play, mac_add_vlan_tag() creates a new 836 * mblk with just the ether_vlan_header and tacks it onto the 837 * front of 'omp'. This breaks the assumptions made below; 838 * namely that the TCP/IP headers are in the first mblk. In 839 * this case, since we already have to pay the cost of LSO 840 * emulation, we simply pull up everything. While this might 841 * seem irksome, keep in mind this will only apply in a couple 842 * of scenarios: a) an LSO-capable VLAN client sending to a 843 * non-LSO-capable client over the "MAC/bridge loopback" 844 * datapath or b) an LSO-capable VLAN client is sending to a 845 * client that, for whatever reason, doesn't have DLS-bypass 846 * enabled. Finally, we have to check for both a tagged and 847 * untagged sized mblk depending on if the mblk came via 848 * mac_promisc_dispatch() or mac_rx_deliver(). 849 * 850 * In the future, two things should be done: 851 * 852 * 1. This function should make use of some yet to be 853 * implemented "mblk helpers". These helper functions would 854 * perform all the b_cont walking for us and guarantee safe 855 * access to the mblk data. 856 * 857 * 2. We should add some slop to the mblks so that 858 * mac_add_vlan_tag() can just edit the first mblk instead 859 * of allocating on the hot path. 860 */ 861 if (MBLKL(omp) == sizeof (struct ether_vlan_header) || 862 MBLKL(omp) == sizeof (struct ether_header)) { 863 mblk_t *tmp = msgpullup(omp, -1); 864 865 if (tmp == NULL) { 866 mac_drop_pkt(omp, "failed to pull up"); 867 goto fail; 868 } 869 870 mac_hcksum_clone(omp, tmp); 871 freemsg(omp); 872 omp = tmp; 873 } 874 875 mss = DB_LSOMSS(omp); 876 ASSERT3U(msgsize(omp), <=, IP_MAXPACKET + 877 sizeof (struct ether_vlan_header)); 878 opktlen = msgsize(omp); 879 880 /* 881 * First, get references to the IP and TCP headers and 882 * determine the total TCP length (header + data). 883 * 884 * Thanks to mac_hw_emul() we know that the first mblk must 885 * contain (at minimum) the full L2 header. However, this 886 * function assumes more than that. It assumes the L2/L3/L4 887 * headers are all contained in the first mblk of a message 888 * (i.e., no b_cont walking for headers). While this is a 889 * current reality (our native TCP stack and viona both 890 * enforce this) things may become more nuanced in the future 891 * (e.g. when introducing encap support or adding new 892 * clients). For now we guard against this case by dropping 893 * the packet. 894 */ 895 oevh = (struct ether_vlan_header *)omp->b_rptr; 896 if (oevh->ether_tpid == htons(ETHERTYPE_VLAN)) 897 oehlen = sizeof (struct ether_vlan_header); 898 else 899 oehlen = sizeof (struct ether_header); 900 901 ASSERT3U(MBLKL(omp), >=, (oehlen + sizeof (ipha_t) + sizeof (tcph_t))); 902 if (MBLKL(omp) < (oehlen + sizeof (ipha_t) + sizeof (tcph_t))) { 903 mac_drop_pkt(omp, "mblk doesn't contain TCP/IP headers"); 904 goto fail; 905 } 906 907 oiph = (ipha_t *)(omp->b_rptr + oehlen); 908 oiphlen = IPH_HDR_LENGTH(oiph); 909 otcph = (tcph_t *)(omp->b_rptr + oehlen + oiphlen); 910 otcphlen = TCP_HDR_LENGTH(otcph); 911 912 /* 913 * Currently we only support LSO for TCP/IPv4. 914 */ 915 if (IPH_HDR_VERSION(oiph) != IPV4_VERSION) { 916 mac_drop_pkt(omp, "LSO unsupported IP version: %uhh", 917 IPH_HDR_VERSION(oiph)); 918 goto fail; 919 } 920 921 if (oiph->ipha_protocol != IPPROTO_TCP) { 922 mac_drop_pkt(omp, "LSO unsupported protocol: %uhh", 923 oiph->ipha_protocol); 924 goto fail; 925 } 926 927 if (otcph->th_flags[0] & (TH_SYN | TH_RST | TH_URG)) { 928 mac_drop_pkt(omp, "LSO packet has SYN|RST|URG set"); 929 goto fail; 930 } 931 932 ohdrslen = oehlen + oiphlen + otcphlen; 933 if ((len = MBLKL(omp)) < ohdrslen) { 934 mac_drop_pkt(omp, "LSO packet too short: %d < %u", len, 935 ohdrslen); 936 goto fail; 937 } 938 939 /* 940 * Either we have data in the first mblk or it's just the 941 * header. In either case, we need to set rptr to the start of 942 * the TCP data. 943 */ 944 if (len > ohdrslen) { 945 odatamp = omp; 946 offset = ohdrslen; 947 } else { 948 ASSERT3U(len, ==, ohdrslen); 949 odatamp = omp->b_cont; 950 offset = 0; 951 } 952 953 /* Make sure we still have enough data. */ 954 ASSERT3U(msgsize(odatamp), >=, opktlen - ohdrslen); 955 956 /* 957 * If a MAC negotiated LSO then it must negotioate both 958 * HCKSUM_IPHDRCKSUM and either HCKSUM_INET_FULL_V4 or 959 * HCKSUM_INET_PARTIAL; because both the IP and TCP headers 960 * change during LSO segmentation (only the 3 fields of the 961 * pseudo header checksum don't change: src, dst, proto). Thus 962 * we would expect these flags (HCK_IPV4_HDRCKSUM | 963 * HCK_PARTIALCKSUM | HCK_FULLCKSUM) to be set and for this 964 * function to emulate those checksums in software. However, 965 * that assumes a world where we only expose LSO if the 966 * underlying hardware exposes LSO. Moving forward the plan is 967 * to assume LSO in the upper layers and have MAC perform 968 * software LSO when the underlying provider doesn't support 969 * it. In such a world, if the provider doesn't support LSO 970 * but does support hardware checksum offload, then we could 971 * simply perform the segmentation and allow the hardware to 972 * calculate the checksums. To the hardware it's just another 973 * chain of non-LSO packets. 974 */ 975 ASSERT3S(DB_TYPE(omp), ==, M_DATA); 976 ocsum_flags = DB_CKSUMFLAGS(omp); 977 ASSERT3U(ocsum_flags & HCK_IPV4_HDRCKSUM, !=, 0); 978 ASSERT3U(ocsum_flags & (HCK_PARTIALCKSUM | HCK_FULLCKSUM), !=, 0); 979 980 /* 981 * If hardware only provides partial checksum then software 982 * must supply the pseudo-header checksum. In the case of LSO 983 * we leave the TCP length at zero to be filled in by 984 * hardware. This function must handle two scenarios. 985 * 986 * 1. Being called by a MAC client on the Rx path to segment 987 * an LSO packet and calculate the checksum. 988 * 989 * 2. Being called by a MAC provider to segment an LSO packet. 990 * In this case the LSO segmentation is performed in 991 * software (by this routine) but the MAC provider should 992 * still calculate the TCP/IP checksums in hardware. 993 * 994 * To elaborate on the second case: we cannot have the 995 * scenario where IP sends LSO packets but the underlying HW 996 * doesn't support checksum offload -- because in that case 997 * TCP/IP would calculate the checksum in software (for the 998 * LSO packet) but then MAC would segment the packet and have 999 * to redo all the checksum work. So IP should never do LSO 1000 * if HW doesn't support both IP and TCP checksum. 1001 */ 1002 if (ocsum_flags & HCK_PARTIALCKSUM) { 1003 ocsum_start = (uint32_t)DB_CKSUMSTART(omp); 1004 ocsum_stuff = (uint32_t)DB_CKSUMSTUFF(omp); 1005 } 1006 1007 odatalen = opktlen - ohdrslen; 1008 1009 /* 1010 * Subtract one to account for the case where the data length 1011 * is evenly divisble by the MSS. Add one to account for the 1012 * fact that the division will always result in one less 1013 * segment than needed. 1014 */ 1015 nsegs = ((odatalen - 1) / mss) + 1; 1016 if (nsegs < 2) { 1017 mac_drop_pkt(omp, "LSO not enough segs: %u", nsegs); 1018 goto fail; 1019 } 1020 1021 DTRACE_PROBE6(sw__lso__start, mblk_t *, omp, void_ip_t *, oiph, 1022 __dtrace_tcp_tcph_t *, otcph, uint_t, odatalen, uint_t, mss, uint_t, 1023 nsegs); 1024 1025 seg_chain = NULL; 1026 tmptail = seg_chain; 1027 oleft = odatalen; 1028 1029 for (uint_t i = 0; i < nsegs; i++) { 1030 boolean_t last_seg = ((i + 1) == nsegs); 1031 uint32_t seg_len; 1032 1033 /* 1034 * If we fail to allocate, then drop the partially 1035 * allocated chain as well as the LSO packet. Let the 1036 * sender deal with the fallout. 1037 */ 1038 if ((nhdrmp = allocb(ohdrslen, 0)) == NULL) { 1039 freemsgchain(seg_chain); 1040 mac_drop_pkt(omp, "failed to alloc segment header"); 1041 goto fail; 1042 } 1043 ASSERT3P(nhdrmp->b_cont, ==, NULL); 1044 1045 if (seg_chain == NULL) { 1046 seg_chain = nhdrmp; 1047 } else { 1048 ASSERT3P(tmptail, !=, NULL); 1049 tmptail->b_next = nhdrmp; 1050 } 1051 1052 tmptail = nhdrmp; 1053 1054 /* 1055 * Calculate this segment's lengh. It's either the MSS 1056 * or whatever remains for the last segment. 1057 */ 1058 seg_len = last_seg ? oleft : mss; 1059 ASSERT3U(seg_len, <=, mss); 1060 ndatamp = build_data_seg(&odatamp, &offset, seg_len); 1061 1062 if (ndatamp == NULL) { 1063 freemsgchain(seg_chain); 1064 mac_drop_pkt(omp, "LSO failed to segment data"); 1065 goto fail; 1066 } 1067 1068 /* Attach data mblk to header mblk. */ 1069 nhdrmp->b_cont = ndatamp; 1070 DB_CKSUMFLAGS(ndatamp) &= ~HW_LSO; 1071 ASSERT3U(seg_len, <=, oleft); 1072 oleft -= seg_len; 1073 } 1074 1075 /* We should have consumed entire LSO msg. */ 1076 ASSERT3S(oleft, ==, 0); 1077 ASSERT3P(odatamp, ==, NULL); 1078 1079 /* 1080 * All seg data mblks are referenced by the header mblks, null 1081 * out this pointer to catch any bad derefs. 1082 */ 1083 ndatamp = NULL; 1084 1085 /* 1086 * Set headers and checksum for first segment. 1087 */ 1088 nhdrmp = seg_chain; 1089 bcopy(omp->b_rptr, nhdrmp->b_rptr, ohdrslen); 1090 nhdrmp->b_wptr = nhdrmp->b_rptr + ohdrslen; 1091 niph = (ipha_t *)(nhdrmp->b_rptr + oehlen); 1092 ASSERT3U(msgsize(nhdrmp->b_cont), ==, mss); 1093 niph->ipha_length = htons(oiphlen + otcphlen + mss); 1094 niph->ipha_hdr_checksum = 0; 1095 ip_id = ntohs(niph->ipha_ident); 1096 ntcph = (tcph_t *)(nhdrmp->b_rptr + oehlen + oiphlen); 1097 tcp_seq = BE32_TO_U32(ntcph->th_seq); 1098 tcp_seq += mss; 1099 1100 /* 1101 * The first segment shouldn't: 1102 * 1103 * o indicate end of data transmission (FIN), 1104 * o indicate immediate handling of the data (PUSH). 1105 */ 1106 ntcph->th_flags[0] &= ~(TH_FIN | TH_PUSH); 1107 DB_CKSUMFLAGS(nhdrmp) = (uint16_t)(ocsum_flags & ~HW_LSO); 1108 1109 /* 1110 * If the underlying HW provides partial checksum, then make 1111 * sure to correct the pseudo header checksum before calling 1112 * mac_sw_cksum(). The native TCP stack doesn't include the 1113 * length field in the pseudo header when LSO is in play -- so 1114 * we need to calculate it here. 1115 */ 1116 if (ocsum_flags & HCK_PARTIALCKSUM) { 1117 DB_CKSUMSTART(nhdrmp) = ocsum_start; 1118 DB_CKSUMEND(nhdrmp) = ntohs(niph->ipha_length); 1119 DB_CKSUMSTUFF(nhdrmp) = ocsum_stuff; 1120 tcp_sum = BE16_TO_U16(ntcph->th_sum); 1121 otcp_sum = tcp_sum; 1122 tcp_sum += mss + otcphlen; 1123 tcp_sum = (tcp_sum >> 16) + (tcp_sum & 0xFFFF); 1124 U16_TO_BE16(tcp_sum, ntcph->th_sum); 1125 } 1126 1127 if ((ocsum_flags & (HCK_PARTIALCKSUM | HCK_FULLCKSUM)) && 1128 (emul & MAC_HWCKSUM_EMULS)) { 1129 next_nhdrmp = nhdrmp->b_next; 1130 nhdrmp->b_next = NULL; 1131 nhdrmp = mac_sw_cksum(nhdrmp, emul); 1132 nhdrmp->b_next = next_nhdrmp; 1133 next_nhdrmp = NULL; 1134 1135 /* 1136 * We may have freed the nhdrmp argument during 1137 * checksum emulation, make sure that seg_chain 1138 * references a valid mblk. 1139 */ 1140 seg_chain = nhdrmp; 1141 } 1142 1143 ASSERT3P(nhdrmp, !=, NULL); 1144 1145 seg = 1; 1146 DTRACE_PROBE5(sw__lso__seg, mblk_t *, nhdrmp, void_ip_t *, 1147 (ipha_t *)(nhdrmp->b_rptr + oehlen), __dtrace_tcp_tcph_t *, 1148 (tcph_t *)(nhdrmp->b_rptr + oehlen + oiphlen), uint_t, mss, 1149 uint_t, seg); 1150 seg++; 1151 1152 /* There better be at least 2 segs. */ 1153 ASSERT3P(nhdrmp->b_next, !=, NULL); 1154 prev_nhdrmp = nhdrmp; 1155 nhdrmp = nhdrmp->b_next; 1156 1157 /* 1158 * Now adjust the headers of the middle segments. For each 1159 * header we need to adjust the following. 1160 * 1161 * o IP ID 1162 * o IP length 1163 * o TCP sequence 1164 * o TCP flags 1165 * o cksum flags 1166 * o cksum values (if MAC_HWCKSUM_EMUL is set) 1167 */ 1168 for (; seg < nsegs; seg++) { 1169 /* 1170 * We use seg_chain as a reference to the first seg 1171 * header mblk -- this first header is a template for 1172 * the rest of the segments. This copy will include 1173 * the now updated checksum values from the first 1174 * header. We must reset these checksum values to 1175 * their original to make sure we produce the correct 1176 * value. 1177 */ 1178 bcopy(seg_chain->b_rptr, nhdrmp->b_rptr, ohdrslen); 1179 nhdrmp->b_wptr = nhdrmp->b_rptr + ohdrslen; 1180 niph = (ipha_t *)(nhdrmp->b_rptr + oehlen); 1181 niph->ipha_ident = htons(++ip_id); 1182 ASSERT3P(msgsize(nhdrmp->b_cont), ==, mss); 1183 niph->ipha_length = htons(oiphlen + otcphlen + mss); 1184 niph->ipha_hdr_checksum = 0; 1185 ntcph = (tcph_t *)(nhdrmp->b_rptr + oehlen + oiphlen); 1186 U32_TO_BE32(tcp_seq, ntcph->th_seq); 1187 tcp_seq += mss; 1188 /* 1189 * Just like the first segment, the middle segments 1190 * shouldn't have these flags set. 1191 */ 1192 ntcph->th_flags[0] &= ~(TH_FIN | TH_PUSH); 1193 DB_CKSUMFLAGS(nhdrmp) = (uint16_t)(ocsum_flags & ~HW_LSO); 1194 1195 if (ocsum_flags & HCK_PARTIALCKSUM) { 1196 /* 1197 * First and middle segs have same 1198 * pseudo-header checksum. 1199 */ 1200 U16_TO_BE16(tcp_sum, ntcph->th_sum); 1201 DB_CKSUMSTART(nhdrmp) = ocsum_start; 1202 DB_CKSUMEND(nhdrmp) = ntohs(niph->ipha_length); 1203 DB_CKSUMSTUFF(nhdrmp) = ocsum_stuff; 1204 } 1205 1206 if ((ocsum_flags & (HCK_PARTIALCKSUM | HCK_FULLCKSUM)) && 1207 (emul & MAC_HWCKSUM_EMULS)) { 1208 next_nhdrmp = nhdrmp->b_next; 1209 nhdrmp->b_next = NULL; 1210 nhdrmp = mac_sw_cksum(nhdrmp, emul); 1211 nhdrmp->b_next = next_nhdrmp; 1212 next_nhdrmp = NULL; 1213 /* We may have freed the original nhdrmp. */ 1214 prev_nhdrmp->b_next = nhdrmp; 1215 } 1216 1217 DTRACE_PROBE5(sw__lso__seg, mblk_t *, nhdrmp, void_ip_t *, 1218 (ipha_t *)(nhdrmp->b_rptr + oehlen), __dtrace_tcp_tcph_t *, 1219 (tcph_t *)(nhdrmp->b_rptr + oehlen + oiphlen), 1220 uint_t, mss, uint_t, seg); 1221 1222 ASSERT3P(nhdrmp->b_next, !=, NULL); 1223 prev_nhdrmp = nhdrmp; 1224 nhdrmp = nhdrmp->b_next; 1225 } 1226 1227 /* Make sure we are on the last segment. */ 1228 ASSERT3U(seg, ==, nsegs); 1229 ASSERT3P(nhdrmp->b_next, ==, NULL); 1230 1231 /* 1232 * Now we set the last segment header. The difference being 1233 * that FIN/PSH/RST flags are allowed. 1234 */ 1235 bcopy(seg_chain->b_rptr, nhdrmp->b_rptr, ohdrslen); 1236 nhdrmp->b_wptr = nhdrmp->b_rptr + ohdrslen; 1237 niph = (ipha_t *)(nhdrmp->b_rptr + oehlen); 1238 niph->ipha_ident = htons(++ip_id); 1239 len = msgsize(nhdrmp->b_cont); 1240 ASSERT3S(len, >, 0); 1241 niph->ipha_length = htons(oiphlen + otcphlen + len); 1242 niph->ipha_hdr_checksum = 0; 1243 ntcph = (tcph_t *)(nhdrmp->b_rptr + oehlen + oiphlen); 1244 U32_TO_BE32(tcp_seq, ntcph->th_seq); 1245 1246 DB_CKSUMFLAGS(nhdrmp) = (uint16_t)(ocsum_flags & ~HW_LSO); 1247 if (ocsum_flags & HCK_PARTIALCKSUM) { 1248 DB_CKSUMSTART(nhdrmp) = ocsum_start; 1249 DB_CKSUMEND(nhdrmp) = ntohs(niph->ipha_length); 1250 DB_CKSUMSTUFF(nhdrmp) = ocsum_stuff; 1251 tcp_sum = otcp_sum; 1252 tcp_sum += len + otcphlen; 1253 tcp_sum = (tcp_sum >> 16) + (tcp_sum & 0xFFFF); 1254 U16_TO_BE16(tcp_sum, ntcph->th_sum); 1255 } 1256 1257 if ((ocsum_flags & (HCK_PARTIALCKSUM | HCK_FULLCKSUM)) && 1258 (emul & MAC_HWCKSUM_EMULS)) { 1259 /* This should be the last mblk. */ 1260 ASSERT3P(nhdrmp->b_next, ==, NULL); 1261 nhdrmp = mac_sw_cksum(nhdrmp, emul); 1262 prev_nhdrmp->b_next = nhdrmp; 1263 } 1264 1265 DTRACE_PROBE5(sw__lso__seg, mblk_t *, nhdrmp, void_ip_t *, 1266 (ipha_t *)(nhdrmp->b_rptr + oehlen), __dtrace_tcp_tcph_t *, 1267 (tcph_t *)(nhdrmp->b_rptr + oehlen + oiphlen), uint_t, len, 1268 uint_t, seg); 1269 1270 /* 1271 * Free the reference to the original LSO message as it is 1272 * being replaced by seg_cahin. 1273 */ 1274 freemsg(omp); 1275 *head = seg_chain; 1276 *tail = nhdrmp; 1277 *count = nsegs; 1278 return; 1279 1280 fail: 1281 *head = NULL; 1282 *tail = NULL; 1283 *count = 0; 1284 } 1285 1286 #define HCK_NEEDED (HCK_IPV4_HDRCKSUM | HCK_PARTIALCKSUM | HCK_FULLCKSUM) 1287 1288 /* 1289 * Emulate various hardware offload features in software. Take a chain 1290 * of packets as input and emulate the hardware features specified in 1291 * 'emul'. The resulting chain's head pointer replaces the 'mp_chain' 1292 * pointer given as input, and its tail pointer is written to 1293 * '*otail'. The number of packets in the new chain is written to 1294 * '*ocount'. The 'otail' and 'ocount' arguments are optional and thus 1295 * may be NULL. The 'mp_chain' argument may point to a NULL chain; in 1296 * which case 'mp_chain' will simply stay a NULL chain. 1297 * 1298 * While unlikely, it is technically possible that this function could 1299 * receive a non-NULL chain as input and return a NULL chain as output 1300 * ('*mp_chain' and '*otail' would be NULL and '*ocount' would be 1301 * zero). This could happen if all the packets in the chain are 1302 * dropped or if we fail to allocate new mblks. In this case, there is 1303 * nothing for the caller to free. In any event, the caller shouldn't 1304 * assume that '*mp_chain' is non-NULL on return. 1305 * 1306 * This function was written with three main use cases in mind. 1307 * 1308 * 1. To emulate hardware offloads when traveling mac-loopback (two 1309 * clients on the same mac). This is wired up in mac_tx_send(). 1310 * 1311 * 2. To provide hardware offloads to the client when the underlying 1312 * provider cannot. This is currently wired up in mac_tx() but we 1313 * still only negotiate offloads when the underlying provider 1314 * supports them. 1315 * 1316 * 3. To emulate real hardware in simnet. 1317 */ 1318 void 1319 mac_hw_emul(mblk_t **mp_chain, mblk_t **otail, uint_t *ocount, mac_emul_t emul) 1320 { 1321 mblk_t *head = NULL, *tail = NULL; 1322 uint_t count = 0; 1323 1324 ASSERT3S(~(MAC_HWCKSUM_EMULS | MAC_LSO_EMUL) & emul, ==, 0); 1325 ASSERT3P(mp_chain, !=, NULL); 1326 1327 for (mblk_t *mp = *mp_chain; mp != NULL; ) { 1328 mblk_t *tmp, *next, *tmphead, *tmptail; 1329 struct ether_header *ehp; 1330 uint32_t flags; 1331 uint_t len = MBLKL(mp), l2len; 1332 1333 /* Perform LSO/cksum one message at a time. */ 1334 next = mp->b_next; 1335 mp->b_next = NULL; 1336 1337 /* 1338 * For our sanity the first mblk should contain at 1339 * least the full L2 header. 1340 */ 1341 if (len < sizeof (struct ether_header)) { 1342 mac_drop_pkt(mp, "packet too short (A): %u", len); 1343 mp = next; 1344 continue; 1345 } 1346 1347 ehp = (struct ether_header *)mp->b_rptr; 1348 if (ntohs(ehp->ether_type) == VLAN_TPID) 1349 l2len = sizeof (struct ether_vlan_header); 1350 else 1351 l2len = sizeof (struct ether_header); 1352 1353 /* 1354 * If the first mblk is solely the L2 header, then 1355 * there better be more data. 1356 */ 1357 if (len < l2len || (len == l2len && mp->b_cont == NULL)) { 1358 mac_drop_pkt(mp, "packet too short (C): %u", len); 1359 mp = next; 1360 continue; 1361 } 1362 1363 DTRACE_PROBE2(mac__emul, mblk_t *, mp, mac_emul_t, emul); 1364 1365 /* 1366 * We use DB_CKSUMFLAGS (instead of mac_hcksum_get()) 1367 * because we don't want to mask-out the LSO flag. 1368 */ 1369 flags = DB_CKSUMFLAGS(mp); 1370 1371 if ((flags & HW_LSO) && (emul & MAC_LSO_EMUL)) { 1372 uint_t tmpcount = 0; 1373 1374 /* 1375 * LSO fix-up handles checksum emulation 1376 * inline (if requested). It also frees mp. 1377 */ 1378 mac_sw_lso(mp, emul, &tmphead, &tmptail, 1379 &tmpcount); 1380 if (tmphead == NULL) { 1381 /* mac_sw_lso() freed the mp. */ 1382 mp = next; 1383 continue; 1384 } 1385 count += tmpcount; 1386 } else if ((flags & HCK_NEEDED) && (emul & MAC_HWCKSUM_EMULS)) { 1387 tmp = mac_sw_cksum(mp, emul); 1388 if (tmp == NULL) { 1389 /* mac_sw_cksum() freed the mp. */ 1390 mp = next; 1391 continue; 1392 } 1393 tmphead = tmp; 1394 tmptail = tmp; 1395 count++; 1396 } else { 1397 /* There is nothing to emulate. */ 1398 tmp = mp; 1399 tmphead = tmp; 1400 tmptail = tmp; 1401 count++; 1402 } 1403 1404 /* 1405 * The tmp mblk chain is either the start of the new 1406 * chain or added to the tail of the new chain. 1407 */ 1408 if (head == NULL) { 1409 head = tmphead; 1410 tail = tmptail; 1411 } else { 1412 /* Attach the new mblk to the end of the new chain. */ 1413 tail->b_next = tmphead; 1414 tail = tmptail; 1415 } 1416 1417 mp = next; 1418 } 1419 1420 *mp_chain = head; 1421 1422 if (otail != NULL) 1423 *otail = tail; 1424 1425 if (ocount != NULL) 1426 *ocount = count; 1427 } 1428 1429 /* 1430 * Add VLAN tag to the specified mblk. 1431 */ 1432 mblk_t * 1433 mac_add_vlan_tag(mblk_t *mp, uint_t pri, uint16_t vid) 1434 { 1435 mblk_t *hmp; 1436 struct ether_vlan_header *evhp; 1437 struct ether_header *ehp; 1438 1439 ASSERT(pri != 0 || vid != 0); 1440 1441 /* 1442 * Allocate an mblk for the new tagged ethernet header, 1443 * and copy the MAC addresses and ethertype from the 1444 * original header. 1445 */ 1446 1447 hmp = allocb(sizeof (struct ether_vlan_header), BPRI_MED); 1448 if (hmp == NULL) { 1449 freemsg(mp); 1450 return (NULL); 1451 } 1452 1453 evhp = (struct ether_vlan_header *)hmp->b_rptr; 1454 ehp = (struct ether_header *)mp->b_rptr; 1455 1456 bcopy(ehp, evhp, (ETHERADDRL * 2)); 1457 evhp->ether_type = ehp->ether_type; 1458 evhp->ether_tpid = htons(ETHERTYPE_VLAN); 1459 1460 hmp->b_wptr += sizeof (struct ether_vlan_header); 1461 mp->b_rptr += sizeof (struct ether_header); 1462 1463 /* 1464 * Free the original message if it's now empty. Link the 1465 * rest of messages to the header message. 1466 */ 1467 mac_hcksum_clone(mp, hmp); 1468 if (MBLKL(mp) == 0) { 1469 hmp->b_cont = mp->b_cont; 1470 freeb(mp); 1471 } else { 1472 hmp->b_cont = mp; 1473 } 1474 ASSERT(MBLKL(hmp) >= sizeof (struct ether_vlan_header)); 1475 1476 /* 1477 * Initialize the new TCI (Tag Control Information). 1478 */ 1479 evhp->ether_tci = htons(VLAN_TCI(pri, 0, vid)); 1480 1481 return (hmp); 1482 } 1483 1484 /* 1485 * Adds a VLAN tag with the specified VID and priority to each mblk of 1486 * the specified chain. 1487 */ 1488 mblk_t * 1489 mac_add_vlan_tag_chain(mblk_t *mp_chain, uint_t pri, uint16_t vid) 1490 { 1491 mblk_t *next_mp, **prev, *mp; 1492 1493 mp = mp_chain; 1494 prev = &mp_chain; 1495 1496 while (mp != NULL) { 1497 next_mp = mp->b_next; 1498 mp->b_next = NULL; 1499 if ((mp = mac_add_vlan_tag(mp, pri, vid)) == NULL) { 1500 freemsgchain(next_mp); 1501 break; 1502 } 1503 *prev = mp; 1504 prev = &mp->b_next; 1505 mp = mp->b_next = next_mp; 1506 } 1507 1508 return (mp_chain); 1509 } 1510 1511 /* 1512 * Strip VLAN tag 1513 */ 1514 mblk_t * 1515 mac_strip_vlan_tag(mblk_t *mp) 1516 { 1517 mblk_t *newmp; 1518 struct ether_vlan_header *evhp; 1519 1520 evhp = (struct ether_vlan_header *)mp->b_rptr; 1521 if (ntohs(evhp->ether_tpid) == ETHERTYPE_VLAN) { 1522 ASSERT(MBLKL(mp) >= sizeof (struct ether_vlan_header)); 1523 1524 if (DB_REF(mp) > 1) { 1525 newmp = copymsg(mp); 1526 if (newmp == NULL) 1527 return (NULL); 1528 freemsg(mp); 1529 mp = newmp; 1530 } 1531 1532 evhp = (struct ether_vlan_header *)mp->b_rptr; 1533 1534 ovbcopy(mp->b_rptr, mp->b_rptr + VLAN_TAGSZ, 2 * ETHERADDRL); 1535 mp->b_rptr += VLAN_TAGSZ; 1536 } 1537 return (mp); 1538 } 1539 1540 /* 1541 * Strip VLAN tag from each mblk of the chain. 1542 */ 1543 mblk_t * 1544 mac_strip_vlan_tag_chain(mblk_t *mp_chain) 1545 { 1546 mblk_t *mp, *next_mp, **prev; 1547 1548 mp = mp_chain; 1549 prev = &mp_chain; 1550 1551 while (mp != NULL) { 1552 next_mp = mp->b_next; 1553 mp->b_next = NULL; 1554 if ((mp = mac_strip_vlan_tag(mp)) == NULL) { 1555 freemsgchain(next_mp); 1556 break; 1557 } 1558 *prev = mp; 1559 prev = &mp->b_next; 1560 mp = mp->b_next = next_mp; 1561 } 1562 1563 return (mp_chain); 1564 } 1565 1566 /* 1567 * Default callback function. Used when the datapath is not yet initialized. 1568 */ 1569 /* ARGSUSED */ 1570 void 1571 mac_rx_def(void *arg, mac_resource_handle_t resource, mblk_t *mp_chain, 1572 boolean_t loopback) 1573 { 1574 freemsgchain(mp_chain); 1575 } 1576 1577 /* 1578 * Determines the IPv6 header length accounting for all the optional IPv6 1579 * headers (hop-by-hop, destination, routing and fragment). The header length 1580 * and next header value (a transport header) is captured. 1581 * 1582 * Returns B_FALSE if all the IP headers are not in the same mblk otherwise 1583 * returns B_TRUE. 1584 */ 1585 boolean_t 1586 mac_ip_hdr_length_v6(ip6_t *ip6h, uint8_t *endptr, uint16_t *hdr_length, 1587 uint8_t *next_hdr, ip6_frag_t **fragp) 1588 { 1589 uint16_t length; 1590 uint_t ehdrlen; 1591 uint8_t *whereptr; 1592 uint8_t *nexthdrp; 1593 ip6_dest_t *desthdr; 1594 ip6_rthdr_t *rthdr; 1595 ip6_frag_t *fraghdr; 1596 1597 if (((uchar_t *)ip6h + IPV6_HDR_LEN) > endptr) 1598 return (B_FALSE); 1599 ASSERT(IPH_HDR_VERSION(ip6h) == IPV6_VERSION); 1600 length = IPV6_HDR_LEN; 1601 whereptr = ((uint8_t *)&ip6h[1]); /* point to next hdr */ 1602 1603 if (fragp != NULL) 1604 *fragp = NULL; 1605 1606 nexthdrp = &ip6h->ip6_nxt; 1607 while (whereptr < endptr) { 1608 /* Is there enough left for len + nexthdr? */ 1609 if (whereptr + MIN_EHDR_LEN > endptr) 1610 break; 1611 1612 switch (*nexthdrp) { 1613 case IPPROTO_HOPOPTS: 1614 case IPPROTO_DSTOPTS: 1615 /* Assumes the headers are identical for hbh and dst */ 1616 desthdr = (ip6_dest_t *)whereptr; 1617 ehdrlen = 8 * (desthdr->ip6d_len + 1); 1618 if ((uchar_t *)desthdr + ehdrlen > endptr) 1619 return (B_FALSE); 1620 nexthdrp = &desthdr->ip6d_nxt; 1621 break; 1622 case IPPROTO_ROUTING: 1623 rthdr = (ip6_rthdr_t *)whereptr; 1624 ehdrlen = 8 * (rthdr->ip6r_len + 1); 1625 if ((uchar_t *)rthdr + ehdrlen > endptr) 1626 return (B_FALSE); 1627 nexthdrp = &rthdr->ip6r_nxt; 1628 break; 1629 case IPPROTO_FRAGMENT: 1630 fraghdr = (ip6_frag_t *)whereptr; 1631 ehdrlen = sizeof (ip6_frag_t); 1632 if ((uchar_t *)&fraghdr[1] > endptr) 1633 return (B_FALSE); 1634 nexthdrp = &fraghdr->ip6f_nxt; 1635 if (fragp != NULL) 1636 *fragp = fraghdr; 1637 break; 1638 case IPPROTO_NONE: 1639 /* No next header means we're finished */ 1640 default: 1641 *hdr_length = length; 1642 *next_hdr = *nexthdrp; 1643 return (B_TRUE); 1644 } 1645 length += ehdrlen; 1646 whereptr += ehdrlen; 1647 *hdr_length = length; 1648 *next_hdr = *nexthdrp; 1649 } 1650 switch (*nexthdrp) { 1651 case IPPROTO_HOPOPTS: 1652 case IPPROTO_DSTOPTS: 1653 case IPPROTO_ROUTING: 1654 case IPPROTO_FRAGMENT: 1655 /* 1656 * If any know extension headers are still to be processed, 1657 * the packet's malformed (or at least all the IP header(s) are 1658 * not in the same mblk - and that should never happen. 1659 */ 1660 return (B_FALSE); 1661 1662 default: 1663 /* 1664 * If we get here, we know that all of the IP headers were in 1665 * the same mblk, even if the ULP header is in the next mblk. 1666 */ 1667 *hdr_length = length; 1668 *next_hdr = *nexthdrp; 1669 return (B_TRUE); 1670 } 1671 } 1672 1673 /* 1674 * The following set of routines are there to take care of interrupt 1675 * re-targeting for legacy (fixed) interrupts. Some older versions 1676 * of the popular NICs like e1000g do not support MSI-X interrupts 1677 * and they reserve fixed interrupts for RX/TX rings. To re-target 1678 * these interrupts, PCITOOL ioctls need to be used. 1679 */ 1680 typedef struct mac_dladm_intr { 1681 int ino; 1682 int cpu_id; 1683 char driver_path[MAXPATHLEN]; 1684 char nexus_path[MAXPATHLEN]; 1685 } mac_dladm_intr_t; 1686 1687 /* Bind the interrupt to cpu_num */ 1688 static int 1689 mac_set_intr(ldi_handle_t lh, processorid_t cpu_num, int oldcpuid, int ino) 1690 { 1691 pcitool_intr_set_t iset; 1692 int err; 1693 1694 iset.old_cpu = oldcpuid; 1695 iset.ino = ino; 1696 iset.cpu_id = cpu_num; 1697 iset.user_version = PCITOOL_VERSION; 1698 err = ldi_ioctl(lh, PCITOOL_DEVICE_SET_INTR, (intptr_t)&iset, FKIOCTL, 1699 kcred, NULL); 1700 1701 return (err); 1702 } 1703 1704 /* 1705 * Search interrupt information. iget is filled in with the info to search 1706 */ 1707 static boolean_t 1708 mac_search_intrinfo(pcitool_intr_get_t *iget_p, mac_dladm_intr_t *dln) 1709 { 1710 int i; 1711 char driver_path[2 * MAXPATHLEN]; 1712 1713 for (i = 0; i < iget_p->num_devs; i++) { 1714 (void) strlcpy(driver_path, iget_p->dev[i].path, MAXPATHLEN); 1715 (void) snprintf(&driver_path[strlen(driver_path)], MAXPATHLEN, 1716 ":%s%d", iget_p->dev[i].driver_name, 1717 iget_p->dev[i].dev_inst); 1718 /* Match the device path for the device path */ 1719 if (strcmp(driver_path, dln->driver_path) == 0) { 1720 dln->ino = iget_p->ino; 1721 dln->cpu_id = iget_p->cpu_id; 1722 return (B_TRUE); 1723 } 1724 } 1725 return (B_FALSE); 1726 } 1727 1728 /* 1729 * Get information about ino, i.e. if this is the interrupt for our 1730 * device and where it is bound etc. 1731 */ 1732 static boolean_t 1733 mac_get_single_intr(ldi_handle_t lh, int oldcpuid, int ino, 1734 mac_dladm_intr_t *dln) 1735 { 1736 pcitool_intr_get_t *iget_p; 1737 int ipsz; 1738 int nipsz; 1739 int err; 1740 uint8_t inum; 1741 1742 /* 1743 * Check if SLEEP is OK, i.e if could come here in response to 1744 * changing the fanout due to some callback from the driver, say 1745 * link speed changes. 1746 */ 1747 ipsz = PCITOOL_IGET_SIZE(0); 1748 iget_p = kmem_zalloc(ipsz, KM_SLEEP); 1749 1750 iget_p->num_devs_ret = 0; 1751 iget_p->user_version = PCITOOL_VERSION; 1752 iget_p->cpu_id = oldcpuid; 1753 iget_p->ino = ino; 1754 1755 err = ldi_ioctl(lh, PCITOOL_DEVICE_GET_INTR, (intptr_t)iget_p, 1756 FKIOCTL, kcred, NULL); 1757 if (err != 0) { 1758 kmem_free(iget_p, ipsz); 1759 return (B_FALSE); 1760 } 1761 if (iget_p->num_devs == 0) { 1762 kmem_free(iget_p, ipsz); 1763 return (B_FALSE); 1764 } 1765 inum = iget_p->num_devs; 1766 if (iget_p->num_devs_ret < iget_p->num_devs) { 1767 /* Reallocate */ 1768 nipsz = PCITOOL_IGET_SIZE(iget_p->num_devs); 1769 1770 kmem_free(iget_p, ipsz); 1771 ipsz = nipsz; 1772 iget_p = kmem_zalloc(ipsz, KM_SLEEP); 1773 1774 iget_p->num_devs_ret = inum; 1775 iget_p->cpu_id = oldcpuid; 1776 iget_p->ino = ino; 1777 iget_p->user_version = PCITOOL_VERSION; 1778 err = ldi_ioctl(lh, PCITOOL_DEVICE_GET_INTR, (intptr_t)iget_p, 1779 FKIOCTL, kcred, NULL); 1780 if (err != 0) { 1781 kmem_free(iget_p, ipsz); 1782 return (B_FALSE); 1783 } 1784 /* defensive */ 1785 if (iget_p->num_devs != iget_p->num_devs_ret) { 1786 kmem_free(iget_p, ipsz); 1787 return (B_FALSE); 1788 } 1789 } 1790 1791 if (mac_search_intrinfo(iget_p, dln)) { 1792 kmem_free(iget_p, ipsz); 1793 return (B_TRUE); 1794 } 1795 kmem_free(iget_p, ipsz); 1796 return (B_FALSE); 1797 } 1798 1799 /* 1800 * Get the interrupts and check each one to see if it is for our device. 1801 */ 1802 static int 1803 mac_validate_intr(ldi_handle_t lh, mac_dladm_intr_t *dln, processorid_t cpuid) 1804 { 1805 pcitool_intr_info_t intr_info; 1806 int err; 1807 int ino; 1808 int oldcpuid; 1809 1810 err = ldi_ioctl(lh, PCITOOL_SYSTEM_INTR_INFO, (intptr_t)&intr_info, 1811 FKIOCTL, kcred, NULL); 1812 if (err != 0) 1813 return (-1); 1814 1815 for (oldcpuid = 0; oldcpuid < intr_info.num_cpu; oldcpuid++) { 1816 for (ino = 0; ino < intr_info.num_intr; ino++) { 1817 if (mac_get_single_intr(lh, oldcpuid, ino, dln)) { 1818 if (dln->cpu_id == cpuid) 1819 return (0); 1820 return (1); 1821 } 1822 } 1823 } 1824 return (-1); 1825 } 1826 1827 /* 1828 * Obtain the nexus parent node info. for mdip. 1829 */ 1830 static dev_info_t * 1831 mac_get_nexus_node(dev_info_t *mdip, mac_dladm_intr_t *dln) 1832 { 1833 struct dev_info *tdip = (struct dev_info *)mdip; 1834 struct ddi_minor_data *minordata; 1835 int circ; 1836 dev_info_t *pdip; 1837 char pathname[MAXPATHLEN]; 1838 1839 while (tdip != NULL) { 1840 /* 1841 * The netboot code could call this function while walking the 1842 * device tree so we need to use ndi_devi_tryenter() here to 1843 * avoid deadlock. 1844 */ 1845 if (ndi_devi_tryenter((dev_info_t *)tdip, &circ) == 0) 1846 break; 1847 1848 for (minordata = tdip->devi_minor; minordata != NULL; 1849 minordata = minordata->next) { 1850 if (strncmp(minordata->ddm_node_type, DDI_NT_INTRCTL, 1851 strlen(DDI_NT_INTRCTL)) == 0) { 1852 pdip = minordata->dip; 1853 (void) ddi_pathname(pdip, pathname); 1854 (void) snprintf(dln->nexus_path, MAXPATHLEN, 1855 "/devices%s:intr", pathname); 1856 (void) ddi_pathname_minor(minordata, pathname); 1857 ndi_devi_exit((dev_info_t *)tdip, circ); 1858 return (pdip); 1859 } 1860 } 1861 ndi_devi_exit((dev_info_t *)tdip, circ); 1862 tdip = tdip->devi_parent; 1863 } 1864 return (NULL); 1865 } 1866 1867 /* 1868 * For a primary MAC client, if the user has set a list or CPUs or 1869 * we have obtained it implicitly, we try to retarget the interrupt 1870 * for that device on one of the CPUs in the list. 1871 * We assign the interrupt to the same CPU as the poll thread. 1872 */ 1873 static boolean_t 1874 mac_check_interrupt_binding(dev_info_t *mdip, int32_t cpuid) 1875 { 1876 ldi_handle_t lh = NULL; 1877 ldi_ident_t li = NULL; 1878 int err; 1879 int ret; 1880 mac_dladm_intr_t dln; 1881 dev_info_t *dip; 1882 struct ddi_minor_data *minordata; 1883 1884 dln.nexus_path[0] = '\0'; 1885 dln.driver_path[0] = '\0'; 1886 1887 minordata = ((struct dev_info *)mdip)->devi_minor; 1888 while (minordata != NULL) { 1889 if (minordata->type == DDM_MINOR) 1890 break; 1891 minordata = minordata->next; 1892 } 1893 if (minordata == NULL) 1894 return (B_FALSE); 1895 1896 (void) ddi_pathname_minor(minordata, dln.driver_path); 1897 1898 dip = mac_get_nexus_node(mdip, &dln); 1899 /* defensive */ 1900 if (dip == NULL) 1901 return (B_FALSE); 1902 1903 err = ldi_ident_from_major(ddi_driver_major(dip), &li); 1904 if (err != 0) 1905 return (B_FALSE); 1906 1907 err = ldi_open_by_name(dln.nexus_path, FREAD|FWRITE, kcred, &lh, li); 1908 if (err != 0) 1909 return (B_FALSE); 1910 1911 ret = mac_validate_intr(lh, &dln, cpuid); 1912 if (ret < 0) { 1913 (void) ldi_close(lh, FREAD|FWRITE, kcred); 1914 return (B_FALSE); 1915 } 1916 /* cmn_note? */ 1917 if (ret != 0) 1918 if ((err = (mac_set_intr(lh, cpuid, dln.cpu_id, dln.ino))) 1919 != 0) { 1920 (void) ldi_close(lh, FREAD|FWRITE, kcred); 1921 return (B_FALSE); 1922 } 1923 (void) ldi_close(lh, FREAD|FWRITE, kcred); 1924 return (B_TRUE); 1925 } 1926 1927 void 1928 mac_client_set_intr_cpu(void *arg, mac_client_handle_t mch, int32_t cpuid) 1929 { 1930 dev_info_t *mdip = (dev_info_t *)arg; 1931 mac_client_impl_t *mcip = (mac_client_impl_t *)mch; 1932 mac_resource_props_t *mrp; 1933 mac_perim_handle_t mph; 1934 flow_entry_t *flent = mcip->mci_flent; 1935 mac_soft_ring_set_t *rx_srs; 1936 mac_cpus_t *srs_cpu; 1937 1938 if (!mac_check_interrupt_binding(mdip, cpuid)) 1939 cpuid = -1; 1940 mac_perim_enter_by_mh((mac_handle_t)mcip->mci_mip, &mph); 1941 mrp = MCIP_RESOURCE_PROPS(mcip); 1942 mrp->mrp_rx_intr_cpu = cpuid; 1943 if (flent != NULL && flent->fe_rx_srs_cnt == 2) { 1944 rx_srs = flent->fe_rx_srs[1]; 1945 srs_cpu = &rx_srs->srs_cpu; 1946 srs_cpu->mc_rx_intr_cpu = cpuid; 1947 } 1948 mac_perim_exit(mph); 1949 } 1950 1951 int32_t 1952 mac_client_intr_cpu(mac_client_handle_t mch) 1953 { 1954 mac_client_impl_t *mcip = (mac_client_impl_t *)mch; 1955 mac_cpus_t *srs_cpu; 1956 mac_soft_ring_set_t *rx_srs; 1957 flow_entry_t *flent = mcip->mci_flent; 1958 mac_resource_props_t *mrp = MCIP_RESOURCE_PROPS(mcip); 1959 mac_ring_t *ring; 1960 mac_intr_t *mintr; 1961 1962 /* 1963 * Check if we need to retarget the interrupt. We do this only 1964 * for the primary MAC client. We do this if we have the only 1965 * exclusive ring in the group. 1966 */ 1967 if (mac_is_primary_client(mcip) && flent->fe_rx_srs_cnt == 2) { 1968 rx_srs = flent->fe_rx_srs[1]; 1969 srs_cpu = &rx_srs->srs_cpu; 1970 ring = rx_srs->srs_ring; 1971 mintr = &ring->mr_info.mri_intr; 1972 /* 1973 * If ddi_handle is present or the poll CPU is 1974 * already bound to the interrupt CPU, return -1. 1975 */ 1976 if (mintr->mi_ddi_handle != NULL || 1977 ((mrp->mrp_ncpus != 0) && 1978 (mrp->mrp_rx_intr_cpu == srs_cpu->mc_rx_pollid))) { 1979 return (-1); 1980 } 1981 return (srs_cpu->mc_rx_pollid); 1982 } 1983 return (-1); 1984 } 1985 1986 void * 1987 mac_get_devinfo(mac_handle_t mh) 1988 { 1989 mac_impl_t *mip = (mac_impl_t *)mh; 1990 1991 return ((void *)mip->mi_dip); 1992 } 1993 1994 #define PKT_HASH_2BYTES(x) ((x)[0] ^ (x)[1]) 1995 #define PKT_HASH_4BYTES(x) ((x)[0] ^ (x)[1] ^ (x)[2] ^ (x)[3]) 1996 #define PKT_HASH_MAC(x) ((x)[0] ^ (x)[1] ^ (x)[2] ^ (x)[3] ^ (x)[4] ^ (x)[5]) 1997 1998 uint64_t 1999 mac_pkt_hash(uint_t media, mblk_t *mp, uint8_t policy, boolean_t is_outbound) 2000 { 2001 struct ether_header *ehp; 2002 uint64_t hash = 0; 2003 uint16_t sap; 2004 uint_t skip_len; 2005 uint8_t proto; 2006 boolean_t ip_fragmented; 2007 2008 /* 2009 * We may want to have one of these per MAC type plugin in the 2010 * future. For now supports only ethernet. 2011 */ 2012 if (media != DL_ETHER) 2013 return (0L); 2014 2015 /* for now we support only outbound packets */ 2016 ASSERT(is_outbound); 2017 ASSERT(IS_P2ALIGNED(mp->b_rptr, sizeof (uint16_t))); 2018 ASSERT(MBLKL(mp) >= sizeof (struct ether_header)); 2019 2020 /* compute L2 hash */ 2021 2022 ehp = (struct ether_header *)mp->b_rptr; 2023 2024 if ((policy & MAC_PKT_HASH_L2) != 0) { 2025 uchar_t *mac_src = ehp->ether_shost.ether_addr_octet; 2026 uchar_t *mac_dst = ehp->ether_dhost.ether_addr_octet; 2027 hash = PKT_HASH_MAC(mac_src) ^ PKT_HASH_MAC(mac_dst); 2028 policy &= ~MAC_PKT_HASH_L2; 2029 } 2030 2031 if (policy == 0) 2032 goto done; 2033 2034 /* skip ethernet header */ 2035 2036 sap = ntohs(ehp->ether_type); 2037 if (sap == ETHERTYPE_VLAN) { 2038 struct ether_vlan_header *evhp; 2039 mblk_t *newmp = NULL; 2040 2041 skip_len = sizeof (struct ether_vlan_header); 2042 if (MBLKL(mp) < skip_len) { 2043 /* the vlan tag is the payload, pull up first */ 2044 newmp = msgpullup(mp, -1); 2045 if ((newmp == NULL) || (MBLKL(newmp) < skip_len)) { 2046 goto done; 2047 } 2048 evhp = (struct ether_vlan_header *)newmp->b_rptr; 2049 } else { 2050 evhp = (struct ether_vlan_header *)mp->b_rptr; 2051 } 2052 2053 sap = ntohs(evhp->ether_type); 2054 freemsg(newmp); 2055 } else { 2056 skip_len = sizeof (struct ether_header); 2057 } 2058 2059 /* if ethernet header is in its own mblk, skip it */ 2060 if (MBLKL(mp) <= skip_len) { 2061 skip_len -= MBLKL(mp); 2062 mp = mp->b_cont; 2063 if (mp == NULL) 2064 goto done; 2065 } 2066 2067 sap = (sap < ETHERTYPE_802_MIN) ? 0 : sap; 2068 2069 /* compute IP src/dst addresses hash and skip IPv{4,6} header */ 2070 2071 switch (sap) { 2072 case ETHERTYPE_IP: { 2073 ipha_t *iphp; 2074 2075 /* 2076 * If the header is not aligned or the header doesn't fit 2077 * in the mblk, bail now. Note that this may cause packets 2078 * reordering. 2079 */ 2080 iphp = (ipha_t *)(mp->b_rptr + skip_len); 2081 if (((unsigned char *)iphp + sizeof (ipha_t) > mp->b_wptr) || 2082 !OK_32PTR((char *)iphp)) 2083 goto done; 2084 2085 proto = iphp->ipha_protocol; 2086 skip_len += IPH_HDR_LENGTH(iphp); 2087 2088 /* Check if the packet is fragmented. */ 2089 ip_fragmented = ntohs(iphp->ipha_fragment_offset_and_flags) & 2090 IPH_OFFSET; 2091 2092 /* 2093 * For fragmented packets, use addresses in addition to 2094 * the frag_id to generate the hash inorder to get 2095 * better distribution. 2096 */ 2097 if (ip_fragmented || (policy & MAC_PKT_HASH_L3) != 0) { 2098 uint8_t *ip_src = (uint8_t *)&(iphp->ipha_src); 2099 uint8_t *ip_dst = (uint8_t *)&(iphp->ipha_dst); 2100 2101 hash ^= (PKT_HASH_4BYTES(ip_src) ^ 2102 PKT_HASH_4BYTES(ip_dst)); 2103 policy &= ~MAC_PKT_HASH_L3; 2104 } 2105 2106 if (ip_fragmented) { 2107 uint8_t *identp = (uint8_t *)&iphp->ipha_ident; 2108 hash ^= PKT_HASH_2BYTES(identp); 2109 goto done; 2110 } 2111 break; 2112 } 2113 case ETHERTYPE_IPV6: { 2114 ip6_t *ip6hp; 2115 ip6_frag_t *frag = NULL; 2116 uint16_t hdr_length; 2117 2118 /* 2119 * If the header is not aligned or the header doesn't fit 2120 * in the mblk, bail now. Note that this may cause packets 2121 * reordering. 2122 */ 2123 2124 ip6hp = (ip6_t *)(mp->b_rptr + skip_len); 2125 if (((unsigned char *)ip6hp + IPV6_HDR_LEN > mp->b_wptr) || 2126 !OK_32PTR((char *)ip6hp)) 2127 goto done; 2128 2129 if (!mac_ip_hdr_length_v6(ip6hp, mp->b_wptr, &hdr_length, 2130 &proto, &frag)) 2131 goto done; 2132 skip_len += hdr_length; 2133 2134 /* 2135 * For fragmented packets, use addresses in addition to 2136 * the frag_id to generate the hash inorder to get 2137 * better distribution. 2138 */ 2139 if (frag != NULL || (policy & MAC_PKT_HASH_L3) != 0) { 2140 uint8_t *ip_src = &(ip6hp->ip6_src.s6_addr8[12]); 2141 uint8_t *ip_dst = &(ip6hp->ip6_dst.s6_addr8[12]); 2142 2143 hash ^= (PKT_HASH_4BYTES(ip_src) ^ 2144 PKT_HASH_4BYTES(ip_dst)); 2145 policy &= ~MAC_PKT_HASH_L3; 2146 } 2147 2148 if (frag != NULL) { 2149 uint8_t *identp = (uint8_t *)&frag->ip6f_ident; 2150 hash ^= PKT_HASH_4BYTES(identp); 2151 goto done; 2152 } 2153 break; 2154 } 2155 default: 2156 goto done; 2157 } 2158 2159 if (policy == 0) 2160 goto done; 2161 2162 /* if ip header is in its own mblk, skip it */ 2163 if (MBLKL(mp) <= skip_len) { 2164 skip_len -= MBLKL(mp); 2165 mp = mp->b_cont; 2166 if (mp == NULL) 2167 goto done; 2168 } 2169 2170 /* parse ULP header */ 2171 again: 2172 switch (proto) { 2173 case IPPROTO_TCP: 2174 case IPPROTO_UDP: 2175 case IPPROTO_ESP: 2176 case IPPROTO_SCTP: 2177 /* 2178 * These Internet Protocols are intentionally designed 2179 * for hashing from the git-go. Port numbers are in the first 2180 * word for transports, SPI is first for ESP. 2181 */ 2182 if (mp->b_rptr + skip_len + 4 > mp->b_wptr) 2183 goto done; 2184 hash ^= PKT_HASH_4BYTES((mp->b_rptr + skip_len)); 2185 break; 2186 2187 case IPPROTO_AH: { 2188 ah_t *ah = (ah_t *)(mp->b_rptr + skip_len); 2189 uint_t ah_length = AH_TOTAL_LEN(ah); 2190 2191 if ((unsigned char *)ah + sizeof (ah_t) > mp->b_wptr) 2192 goto done; 2193 2194 proto = ah->ah_nexthdr; 2195 skip_len += ah_length; 2196 2197 /* if AH header is in its own mblk, skip it */ 2198 if (MBLKL(mp) <= skip_len) { 2199 skip_len -= MBLKL(mp); 2200 mp = mp->b_cont; 2201 if (mp == NULL) 2202 goto done; 2203 } 2204 2205 goto again; 2206 } 2207 } 2208 2209 done: 2210 return (hash); 2211 } 2212