1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. 23 */ 24 25 /* 26 * MAC Services Module - misc utilities 27 */ 28 29 #include <sys/types.h> 30 #include <sys/mac.h> 31 #include <sys/mac_impl.h> 32 #include <sys/mac_client_priv.h> 33 #include <sys/mac_client_impl.h> 34 #include <sys/mac_soft_ring.h> 35 #include <sys/strsubr.h> 36 #include <sys/strsun.h> 37 #include <sys/vlan.h> 38 #include <sys/pattr.h> 39 #include <sys/pci_tools.h> 40 #include <inet/ip.h> 41 #include <inet/ip_impl.h> 42 #include <inet/ip6.h> 43 #include <sys/vtrace.h> 44 #include <sys/dlpi.h> 45 #include <sys/sunndi.h> 46 #include <inet/ipsec_impl.h> 47 #include <inet/sadb.h> 48 #include <inet/ipsecesp.h> 49 #include <inet/ipsecah.h> 50 51 /* 52 * Copy an mblk, preserving its hardware checksum flags. 53 */ 54 static mblk_t * 55 mac_copymsg_cksum(mblk_t *mp) 56 { 57 mblk_t *mp1; 58 uint32_t start, stuff, end, value, flags; 59 60 mp1 = copymsg(mp); 61 if (mp1 == NULL) 62 return (NULL); 63 64 hcksum_retrieve(mp, NULL, NULL, &start, &stuff, &end, &value, &flags); 65 (void) hcksum_assoc(mp1, NULL, NULL, start, stuff, end, value, 66 flags, KM_NOSLEEP); 67 68 return (mp1); 69 } 70 71 /* 72 * Copy an mblk chain, presenting the hardware checksum flags of the 73 * individual mblks. 74 */ 75 mblk_t * 76 mac_copymsgchain_cksum(mblk_t *mp) 77 { 78 mblk_t *nmp = NULL; 79 mblk_t **nmpp = &nmp; 80 81 for (; mp != NULL; mp = mp->b_next) { 82 if ((*nmpp = mac_copymsg_cksum(mp)) == NULL) { 83 freemsgchain(nmp); 84 return (NULL); 85 } 86 87 nmpp = &((*nmpp)->b_next); 88 } 89 90 return (nmp); 91 } 92 93 /* 94 * Process the specified mblk chain for proper handling of hardware 95 * checksum offload. This routine is invoked for loopback traffic 96 * between MAC clients. 97 * The function handles a NULL mblk chain passed as argument. 98 */ 99 mblk_t * 100 mac_fix_cksum(mblk_t *mp_chain) 101 { 102 mblk_t *mp, *prev = NULL, *new_chain = mp_chain, *mp1; 103 uint32_t flags, start, stuff, end, value; 104 105 for (mp = mp_chain; mp != NULL; prev = mp, mp = mp->b_next) { 106 uint16_t len; 107 uint32_t offset; 108 struct ether_header *ehp; 109 uint16_t sap; 110 111 hcksum_retrieve(mp, NULL, NULL, &start, &stuff, &end, &value, 112 &flags); 113 if (flags == 0) 114 continue; 115 116 /* 117 * Since the processing of checksum offload for loopback 118 * traffic requires modification of the packet contents, 119 * ensure sure that we are always modifying our own copy. 120 */ 121 if (DB_REF(mp) > 1) { 122 mp1 = copymsg(mp); 123 if (mp1 == NULL) 124 continue; 125 mp1->b_next = mp->b_next; 126 mp->b_next = NULL; 127 freemsg(mp); 128 if (prev != NULL) 129 prev->b_next = mp1; 130 else 131 new_chain = mp1; 132 mp = mp1; 133 } 134 135 /* 136 * Ethernet, and optionally VLAN header. 137 */ 138 /* LINTED: improper alignment cast */ 139 ehp = (struct ether_header *)mp->b_rptr; 140 if (ntohs(ehp->ether_type) == VLAN_TPID) { 141 struct ether_vlan_header *evhp; 142 143 ASSERT(MBLKL(mp) >= sizeof (struct ether_vlan_header)); 144 /* LINTED: improper alignment cast */ 145 evhp = (struct ether_vlan_header *)mp->b_rptr; 146 sap = ntohs(evhp->ether_type); 147 offset = sizeof (struct ether_vlan_header); 148 } else { 149 sap = ntohs(ehp->ether_type); 150 offset = sizeof (struct ether_header); 151 } 152 153 if (MBLKL(mp) <= offset) { 154 offset -= MBLKL(mp); 155 if (mp->b_cont == NULL) { 156 /* corrupted packet, skip it */ 157 if (prev != NULL) 158 prev->b_next = mp->b_next; 159 else 160 new_chain = mp->b_next; 161 mp1 = mp->b_next; 162 mp->b_next = NULL; 163 freemsg(mp); 164 mp = mp1; 165 continue; 166 } 167 mp = mp->b_cont; 168 } 169 170 if (flags & (HCK_FULLCKSUM | HCK_IPV4_HDRCKSUM)) { 171 ipha_t *ipha = NULL; 172 173 /* 174 * In order to compute the full and header 175 * checksums, we need to find and parse 176 * the IP and/or ULP headers. 177 */ 178 179 sap = (sap < ETHERTYPE_802_MIN) ? 0 : sap; 180 181 /* 182 * IP header. 183 */ 184 if (sap != ETHERTYPE_IP) 185 continue; 186 187 ASSERT(MBLKL(mp) >= offset + sizeof (ipha_t)); 188 /* LINTED: improper alignment cast */ 189 ipha = (ipha_t *)(mp->b_rptr + offset); 190 191 if (flags & HCK_FULLCKSUM) { 192 ipaddr_t src, dst; 193 uint32_t cksum; 194 uint16_t *up; 195 uint8_t proto; 196 197 /* 198 * Pointer to checksum field in ULP header. 199 */ 200 proto = ipha->ipha_protocol; 201 ASSERT(ipha->ipha_version_and_hdr_length == 202 IP_SIMPLE_HDR_VERSION); 203 204 switch (proto) { 205 case IPPROTO_TCP: 206 /* LINTED: improper alignment cast */ 207 up = IPH_TCPH_CHECKSUMP(ipha, 208 IP_SIMPLE_HDR_LENGTH); 209 break; 210 211 case IPPROTO_UDP: 212 /* LINTED: improper alignment cast */ 213 up = IPH_UDPH_CHECKSUMP(ipha, 214 IP_SIMPLE_HDR_LENGTH); 215 break; 216 217 default: 218 cmn_err(CE_WARN, "mac_fix_cksum: " 219 "unexpected protocol: %d", proto); 220 continue; 221 } 222 223 /* 224 * Pseudo-header checksum. 225 */ 226 src = ipha->ipha_src; 227 dst = ipha->ipha_dst; 228 len = ntohs(ipha->ipha_length) - 229 IP_SIMPLE_HDR_LENGTH; 230 231 cksum = (dst >> 16) + (dst & 0xFFFF) + 232 (src >> 16) + (src & 0xFFFF); 233 cksum += htons(len); 234 235 /* 236 * The checksum value stored in the packet needs 237 * to be correct. Compute it here. 238 */ 239 *up = 0; 240 cksum += (((proto) == IPPROTO_UDP) ? 241 IP_UDP_CSUM_COMP : IP_TCP_CSUM_COMP); 242 cksum = IP_CSUM(mp, IP_SIMPLE_HDR_LENGTH + 243 offset, cksum); 244 *(up) = (uint16_t)(cksum ? cksum : ~cksum); 245 246 /* 247 * Flag the packet so that it appears 248 * that the checksum has already been 249 * verified by the hardware. 250 */ 251 flags &= ~HCK_FULLCKSUM; 252 flags |= HCK_FULLCKSUM_OK; 253 value = 0; 254 } 255 256 if (flags & HCK_IPV4_HDRCKSUM) { 257 ASSERT(ipha != NULL); 258 ipha->ipha_hdr_checksum = 259 (uint16_t)ip_csum_hdr(ipha); 260 flags &= ~HCK_IPV4_HDRCKSUM; 261 flags |= HCK_IPV4_HDRCKSUM_OK; 262 263 } 264 } 265 266 if (flags & HCK_PARTIALCKSUM) { 267 uint16_t *up, partial, cksum; 268 uchar_t *ipp; /* ptr to beginning of IP header */ 269 270 if (mp->b_cont != NULL) { 271 mblk_t *mp1; 272 273 mp1 = msgpullup(mp, offset + end); 274 if (mp1 == NULL) 275 continue; 276 mp1->b_next = mp->b_next; 277 mp->b_next = NULL; 278 freemsg(mp); 279 if (prev != NULL) 280 prev->b_next = mp1; 281 else 282 new_chain = mp1; 283 mp = mp1; 284 } 285 286 ipp = mp->b_rptr + offset; 287 /* LINTED: cast may result in improper alignment */ 288 up = (uint16_t *)((uchar_t *)ipp + stuff); 289 partial = *up; 290 *up = 0; 291 292 cksum = IP_BCSUM_PARTIAL(mp->b_rptr + offset + start, 293 end - start, partial); 294 cksum = ~cksum; 295 *up = cksum ? cksum : ~cksum; 296 297 /* 298 * Since we already computed the whole checksum, 299 * indicate to the stack that it has already 300 * been verified by the hardware. 301 */ 302 flags &= ~HCK_PARTIALCKSUM; 303 flags |= HCK_FULLCKSUM_OK; 304 value = 0; 305 } 306 307 (void) hcksum_assoc(mp, NULL, NULL, start, stuff, end, 308 value, flags, KM_NOSLEEP); 309 } 310 311 return (new_chain); 312 } 313 314 /* 315 * Add VLAN tag to the specified mblk. 316 */ 317 mblk_t * 318 mac_add_vlan_tag(mblk_t *mp, uint_t pri, uint16_t vid) 319 { 320 mblk_t *hmp; 321 struct ether_vlan_header *evhp; 322 struct ether_header *ehp; 323 uint32_t start, stuff, end, value, flags; 324 325 ASSERT(pri != 0 || vid != 0); 326 327 /* 328 * Allocate an mblk for the new tagged ethernet header, 329 * and copy the MAC addresses and ethertype from the 330 * original header. 331 */ 332 333 hmp = allocb(sizeof (struct ether_vlan_header), BPRI_MED); 334 if (hmp == NULL) { 335 freemsg(mp); 336 return (NULL); 337 } 338 339 evhp = (struct ether_vlan_header *)hmp->b_rptr; 340 ehp = (struct ether_header *)mp->b_rptr; 341 342 bcopy(ehp, evhp, (ETHERADDRL * 2)); 343 evhp->ether_type = ehp->ether_type; 344 evhp->ether_tpid = htons(ETHERTYPE_VLAN); 345 346 hmp->b_wptr += sizeof (struct ether_vlan_header); 347 mp->b_rptr += sizeof (struct ether_header); 348 349 /* 350 * Free the original message if it's now empty. Link the 351 * rest of messages to the header message. 352 */ 353 hcksum_retrieve(mp, NULL, NULL, &start, &stuff, &end, &value, &flags); 354 (void) hcksum_assoc(hmp, NULL, NULL, start, stuff, end, value, flags, 355 KM_NOSLEEP); 356 if (MBLKL(mp) == 0) { 357 hmp->b_cont = mp->b_cont; 358 freeb(mp); 359 } else { 360 hmp->b_cont = mp; 361 } 362 ASSERT(MBLKL(hmp) >= sizeof (struct ether_vlan_header)); 363 364 /* 365 * Initialize the new TCI (Tag Control Information). 366 */ 367 evhp->ether_tci = htons(VLAN_TCI(pri, 0, vid)); 368 369 return (hmp); 370 } 371 372 /* 373 * Adds a VLAN tag with the specified VID and priority to each mblk of 374 * the specified chain. 375 */ 376 mblk_t * 377 mac_add_vlan_tag_chain(mblk_t *mp_chain, uint_t pri, uint16_t vid) 378 { 379 mblk_t *next_mp, **prev, *mp; 380 381 mp = mp_chain; 382 prev = &mp_chain; 383 384 while (mp != NULL) { 385 next_mp = mp->b_next; 386 mp->b_next = NULL; 387 if ((mp = mac_add_vlan_tag(mp, pri, vid)) == NULL) { 388 freemsgchain(next_mp); 389 break; 390 } 391 *prev = mp; 392 prev = &mp->b_next; 393 mp = mp->b_next = next_mp; 394 } 395 396 return (mp_chain); 397 } 398 399 /* 400 * Strip VLAN tag 401 */ 402 mblk_t * 403 mac_strip_vlan_tag(mblk_t *mp) 404 { 405 mblk_t *newmp; 406 struct ether_vlan_header *evhp; 407 408 evhp = (struct ether_vlan_header *)mp->b_rptr; 409 if (ntohs(evhp->ether_tpid) == ETHERTYPE_VLAN) { 410 ASSERT(MBLKL(mp) >= sizeof (struct ether_vlan_header)); 411 412 if (DB_REF(mp) > 1) { 413 newmp = copymsg(mp); 414 if (newmp == NULL) 415 return (NULL); 416 freemsg(mp); 417 mp = newmp; 418 } 419 420 evhp = (struct ether_vlan_header *)mp->b_rptr; 421 422 ovbcopy(mp->b_rptr, mp->b_rptr + VLAN_TAGSZ, 2 * ETHERADDRL); 423 mp->b_rptr += VLAN_TAGSZ; 424 } 425 return (mp); 426 } 427 428 /* 429 * Strip VLAN tag from each mblk of the chain. 430 */ 431 mblk_t * 432 mac_strip_vlan_tag_chain(mblk_t *mp_chain) 433 { 434 mblk_t *mp, *next_mp, **prev; 435 436 mp = mp_chain; 437 prev = &mp_chain; 438 439 while (mp != NULL) { 440 next_mp = mp->b_next; 441 mp->b_next = NULL; 442 if ((mp = mac_strip_vlan_tag(mp)) == NULL) { 443 freemsgchain(next_mp); 444 break; 445 } 446 *prev = mp; 447 prev = &mp->b_next; 448 mp = mp->b_next = next_mp; 449 } 450 451 return (mp_chain); 452 } 453 454 /* 455 * Default callback function. Used when the datapath is not yet initialized. 456 */ 457 /* ARGSUSED */ 458 void 459 mac_pkt_drop(void *arg, mac_resource_handle_t resource, mblk_t *mp, 460 boolean_t loopback) 461 { 462 mblk_t *mp1 = mp; 463 464 while (mp1 != NULL) { 465 mp1->b_prev = NULL; 466 mp1->b_queue = NULL; 467 mp1 = mp1->b_next; 468 } 469 freemsgchain(mp); 470 } 471 472 /* 473 * Determines the IPv6 header length accounting for all the optional IPv6 474 * headers (hop-by-hop, destination, routing and fragment). The header length 475 * and next header value (a transport header) is captured. 476 * 477 * Returns B_FALSE if all the IP headers are not in the same mblk otherwise 478 * returns B_TRUE. 479 */ 480 boolean_t 481 mac_ip_hdr_length_v6(ip6_t *ip6h, uint8_t *endptr, uint16_t *hdr_length, 482 uint8_t *next_hdr, ip6_frag_t **fragp) 483 { 484 uint16_t length; 485 uint_t ehdrlen; 486 uint8_t *whereptr; 487 uint8_t *nexthdrp; 488 ip6_dest_t *desthdr; 489 ip6_rthdr_t *rthdr; 490 ip6_frag_t *fraghdr; 491 492 if (((uchar_t *)ip6h + IPV6_HDR_LEN) > endptr) 493 return (B_FALSE); 494 ASSERT(IPH_HDR_VERSION(ip6h) == IPV6_VERSION); 495 length = IPV6_HDR_LEN; 496 whereptr = ((uint8_t *)&ip6h[1]); /* point to next hdr */ 497 498 if (fragp != NULL) 499 *fragp = NULL; 500 501 nexthdrp = &ip6h->ip6_nxt; 502 while (whereptr < endptr) { 503 /* Is there enough left for len + nexthdr? */ 504 if (whereptr + MIN_EHDR_LEN > endptr) 505 break; 506 507 switch (*nexthdrp) { 508 case IPPROTO_HOPOPTS: 509 case IPPROTO_DSTOPTS: 510 /* Assumes the headers are identical for hbh and dst */ 511 desthdr = (ip6_dest_t *)whereptr; 512 ehdrlen = 8 * (desthdr->ip6d_len + 1); 513 if ((uchar_t *)desthdr + ehdrlen > endptr) 514 return (B_FALSE); 515 nexthdrp = &desthdr->ip6d_nxt; 516 break; 517 case IPPROTO_ROUTING: 518 rthdr = (ip6_rthdr_t *)whereptr; 519 ehdrlen = 8 * (rthdr->ip6r_len + 1); 520 if ((uchar_t *)rthdr + ehdrlen > endptr) 521 return (B_FALSE); 522 nexthdrp = &rthdr->ip6r_nxt; 523 break; 524 case IPPROTO_FRAGMENT: 525 fraghdr = (ip6_frag_t *)whereptr; 526 ehdrlen = sizeof (ip6_frag_t); 527 if ((uchar_t *)&fraghdr[1] > endptr) 528 return (B_FALSE); 529 nexthdrp = &fraghdr->ip6f_nxt; 530 if (fragp != NULL) 531 *fragp = fraghdr; 532 break; 533 case IPPROTO_NONE: 534 /* No next header means we're finished */ 535 default: 536 *hdr_length = length; 537 *next_hdr = *nexthdrp; 538 return (B_TRUE); 539 } 540 length += ehdrlen; 541 whereptr += ehdrlen; 542 *hdr_length = length; 543 *next_hdr = *nexthdrp; 544 } 545 switch (*nexthdrp) { 546 case IPPROTO_HOPOPTS: 547 case IPPROTO_DSTOPTS: 548 case IPPROTO_ROUTING: 549 case IPPROTO_FRAGMENT: 550 /* 551 * If any know extension headers are still to be processed, 552 * the packet's malformed (or at least all the IP header(s) are 553 * not in the same mblk - and that should never happen. 554 */ 555 return (B_FALSE); 556 557 default: 558 /* 559 * If we get here, we know that all of the IP headers were in 560 * the same mblk, even if the ULP header is in the next mblk. 561 */ 562 *hdr_length = length; 563 *next_hdr = *nexthdrp; 564 return (B_TRUE); 565 } 566 } 567 568 /* 569 * The following set of routines are there to take care of interrupt 570 * re-targeting for legacy (fixed) interrupts. Some older versions 571 * of the popular NICs like e1000g do not support MSI-X interrupts 572 * and they reserve fixed interrupts for RX/TX rings. To re-target 573 * these interrupts, PCITOOL ioctls need to be used. 574 */ 575 typedef struct mac_dladm_intr { 576 int ino; 577 int cpu_id; 578 char driver_path[MAXPATHLEN]; 579 char nexus_path[MAXPATHLEN]; 580 } mac_dladm_intr_t; 581 582 /* Bind the interrupt to cpu_num */ 583 static int 584 mac_set_intr(ldi_handle_t lh, processorid_t cpu_num, int oldcpuid, int ino) 585 { 586 pcitool_intr_set_t iset; 587 int err; 588 589 iset.old_cpu = oldcpuid; 590 iset.ino = ino; 591 iset.cpu_id = cpu_num; 592 iset.user_version = PCITOOL_VERSION; 593 err = ldi_ioctl(lh, PCITOOL_DEVICE_SET_INTR, (intptr_t)&iset, FKIOCTL, 594 kcred, NULL); 595 596 return (err); 597 } 598 599 /* 600 * Search interrupt information. iget is filled in with the info to search 601 */ 602 static boolean_t 603 mac_search_intrinfo(pcitool_intr_get_t *iget_p, mac_dladm_intr_t *dln) 604 { 605 int i; 606 char driver_path[2 * MAXPATHLEN]; 607 608 for (i = 0; i < iget_p->num_devs; i++) { 609 (void) strlcpy(driver_path, iget_p->dev[i].path, MAXPATHLEN); 610 (void) snprintf(&driver_path[strlen(driver_path)], MAXPATHLEN, 611 ":%s%d", iget_p->dev[i].driver_name, 612 iget_p->dev[i].dev_inst); 613 /* Match the device path for the device path */ 614 if (strcmp(driver_path, dln->driver_path) == 0) { 615 dln->ino = iget_p->ino; 616 dln->cpu_id = iget_p->cpu_id; 617 return (B_TRUE); 618 } 619 } 620 return (B_FALSE); 621 } 622 623 /* 624 * Get information about ino, i.e. if this is the interrupt for our 625 * device and where it is bound etc. 626 */ 627 static boolean_t 628 mac_get_single_intr(ldi_handle_t lh, int oldcpuid, int ino, 629 mac_dladm_intr_t *dln) 630 { 631 pcitool_intr_get_t *iget_p; 632 int ipsz; 633 int nipsz; 634 int err; 635 uint8_t inum; 636 637 /* 638 * Check if SLEEP is OK, i.e if could come here in response to 639 * changing the fanout due to some callback from the driver, say 640 * link speed changes. 641 */ 642 ipsz = PCITOOL_IGET_SIZE(0); 643 iget_p = kmem_zalloc(ipsz, KM_SLEEP); 644 645 iget_p->num_devs_ret = 0; 646 iget_p->user_version = PCITOOL_VERSION; 647 iget_p->cpu_id = oldcpuid; 648 iget_p->ino = ino; 649 650 err = ldi_ioctl(lh, PCITOOL_DEVICE_GET_INTR, (intptr_t)iget_p, 651 FKIOCTL, kcred, NULL); 652 if (err != 0) { 653 kmem_free(iget_p, ipsz); 654 return (B_FALSE); 655 } 656 if (iget_p->num_devs == 0) { 657 kmem_free(iget_p, ipsz); 658 return (B_FALSE); 659 } 660 inum = iget_p->num_devs; 661 if (iget_p->num_devs_ret < iget_p->num_devs) { 662 /* Reallocate */ 663 nipsz = PCITOOL_IGET_SIZE(iget_p->num_devs); 664 665 kmem_free(iget_p, ipsz); 666 ipsz = nipsz; 667 iget_p = kmem_zalloc(ipsz, KM_SLEEP); 668 669 iget_p->num_devs_ret = inum; 670 iget_p->cpu_id = oldcpuid; 671 iget_p->ino = ino; 672 iget_p->user_version = PCITOOL_VERSION; 673 err = ldi_ioctl(lh, PCITOOL_DEVICE_GET_INTR, (intptr_t)iget_p, 674 FKIOCTL, kcred, NULL); 675 if (err != 0) { 676 kmem_free(iget_p, ipsz); 677 return (B_FALSE); 678 } 679 /* defensive */ 680 if (iget_p->num_devs != iget_p->num_devs_ret) { 681 kmem_free(iget_p, ipsz); 682 return (B_FALSE); 683 } 684 } 685 686 if (mac_search_intrinfo(iget_p, dln)) { 687 kmem_free(iget_p, ipsz); 688 return (B_TRUE); 689 } 690 kmem_free(iget_p, ipsz); 691 return (B_FALSE); 692 } 693 694 /* 695 * Get the interrupts and check each one to see if it is for our device. 696 */ 697 static int 698 mac_validate_intr(ldi_handle_t lh, mac_dladm_intr_t *dln, processorid_t cpuid) 699 { 700 pcitool_intr_info_t intr_info; 701 int err; 702 int ino; 703 int oldcpuid; 704 705 err = ldi_ioctl(lh, PCITOOL_SYSTEM_INTR_INFO, (intptr_t)&intr_info, 706 FKIOCTL, kcred, NULL); 707 if (err != 0) 708 return (-1); 709 710 for (oldcpuid = 0; oldcpuid < intr_info.num_cpu; oldcpuid++) { 711 for (ino = 0; ino < intr_info.num_intr; ino++) { 712 if (mac_get_single_intr(lh, oldcpuid, ino, dln)) { 713 if (dln->cpu_id == cpuid) 714 return (0); 715 return (1); 716 } 717 } 718 } 719 return (-1); 720 } 721 722 /* 723 * Obtain the nexus parent node info. for mdip. 724 */ 725 static dev_info_t * 726 mac_get_nexus_node(dev_info_t *mdip, mac_dladm_intr_t *dln) 727 { 728 struct dev_info *tdip = (struct dev_info *)mdip; 729 struct ddi_minor_data *minordata; 730 int circ; 731 dev_info_t *pdip; 732 char pathname[MAXPATHLEN]; 733 734 while (tdip != NULL) { 735 /* 736 * The netboot code could call this function while walking the 737 * device tree so we need to use ndi_devi_tryenter() here to 738 * avoid deadlock. 739 */ 740 if (ndi_devi_tryenter((dev_info_t *)tdip, &circ) == 0) 741 break; 742 743 for (minordata = tdip->devi_minor; minordata != NULL; 744 minordata = minordata->next) { 745 if (strncmp(minordata->ddm_node_type, DDI_NT_INTRCTL, 746 strlen(DDI_NT_INTRCTL)) == 0) { 747 pdip = minordata->dip; 748 (void) ddi_pathname(pdip, pathname); 749 (void) snprintf(dln->nexus_path, MAXPATHLEN, 750 "/devices%s:intr", pathname); 751 (void) ddi_pathname_minor(minordata, pathname); 752 ndi_devi_exit((dev_info_t *)tdip, circ); 753 return (pdip); 754 } 755 } 756 ndi_devi_exit((dev_info_t *)tdip, circ); 757 tdip = tdip->devi_parent; 758 } 759 return (NULL); 760 } 761 762 /* 763 * For a primary MAC client, if the user has set a list or CPUs or 764 * we have obtained it implicitly, we try to retarget the interrupt 765 * for that device on one of the CPUs in the list. 766 * We assign the interrupt to the same CPU as the poll thread. 767 */ 768 static boolean_t 769 mac_check_interrupt_binding(dev_info_t *mdip, int32_t cpuid) 770 { 771 ldi_handle_t lh = NULL; 772 ldi_ident_t li = NULL; 773 int err; 774 int ret; 775 mac_dladm_intr_t dln; 776 dev_info_t *dip; 777 struct ddi_minor_data *minordata; 778 779 dln.nexus_path[0] = '\0'; 780 dln.driver_path[0] = '\0'; 781 782 minordata = ((struct dev_info *)mdip)->devi_minor; 783 while (minordata != NULL) { 784 if (minordata->type == DDM_MINOR) 785 break; 786 minordata = minordata->next; 787 } 788 if (minordata == NULL) 789 return (B_FALSE); 790 791 (void) ddi_pathname_minor(minordata, dln.driver_path); 792 793 dip = mac_get_nexus_node(mdip, &dln); 794 /* defensive */ 795 if (dip == NULL) 796 return (B_FALSE); 797 798 err = ldi_ident_from_major(ddi_driver_major(dip), &li); 799 if (err != 0) 800 return (B_FALSE); 801 802 err = ldi_open_by_name(dln.nexus_path, FREAD|FWRITE, kcred, &lh, li); 803 if (err != 0) 804 return (B_FALSE); 805 806 ret = mac_validate_intr(lh, &dln, cpuid); 807 if (ret < 0) { 808 (void) ldi_close(lh, FREAD|FWRITE, kcred); 809 return (B_FALSE); 810 } 811 /* cmn_note? */ 812 if (ret != 0) 813 if ((err = (mac_set_intr(lh, cpuid, dln.cpu_id, dln.ino))) 814 != 0) { 815 (void) ldi_close(lh, FREAD|FWRITE, kcred); 816 return (B_FALSE); 817 } 818 (void) ldi_close(lh, FREAD|FWRITE, kcred); 819 return (B_TRUE); 820 } 821 822 void 823 mac_client_set_intr_cpu(void *arg, mac_client_handle_t mch, int32_t cpuid) 824 { 825 dev_info_t *mdip = (dev_info_t *)arg; 826 mac_client_impl_t *mcip = (mac_client_impl_t *)mch; 827 mac_resource_props_t *mrp; 828 mac_perim_handle_t mph; 829 flow_entry_t *flent = mcip->mci_flent; 830 mac_soft_ring_set_t *rx_srs; 831 mac_cpus_t *srs_cpu; 832 833 if (!mac_check_interrupt_binding(mdip, cpuid)) 834 cpuid = -1; 835 mac_perim_enter_by_mh((mac_handle_t)mcip->mci_mip, &mph); 836 mrp = MCIP_RESOURCE_PROPS(mcip); 837 mrp->mrp_rx_intr_cpu = cpuid; 838 if (flent != NULL && flent->fe_rx_srs_cnt == 2) { 839 rx_srs = flent->fe_rx_srs[1]; 840 srs_cpu = &rx_srs->srs_cpu; 841 srs_cpu->mc_rx_intr_cpu = cpuid; 842 } 843 mac_perim_exit(mph); 844 } 845 846 int32_t 847 mac_client_intr_cpu(mac_client_handle_t mch) 848 { 849 mac_client_impl_t *mcip = (mac_client_impl_t *)mch; 850 mac_cpus_t *srs_cpu; 851 mac_soft_ring_set_t *rx_srs; 852 flow_entry_t *flent = mcip->mci_flent; 853 mac_resource_props_t *mrp = MCIP_RESOURCE_PROPS(mcip); 854 mac_ring_t *ring; 855 mac_intr_t *mintr; 856 857 /* 858 * Check if we need to retarget the interrupt. We do this only 859 * for the primary MAC client. We do this if we have the only 860 * exclusive ring in the group. 861 */ 862 if (mac_is_primary_client(mcip) && flent->fe_rx_srs_cnt == 2) { 863 rx_srs = flent->fe_rx_srs[1]; 864 srs_cpu = &rx_srs->srs_cpu; 865 ring = rx_srs->srs_ring; 866 mintr = &ring->mr_info.mri_intr; 867 /* 868 * If ddi_handle is present or the poll CPU is 869 * already bound to the interrupt CPU, return -1. 870 */ 871 if (mintr->mi_ddi_handle != NULL || 872 ((mrp->mrp_ncpus != 0) && 873 (mrp->mrp_rx_intr_cpu == srs_cpu->mc_rx_pollid))) { 874 return (-1); 875 } 876 return (srs_cpu->mc_rx_pollid); 877 } 878 return (-1); 879 } 880 881 void * 882 mac_get_devinfo(mac_handle_t mh) 883 { 884 mac_impl_t *mip = (mac_impl_t *)mh; 885 886 return ((void *)mip->mi_dip); 887 } 888 889 #define PKT_HASH_2BYTES(x) ((x)[0] ^ (x)[1]) 890 #define PKT_HASH_4BYTES(x) ((x)[0] ^ (x)[1] ^ (x)[2] ^ (x)[3]) 891 #define PKT_HASH_MAC(x) ((x)[0] ^ (x)[1] ^ (x)[2] ^ (x)[3] ^ (x)[4] ^ (x)[5]) 892 893 uint64_t 894 mac_pkt_hash(uint_t media, mblk_t *mp, uint8_t policy, boolean_t is_outbound) 895 { 896 struct ether_header *ehp; 897 uint64_t hash = 0; 898 uint16_t sap; 899 uint_t skip_len; 900 uint8_t proto; 901 boolean_t ip_fragmented; 902 903 /* 904 * We may want to have one of these per MAC type plugin in the 905 * future. For now supports only ethernet. 906 */ 907 if (media != DL_ETHER) 908 return (0L); 909 910 /* for now we support only outbound packets */ 911 ASSERT(is_outbound); 912 ASSERT(IS_P2ALIGNED(mp->b_rptr, sizeof (uint16_t))); 913 ASSERT(MBLKL(mp) >= sizeof (struct ether_header)); 914 915 /* compute L2 hash */ 916 917 ehp = (struct ether_header *)mp->b_rptr; 918 919 if ((policy & MAC_PKT_HASH_L2) != 0) { 920 uchar_t *mac_src = ehp->ether_shost.ether_addr_octet; 921 uchar_t *mac_dst = ehp->ether_dhost.ether_addr_octet; 922 hash = PKT_HASH_MAC(mac_src) ^ PKT_HASH_MAC(mac_dst); 923 policy &= ~MAC_PKT_HASH_L2; 924 } 925 926 if (policy == 0) 927 goto done; 928 929 /* skip ethernet header */ 930 931 sap = ntohs(ehp->ether_type); 932 if (sap == ETHERTYPE_VLAN) { 933 struct ether_vlan_header *evhp; 934 mblk_t *newmp = NULL; 935 936 skip_len = sizeof (struct ether_vlan_header); 937 if (MBLKL(mp) < skip_len) { 938 /* the vlan tag is the payload, pull up first */ 939 newmp = msgpullup(mp, -1); 940 if ((newmp == NULL) || (MBLKL(newmp) < skip_len)) { 941 goto done; 942 } 943 evhp = (struct ether_vlan_header *)newmp->b_rptr; 944 } else { 945 evhp = (struct ether_vlan_header *)mp->b_rptr; 946 } 947 948 sap = ntohs(evhp->ether_type); 949 freemsg(newmp); 950 } else { 951 skip_len = sizeof (struct ether_header); 952 } 953 954 /* if ethernet header is in its own mblk, skip it */ 955 if (MBLKL(mp) <= skip_len) { 956 skip_len -= MBLKL(mp); 957 mp = mp->b_cont; 958 if (mp == NULL) 959 goto done; 960 } 961 962 sap = (sap < ETHERTYPE_802_MIN) ? 0 : sap; 963 964 /* compute IP src/dst addresses hash and skip IPv{4,6} header */ 965 966 switch (sap) { 967 case ETHERTYPE_IP: { 968 ipha_t *iphp; 969 970 /* 971 * If the header is not aligned or the header doesn't fit 972 * in the mblk, bail now. Note that this may cause packets 973 * reordering. 974 */ 975 iphp = (ipha_t *)(mp->b_rptr + skip_len); 976 if (((unsigned char *)iphp + sizeof (ipha_t) > mp->b_wptr) || 977 !OK_32PTR((char *)iphp)) 978 goto done; 979 980 proto = iphp->ipha_protocol; 981 skip_len += IPH_HDR_LENGTH(iphp); 982 983 /* Check if the packet is fragmented. */ 984 ip_fragmented = ntohs(iphp->ipha_fragment_offset_and_flags) & 985 IPH_OFFSET; 986 987 /* 988 * For fragmented packets, use addresses in addition to 989 * the frag_id to generate the hash inorder to get 990 * better distribution. 991 */ 992 if (ip_fragmented || (policy & MAC_PKT_HASH_L3) != 0) { 993 uint8_t *ip_src = (uint8_t *)&(iphp->ipha_src); 994 uint8_t *ip_dst = (uint8_t *)&(iphp->ipha_dst); 995 996 hash ^= (PKT_HASH_4BYTES(ip_src) ^ 997 PKT_HASH_4BYTES(ip_dst)); 998 policy &= ~MAC_PKT_HASH_L3; 999 } 1000 1001 if (ip_fragmented) { 1002 uint8_t *identp = (uint8_t *)&iphp->ipha_ident; 1003 hash ^= PKT_HASH_2BYTES(identp); 1004 goto done; 1005 } 1006 break; 1007 } 1008 case ETHERTYPE_IPV6: { 1009 ip6_t *ip6hp; 1010 ip6_frag_t *frag = NULL; 1011 uint16_t hdr_length; 1012 1013 /* 1014 * If the header is not aligned or the header doesn't fit 1015 * in the mblk, bail now. Note that this may cause packets 1016 * reordering. 1017 */ 1018 1019 ip6hp = (ip6_t *)(mp->b_rptr + skip_len); 1020 if (((unsigned char *)ip6hp + IPV6_HDR_LEN > mp->b_wptr) || 1021 !OK_32PTR((char *)ip6hp)) 1022 goto done; 1023 1024 if (!mac_ip_hdr_length_v6(ip6hp, mp->b_wptr, &hdr_length, 1025 &proto, &frag)) 1026 goto done; 1027 skip_len += hdr_length; 1028 1029 /* 1030 * For fragmented packets, use addresses in addition to 1031 * the frag_id to generate the hash inorder to get 1032 * better distribution. 1033 */ 1034 if (frag != NULL || (policy & MAC_PKT_HASH_L3) != 0) { 1035 uint8_t *ip_src = &(ip6hp->ip6_src.s6_addr8[12]); 1036 uint8_t *ip_dst = &(ip6hp->ip6_dst.s6_addr8[12]); 1037 1038 hash ^= (PKT_HASH_4BYTES(ip_src) ^ 1039 PKT_HASH_4BYTES(ip_dst)); 1040 policy &= ~MAC_PKT_HASH_L3; 1041 } 1042 1043 if (frag != NULL) { 1044 uint8_t *identp = (uint8_t *)&frag->ip6f_ident; 1045 hash ^= PKT_HASH_4BYTES(identp); 1046 goto done; 1047 } 1048 break; 1049 } 1050 default: 1051 goto done; 1052 } 1053 1054 if (policy == 0) 1055 goto done; 1056 1057 /* if ip header is in its own mblk, skip it */ 1058 if (MBLKL(mp) <= skip_len) { 1059 skip_len -= MBLKL(mp); 1060 mp = mp->b_cont; 1061 if (mp == NULL) 1062 goto done; 1063 } 1064 1065 /* parse ULP header */ 1066 again: 1067 switch (proto) { 1068 case IPPROTO_TCP: 1069 case IPPROTO_UDP: 1070 case IPPROTO_ESP: 1071 case IPPROTO_SCTP: 1072 /* 1073 * These Internet Protocols are intentionally designed 1074 * for hashing from the git-go. Port numbers are in the first 1075 * word for transports, SPI is first for ESP. 1076 */ 1077 if (mp->b_rptr + skip_len + 4 > mp->b_wptr) 1078 goto done; 1079 hash ^= PKT_HASH_4BYTES((mp->b_rptr + skip_len)); 1080 break; 1081 1082 case IPPROTO_AH: { 1083 ah_t *ah = (ah_t *)(mp->b_rptr + skip_len); 1084 uint_t ah_length = AH_TOTAL_LEN(ah); 1085 1086 if ((unsigned char *)ah + sizeof (ah_t) > mp->b_wptr) 1087 goto done; 1088 1089 proto = ah->ah_nexthdr; 1090 skip_len += ah_length; 1091 1092 /* if AH header is in its own mblk, skip it */ 1093 if (MBLKL(mp) <= skip_len) { 1094 skip_len -= MBLKL(mp); 1095 mp = mp->b_cont; 1096 if (mp == NULL) 1097 goto done; 1098 } 1099 1100 goto again; 1101 } 1102 } 1103 1104 done: 1105 return (hash); 1106 } 1107