1 /****************************************************************************** 2 3 Copyright (c) 2006-2008, Myricom Inc. 4 All rights reserved. 5 6 Redistribution and use in source and binary forms, with or without 7 modification, are permitted provided that the following conditions are met: 8 9 1. Redistributions of source code must retain the above copyright notice, 10 this list of conditions and the following disclaimer. 11 12 2. Neither the name of the Myricom Inc, nor the names of its 13 contributors may be used to endorse or promote products derived from 14 this software without specific prior written permission. 15 16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 21 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 22 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 23 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 24 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 25 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 26 POSSIBILITY OF SUCH DAMAGE. 27 28 ***************************************************************************/ 29 30 #include <sys/cdefs.h> 31 __FBSDID("$FreeBSD$"); 32 33 #include <sys/param.h> 34 #include <sys/systm.h> 35 #include <sys/linker.h> 36 #include <sys/firmware.h> 37 #include <sys/endian.h> 38 #include <sys/sockio.h> 39 #include <sys/mbuf.h> 40 #include <sys/malloc.h> 41 #include <sys/kdb.h> 42 #include <sys/kernel.h> 43 #include <sys/lock.h> 44 #include <sys/module.h> 45 #include <sys/memrange.h> 46 #include <sys/socket.h> 47 #include <sys/sysctl.h> 48 #include <sys/sx.h> 49 50 #include <net/if.h> 51 #include <net/if_arp.h> 52 #include <net/ethernet.h> 53 #include <net/if_dl.h> 54 #include <net/if_media.h> 55 56 #include <net/bpf.h> 57 58 #include <net/if_types.h> 59 #include <net/if_vlan_var.h> 60 #include <net/zlib.h> 61 62 #include <netinet/in_systm.h> 63 #include <netinet/in.h> 64 #include <netinet/ip.h> 65 #include <netinet/tcp.h> 66 67 #include <machine/bus.h> 68 #include <machine/in_cksum.h> 69 #include <machine/resource.h> 70 #include <sys/bus.h> 71 #include <sys/rman.h> 72 #include <sys/smp.h> 73 74 #include <dev/pci/pcireg.h> 75 #include <dev/pci/pcivar.h> 76 77 #include <vm/vm.h> /* for pmap_mapdev() */ 78 #include <vm/pmap.h> 79 80 #if defined(__i386) || defined(__amd64) 81 #include <machine/specialreg.h> 82 #endif 83 84 #include <dev/mxge/mxge_mcp.h> 85 #include <dev/mxge/mcp_gen_header.h> 86 /*#define MXGE_FAKE_IFP*/ 87 #include <dev/mxge/if_mxge_var.h> 88 89 /* tunable params */ 90 static int mxge_nvidia_ecrc_enable = 1; 91 static int mxge_force_firmware = 0; 92 static int mxge_intr_coal_delay = 30; 93 static int mxge_deassert_wait = 1; 94 static int mxge_flow_control = 1; 95 static int mxge_verbose = 0; 96 static int mxge_lro_cnt = 8; 97 static int mxge_ticks; 98 static int mxge_max_slices = 1; 99 static int mxge_rss_hash_type = MXGEFW_RSS_HASH_TYPE_SRC_PORT; 100 static int mxge_always_promisc = 0; 101 static char *mxge_fw_unaligned = "mxge_ethp_z8e"; 102 static char *mxge_fw_aligned = "mxge_eth_z8e"; 103 static char *mxge_fw_rss_aligned = "mxge_rss_eth_z8e"; 104 static char *mxge_fw_rss_unaligned = "mxge_rss_ethp_z8e"; 105 106 static int mxge_probe(device_t dev); 107 static int mxge_attach(device_t dev); 108 static int mxge_detach(device_t dev); 109 static int mxge_shutdown(device_t dev); 110 static void mxge_intr(void *arg); 111 112 static device_method_t mxge_methods[] = 113 { 114 /* Device interface */ 115 DEVMETHOD(device_probe, mxge_probe), 116 DEVMETHOD(device_attach, mxge_attach), 117 DEVMETHOD(device_detach, mxge_detach), 118 DEVMETHOD(device_shutdown, mxge_shutdown), 119 {0, 0} 120 }; 121 122 static driver_t mxge_driver = 123 { 124 "mxge", 125 mxge_methods, 126 sizeof(mxge_softc_t), 127 }; 128 129 static devclass_t mxge_devclass; 130 131 /* Declare ourselves to be a child of the PCI bus.*/ 132 DRIVER_MODULE(mxge, pci, mxge_driver, mxge_devclass, 0, 0); 133 MODULE_DEPEND(mxge, firmware, 1, 1, 1); 134 MODULE_DEPEND(mxge, zlib, 1, 1, 1); 135 136 static int mxge_load_firmware(mxge_softc_t *sc, int adopt); 137 static int mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data); 138 static int mxge_close(mxge_softc_t *sc); 139 static int mxge_open(mxge_softc_t *sc); 140 static void mxge_tick(void *arg); 141 142 static int 143 mxge_probe(device_t dev) 144 { 145 if ((pci_get_vendor(dev) == MXGE_PCI_VENDOR_MYRICOM) && 146 ((pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E) || 147 (pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E_9))) { 148 device_set_desc(dev, "Myri10G-PCIE-8A"); 149 return 0; 150 } 151 return ENXIO; 152 } 153 154 static void 155 mxge_enable_wc(mxge_softc_t *sc) 156 { 157 #if defined(__i386) || defined(__amd64) 158 struct mem_range_desc mrdesc; 159 vm_paddr_t pa; 160 vm_offset_t len; 161 int err, action; 162 163 sc->wc = 1; 164 len = rman_get_size(sc->mem_res); 165 err = pmap_change_attr((vm_offset_t) sc->sram, 166 len, PAT_WRITE_COMBINING); 167 if (err == 0) 168 return; 169 else 170 device_printf(sc->dev, "pmap_change_attr failed, %d\n", 171 err); 172 pa = rman_get_start(sc->mem_res); 173 mrdesc.mr_base = pa; 174 mrdesc.mr_len = len; 175 mrdesc.mr_flags = MDF_WRITECOMBINE; 176 action = MEMRANGE_SET_UPDATE; 177 strcpy((char *)&mrdesc.mr_owner, "mxge"); 178 err = mem_range_attr_set(&mrdesc, &action); 179 if (err != 0) { 180 sc->wc = 0; 181 device_printf(sc->dev, 182 "w/c failed for pa 0x%lx, len 0x%lx, err = %d\n", 183 (unsigned long)pa, (unsigned long)len, err); 184 } 185 #endif 186 } 187 188 189 /* callback to get our DMA address */ 190 static void 191 mxge_dmamap_callback(void *arg, bus_dma_segment_t *segs, int nsegs, 192 int error) 193 { 194 if (error == 0) { 195 *(bus_addr_t *) arg = segs->ds_addr; 196 } 197 } 198 199 static int 200 mxge_dma_alloc(mxge_softc_t *sc, mxge_dma_t *dma, size_t bytes, 201 bus_size_t alignment) 202 { 203 int err; 204 device_t dev = sc->dev; 205 bus_size_t boundary, maxsegsize; 206 207 if (bytes > 4096 && alignment == 4096) { 208 boundary = 0; 209 maxsegsize = bytes; 210 } else { 211 boundary = 4096; 212 maxsegsize = 4096; 213 } 214 215 /* allocate DMAable memory tags */ 216 err = bus_dma_tag_create(sc->parent_dmat, /* parent */ 217 alignment, /* alignment */ 218 boundary, /* boundary */ 219 BUS_SPACE_MAXADDR, /* low */ 220 BUS_SPACE_MAXADDR, /* high */ 221 NULL, NULL, /* filter */ 222 bytes, /* maxsize */ 223 1, /* num segs */ 224 maxsegsize, /* maxsegsize */ 225 BUS_DMA_COHERENT, /* flags */ 226 NULL, NULL, /* lock */ 227 &dma->dmat); /* tag */ 228 if (err != 0) { 229 device_printf(dev, "couldn't alloc tag (err = %d)\n", err); 230 return err; 231 } 232 233 /* allocate DMAable memory & map */ 234 err = bus_dmamem_alloc(dma->dmat, &dma->addr, 235 (BUS_DMA_WAITOK | BUS_DMA_COHERENT 236 | BUS_DMA_ZERO), &dma->map); 237 if (err != 0) { 238 device_printf(dev, "couldn't alloc mem (err = %d)\n", err); 239 goto abort_with_dmat; 240 } 241 242 /* load the memory */ 243 err = bus_dmamap_load(dma->dmat, dma->map, dma->addr, bytes, 244 mxge_dmamap_callback, 245 (void *)&dma->bus_addr, 0); 246 if (err != 0) { 247 device_printf(dev, "couldn't load map (err = %d)\n", err); 248 goto abort_with_mem; 249 } 250 return 0; 251 252 abort_with_mem: 253 bus_dmamem_free(dma->dmat, dma->addr, dma->map); 254 abort_with_dmat: 255 (void)bus_dma_tag_destroy(dma->dmat); 256 return err; 257 } 258 259 260 static void 261 mxge_dma_free(mxge_dma_t *dma) 262 { 263 bus_dmamap_unload(dma->dmat, dma->map); 264 bus_dmamem_free(dma->dmat, dma->addr, dma->map); 265 (void)bus_dma_tag_destroy(dma->dmat); 266 } 267 268 /* 269 * The eeprom strings on the lanaiX have the format 270 * SN=x\0 271 * MAC=x:x:x:x:x:x\0 272 * PC=text\0 273 */ 274 275 static int 276 mxge_parse_strings(mxge_softc_t *sc) 277 { 278 #define MXGE_NEXT_STRING(p) while(ptr < limit && *ptr++) 279 280 char *ptr, *limit; 281 int i, found_mac; 282 283 ptr = sc->eeprom_strings; 284 limit = sc->eeprom_strings + MXGE_EEPROM_STRINGS_SIZE; 285 found_mac = 0; 286 while (ptr < limit && *ptr != '\0') { 287 if (memcmp(ptr, "MAC=", 4) == 0) { 288 ptr += 1; 289 sc->mac_addr_string = ptr; 290 for (i = 0; i < 6; i++) { 291 ptr += 3; 292 if ((ptr + 2) > limit) 293 goto abort; 294 sc->mac_addr[i] = strtoul(ptr, NULL, 16); 295 found_mac = 1; 296 } 297 } else if (memcmp(ptr, "PC=", 3) == 0) { 298 ptr += 3; 299 strncpy(sc->product_code_string, ptr, 300 sizeof (sc->product_code_string) - 1); 301 } else if (memcmp(ptr, "SN=", 3) == 0) { 302 ptr += 3; 303 strncpy(sc->serial_number_string, ptr, 304 sizeof (sc->serial_number_string) - 1); 305 } 306 MXGE_NEXT_STRING(ptr); 307 } 308 309 if (found_mac) 310 return 0; 311 312 abort: 313 device_printf(sc->dev, "failed to parse eeprom_strings\n"); 314 315 return ENXIO; 316 } 317 318 #if #cpu(i386) || defined __i386 || defined i386 || defined __i386__ || #cpu(x86_64) || defined __x86_64__ 319 static void 320 mxge_enable_nvidia_ecrc(mxge_softc_t *sc) 321 { 322 uint32_t val; 323 unsigned long base, off; 324 char *va, *cfgptr; 325 device_t pdev, mcp55; 326 uint16_t vendor_id, device_id, word; 327 uintptr_t bus, slot, func, ivend, idev; 328 uint32_t *ptr32; 329 330 331 if (!mxge_nvidia_ecrc_enable) 332 return; 333 334 pdev = device_get_parent(device_get_parent(sc->dev)); 335 if (pdev == NULL) { 336 device_printf(sc->dev, "could not find parent?\n"); 337 return; 338 } 339 vendor_id = pci_read_config(pdev, PCIR_VENDOR, 2); 340 device_id = pci_read_config(pdev, PCIR_DEVICE, 2); 341 342 if (vendor_id != 0x10de) 343 return; 344 345 base = 0; 346 347 if (device_id == 0x005d) { 348 /* ck804, base address is magic */ 349 base = 0xe0000000UL; 350 } else if (device_id >= 0x0374 && device_id <= 0x378) { 351 /* mcp55, base address stored in chipset */ 352 mcp55 = pci_find_bsf(0, 0, 0); 353 if (mcp55 && 354 0x10de == pci_read_config(mcp55, PCIR_VENDOR, 2) && 355 0x0369 == pci_read_config(mcp55, PCIR_DEVICE, 2)) { 356 word = pci_read_config(mcp55, 0x90, 2); 357 base = ((unsigned long)word & 0x7ffeU) << 25; 358 } 359 } 360 if (!base) 361 return; 362 363 /* XXXX 364 Test below is commented because it is believed that doing 365 config read/write beyond 0xff will access the config space 366 for the next larger function. Uncomment this and remove 367 the hacky pmap_mapdev() way of accessing config space when 368 FreeBSD grows support for extended pcie config space access 369 */ 370 #if 0 371 /* See if we can, by some miracle, access the extended 372 config space */ 373 val = pci_read_config(pdev, 0x178, 4); 374 if (val != 0xffffffff) { 375 val |= 0x40; 376 pci_write_config(pdev, 0x178, val, 4); 377 return; 378 } 379 #endif 380 /* Rather than using normal pci config space writes, we must 381 * map the Nvidia config space ourselves. This is because on 382 * opteron/nvidia class machine the 0xe000000 mapping is 383 * handled by the nvidia chipset, that means the internal PCI 384 * device (the on-chip northbridge), or the amd-8131 bridge 385 * and things behind them are not visible by this method. 386 */ 387 388 BUS_READ_IVAR(device_get_parent(pdev), pdev, 389 PCI_IVAR_BUS, &bus); 390 BUS_READ_IVAR(device_get_parent(pdev), pdev, 391 PCI_IVAR_SLOT, &slot); 392 BUS_READ_IVAR(device_get_parent(pdev), pdev, 393 PCI_IVAR_FUNCTION, &func); 394 BUS_READ_IVAR(device_get_parent(pdev), pdev, 395 PCI_IVAR_VENDOR, &ivend); 396 BUS_READ_IVAR(device_get_parent(pdev), pdev, 397 PCI_IVAR_DEVICE, &idev); 398 399 off = base 400 + 0x00100000UL * (unsigned long)bus 401 + 0x00001000UL * (unsigned long)(func 402 + 8 * slot); 403 404 /* map it into the kernel */ 405 va = pmap_mapdev(trunc_page((vm_paddr_t)off), PAGE_SIZE); 406 407 408 if (va == NULL) { 409 device_printf(sc->dev, "pmap_kenter_temporary didn't\n"); 410 return; 411 } 412 /* get a pointer to the config space mapped into the kernel */ 413 cfgptr = va + (off & PAGE_MASK); 414 415 /* make sure that we can really access it */ 416 vendor_id = *(uint16_t *)(cfgptr + PCIR_VENDOR); 417 device_id = *(uint16_t *)(cfgptr + PCIR_DEVICE); 418 if (! (vendor_id == ivend && device_id == idev)) { 419 device_printf(sc->dev, "mapping failed: 0x%x:0x%x\n", 420 vendor_id, device_id); 421 pmap_unmapdev((vm_offset_t)va, PAGE_SIZE); 422 return; 423 } 424 425 ptr32 = (uint32_t*)(cfgptr + 0x178); 426 val = *ptr32; 427 428 if (val == 0xffffffff) { 429 device_printf(sc->dev, "extended mapping failed\n"); 430 pmap_unmapdev((vm_offset_t)va, PAGE_SIZE); 431 return; 432 } 433 *ptr32 = val | 0x40; 434 pmap_unmapdev((vm_offset_t)va, PAGE_SIZE); 435 if (mxge_verbose) 436 device_printf(sc->dev, 437 "Enabled ECRC on upstream Nvidia bridge " 438 "at %d:%d:%d\n", 439 (int)bus, (int)slot, (int)func); 440 return; 441 } 442 #else 443 static void 444 mxge_enable_nvidia_ecrc(mxge_softc_t *sc) 445 { 446 device_printf(sc->dev, 447 "Nforce 4 chipset on non-x86/amd64!?!?!\n"); 448 return; 449 } 450 #endif 451 452 453 static int 454 mxge_dma_test(mxge_softc_t *sc, int test_type) 455 { 456 mxge_cmd_t cmd; 457 bus_addr_t dmatest_bus = sc->dmabench_dma.bus_addr; 458 int status; 459 uint32_t len; 460 char *test = " "; 461 462 463 /* Run a small DMA test. 464 * The magic multipliers to the length tell the firmware 465 * to do DMA read, write, or read+write tests. The 466 * results are returned in cmd.data0. The upper 16 467 * bits of the return is the number of transfers completed. 468 * The lower 16 bits is the time in 0.5us ticks that the 469 * transfers took to complete. 470 */ 471 472 len = sc->tx_boundary; 473 474 cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus); 475 cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus); 476 cmd.data2 = len * 0x10000; 477 status = mxge_send_cmd(sc, test_type, &cmd); 478 if (status != 0) { 479 test = "read"; 480 goto abort; 481 } 482 sc->read_dma = ((cmd.data0>>16) * len * 2) / 483 (cmd.data0 & 0xffff); 484 cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus); 485 cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus); 486 cmd.data2 = len * 0x1; 487 status = mxge_send_cmd(sc, test_type, &cmd); 488 if (status != 0) { 489 test = "write"; 490 goto abort; 491 } 492 sc->write_dma = ((cmd.data0>>16) * len * 2) / 493 (cmd.data0 & 0xffff); 494 495 cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus); 496 cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus); 497 cmd.data2 = len * 0x10001; 498 status = mxge_send_cmd(sc, test_type, &cmd); 499 if (status != 0) { 500 test = "read/write"; 501 goto abort; 502 } 503 sc->read_write_dma = ((cmd.data0>>16) * len * 2 * 2) / 504 (cmd.data0 & 0xffff); 505 506 abort: 507 if (status != 0 && test_type != MXGEFW_CMD_UNALIGNED_TEST) 508 device_printf(sc->dev, "DMA %s benchmark failed: %d\n", 509 test, status); 510 511 return status; 512 } 513 514 /* 515 * The Lanai Z8E PCI-E interface achieves higher Read-DMA throughput 516 * when the PCI-E Completion packets are aligned on an 8-byte 517 * boundary. Some PCI-E chip sets always align Completion packets; on 518 * the ones that do not, the alignment can be enforced by enabling 519 * ECRC generation (if supported). 520 * 521 * When PCI-E Completion packets are not aligned, it is actually more 522 * efficient to limit Read-DMA transactions to 2KB, rather than 4KB. 523 * 524 * If the driver can neither enable ECRC nor verify that it has 525 * already been enabled, then it must use a firmware image which works 526 * around unaligned completion packets (ethp_z8e.dat), and it should 527 * also ensure that it never gives the device a Read-DMA which is 528 * larger than 2KB by setting the tx_boundary to 2KB. If ECRC is 529 * enabled, then the driver should use the aligned (eth_z8e.dat) 530 * firmware image, and set tx_boundary to 4KB. 531 */ 532 533 static int 534 mxge_firmware_probe(mxge_softc_t *sc) 535 { 536 device_t dev = sc->dev; 537 int reg, status; 538 uint16_t pectl; 539 540 sc->tx_boundary = 4096; 541 /* 542 * Verify the max read request size was set to 4KB 543 * before trying the test with 4KB. 544 */ 545 if (pci_find_extcap(dev, PCIY_EXPRESS, ®) == 0) { 546 pectl = pci_read_config(dev, reg + 0x8, 2); 547 if ((pectl & (5 << 12)) != (5 << 12)) { 548 device_printf(dev, "Max Read Req. size != 4k (0x%x\n", 549 pectl); 550 sc->tx_boundary = 2048; 551 } 552 } 553 554 /* 555 * load the optimized firmware (which assumes aligned PCIe 556 * completions) in order to see if it works on this host. 557 */ 558 sc->fw_name = mxge_fw_aligned; 559 status = mxge_load_firmware(sc, 1); 560 if (status != 0) { 561 return status; 562 } 563 564 /* 565 * Enable ECRC if possible 566 */ 567 mxge_enable_nvidia_ecrc(sc); 568 569 /* 570 * Run a DMA test which watches for unaligned completions and 571 * aborts on the first one seen. 572 */ 573 574 status = mxge_dma_test(sc, MXGEFW_CMD_UNALIGNED_TEST); 575 if (status == 0) 576 return 0; /* keep the aligned firmware */ 577 578 if (status != E2BIG) 579 device_printf(dev, "DMA test failed: %d\n", status); 580 if (status == ENOSYS) 581 device_printf(dev, "Falling back to ethp! " 582 "Please install up to date fw\n"); 583 return status; 584 } 585 586 static int 587 mxge_select_firmware(mxge_softc_t *sc) 588 { 589 int aligned = 0; 590 591 592 if (mxge_force_firmware != 0) { 593 if (mxge_force_firmware == 1) 594 aligned = 1; 595 else 596 aligned = 0; 597 if (mxge_verbose) 598 device_printf(sc->dev, 599 "Assuming %s completions (forced)\n", 600 aligned ? "aligned" : "unaligned"); 601 goto abort; 602 } 603 604 /* if the PCIe link width is 4 or less, we can use the aligned 605 firmware and skip any checks */ 606 if (sc->link_width != 0 && sc->link_width <= 4) { 607 device_printf(sc->dev, 608 "PCIe x%d Link, expect reduced performance\n", 609 sc->link_width); 610 aligned = 1; 611 goto abort; 612 } 613 614 if (0 == mxge_firmware_probe(sc)) 615 return 0; 616 617 abort: 618 if (aligned) { 619 sc->fw_name = mxge_fw_aligned; 620 sc->tx_boundary = 4096; 621 } else { 622 sc->fw_name = mxge_fw_unaligned; 623 sc->tx_boundary = 2048; 624 } 625 return (mxge_load_firmware(sc, 0)); 626 } 627 628 union qualhack 629 { 630 const char *ro_char; 631 char *rw_char; 632 }; 633 634 static int 635 mxge_validate_firmware(mxge_softc_t *sc, const mcp_gen_header_t *hdr) 636 { 637 638 639 if (be32toh(hdr->mcp_type) != MCP_TYPE_ETH) { 640 device_printf(sc->dev, "Bad firmware type: 0x%x\n", 641 be32toh(hdr->mcp_type)); 642 return EIO; 643 } 644 645 /* save firmware version for sysctl */ 646 strncpy(sc->fw_version, hdr->version, sizeof (sc->fw_version)); 647 if (mxge_verbose) 648 device_printf(sc->dev, "firmware id: %s\n", hdr->version); 649 650 sscanf(sc->fw_version, "%d.%d.%d", &sc->fw_ver_major, 651 &sc->fw_ver_minor, &sc->fw_ver_tiny); 652 653 if (!(sc->fw_ver_major == MXGEFW_VERSION_MAJOR 654 && sc->fw_ver_minor == MXGEFW_VERSION_MINOR)) { 655 device_printf(sc->dev, "Found firmware version %s\n", 656 sc->fw_version); 657 device_printf(sc->dev, "Driver needs %d.%d\n", 658 MXGEFW_VERSION_MAJOR, MXGEFW_VERSION_MINOR); 659 return EINVAL; 660 } 661 return 0; 662 663 } 664 665 static void * 666 z_alloc(void *nil, u_int items, u_int size) 667 { 668 void *ptr; 669 670 ptr = malloc(items * size, M_TEMP, M_NOWAIT); 671 return ptr; 672 } 673 674 static void 675 z_free(void *nil, void *ptr) 676 { 677 free(ptr, M_TEMP); 678 } 679 680 681 static int 682 mxge_load_firmware_helper(mxge_softc_t *sc, uint32_t *limit) 683 { 684 z_stream zs; 685 char *inflate_buffer; 686 const struct firmware *fw; 687 const mcp_gen_header_t *hdr; 688 unsigned hdr_offset; 689 int status; 690 unsigned int i; 691 char dummy; 692 size_t fw_len; 693 694 fw = firmware_get(sc->fw_name); 695 if (fw == NULL) { 696 device_printf(sc->dev, "Could not find firmware image %s\n", 697 sc->fw_name); 698 return ENOENT; 699 } 700 701 702 703 /* setup zlib and decompress f/w */ 704 bzero(&zs, sizeof (zs)); 705 zs.zalloc = z_alloc; 706 zs.zfree = z_free; 707 status = inflateInit(&zs); 708 if (status != Z_OK) { 709 status = EIO; 710 goto abort_with_fw; 711 } 712 713 /* the uncompressed size is stored as the firmware version, 714 which would otherwise go unused */ 715 fw_len = (size_t) fw->version; 716 inflate_buffer = malloc(fw_len, M_TEMP, M_NOWAIT); 717 if (inflate_buffer == NULL) 718 goto abort_with_zs; 719 zs.avail_in = fw->datasize; 720 zs.next_in = __DECONST(char *, fw->data); 721 zs.avail_out = fw_len; 722 zs.next_out = inflate_buffer; 723 status = inflate(&zs, Z_FINISH); 724 if (status != Z_STREAM_END) { 725 device_printf(sc->dev, "zlib %d\n", status); 726 status = EIO; 727 goto abort_with_buffer; 728 } 729 730 /* check id */ 731 hdr_offset = htobe32(*(const uint32_t *) 732 (inflate_buffer + MCP_HEADER_PTR_OFFSET)); 733 if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > fw_len) { 734 device_printf(sc->dev, "Bad firmware file"); 735 status = EIO; 736 goto abort_with_buffer; 737 } 738 hdr = (const void*)(inflate_buffer + hdr_offset); 739 740 status = mxge_validate_firmware(sc, hdr); 741 if (status != 0) 742 goto abort_with_buffer; 743 744 /* Copy the inflated firmware to NIC SRAM. */ 745 for (i = 0; i < fw_len; i += 256) { 746 mxge_pio_copy(sc->sram + MXGE_FW_OFFSET + i, 747 inflate_buffer + i, 748 min(256U, (unsigned)(fw_len - i))); 749 mb(); 750 dummy = *sc->sram; 751 mb(); 752 } 753 754 *limit = fw_len; 755 status = 0; 756 abort_with_buffer: 757 free(inflate_buffer, M_TEMP); 758 abort_with_zs: 759 inflateEnd(&zs); 760 abort_with_fw: 761 firmware_put(fw, FIRMWARE_UNLOAD); 762 return status; 763 } 764 765 /* 766 * Enable or disable periodic RDMAs from the host to make certain 767 * chipsets resend dropped PCIe messages 768 */ 769 770 static void 771 mxge_dummy_rdma(mxge_softc_t *sc, int enable) 772 { 773 char buf_bytes[72]; 774 volatile uint32_t *confirm; 775 volatile char *submit; 776 uint32_t *buf, dma_low, dma_high; 777 int i; 778 779 buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL); 780 781 /* clear confirmation addr */ 782 confirm = (volatile uint32_t *)sc->cmd; 783 *confirm = 0; 784 mb(); 785 786 /* send an rdma command to the PCIe engine, and wait for the 787 response in the confirmation address. The firmware should 788 write a -1 there to indicate it is alive and well 789 */ 790 791 dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr); 792 dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr); 793 buf[0] = htobe32(dma_high); /* confirm addr MSW */ 794 buf[1] = htobe32(dma_low); /* confirm addr LSW */ 795 buf[2] = htobe32(0xffffffff); /* confirm data */ 796 dma_low = MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr); 797 dma_high = MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr); 798 buf[3] = htobe32(dma_high); /* dummy addr MSW */ 799 buf[4] = htobe32(dma_low); /* dummy addr LSW */ 800 buf[5] = htobe32(enable); /* enable? */ 801 802 803 submit = (volatile char *)(sc->sram + MXGEFW_BOOT_DUMMY_RDMA); 804 805 mxge_pio_copy(submit, buf, 64); 806 mb(); 807 DELAY(1000); 808 mb(); 809 i = 0; 810 while (*confirm != 0xffffffff && i < 20) { 811 DELAY(1000); 812 i++; 813 } 814 if (*confirm != 0xffffffff) { 815 device_printf(sc->dev, "dummy rdma %s failed (%p = 0x%x)", 816 (enable ? "enable" : "disable"), confirm, 817 *confirm); 818 } 819 return; 820 } 821 822 static int 823 mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data) 824 { 825 mcp_cmd_t *buf; 826 char buf_bytes[sizeof(*buf) + 8]; 827 volatile mcp_cmd_response_t *response = sc->cmd; 828 volatile char *cmd_addr = sc->sram + MXGEFW_ETH_CMD; 829 uint32_t dma_low, dma_high; 830 int err, sleep_total = 0; 831 832 /* ensure buf is aligned to 8 bytes */ 833 buf = (mcp_cmd_t *)((unsigned long)(buf_bytes + 7) & ~7UL); 834 835 buf->data0 = htobe32(data->data0); 836 buf->data1 = htobe32(data->data1); 837 buf->data2 = htobe32(data->data2); 838 buf->cmd = htobe32(cmd); 839 dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr); 840 dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr); 841 842 buf->response_addr.low = htobe32(dma_low); 843 buf->response_addr.high = htobe32(dma_high); 844 mtx_lock(&sc->cmd_mtx); 845 response->result = 0xffffffff; 846 mb(); 847 mxge_pio_copy((volatile void *)cmd_addr, buf, sizeof (*buf)); 848 849 /* wait up to 20ms */ 850 err = EAGAIN; 851 for (sleep_total = 0; sleep_total < 20; sleep_total++) { 852 bus_dmamap_sync(sc->cmd_dma.dmat, 853 sc->cmd_dma.map, BUS_DMASYNC_POSTREAD); 854 mb(); 855 switch (be32toh(response->result)) { 856 case 0: 857 data->data0 = be32toh(response->data); 858 err = 0; 859 break; 860 case 0xffffffff: 861 DELAY(1000); 862 break; 863 case MXGEFW_CMD_UNKNOWN: 864 err = ENOSYS; 865 break; 866 case MXGEFW_CMD_ERROR_UNALIGNED: 867 err = E2BIG; 868 break; 869 case MXGEFW_CMD_ERROR_BUSY: 870 err = EBUSY; 871 break; 872 default: 873 device_printf(sc->dev, 874 "mxge: command %d " 875 "failed, result = %d\n", 876 cmd, be32toh(response->result)); 877 err = ENXIO; 878 break; 879 } 880 if (err != EAGAIN) 881 break; 882 } 883 if (err == EAGAIN) 884 device_printf(sc->dev, "mxge: command %d timed out" 885 "result = %d\n", 886 cmd, be32toh(response->result)); 887 mtx_unlock(&sc->cmd_mtx); 888 return err; 889 } 890 891 static int 892 mxge_adopt_running_firmware(mxge_softc_t *sc) 893 { 894 struct mcp_gen_header *hdr; 895 const size_t bytes = sizeof (struct mcp_gen_header); 896 size_t hdr_offset; 897 int status; 898 899 /* find running firmware header */ 900 hdr_offset = htobe32(*(volatile uint32_t *) 901 (sc->sram + MCP_HEADER_PTR_OFFSET)); 902 903 if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > sc->sram_size) { 904 device_printf(sc->dev, 905 "Running firmware has bad header offset (%d)\n", 906 (int)hdr_offset); 907 return EIO; 908 } 909 910 /* copy header of running firmware from SRAM to host memory to 911 * validate firmware */ 912 hdr = malloc(bytes, M_DEVBUF, M_NOWAIT); 913 if (hdr == NULL) { 914 device_printf(sc->dev, "could not malloc firmware hdr\n"); 915 return ENOMEM; 916 } 917 bus_space_read_region_1(rman_get_bustag(sc->mem_res), 918 rman_get_bushandle(sc->mem_res), 919 hdr_offset, (char *)hdr, bytes); 920 status = mxge_validate_firmware(sc, hdr); 921 free(hdr, M_DEVBUF); 922 923 /* 924 * check to see if adopted firmware has bug where adopting 925 * it will cause broadcasts to be filtered unless the NIC 926 * is kept in ALLMULTI mode 927 */ 928 if (sc->fw_ver_major == 1 && sc->fw_ver_minor == 4 && 929 sc->fw_ver_tiny >= 4 && sc->fw_ver_tiny <= 11) { 930 sc->adopted_rx_filter_bug = 1; 931 device_printf(sc->dev, "Adopting fw %d.%d.%d: " 932 "working around rx filter bug\n", 933 sc->fw_ver_major, sc->fw_ver_minor, 934 sc->fw_ver_tiny); 935 } 936 937 return status; 938 } 939 940 941 static int 942 mxge_load_firmware(mxge_softc_t *sc, int adopt) 943 { 944 volatile uint32_t *confirm; 945 volatile char *submit; 946 char buf_bytes[72]; 947 uint32_t *buf, size, dma_low, dma_high; 948 int status, i; 949 950 buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL); 951 952 size = sc->sram_size; 953 status = mxge_load_firmware_helper(sc, &size); 954 if (status) { 955 if (!adopt) 956 return status; 957 /* Try to use the currently running firmware, if 958 it is new enough */ 959 status = mxge_adopt_running_firmware(sc); 960 if (status) { 961 device_printf(sc->dev, 962 "failed to adopt running firmware\n"); 963 return status; 964 } 965 device_printf(sc->dev, 966 "Successfully adopted running firmware\n"); 967 if (sc->tx_boundary == 4096) { 968 device_printf(sc->dev, 969 "Using firmware currently running on NIC" 970 ". For optimal\n"); 971 device_printf(sc->dev, 972 "performance consider loading optimized " 973 "firmware\n"); 974 } 975 sc->fw_name = mxge_fw_unaligned; 976 sc->tx_boundary = 2048; 977 return 0; 978 } 979 /* clear confirmation addr */ 980 confirm = (volatile uint32_t *)sc->cmd; 981 *confirm = 0; 982 mb(); 983 /* send a reload command to the bootstrap MCP, and wait for the 984 response in the confirmation address. The firmware should 985 write a -1 there to indicate it is alive and well 986 */ 987 988 dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr); 989 dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr); 990 991 buf[0] = htobe32(dma_high); /* confirm addr MSW */ 992 buf[1] = htobe32(dma_low); /* confirm addr LSW */ 993 buf[2] = htobe32(0xffffffff); /* confirm data */ 994 995 /* FIX: All newest firmware should un-protect the bottom of 996 the sram before handoff. However, the very first interfaces 997 do not. Therefore the handoff copy must skip the first 8 bytes 998 */ 999 /* where the code starts*/ 1000 buf[3] = htobe32(MXGE_FW_OFFSET + 8); 1001 buf[4] = htobe32(size - 8); /* length of code */ 1002 buf[5] = htobe32(8); /* where to copy to */ 1003 buf[6] = htobe32(0); /* where to jump to */ 1004 1005 submit = (volatile char *)(sc->sram + MXGEFW_BOOT_HANDOFF); 1006 mxge_pio_copy(submit, buf, 64); 1007 mb(); 1008 DELAY(1000); 1009 mb(); 1010 i = 0; 1011 while (*confirm != 0xffffffff && i < 20) { 1012 DELAY(1000*10); 1013 i++; 1014 bus_dmamap_sync(sc->cmd_dma.dmat, 1015 sc->cmd_dma.map, BUS_DMASYNC_POSTREAD); 1016 } 1017 if (*confirm != 0xffffffff) { 1018 device_printf(sc->dev,"handoff failed (%p = 0x%x)", 1019 confirm, *confirm); 1020 1021 return ENXIO; 1022 } 1023 return 0; 1024 } 1025 1026 static int 1027 mxge_update_mac_address(mxge_softc_t *sc) 1028 { 1029 mxge_cmd_t cmd; 1030 uint8_t *addr = sc->mac_addr; 1031 int status; 1032 1033 1034 cmd.data0 = ((addr[0] << 24) | (addr[1] << 16) 1035 | (addr[2] << 8) | addr[3]); 1036 1037 cmd.data1 = ((addr[4] << 8) | (addr[5])); 1038 1039 status = mxge_send_cmd(sc, MXGEFW_SET_MAC_ADDRESS, &cmd); 1040 return status; 1041 } 1042 1043 static int 1044 mxge_change_pause(mxge_softc_t *sc, int pause) 1045 { 1046 mxge_cmd_t cmd; 1047 int status; 1048 1049 if (pause) 1050 status = mxge_send_cmd(sc, MXGEFW_ENABLE_FLOW_CONTROL, 1051 &cmd); 1052 else 1053 status = mxge_send_cmd(sc, MXGEFW_DISABLE_FLOW_CONTROL, 1054 &cmd); 1055 1056 if (status) { 1057 device_printf(sc->dev, "Failed to set flow control mode\n"); 1058 return ENXIO; 1059 } 1060 sc->pause = pause; 1061 return 0; 1062 } 1063 1064 static void 1065 mxge_change_promisc(mxge_softc_t *sc, int promisc) 1066 { 1067 mxge_cmd_t cmd; 1068 int status; 1069 1070 if (mxge_always_promisc) 1071 promisc = 1; 1072 1073 if (promisc) 1074 status = mxge_send_cmd(sc, MXGEFW_ENABLE_PROMISC, 1075 &cmd); 1076 else 1077 status = mxge_send_cmd(sc, MXGEFW_DISABLE_PROMISC, 1078 &cmd); 1079 1080 if (status) { 1081 device_printf(sc->dev, "Failed to set promisc mode\n"); 1082 } 1083 } 1084 1085 static void 1086 mxge_set_multicast_list(mxge_softc_t *sc) 1087 { 1088 mxge_cmd_t cmd; 1089 struct ifmultiaddr *ifma; 1090 struct ifnet *ifp = sc->ifp; 1091 int err; 1092 1093 /* This firmware is known to not support multicast */ 1094 if (!sc->fw_multicast_support) 1095 return; 1096 1097 /* Disable multicast filtering while we play with the lists*/ 1098 err = mxge_send_cmd(sc, MXGEFW_ENABLE_ALLMULTI, &cmd); 1099 if (err != 0) { 1100 device_printf(sc->dev, "Failed MXGEFW_ENABLE_ALLMULTI," 1101 " error status: %d\n", err); 1102 return; 1103 } 1104 1105 if (sc->adopted_rx_filter_bug) 1106 return; 1107 1108 if (ifp->if_flags & IFF_ALLMULTI) 1109 /* request to disable multicast filtering, so quit here */ 1110 return; 1111 1112 /* Flush all the filters */ 1113 1114 err = mxge_send_cmd(sc, MXGEFW_LEAVE_ALL_MULTICAST_GROUPS, &cmd); 1115 if (err != 0) { 1116 device_printf(sc->dev, 1117 "Failed MXGEFW_LEAVE_ALL_MULTICAST_GROUPS" 1118 ", error status: %d\n", err); 1119 return; 1120 } 1121 1122 /* Walk the multicast list, and add each address */ 1123 1124 IF_ADDR_LOCK(ifp); 1125 TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { 1126 if (ifma->ifma_addr->sa_family != AF_LINK) 1127 continue; 1128 bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr), 1129 &cmd.data0, 4); 1130 bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr) + 4, 1131 &cmd.data1, 2); 1132 cmd.data0 = htonl(cmd.data0); 1133 cmd.data1 = htonl(cmd.data1); 1134 err = mxge_send_cmd(sc, MXGEFW_JOIN_MULTICAST_GROUP, &cmd); 1135 if (err != 0) { 1136 device_printf(sc->dev, "Failed " 1137 "MXGEFW_JOIN_MULTICAST_GROUP, error status:" 1138 "%d\t", err); 1139 /* abort, leaving multicast filtering off */ 1140 IF_ADDR_UNLOCK(ifp); 1141 return; 1142 } 1143 } 1144 IF_ADDR_UNLOCK(ifp); 1145 /* Enable multicast filtering */ 1146 err = mxge_send_cmd(sc, MXGEFW_DISABLE_ALLMULTI, &cmd); 1147 if (err != 0) { 1148 device_printf(sc->dev, "Failed MXGEFW_DISABLE_ALLMULTI" 1149 ", error status: %d\n", err); 1150 } 1151 } 1152 1153 static int 1154 mxge_max_mtu(mxge_softc_t *sc) 1155 { 1156 mxge_cmd_t cmd; 1157 int status; 1158 1159 if (MJUMPAGESIZE - MXGEFW_PAD > MXGEFW_MAX_MTU) 1160 return MXGEFW_MAX_MTU - MXGEFW_PAD; 1161 1162 /* try to set nbufs to see if it we can 1163 use virtually contiguous jumbos */ 1164 cmd.data0 = 0; 1165 status = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS, 1166 &cmd); 1167 if (status == 0) 1168 return MXGEFW_MAX_MTU - MXGEFW_PAD; 1169 1170 /* otherwise, we're limited to MJUMPAGESIZE */ 1171 return MJUMPAGESIZE - MXGEFW_PAD; 1172 } 1173 1174 static int 1175 mxge_reset(mxge_softc_t *sc, int interrupts_setup) 1176 { 1177 struct mxge_slice_state *ss; 1178 mxge_rx_done_t *rx_done; 1179 volatile uint32_t *irq_claim; 1180 mxge_cmd_t cmd; 1181 int slice, status; 1182 1183 /* try to send a reset command to the card to see if it 1184 is alive */ 1185 memset(&cmd, 0, sizeof (cmd)); 1186 status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd); 1187 if (status != 0) { 1188 device_printf(sc->dev, "failed reset\n"); 1189 return ENXIO; 1190 } 1191 1192 mxge_dummy_rdma(sc, 1); 1193 1194 1195 /* set the intrq size */ 1196 cmd.data0 = sc->rx_ring_size; 1197 status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd); 1198 1199 /* 1200 * Even though we already know how many slices are supported 1201 * via mxge_slice_probe(), MXGEFW_CMD_GET_MAX_RSS_QUEUES 1202 * has magic side effects, and must be called after a reset. 1203 * It must be called prior to calling any RSS related cmds, 1204 * including assigning an interrupt queue for anything but 1205 * slice 0. It must also be called *after* 1206 * MXGEFW_CMD_SET_INTRQ_SIZE, since the intrq size is used by 1207 * the firmware to compute offsets. 1208 */ 1209 1210 if (sc->num_slices > 1) { 1211 /* ask the maximum number of slices it supports */ 1212 status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES, 1213 &cmd); 1214 if (status != 0) { 1215 device_printf(sc->dev, 1216 "failed to get number of slices\n"); 1217 return status; 1218 } 1219 /* 1220 * MXGEFW_CMD_ENABLE_RSS_QUEUES must be called prior 1221 * to setting up the interrupt queue DMA 1222 */ 1223 cmd.data0 = sc->num_slices; 1224 cmd.data1 = MXGEFW_SLICE_INTR_MODE_ONE_PER_SLICE; 1225 status = mxge_send_cmd(sc, MXGEFW_CMD_ENABLE_RSS_QUEUES, 1226 &cmd); 1227 if (status != 0) { 1228 device_printf(sc->dev, 1229 "failed to set number of slices\n"); 1230 return status; 1231 } 1232 } 1233 1234 1235 if (interrupts_setup) { 1236 /* Now exchange information about interrupts */ 1237 for (slice = 0; slice < sc->num_slices; slice++) { 1238 rx_done = &sc->ss[slice].rx_done; 1239 memset(rx_done->entry, 0, sc->rx_ring_size); 1240 cmd.data0 = MXGE_LOWPART_TO_U32(rx_done->dma.bus_addr); 1241 cmd.data1 = MXGE_HIGHPART_TO_U32(rx_done->dma.bus_addr); 1242 cmd.data2 = slice; 1243 status |= mxge_send_cmd(sc, 1244 MXGEFW_CMD_SET_INTRQ_DMA, 1245 &cmd); 1246 } 1247 } 1248 1249 status |= mxge_send_cmd(sc, 1250 MXGEFW_CMD_GET_INTR_COAL_DELAY_OFFSET, &cmd); 1251 1252 1253 sc->intr_coal_delay_ptr = (volatile uint32_t *)(sc->sram + cmd.data0); 1254 1255 status |= mxge_send_cmd(sc, MXGEFW_CMD_GET_IRQ_ACK_OFFSET, &cmd); 1256 irq_claim = (volatile uint32_t *)(sc->sram + cmd.data0); 1257 1258 1259 status |= mxge_send_cmd(sc, MXGEFW_CMD_GET_IRQ_DEASSERT_OFFSET, 1260 &cmd); 1261 sc->irq_deassert = (volatile uint32_t *)(sc->sram + cmd.data0); 1262 if (status != 0) { 1263 device_printf(sc->dev, "failed set interrupt parameters\n"); 1264 return status; 1265 } 1266 1267 1268 *sc->intr_coal_delay_ptr = htobe32(sc->intr_coal_delay); 1269 1270 1271 /* run a DMA benchmark */ 1272 (void) mxge_dma_test(sc, MXGEFW_DMA_TEST); 1273 1274 for (slice = 0; slice < sc->num_slices; slice++) { 1275 ss = &sc->ss[slice]; 1276 1277 ss->irq_claim = irq_claim + (2 * slice); 1278 /* reset mcp/driver shared state back to 0 */ 1279 ss->rx_done.idx = 0; 1280 ss->rx_done.cnt = 0; 1281 ss->tx.req = 0; 1282 ss->tx.done = 0; 1283 ss->tx.pkt_done = 0; 1284 ss->tx.wake = 0; 1285 ss->tx.defrag = 0; 1286 ss->tx.stall = 0; 1287 ss->rx_big.cnt = 0; 1288 ss->rx_small.cnt = 0; 1289 ss->lro_bad_csum = 0; 1290 ss->lro_queued = 0; 1291 ss->lro_flushed = 0; 1292 if (ss->fw_stats != NULL) { 1293 ss->fw_stats->valid = 0; 1294 ss->fw_stats->send_done_count = 0; 1295 } 1296 } 1297 sc->rdma_tags_available = 15; 1298 status = mxge_update_mac_address(sc); 1299 mxge_change_promisc(sc, 0); 1300 mxge_change_pause(sc, sc->pause); 1301 mxge_set_multicast_list(sc); 1302 return status; 1303 } 1304 1305 static int 1306 mxge_change_intr_coal(SYSCTL_HANDLER_ARGS) 1307 { 1308 mxge_softc_t *sc; 1309 unsigned int intr_coal_delay; 1310 int err; 1311 1312 sc = arg1; 1313 intr_coal_delay = sc->intr_coal_delay; 1314 err = sysctl_handle_int(oidp, &intr_coal_delay, arg2, req); 1315 if (err != 0) { 1316 return err; 1317 } 1318 if (intr_coal_delay == sc->intr_coal_delay) 1319 return 0; 1320 1321 if (intr_coal_delay == 0 || intr_coal_delay > 1000*1000) 1322 return EINVAL; 1323 1324 mtx_lock(&sc->driver_mtx); 1325 *sc->intr_coal_delay_ptr = htobe32(intr_coal_delay); 1326 sc->intr_coal_delay = intr_coal_delay; 1327 1328 mtx_unlock(&sc->driver_mtx); 1329 return err; 1330 } 1331 1332 static int 1333 mxge_change_flow_control(SYSCTL_HANDLER_ARGS) 1334 { 1335 mxge_softc_t *sc; 1336 unsigned int enabled; 1337 int err; 1338 1339 sc = arg1; 1340 enabled = sc->pause; 1341 err = sysctl_handle_int(oidp, &enabled, arg2, req); 1342 if (err != 0) { 1343 return err; 1344 } 1345 if (enabled == sc->pause) 1346 return 0; 1347 1348 mtx_lock(&sc->driver_mtx); 1349 err = mxge_change_pause(sc, enabled); 1350 mtx_unlock(&sc->driver_mtx); 1351 return err; 1352 } 1353 1354 static int 1355 mxge_change_lro_locked(mxge_softc_t *sc, int lro_cnt) 1356 { 1357 struct ifnet *ifp; 1358 int err = 0; 1359 1360 ifp = sc->ifp; 1361 if (lro_cnt == 0) 1362 ifp->if_capenable &= ~IFCAP_LRO; 1363 else 1364 ifp->if_capenable |= IFCAP_LRO; 1365 sc->lro_cnt = lro_cnt; 1366 if (ifp->if_drv_flags & IFF_DRV_RUNNING) { 1367 callout_stop(&sc->co_hdl); 1368 mxge_close(sc); 1369 err = mxge_open(sc); 1370 if (err == 0) 1371 callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc); 1372 } 1373 return err; 1374 } 1375 1376 static int 1377 mxge_change_lro(SYSCTL_HANDLER_ARGS) 1378 { 1379 mxge_softc_t *sc; 1380 unsigned int lro_cnt; 1381 int err; 1382 1383 sc = arg1; 1384 lro_cnt = sc->lro_cnt; 1385 err = sysctl_handle_int(oidp, &lro_cnt, arg2, req); 1386 if (err != 0) 1387 return err; 1388 1389 if (lro_cnt == sc->lro_cnt) 1390 return 0; 1391 1392 if (lro_cnt > 128) 1393 return EINVAL; 1394 1395 mtx_lock(&sc->driver_mtx); 1396 err = mxge_change_lro_locked(sc, lro_cnt); 1397 mtx_unlock(&sc->driver_mtx); 1398 return err; 1399 } 1400 1401 static int 1402 mxge_handle_be32(SYSCTL_HANDLER_ARGS) 1403 { 1404 int err; 1405 1406 if (arg1 == NULL) 1407 return EFAULT; 1408 arg2 = be32toh(*(int *)arg1); 1409 arg1 = NULL; 1410 err = sysctl_handle_int(oidp, arg1, arg2, req); 1411 1412 return err; 1413 } 1414 1415 static void 1416 mxge_rem_sysctls(mxge_softc_t *sc) 1417 { 1418 struct mxge_slice_state *ss; 1419 int slice; 1420 1421 if (sc->slice_sysctl_tree == NULL) 1422 return; 1423 1424 for (slice = 0; slice < sc->num_slices; slice++) { 1425 ss = &sc->ss[slice]; 1426 if (ss == NULL || ss->sysctl_tree == NULL) 1427 continue; 1428 sysctl_ctx_free(&ss->sysctl_ctx); 1429 ss->sysctl_tree = NULL; 1430 } 1431 sysctl_ctx_free(&sc->slice_sysctl_ctx); 1432 sc->slice_sysctl_tree = NULL; 1433 } 1434 1435 static void 1436 mxge_add_sysctls(mxge_softc_t *sc) 1437 { 1438 struct sysctl_ctx_list *ctx; 1439 struct sysctl_oid_list *children; 1440 mcp_irq_data_t *fw; 1441 struct mxge_slice_state *ss; 1442 int slice; 1443 char slice_num[8]; 1444 1445 ctx = device_get_sysctl_ctx(sc->dev); 1446 children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev)); 1447 fw = sc->ss[0].fw_stats; 1448 1449 /* random information */ 1450 SYSCTL_ADD_STRING(ctx, children, OID_AUTO, 1451 "firmware_version", 1452 CTLFLAG_RD, &sc->fw_version, 1453 0, "firmware version"); 1454 SYSCTL_ADD_STRING(ctx, children, OID_AUTO, 1455 "serial_number", 1456 CTLFLAG_RD, &sc->serial_number_string, 1457 0, "serial number"); 1458 SYSCTL_ADD_STRING(ctx, children, OID_AUTO, 1459 "product_code", 1460 CTLFLAG_RD, &sc->product_code_string, 1461 0, "product_code"); 1462 SYSCTL_ADD_INT(ctx, children, OID_AUTO, 1463 "pcie_link_width", 1464 CTLFLAG_RD, &sc->link_width, 1465 0, "tx_boundary"); 1466 SYSCTL_ADD_INT(ctx, children, OID_AUTO, 1467 "tx_boundary", 1468 CTLFLAG_RD, &sc->tx_boundary, 1469 0, "tx_boundary"); 1470 SYSCTL_ADD_INT(ctx, children, OID_AUTO, 1471 "write_combine", 1472 CTLFLAG_RD, &sc->wc, 1473 0, "write combining PIO?"); 1474 SYSCTL_ADD_INT(ctx, children, OID_AUTO, 1475 "read_dma_MBs", 1476 CTLFLAG_RD, &sc->read_dma, 1477 0, "DMA Read speed in MB/s"); 1478 SYSCTL_ADD_INT(ctx, children, OID_AUTO, 1479 "write_dma_MBs", 1480 CTLFLAG_RD, &sc->write_dma, 1481 0, "DMA Write speed in MB/s"); 1482 SYSCTL_ADD_INT(ctx, children, OID_AUTO, 1483 "read_write_dma_MBs", 1484 CTLFLAG_RD, &sc->read_write_dma, 1485 0, "DMA concurrent Read/Write speed in MB/s"); 1486 1487 1488 /* performance related tunables */ 1489 SYSCTL_ADD_PROC(ctx, children, OID_AUTO, 1490 "intr_coal_delay", 1491 CTLTYPE_INT|CTLFLAG_RW, sc, 1492 0, mxge_change_intr_coal, 1493 "I", "interrupt coalescing delay in usecs"); 1494 1495 SYSCTL_ADD_PROC(ctx, children, OID_AUTO, 1496 "flow_control_enabled", 1497 CTLTYPE_INT|CTLFLAG_RW, sc, 1498 0, mxge_change_flow_control, 1499 "I", "interrupt coalescing delay in usecs"); 1500 1501 SYSCTL_ADD_INT(ctx, children, OID_AUTO, 1502 "deassert_wait", 1503 CTLFLAG_RW, &mxge_deassert_wait, 1504 0, "Wait for IRQ line to go low in ihandler"); 1505 1506 /* stats block from firmware is in network byte order. 1507 Need to swap it */ 1508 SYSCTL_ADD_PROC(ctx, children, OID_AUTO, 1509 "link_up", 1510 CTLTYPE_INT|CTLFLAG_RD, &fw->link_up, 1511 0, mxge_handle_be32, 1512 "I", "link up"); 1513 SYSCTL_ADD_PROC(ctx, children, OID_AUTO, 1514 "rdma_tags_available", 1515 CTLTYPE_INT|CTLFLAG_RD, &fw->rdma_tags_available, 1516 0, mxge_handle_be32, 1517 "I", "rdma_tags_available"); 1518 SYSCTL_ADD_PROC(ctx, children, OID_AUTO, 1519 "dropped_bad_crc32", 1520 CTLTYPE_INT|CTLFLAG_RD, 1521 &fw->dropped_bad_crc32, 1522 0, mxge_handle_be32, 1523 "I", "dropped_bad_crc32"); 1524 SYSCTL_ADD_PROC(ctx, children, OID_AUTO, 1525 "dropped_bad_phy", 1526 CTLTYPE_INT|CTLFLAG_RD, 1527 &fw->dropped_bad_phy, 1528 0, mxge_handle_be32, 1529 "I", "dropped_bad_phy"); 1530 SYSCTL_ADD_PROC(ctx, children, OID_AUTO, 1531 "dropped_link_error_or_filtered", 1532 CTLTYPE_INT|CTLFLAG_RD, 1533 &fw->dropped_link_error_or_filtered, 1534 0, mxge_handle_be32, 1535 "I", "dropped_link_error_or_filtered"); 1536 SYSCTL_ADD_PROC(ctx, children, OID_AUTO, 1537 "dropped_link_overflow", 1538 CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_link_overflow, 1539 0, mxge_handle_be32, 1540 "I", "dropped_link_overflow"); 1541 SYSCTL_ADD_PROC(ctx, children, OID_AUTO, 1542 "dropped_multicast_filtered", 1543 CTLTYPE_INT|CTLFLAG_RD, 1544 &fw->dropped_multicast_filtered, 1545 0, mxge_handle_be32, 1546 "I", "dropped_multicast_filtered"); 1547 SYSCTL_ADD_PROC(ctx, children, OID_AUTO, 1548 "dropped_no_big_buffer", 1549 CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_no_big_buffer, 1550 0, mxge_handle_be32, 1551 "I", "dropped_no_big_buffer"); 1552 SYSCTL_ADD_PROC(ctx, children, OID_AUTO, 1553 "dropped_no_small_buffer", 1554 CTLTYPE_INT|CTLFLAG_RD, 1555 &fw->dropped_no_small_buffer, 1556 0, mxge_handle_be32, 1557 "I", "dropped_no_small_buffer"); 1558 SYSCTL_ADD_PROC(ctx, children, OID_AUTO, 1559 "dropped_overrun", 1560 CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_overrun, 1561 0, mxge_handle_be32, 1562 "I", "dropped_overrun"); 1563 SYSCTL_ADD_PROC(ctx, children, OID_AUTO, 1564 "dropped_pause", 1565 CTLTYPE_INT|CTLFLAG_RD, 1566 &fw->dropped_pause, 1567 0, mxge_handle_be32, 1568 "I", "dropped_pause"); 1569 SYSCTL_ADD_PROC(ctx, children, OID_AUTO, 1570 "dropped_runt", 1571 CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_runt, 1572 0, mxge_handle_be32, 1573 "I", "dropped_runt"); 1574 1575 SYSCTL_ADD_PROC(ctx, children, OID_AUTO, 1576 "dropped_unicast_filtered", 1577 CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_unicast_filtered, 1578 0, mxge_handle_be32, 1579 "I", "dropped_unicast_filtered"); 1580 1581 /* verbose printing? */ 1582 SYSCTL_ADD_INT(ctx, children, OID_AUTO, 1583 "verbose", 1584 CTLFLAG_RW, &mxge_verbose, 1585 0, "verbose printing"); 1586 1587 /* lro */ 1588 SYSCTL_ADD_PROC(ctx, children, OID_AUTO, 1589 "lro_cnt", 1590 CTLTYPE_INT|CTLFLAG_RW, sc, 1591 0, mxge_change_lro, 1592 "I", "number of lro merge queues"); 1593 1594 1595 /* add counters exported for debugging from all slices */ 1596 sysctl_ctx_init(&sc->slice_sysctl_ctx); 1597 sc->slice_sysctl_tree = 1598 SYSCTL_ADD_NODE(&sc->slice_sysctl_ctx, children, OID_AUTO, 1599 "slice", CTLFLAG_RD, 0, ""); 1600 1601 for (slice = 0; slice < sc->num_slices; slice++) { 1602 ss = &sc->ss[slice]; 1603 sysctl_ctx_init(&ss->sysctl_ctx); 1604 ctx = &ss->sysctl_ctx; 1605 children = SYSCTL_CHILDREN(sc->slice_sysctl_tree); 1606 sprintf(slice_num, "%d", slice); 1607 ss->sysctl_tree = 1608 SYSCTL_ADD_NODE(ctx, children, OID_AUTO, slice_num, 1609 CTLFLAG_RD, 0, ""); 1610 children = SYSCTL_CHILDREN(ss->sysctl_tree); 1611 SYSCTL_ADD_INT(ctx, children, OID_AUTO, 1612 "rx_small_cnt", 1613 CTLFLAG_RD, &ss->rx_small.cnt, 1614 0, "rx_small_cnt"); 1615 SYSCTL_ADD_INT(ctx, children, OID_AUTO, 1616 "rx_big_cnt", 1617 CTLFLAG_RD, &ss->rx_big.cnt, 1618 0, "rx_small_cnt"); 1619 SYSCTL_ADD_INT(ctx, children, OID_AUTO, 1620 "tx_req", 1621 CTLFLAG_RD, &ss->tx.req, 1622 0, "tx_req"); 1623 SYSCTL_ADD_INT(ctx, children, OID_AUTO, 1624 "lro_flushed", CTLFLAG_RD, &ss->lro_flushed, 1625 0, "number of lro merge queues flushed"); 1626 1627 SYSCTL_ADD_INT(ctx, children, OID_AUTO, 1628 "lro_queued", CTLFLAG_RD, &ss->lro_queued, 1629 0, "number of frames appended to lro merge" 1630 "queues"); 1631 1632 /* only transmit from slice 0 for now */ 1633 if (slice > 0) 1634 continue; 1635 1636 SYSCTL_ADD_INT(ctx, children, OID_AUTO, 1637 "tx_done", 1638 CTLFLAG_RD, &ss->tx.done, 1639 0, "tx_done"); 1640 SYSCTL_ADD_INT(ctx, children, OID_AUTO, 1641 "tx_pkt_done", 1642 CTLFLAG_RD, &ss->tx.pkt_done, 1643 0, "tx_done"); 1644 SYSCTL_ADD_INT(ctx, children, OID_AUTO, 1645 "tx_stall", 1646 CTLFLAG_RD, &ss->tx.stall, 1647 0, "tx_stall"); 1648 SYSCTL_ADD_INT(ctx, children, OID_AUTO, 1649 "tx_wake", 1650 CTLFLAG_RD, &ss->tx.wake, 1651 0, "tx_wake"); 1652 SYSCTL_ADD_INT(ctx, children, OID_AUTO, 1653 "tx_defrag", 1654 CTLFLAG_RD, &ss->tx.defrag, 1655 0, "tx_defrag"); 1656 } 1657 } 1658 1659 /* copy an array of mcp_kreq_ether_send_t's to the mcp. Copy 1660 backwards one at a time and handle ring wraps */ 1661 1662 static inline void 1663 mxge_submit_req_backwards(mxge_tx_ring_t *tx, 1664 mcp_kreq_ether_send_t *src, int cnt) 1665 { 1666 int idx, starting_slot; 1667 starting_slot = tx->req; 1668 while (cnt > 1) { 1669 cnt--; 1670 idx = (starting_slot + cnt) & tx->mask; 1671 mxge_pio_copy(&tx->lanai[idx], 1672 &src[cnt], sizeof(*src)); 1673 mb(); 1674 } 1675 } 1676 1677 /* 1678 * copy an array of mcp_kreq_ether_send_t's to the mcp. Copy 1679 * at most 32 bytes at a time, so as to avoid involving the software 1680 * pio handler in the nic. We re-write the first segment's flags 1681 * to mark them valid only after writing the entire chain 1682 */ 1683 1684 static inline void 1685 mxge_submit_req(mxge_tx_ring_t *tx, mcp_kreq_ether_send_t *src, 1686 int cnt) 1687 { 1688 int idx, i; 1689 uint32_t *src_ints; 1690 volatile uint32_t *dst_ints; 1691 mcp_kreq_ether_send_t *srcp; 1692 volatile mcp_kreq_ether_send_t *dstp, *dst; 1693 uint8_t last_flags; 1694 1695 idx = tx->req & tx->mask; 1696 1697 last_flags = src->flags; 1698 src->flags = 0; 1699 mb(); 1700 dst = dstp = &tx->lanai[idx]; 1701 srcp = src; 1702 1703 if ((idx + cnt) < tx->mask) { 1704 for (i = 0; i < (cnt - 1); i += 2) { 1705 mxge_pio_copy(dstp, srcp, 2 * sizeof(*src)); 1706 mb(); /* force write every 32 bytes */ 1707 srcp += 2; 1708 dstp += 2; 1709 } 1710 } else { 1711 /* submit all but the first request, and ensure 1712 that it is submitted below */ 1713 mxge_submit_req_backwards(tx, src, cnt); 1714 i = 0; 1715 } 1716 if (i < cnt) { 1717 /* submit the first request */ 1718 mxge_pio_copy(dstp, srcp, sizeof(*src)); 1719 mb(); /* barrier before setting valid flag */ 1720 } 1721 1722 /* re-write the last 32-bits with the valid flags */ 1723 src->flags = last_flags; 1724 src_ints = (uint32_t *)src; 1725 src_ints+=3; 1726 dst_ints = (volatile uint32_t *)dst; 1727 dst_ints+=3; 1728 *dst_ints = *src_ints; 1729 tx->req += cnt; 1730 mb(); 1731 } 1732 1733 static void 1734 mxge_encap_tso(struct mxge_slice_state *ss, struct mbuf *m, 1735 int busdma_seg_cnt, int ip_off) 1736 { 1737 mxge_tx_ring_t *tx; 1738 mcp_kreq_ether_send_t *req; 1739 bus_dma_segment_t *seg; 1740 struct ip *ip; 1741 struct tcphdr *tcp; 1742 uint32_t low, high_swapped; 1743 int len, seglen, cum_len, cum_len_next; 1744 int next_is_first, chop, cnt, rdma_count, small; 1745 uint16_t pseudo_hdr_offset, cksum_offset, mss; 1746 uint8_t flags, flags_next; 1747 static int once; 1748 1749 mss = m->m_pkthdr.tso_segsz; 1750 1751 /* negative cum_len signifies to the 1752 * send loop that we are still in the 1753 * header portion of the TSO packet. 1754 */ 1755 1756 /* ensure we have the ethernet, IP and TCP 1757 header together in the first mbuf, copy 1758 it to a scratch buffer if not */ 1759 if (__predict_false(m->m_len < ip_off + sizeof (*ip))) { 1760 m_copydata(m, 0, ip_off + sizeof (*ip), 1761 ss->scratch); 1762 ip = (struct ip *)(ss->scratch + ip_off); 1763 } else { 1764 ip = (struct ip *)(mtod(m, char *) + ip_off); 1765 } 1766 if (__predict_false(m->m_len < ip_off + (ip->ip_hl << 2) 1767 + sizeof (*tcp))) { 1768 m_copydata(m, 0, ip_off + (ip->ip_hl << 2) 1769 + sizeof (*tcp), ss->scratch); 1770 ip = (struct ip *)(mtod(m, char *) + ip_off); 1771 } 1772 1773 tcp = (struct tcphdr *)((char *)ip + (ip->ip_hl << 2)); 1774 cum_len = -(ip_off + ((ip->ip_hl + tcp->th_off) << 2)); 1775 1776 /* TSO implies checksum offload on this hardware */ 1777 cksum_offset = ip_off + (ip->ip_hl << 2); 1778 flags = MXGEFW_FLAGS_TSO_HDR | MXGEFW_FLAGS_FIRST; 1779 1780 1781 /* for TSO, pseudo_hdr_offset holds mss. 1782 * The firmware figures out where to put 1783 * the checksum by parsing the header. */ 1784 pseudo_hdr_offset = htobe16(mss); 1785 1786 tx = &ss->tx; 1787 req = tx->req_list; 1788 seg = tx->seg_list; 1789 cnt = 0; 1790 rdma_count = 0; 1791 /* "rdma_count" is the number of RDMAs belonging to the 1792 * current packet BEFORE the current send request. For 1793 * non-TSO packets, this is equal to "count". 1794 * For TSO packets, rdma_count needs to be reset 1795 * to 0 after a segment cut. 1796 * 1797 * The rdma_count field of the send request is 1798 * the number of RDMAs of the packet starting at 1799 * that request. For TSO send requests with one ore more cuts 1800 * in the middle, this is the number of RDMAs starting 1801 * after the last cut in the request. All previous 1802 * segments before the last cut implicitly have 1 RDMA. 1803 * 1804 * Since the number of RDMAs is not known beforehand, 1805 * it must be filled-in retroactively - after each 1806 * segmentation cut or at the end of the entire packet. 1807 */ 1808 1809 while (busdma_seg_cnt) { 1810 /* Break the busdma segment up into pieces*/ 1811 low = MXGE_LOWPART_TO_U32(seg->ds_addr); 1812 high_swapped = htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr)); 1813 len = seg->ds_len; 1814 1815 while (len) { 1816 flags_next = flags & ~MXGEFW_FLAGS_FIRST; 1817 seglen = len; 1818 cum_len_next = cum_len + seglen; 1819 (req-rdma_count)->rdma_count = rdma_count + 1; 1820 if (__predict_true(cum_len >= 0)) { 1821 /* payload */ 1822 chop = (cum_len_next > mss); 1823 cum_len_next = cum_len_next % mss; 1824 next_is_first = (cum_len_next == 0); 1825 flags |= chop * MXGEFW_FLAGS_TSO_CHOP; 1826 flags_next |= next_is_first * 1827 MXGEFW_FLAGS_FIRST; 1828 rdma_count |= -(chop | next_is_first); 1829 rdma_count += chop & !next_is_first; 1830 } else if (cum_len_next >= 0) { 1831 /* header ends */ 1832 rdma_count = -1; 1833 cum_len_next = 0; 1834 seglen = -cum_len; 1835 small = (mss <= MXGEFW_SEND_SMALL_SIZE); 1836 flags_next = MXGEFW_FLAGS_TSO_PLD | 1837 MXGEFW_FLAGS_FIRST | 1838 (small * MXGEFW_FLAGS_SMALL); 1839 } 1840 1841 req->addr_high = high_swapped; 1842 req->addr_low = htobe32(low); 1843 req->pseudo_hdr_offset = pseudo_hdr_offset; 1844 req->pad = 0; 1845 req->rdma_count = 1; 1846 req->length = htobe16(seglen); 1847 req->cksum_offset = cksum_offset; 1848 req->flags = flags | ((cum_len & 1) * 1849 MXGEFW_FLAGS_ALIGN_ODD); 1850 low += seglen; 1851 len -= seglen; 1852 cum_len = cum_len_next; 1853 flags = flags_next; 1854 req++; 1855 cnt++; 1856 rdma_count++; 1857 if (__predict_false(cksum_offset > seglen)) 1858 cksum_offset -= seglen; 1859 else 1860 cksum_offset = 0; 1861 if (__predict_false(cnt > tx->max_desc)) 1862 goto drop; 1863 } 1864 busdma_seg_cnt--; 1865 seg++; 1866 } 1867 (req-rdma_count)->rdma_count = rdma_count; 1868 1869 do { 1870 req--; 1871 req->flags |= MXGEFW_FLAGS_TSO_LAST; 1872 } while (!(req->flags & (MXGEFW_FLAGS_TSO_CHOP | MXGEFW_FLAGS_FIRST))); 1873 1874 tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1; 1875 mxge_submit_req(tx, tx->req_list, cnt); 1876 return; 1877 1878 drop: 1879 bus_dmamap_unload(tx->dmat, tx->info[tx->req & tx->mask].map); 1880 m_freem(m); 1881 ss->sc->ifp->if_oerrors++; 1882 if (!once) { 1883 printf("tx->max_desc exceeded via TSO!\n"); 1884 printf("mss = %d, %ld, %d!\n", mss, 1885 (long)seg - (long)tx->seg_list, tx->max_desc); 1886 once = 1; 1887 } 1888 return; 1889 1890 } 1891 1892 /* 1893 * We reproduce the software vlan tag insertion from 1894 * net/if_vlan.c:vlan_start() here so that we can advertise "hardware" 1895 * vlan tag insertion. We need to advertise this in order to have the 1896 * vlan interface respect our csum offload flags. 1897 */ 1898 static struct mbuf * 1899 mxge_vlan_tag_insert(struct mbuf *m) 1900 { 1901 struct ether_vlan_header *evl; 1902 1903 M_PREPEND(m, ETHER_VLAN_ENCAP_LEN, M_DONTWAIT); 1904 if (__predict_false(m == NULL)) 1905 return NULL; 1906 if (m->m_len < sizeof(*evl)) { 1907 m = m_pullup(m, sizeof(*evl)); 1908 if (__predict_false(m == NULL)) 1909 return NULL; 1910 } 1911 /* 1912 * Transform the Ethernet header into an Ethernet header 1913 * with 802.1Q encapsulation. 1914 */ 1915 evl = mtod(m, struct ether_vlan_header *); 1916 bcopy((char *)evl + ETHER_VLAN_ENCAP_LEN, 1917 (char *)evl, ETHER_HDR_LEN - ETHER_TYPE_LEN); 1918 evl->evl_encap_proto = htons(ETHERTYPE_VLAN); 1919 evl->evl_tag = htons(m->m_pkthdr.ether_vtag); 1920 m->m_flags &= ~M_VLANTAG; 1921 return m; 1922 } 1923 1924 static void 1925 mxge_encap(struct mxge_slice_state *ss, struct mbuf *m) 1926 { 1927 mxge_softc_t *sc; 1928 mcp_kreq_ether_send_t *req; 1929 bus_dma_segment_t *seg; 1930 struct mbuf *m_tmp; 1931 struct ifnet *ifp; 1932 mxge_tx_ring_t *tx; 1933 struct ip *ip; 1934 int cnt, cum_len, err, i, idx, odd_flag, ip_off; 1935 uint16_t pseudo_hdr_offset; 1936 uint8_t flags, cksum_offset; 1937 1938 1939 sc = ss->sc; 1940 ifp = sc->ifp; 1941 tx = &ss->tx; 1942 1943 ip_off = sizeof (struct ether_header); 1944 if (m->m_flags & M_VLANTAG) { 1945 m = mxge_vlan_tag_insert(m); 1946 if (__predict_false(m == NULL)) 1947 goto drop; 1948 ip_off += ETHER_VLAN_ENCAP_LEN; 1949 } 1950 1951 /* (try to) map the frame for DMA */ 1952 idx = tx->req & tx->mask; 1953 err = bus_dmamap_load_mbuf_sg(tx->dmat, tx->info[idx].map, 1954 m, tx->seg_list, &cnt, 1955 BUS_DMA_NOWAIT); 1956 if (__predict_false(err == EFBIG)) { 1957 /* Too many segments in the chain. Try 1958 to defrag */ 1959 m_tmp = m_defrag(m, M_NOWAIT); 1960 if (m_tmp == NULL) { 1961 goto drop; 1962 } 1963 ss->tx.defrag++; 1964 m = m_tmp; 1965 err = bus_dmamap_load_mbuf_sg(tx->dmat, 1966 tx->info[idx].map, 1967 m, tx->seg_list, &cnt, 1968 BUS_DMA_NOWAIT); 1969 } 1970 if (__predict_false(err != 0)) { 1971 device_printf(sc->dev, "bus_dmamap_load_mbuf_sg returned %d" 1972 " packet len = %d\n", err, m->m_pkthdr.len); 1973 goto drop; 1974 } 1975 bus_dmamap_sync(tx->dmat, tx->info[idx].map, 1976 BUS_DMASYNC_PREWRITE); 1977 tx->info[idx].m = m; 1978 1979 1980 /* TSO is different enough, we handle it in another routine */ 1981 if (m->m_pkthdr.csum_flags & (CSUM_TSO)) { 1982 mxge_encap_tso(ss, m, cnt, ip_off); 1983 return; 1984 } 1985 1986 req = tx->req_list; 1987 cksum_offset = 0; 1988 pseudo_hdr_offset = 0; 1989 flags = MXGEFW_FLAGS_NO_TSO; 1990 1991 /* checksum offloading? */ 1992 if (m->m_pkthdr.csum_flags & (CSUM_DELAY_DATA)) { 1993 /* ensure ip header is in first mbuf, copy 1994 it to a scratch buffer if not */ 1995 if (__predict_false(m->m_len < ip_off + sizeof (*ip))) { 1996 m_copydata(m, 0, ip_off + sizeof (*ip), 1997 ss->scratch); 1998 ip = (struct ip *)(ss->scratch + ip_off); 1999 } else { 2000 ip = (struct ip *)(mtod(m, char *) + ip_off); 2001 } 2002 cksum_offset = ip_off + (ip->ip_hl << 2); 2003 pseudo_hdr_offset = cksum_offset + m->m_pkthdr.csum_data; 2004 pseudo_hdr_offset = htobe16(pseudo_hdr_offset); 2005 req->cksum_offset = cksum_offset; 2006 flags |= MXGEFW_FLAGS_CKSUM; 2007 odd_flag = MXGEFW_FLAGS_ALIGN_ODD; 2008 } else { 2009 odd_flag = 0; 2010 } 2011 if (m->m_pkthdr.len < MXGEFW_SEND_SMALL_SIZE) 2012 flags |= MXGEFW_FLAGS_SMALL; 2013 2014 /* convert segments into a request list */ 2015 cum_len = 0; 2016 seg = tx->seg_list; 2017 req->flags = MXGEFW_FLAGS_FIRST; 2018 for (i = 0; i < cnt; i++) { 2019 req->addr_low = 2020 htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr)); 2021 req->addr_high = 2022 htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr)); 2023 req->length = htobe16(seg->ds_len); 2024 req->cksum_offset = cksum_offset; 2025 if (cksum_offset > seg->ds_len) 2026 cksum_offset -= seg->ds_len; 2027 else 2028 cksum_offset = 0; 2029 req->pseudo_hdr_offset = pseudo_hdr_offset; 2030 req->pad = 0; /* complete solid 16-byte block */ 2031 req->rdma_count = 1; 2032 req->flags |= flags | ((cum_len & 1) * odd_flag); 2033 cum_len += seg->ds_len; 2034 seg++; 2035 req++; 2036 req->flags = 0; 2037 } 2038 req--; 2039 /* pad runts to 60 bytes */ 2040 if (cum_len < 60) { 2041 req++; 2042 req->addr_low = 2043 htobe32(MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr)); 2044 req->addr_high = 2045 htobe32(MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr)); 2046 req->length = htobe16(60 - cum_len); 2047 req->cksum_offset = 0; 2048 req->pseudo_hdr_offset = pseudo_hdr_offset; 2049 req->pad = 0; /* complete solid 16-byte block */ 2050 req->rdma_count = 1; 2051 req->flags |= flags | ((cum_len & 1) * odd_flag); 2052 cnt++; 2053 } 2054 2055 tx->req_list[0].rdma_count = cnt; 2056 #if 0 2057 /* print what the firmware will see */ 2058 for (i = 0; i < cnt; i++) { 2059 printf("%d: addr: 0x%x 0x%x len:%d pso%d," 2060 "cso:%d, flags:0x%x, rdma:%d\n", 2061 i, (int)ntohl(tx->req_list[i].addr_high), 2062 (int)ntohl(tx->req_list[i].addr_low), 2063 (int)ntohs(tx->req_list[i].length), 2064 (int)ntohs(tx->req_list[i].pseudo_hdr_offset), 2065 tx->req_list[i].cksum_offset, tx->req_list[i].flags, 2066 tx->req_list[i].rdma_count); 2067 } 2068 printf("--------------\n"); 2069 #endif 2070 tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1; 2071 mxge_submit_req(tx, tx->req_list, cnt); 2072 return; 2073 2074 drop: 2075 m_freem(m); 2076 ifp->if_oerrors++; 2077 return; 2078 } 2079 2080 2081 2082 2083 static inline void 2084 mxge_start_locked(struct mxge_slice_state *ss) 2085 { 2086 mxge_softc_t *sc; 2087 struct mbuf *m; 2088 struct ifnet *ifp; 2089 mxge_tx_ring_t *tx; 2090 2091 sc = ss->sc; 2092 ifp = sc->ifp; 2093 tx = &ss->tx; 2094 while ((tx->mask - (tx->req - tx->done)) > tx->max_desc) { 2095 IFQ_DRV_DEQUEUE(&ifp->if_snd, m); 2096 if (m == NULL) { 2097 return; 2098 } 2099 /* let BPF see it */ 2100 BPF_MTAP(ifp, m); 2101 2102 /* give it to the nic */ 2103 mxge_encap(ss, m); 2104 } 2105 /* ran out of transmit slots */ 2106 if ((sc->ifp->if_drv_flags & IFF_DRV_OACTIVE) == 0) { 2107 sc->ifp->if_drv_flags |= IFF_DRV_OACTIVE; 2108 tx->stall++; 2109 } 2110 } 2111 2112 static void 2113 mxge_start(struct ifnet *ifp) 2114 { 2115 mxge_softc_t *sc = ifp->if_softc; 2116 struct mxge_slice_state *ss; 2117 2118 /* only use the first slice for now */ 2119 ss = &sc->ss[0]; 2120 mtx_lock(&ss->tx.mtx); 2121 mxge_start_locked(ss); 2122 mtx_unlock(&ss->tx.mtx); 2123 } 2124 2125 /* 2126 * copy an array of mcp_kreq_ether_recv_t's to the mcp. Copy 2127 * at most 32 bytes at a time, so as to avoid involving the software 2128 * pio handler in the nic. We re-write the first segment's low 2129 * DMA address to mark it valid only after we write the entire chunk 2130 * in a burst 2131 */ 2132 static inline void 2133 mxge_submit_8rx(volatile mcp_kreq_ether_recv_t *dst, 2134 mcp_kreq_ether_recv_t *src) 2135 { 2136 uint32_t low; 2137 2138 low = src->addr_low; 2139 src->addr_low = 0xffffffff; 2140 mxge_pio_copy(dst, src, 4 * sizeof (*src)); 2141 mb(); 2142 mxge_pio_copy(dst + 4, src + 4, 4 * sizeof (*src)); 2143 mb(); 2144 src->addr_low = low; 2145 dst->addr_low = low; 2146 mb(); 2147 } 2148 2149 static int 2150 mxge_get_buf_small(struct mxge_slice_state *ss, bus_dmamap_t map, int idx) 2151 { 2152 bus_dma_segment_t seg; 2153 struct mbuf *m; 2154 mxge_rx_ring_t *rx = &ss->rx_small; 2155 int cnt, err; 2156 2157 m = m_gethdr(M_DONTWAIT, MT_DATA); 2158 if (m == NULL) { 2159 rx->alloc_fail++; 2160 err = ENOBUFS; 2161 goto done; 2162 } 2163 m->m_len = MHLEN; 2164 err = bus_dmamap_load_mbuf_sg(rx->dmat, map, m, 2165 &seg, &cnt, BUS_DMA_NOWAIT); 2166 if (err != 0) { 2167 m_free(m); 2168 goto done; 2169 } 2170 rx->info[idx].m = m; 2171 rx->shadow[idx].addr_low = 2172 htobe32(MXGE_LOWPART_TO_U32(seg.ds_addr)); 2173 rx->shadow[idx].addr_high = 2174 htobe32(MXGE_HIGHPART_TO_U32(seg.ds_addr)); 2175 2176 done: 2177 if ((idx & 7) == 7) 2178 mxge_submit_8rx(&rx->lanai[idx - 7], &rx->shadow[idx - 7]); 2179 return err; 2180 } 2181 2182 static int 2183 mxge_get_buf_big(struct mxge_slice_state *ss, bus_dmamap_t map, int idx) 2184 { 2185 bus_dma_segment_t seg[3]; 2186 struct mbuf *m; 2187 mxge_rx_ring_t *rx = &ss->rx_big; 2188 int cnt, err, i; 2189 2190 if (rx->cl_size == MCLBYTES) 2191 m = m_getcl(M_DONTWAIT, MT_DATA, M_PKTHDR); 2192 else 2193 m = m_getjcl(M_DONTWAIT, MT_DATA, M_PKTHDR, rx->cl_size); 2194 if (m == NULL) { 2195 rx->alloc_fail++; 2196 err = ENOBUFS; 2197 goto done; 2198 } 2199 m->m_len = rx->cl_size; 2200 err = bus_dmamap_load_mbuf_sg(rx->dmat, map, m, 2201 seg, &cnt, BUS_DMA_NOWAIT); 2202 if (err != 0) { 2203 m_free(m); 2204 goto done; 2205 } 2206 rx->info[idx].m = m; 2207 2208 for (i = 0; i < cnt; i++) { 2209 rx->shadow[idx + i].addr_low = 2210 htobe32(MXGE_LOWPART_TO_U32(seg[i].ds_addr)); 2211 rx->shadow[idx + i].addr_high = 2212 htobe32(MXGE_HIGHPART_TO_U32(seg[i].ds_addr)); 2213 } 2214 2215 2216 done: 2217 for (i = 0; i < rx->nbufs; i++) { 2218 if ((idx & 7) == 7) { 2219 mxge_submit_8rx(&rx->lanai[idx - 7], 2220 &rx->shadow[idx - 7]); 2221 } 2222 idx++; 2223 } 2224 return err; 2225 } 2226 2227 /* 2228 * Myri10GE hardware checksums are not valid if the sender 2229 * padded the frame with non-zero padding. This is because 2230 * the firmware just does a simple 16-bit 1s complement 2231 * checksum across the entire frame, excluding the first 14 2232 * bytes. It is best to simply to check the checksum and 2233 * tell the stack about it only if the checksum is good 2234 */ 2235 2236 static inline uint16_t 2237 mxge_rx_csum(struct mbuf *m, int csum) 2238 { 2239 struct ether_header *eh; 2240 struct ip *ip; 2241 uint16_t c; 2242 2243 eh = mtod(m, struct ether_header *); 2244 2245 /* only deal with IPv4 TCP & UDP for now */ 2246 if (__predict_false(eh->ether_type != htons(ETHERTYPE_IP))) 2247 return 1; 2248 ip = (struct ip *)(eh + 1); 2249 if (__predict_false(ip->ip_p != IPPROTO_TCP && 2250 ip->ip_p != IPPROTO_UDP)) 2251 return 1; 2252 2253 c = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, 2254 htonl(ntohs(csum) + ntohs(ip->ip_len) + 2255 - (ip->ip_hl << 2) + ip->ip_p)); 2256 c ^= 0xffff; 2257 return (c); 2258 } 2259 2260 static void 2261 mxge_vlan_tag_remove(struct mbuf *m, uint32_t *csum) 2262 { 2263 struct ether_vlan_header *evl; 2264 struct ether_header *eh; 2265 uint32_t partial; 2266 2267 evl = mtod(m, struct ether_vlan_header *); 2268 eh = mtod(m, struct ether_header *); 2269 2270 /* 2271 * fix checksum by subtracting ETHER_VLAN_ENCAP_LEN bytes 2272 * after what the firmware thought was the end of the ethernet 2273 * header. 2274 */ 2275 2276 /* put checksum into host byte order */ 2277 *csum = ntohs(*csum); 2278 partial = ntohl(*(uint32_t *)(mtod(m, char *) + ETHER_HDR_LEN)); 2279 (*csum) += ~partial; 2280 (*csum) += ((*csum) < ~partial); 2281 (*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF); 2282 (*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF); 2283 2284 /* restore checksum to network byte order; 2285 later consumers expect this */ 2286 *csum = htons(*csum); 2287 2288 /* save the tag */ 2289 m->m_flags |= M_VLANTAG; 2290 m->m_pkthdr.ether_vtag = ntohs(evl->evl_tag); 2291 2292 /* 2293 * Remove the 802.1q header by copying the Ethernet 2294 * addresses over it and adjusting the beginning of 2295 * the data in the mbuf. The encapsulated Ethernet 2296 * type field is already in place. 2297 */ 2298 bcopy((char *)evl, (char *)evl + ETHER_VLAN_ENCAP_LEN, 2299 ETHER_HDR_LEN - ETHER_TYPE_LEN); 2300 m_adj(m, ETHER_VLAN_ENCAP_LEN); 2301 } 2302 2303 2304 static inline void 2305 mxge_rx_done_big(struct mxge_slice_state *ss, uint32_t len, uint32_t csum) 2306 { 2307 mxge_softc_t *sc; 2308 struct ifnet *ifp; 2309 struct mbuf *m; 2310 struct ether_header *eh; 2311 mxge_rx_ring_t *rx; 2312 bus_dmamap_t old_map; 2313 int idx; 2314 uint16_t tcpudp_csum; 2315 2316 sc = ss->sc; 2317 ifp = sc->ifp; 2318 rx = &ss->rx_big; 2319 idx = rx->cnt & rx->mask; 2320 rx->cnt += rx->nbufs; 2321 /* save a pointer to the received mbuf */ 2322 m = rx->info[idx].m; 2323 /* try to replace the received mbuf */ 2324 if (mxge_get_buf_big(ss, rx->extra_map, idx)) { 2325 /* drop the frame -- the old mbuf is re-cycled */ 2326 ifp->if_ierrors++; 2327 return; 2328 } 2329 2330 /* unmap the received buffer */ 2331 old_map = rx->info[idx].map; 2332 bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD); 2333 bus_dmamap_unload(rx->dmat, old_map); 2334 2335 /* swap the bus_dmamap_t's */ 2336 rx->info[idx].map = rx->extra_map; 2337 rx->extra_map = old_map; 2338 2339 /* mcp implicitly skips 1st 2 bytes so that packet is properly 2340 * aligned */ 2341 m->m_data += MXGEFW_PAD; 2342 2343 m->m_pkthdr.rcvif = ifp; 2344 m->m_len = m->m_pkthdr.len = len; 2345 ss->ipackets++; 2346 eh = mtod(m, struct ether_header *); 2347 if (eh->ether_type == htons(ETHERTYPE_VLAN)) { 2348 mxge_vlan_tag_remove(m, &csum); 2349 } 2350 /* if the checksum is valid, mark it in the mbuf header */ 2351 if (sc->csum_flag && (0 == (tcpudp_csum = mxge_rx_csum(m, csum)))) { 2352 if (sc->lro_cnt && (0 == mxge_lro_rx(ss, m, csum))) 2353 return; 2354 /* otherwise, it was a UDP frame, or a TCP frame which 2355 we could not do LRO on. Tell the stack that the 2356 checksum is good */ 2357 m->m_pkthdr.csum_data = 0xffff; 2358 m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR | CSUM_DATA_VALID; 2359 } 2360 /* pass the frame up the stack */ 2361 (*ifp->if_input)(ifp, m); 2362 } 2363 2364 static inline void 2365 mxge_rx_done_small(struct mxge_slice_state *ss, uint32_t len, uint32_t csum) 2366 { 2367 mxge_softc_t *sc; 2368 struct ifnet *ifp; 2369 struct ether_header *eh; 2370 struct mbuf *m; 2371 mxge_rx_ring_t *rx; 2372 bus_dmamap_t old_map; 2373 int idx; 2374 uint16_t tcpudp_csum; 2375 2376 sc = ss->sc; 2377 ifp = sc->ifp; 2378 rx = &ss->rx_small; 2379 idx = rx->cnt & rx->mask; 2380 rx->cnt++; 2381 /* save a pointer to the received mbuf */ 2382 m = rx->info[idx].m; 2383 /* try to replace the received mbuf */ 2384 if (mxge_get_buf_small(ss, rx->extra_map, idx)) { 2385 /* drop the frame -- the old mbuf is re-cycled */ 2386 ifp->if_ierrors++; 2387 return; 2388 } 2389 2390 /* unmap the received buffer */ 2391 old_map = rx->info[idx].map; 2392 bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD); 2393 bus_dmamap_unload(rx->dmat, old_map); 2394 2395 /* swap the bus_dmamap_t's */ 2396 rx->info[idx].map = rx->extra_map; 2397 rx->extra_map = old_map; 2398 2399 /* mcp implicitly skips 1st 2 bytes so that packet is properly 2400 * aligned */ 2401 m->m_data += MXGEFW_PAD; 2402 2403 m->m_pkthdr.rcvif = ifp; 2404 m->m_len = m->m_pkthdr.len = len; 2405 ss->ipackets++; 2406 eh = mtod(m, struct ether_header *); 2407 if (eh->ether_type == htons(ETHERTYPE_VLAN)) { 2408 mxge_vlan_tag_remove(m, &csum); 2409 } 2410 /* if the checksum is valid, mark it in the mbuf header */ 2411 if (sc->csum_flag && (0 == (tcpudp_csum = mxge_rx_csum(m, csum)))) { 2412 if (sc->lro_cnt && (0 == mxge_lro_rx(ss, m, csum))) 2413 return; 2414 /* otherwise, it was a UDP frame, or a TCP frame which 2415 we could not do LRO on. Tell the stack that the 2416 checksum is good */ 2417 m->m_pkthdr.csum_data = 0xffff; 2418 m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR | CSUM_DATA_VALID; 2419 } 2420 /* pass the frame up the stack */ 2421 (*ifp->if_input)(ifp, m); 2422 } 2423 2424 static inline void 2425 mxge_clean_rx_done(struct mxge_slice_state *ss) 2426 { 2427 mxge_rx_done_t *rx_done = &ss->rx_done; 2428 struct lro_entry *lro; 2429 int limit = 0; 2430 uint16_t length; 2431 uint16_t checksum; 2432 2433 2434 while (rx_done->entry[rx_done->idx].length != 0) { 2435 length = ntohs(rx_done->entry[rx_done->idx].length); 2436 rx_done->entry[rx_done->idx].length = 0; 2437 checksum = rx_done->entry[rx_done->idx].checksum; 2438 if (length <= (MHLEN - MXGEFW_PAD)) 2439 mxge_rx_done_small(ss, length, checksum); 2440 else 2441 mxge_rx_done_big(ss, length, checksum); 2442 rx_done->cnt++; 2443 rx_done->idx = rx_done->cnt & rx_done->mask; 2444 2445 /* limit potential for livelock */ 2446 if (__predict_false(++limit > rx_done->mask / 2)) 2447 break; 2448 } 2449 while (!SLIST_EMPTY(&ss->lro_active)) { 2450 lro = SLIST_FIRST(&ss->lro_active); 2451 SLIST_REMOVE_HEAD(&ss->lro_active, next); 2452 mxge_lro_flush(ss, lro); 2453 } 2454 } 2455 2456 2457 static inline void 2458 mxge_tx_done(struct mxge_slice_state *ss, uint32_t mcp_idx) 2459 { 2460 struct ifnet *ifp; 2461 mxge_tx_ring_t *tx; 2462 struct mbuf *m; 2463 bus_dmamap_t map; 2464 int idx; 2465 2466 tx = &ss->tx; 2467 ifp = ss->sc->ifp; 2468 while (tx->pkt_done != mcp_idx) { 2469 idx = tx->done & tx->mask; 2470 tx->done++; 2471 m = tx->info[idx].m; 2472 /* mbuf and DMA map only attached to the first 2473 segment per-mbuf */ 2474 if (m != NULL) { 2475 ifp->if_opackets++; 2476 tx->info[idx].m = NULL; 2477 map = tx->info[idx].map; 2478 bus_dmamap_unload(tx->dmat, map); 2479 m_freem(m); 2480 } 2481 if (tx->info[idx].flag) { 2482 tx->info[idx].flag = 0; 2483 tx->pkt_done++; 2484 } 2485 } 2486 2487 /* If we have space, clear IFF_OACTIVE to tell the stack that 2488 its OK to send packets */ 2489 2490 if (ifp->if_drv_flags & IFF_DRV_OACTIVE && 2491 tx->req - tx->done < (tx->mask + 1)/4) { 2492 mtx_lock(&ss->tx.mtx); 2493 ifp->if_drv_flags &= ~IFF_DRV_OACTIVE; 2494 ss->tx.wake++; 2495 mxge_start_locked(ss); 2496 mtx_unlock(&ss->tx.mtx); 2497 } 2498 } 2499 2500 static struct mxge_media_type mxge_media_types[] = 2501 { 2502 {IFM_10G_CX4, 0x7f, "10GBASE-CX4 (module)"}, 2503 {IFM_10G_SR, (1 << 7), "10GBASE-SR"}, 2504 {IFM_10G_LR, (1 << 6), "10GBASE-LR"}, 2505 {0, (1 << 5), "10GBASE-ER"}, 2506 {0, (1 << 4), "10GBASE-LRM"}, 2507 {0, (1 << 3), "10GBASE-SW"}, 2508 {0, (1 << 2), "10GBASE-LW"}, 2509 {0, (1 << 1), "10GBASE-EW"}, 2510 {0, (1 << 0), "Reserved"} 2511 }; 2512 2513 static void 2514 mxge_set_media(mxge_softc_t *sc, int type) 2515 { 2516 sc->media_flags |= type; 2517 ifmedia_add(&sc->media, sc->media_flags, 0, NULL); 2518 ifmedia_set(&sc->media, sc->media_flags); 2519 } 2520 2521 2522 /* 2523 * Determine the media type for a NIC. Some XFPs will identify 2524 * themselves only when their link is up, so this is initiated via a 2525 * link up interrupt. However, this can potentially take up to 2526 * several milliseconds, so it is run via the watchdog routine, rather 2527 * than in the interrupt handler itself. This need only be done 2528 * once, not each time the link is up. 2529 */ 2530 static void 2531 mxge_media_probe(mxge_softc_t *sc) 2532 { 2533 mxge_cmd_t cmd; 2534 char *ptr; 2535 int i, err, ms; 2536 2537 sc->need_media_probe = 0; 2538 2539 /* if we've already set a media type, we're done */ 2540 if (sc->media_flags != (IFM_ETHER | IFM_AUTO)) 2541 return; 2542 2543 /* 2544 * parse the product code to deterimine the interface type 2545 * (CX4, XFP, Quad Ribbon Fiber) by looking at the character 2546 * after the 3rd dash in the driver's cached copy of the 2547 * EEPROM's product code string. 2548 */ 2549 ptr = sc->product_code_string; 2550 if (ptr == NULL) { 2551 device_printf(sc->dev, "Missing product code\n"); 2552 } 2553 2554 for (i = 0; i < 3; i++, ptr++) { 2555 ptr = strchr(ptr, '-'); 2556 if (ptr == NULL) { 2557 device_printf(sc->dev, 2558 "only %d dashes in PC?!?\n", i); 2559 return; 2560 } 2561 } 2562 if (*ptr == 'C') { 2563 mxge_set_media(sc, IFM_10G_CX4); 2564 return; 2565 } 2566 else if (*ptr == 'Q') { 2567 device_printf(sc->dev, "Quad Ribbon Fiber Media\n"); 2568 /* FreeBSD has no media type for Quad ribbon fiber */ 2569 return; 2570 } 2571 2572 if (*ptr != 'R') { 2573 device_printf(sc->dev, "Unknown media type: %c\n", *ptr); 2574 return; 2575 } 2576 2577 /* 2578 * At this point we know the NIC has an XFP cage, so now we 2579 * try to determine what is in the cage by using the 2580 * firmware's XFP I2C commands to read the XFP 10GbE compilance 2581 * register. We read just one byte, which may take over 2582 * a millisecond 2583 */ 2584 2585 cmd.data0 = 0; /* just fetch 1 byte, not all 256 */ 2586 cmd.data1 = MXGE_XFP_COMPLIANCE_BYTE; /* the byte we want */ 2587 err = mxge_send_cmd(sc, MXGEFW_CMD_XFP_I2C_READ, &cmd); 2588 if (err == MXGEFW_CMD_ERROR_XFP_FAILURE) { 2589 device_printf(sc->dev, "failed to read XFP\n"); 2590 } 2591 if (err == MXGEFW_CMD_ERROR_XFP_ABSENT) { 2592 device_printf(sc->dev, "Type R with no XFP!?!?\n"); 2593 } 2594 if (err != MXGEFW_CMD_OK) { 2595 return; 2596 } 2597 2598 /* now we wait for the data to be cached */ 2599 cmd.data0 = MXGE_XFP_COMPLIANCE_BYTE; 2600 err = mxge_send_cmd(sc, MXGEFW_CMD_XFP_BYTE, &cmd); 2601 for (ms = 0; (err == EBUSY) && (ms < 50); ms++) { 2602 DELAY(1000); 2603 cmd.data0 = MXGE_XFP_COMPLIANCE_BYTE; 2604 err = mxge_send_cmd(sc, MXGEFW_CMD_XFP_BYTE, &cmd); 2605 } 2606 if (err != MXGEFW_CMD_OK) { 2607 device_printf(sc->dev, "failed to read XFP (%d, %dms)\n", 2608 err, ms); 2609 return; 2610 } 2611 2612 if (cmd.data0 == mxge_media_types[0].bitmask) { 2613 if (mxge_verbose) 2614 device_printf(sc->dev, "XFP:%s\n", 2615 mxge_media_types[0].name); 2616 mxge_set_media(sc, IFM_10G_CX4); 2617 return; 2618 } 2619 for (i = 1; 2620 i < sizeof (mxge_media_types) / sizeof (mxge_media_types[0]); 2621 i++) { 2622 if (cmd.data0 & mxge_media_types[i].bitmask) { 2623 if (mxge_verbose) 2624 device_printf(sc->dev, "XFP:%s\n", 2625 mxge_media_types[i].name); 2626 2627 mxge_set_media(sc, mxge_media_types[i].flag); 2628 return; 2629 } 2630 } 2631 device_printf(sc->dev, "XFP media 0x%x unknown\n", cmd.data0); 2632 2633 return; 2634 } 2635 2636 static void 2637 mxge_intr(void *arg) 2638 { 2639 struct mxge_slice_state *ss = arg; 2640 mxge_softc_t *sc = ss->sc; 2641 mcp_irq_data_t *stats = ss->fw_stats; 2642 mxge_tx_ring_t *tx = &ss->tx; 2643 mxge_rx_done_t *rx_done = &ss->rx_done; 2644 uint32_t send_done_count; 2645 uint8_t valid; 2646 2647 2648 /* an interrupt on a non-zero slice is implicitly valid 2649 since MSI-X irqs are not shared */ 2650 if (ss != sc->ss) { 2651 mxge_clean_rx_done(ss); 2652 *ss->irq_claim = be32toh(3); 2653 return; 2654 } 2655 2656 /* make sure the DMA has finished */ 2657 if (!stats->valid) { 2658 return; 2659 } 2660 valid = stats->valid; 2661 2662 if (!sc->msi_enabled) { 2663 /* lower legacy IRQ */ 2664 *sc->irq_deassert = 0; 2665 if (!mxge_deassert_wait) 2666 /* don't wait for conf. that irq is low */ 2667 stats->valid = 0; 2668 } else { 2669 stats->valid = 0; 2670 } 2671 2672 /* loop while waiting for legacy irq deassertion */ 2673 do { 2674 /* check for transmit completes and receives */ 2675 send_done_count = be32toh(stats->send_done_count); 2676 while ((send_done_count != tx->pkt_done) || 2677 (rx_done->entry[rx_done->idx].length != 0)) { 2678 mxge_tx_done(ss, (int)send_done_count); 2679 mxge_clean_rx_done(ss); 2680 send_done_count = be32toh(stats->send_done_count); 2681 } 2682 } while (*((volatile uint8_t *) &stats->valid)); 2683 2684 if (__predict_false(stats->stats_updated)) { 2685 if (sc->link_state != stats->link_up) { 2686 sc->link_state = stats->link_up; 2687 if (sc->link_state) { 2688 if_link_state_change(sc->ifp, LINK_STATE_UP); 2689 if (mxge_verbose) 2690 device_printf(sc->dev, "link up\n"); 2691 } else { 2692 if_link_state_change(sc->ifp, LINK_STATE_DOWN); 2693 if (mxge_verbose) 2694 device_printf(sc->dev, "link down\n"); 2695 } 2696 sc->need_media_probe = 1; 2697 } 2698 if (sc->rdma_tags_available != 2699 be32toh(stats->rdma_tags_available)) { 2700 sc->rdma_tags_available = 2701 be32toh(stats->rdma_tags_available); 2702 device_printf(sc->dev, "RDMA timed out! %d tags " 2703 "left\n", sc->rdma_tags_available); 2704 } 2705 2706 if (stats->link_down) { 2707 sc->down_cnt += stats->link_down; 2708 sc->link_state = 0; 2709 if_link_state_change(sc->ifp, LINK_STATE_DOWN); 2710 } 2711 } 2712 2713 /* check to see if we have rx token to pass back */ 2714 if (valid & 0x1) 2715 *ss->irq_claim = be32toh(3); 2716 *(ss->irq_claim + 1) = be32toh(3); 2717 } 2718 2719 static void 2720 mxge_init(void *arg) 2721 { 2722 } 2723 2724 2725 2726 static void 2727 mxge_free_slice_mbufs(struct mxge_slice_state *ss) 2728 { 2729 struct lro_entry *lro_entry; 2730 int i; 2731 2732 while (!SLIST_EMPTY(&ss->lro_free)) { 2733 lro_entry = SLIST_FIRST(&ss->lro_free); 2734 SLIST_REMOVE_HEAD(&ss->lro_free, next); 2735 free(lro_entry, M_DEVBUF); 2736 } 2737 2738 for (i = 0; i <= ss->rx_big.mask; i++) { 2739 if (ss->rx_big.info[i].m == NULL) 2740 continue; 2741 bus_dmamap_unload(ss->rx_big.dmat, 2742 ss->rx_big.info[i].map); 2743 m_freem(ss->rx_big.info[i].m); 2744 ss->rx_big.info[i].m = NULL; 2745 } 2746 2747 for (i = 0; i <= ss->rx_small.mask; i++) { 2748 if (ss->rx_small.info[i].m == NULL) 2749 continue; 2750 bus_dmamap_unload(ss->rx_small.dmat, 2751 ss->rx_small.info[i].map); 2752 m_freem(ss->rx_small.info[i].m); 2753 ss->rx_small.info[i].m = NULL; 2754 } 2755 2756 /* transmit ring used only on the first slice */ 2757 if (ss->tx.info == NULL) 2758 return; 2759 2760 for (i = 0; i <= ss->tx.mask; i++) { 2761 ss->tx.info[i].flag = 0; 2762 if (ss->tx.info[i].m == NULL) 2763 continue; 2764 bus_dmamap_unload(ss->tx.dmat, 2765 ss->tx.info[i].map); 2766 m_freem(ss->tx.info[i].m); 2767 ss->tx.info[i].m = NULL; 2768 } 2769 } 2770 2771 static void 2772 mxge_free_mbufs(mxge_softc_t *sc) 2773 { 2774 int slice; 2775 2776 for (slice = 0; slice < sc->num_slices; slice++) 2777 mxge_free_slice_mbufs(&sc->ss[slice]); 2778 } 2779 2780 static void 2781 mxge_free_slice_rings(struct mxge_slice_state *ss) 2782 { 2783 int i; 2784 2785 2786 if (ss->rx_done.entry != NULL) 2787 mxge_dma_free(&ss->rx_done.dma); 2788 ss->rx_done.entry = NULL; 2789 2790 if (ss->tx.req_bytes != NULL) 2791 free(ss->tx.req_bytes, M_DEVBUF); 2792 ss->tx.req_bytes = NULL; 2793 2794 if (ss->tx.seg_list != NULL) 2795 free(ss->tx.seg_list, M_DEVBUF); 2796 ss->tx.seg_list = NULL; 2797 2798 if (ss->rx_small.shadow != NULL) 2799 free(ss->rx_small.shadow, M_DEVBUF); 2800 ss->rx_small.shadow = NULL; 2801 2802 if (ss->rx_big.shadow != NULL) 2803 free(ss->rx_big.shadow, M_DEVBUF); 2804 ss->rx_big.shadow = NULL; 2805 2806 if (ss->tx.info != NULL) { 2807 if (ss->tx.dmat != NULL) { 2808 for (i = 0; i <= ss->tx.mask; i++) { 2809 bus_dmamap_destroy(ss->tx.dmat, 2810 ss->tx.info[i].map); 2811 } 2812 bus_dma_tag_destroy(ss->tx.dmat); 2813 } 2814 free(ss->tx.info, M_DEVBUF); 2815 } 2816 ss->tx.info = NULL; 2817 2818 if (ss->rx_small.info != NULL) { 2819 if (ss->rx_small.dmat != NULL) { 2820 for (i = 0; i <= ss->rx_small.mask; i++) { 2821 bus_dmamap_destroy(ss->rx_small.dmat, 2822 ss->rx_small.info[i].map); 2823 } 2824 bus_dmamap_destroy(ss->rx_small.dmat, 2825 ss->rx_small.extra_map); 2826 bus_dma_tag_destroy(ss->rx_small.dmat); 2827 } 2828 free(ss->rx_small.info, M_DEVBUF); 2829 } 2830 ss->rx_small.info = NULL; 2831 2832 if (ss->rx_big.info != NULL) { 2833 if (ss->rx_big.dmat != NULL) { 2834 for (i = 0; i <= ss->rx_big.mask; i++) { 2835 bus_dmamap_destroy(ss->rx_big.dmat, 2836 ss->rx_big.info[i].map); 2837 } 2838 bus_dmamap_destroy(ss->rx_big.dmat, 2839 ss->rx_big.extra_map); 2840 bus_dma_tag_destroy(ss->rx_big.dmat); 2841 } 2842 free(ss->rx_big.info, M_DEVBUF); 2843 } 2844 ss->rx_big.info = NULL; 2845 } 2846 2847 static void 2848 mxge_free_rings(mxge_softc_t *sc) 2849 { 2850 int slice; 2851 2852 for (slice = 0; slice < sc->num_slices; slice++) 2853 mxge_free_slice_rings(&sc->ss[slice]); 2854 } 2855 2856 static int 2857 mxge_alloc_slice_rings(struct mxge_slice_state *ss, int rx_ring_entries, 2858 int tx_ring_entries) 2859 { 2860 mxge_softc_t *sc = ss->sc; 2861 size_t bytes; 2862 int err, i; 2863 2864 err = ENOMEM; 2865 2866 /* allocate per-slice receive resources */ 2867 2868 ss->rx_small.mask = ss->rx_big.mask = rx_ring_entries - 1; 2869 ss->rx_done.mask = (2 * rx_ring_entries) - 1; 2870 2871 /* allocate the rx shadow rings */ 2872 bytes = rx_ring_entries * sizeof (*ss->rx_small.shadow); 2873 ss->rx_small.shadow = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK); 2874 if (ss->rx_small.shadow == NULL) 2875 return err;; 2876 2877 bytes = rx_ring_entries * sizeof (*ss->rx_big.shadow); 2878 ss->rx_big.shadow = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK); 2879 if (ss->rx_big.shadow == NULL) 2880 return err;; 2881 2882 /* allocate the rx host info rings */ 2883 bytes = rx_ring_entries * sizeof (*ss->rx_small.info); 2884 ss->rx_small.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK); 2885 if (ss->rx_small.info == NULL) 2886 return err;; 2887 2888 bytes = rx_ring_entries * sizeof (*ss->rx_big.info); 2889 ss->rx_big.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK); 2890 if (ss->rx_big.info == NULL) 2891 return err;; 2892 2893 /* allocate the rx busdma resources */ 2894 err = bus_dma_tag_create(sc->parent_dmat, /* parent */ 2895 1, /* alignment */ 2896 4096, /* boundary */ 2897 BUS_SPACE_MAXADDR, /* low */ 2898 BUS_SPACE_MAXADDR, /* high */ 2899 NULL, NULL, /* filter */ 2900 MHLEN, /* maxsize */ 2901 1, /* num segs */ 2902 MHLEN, /* maxsegsize */ 2903 BUS_DMA_ALLOCNOW, /* flags */ 2904 NULL, NULL, /* lock */ 2905 &ss->rx_small.dmat); /* tag */ 2906 if (err != 0) { 2907 device_printf(sc->dev, "Err %d allocating rx_small dmat\n", 2908 err); 2909 return err;; 2910 } 2911 2912 err = bus_dma_tag_create(sc->parent_dmat, /* parent */ 2913 1, /* alignment */ 2914 4096, /* boundary */ 2915 BUS_SPACE_MAXADDR, /* low */ 2916 BUS_SPACE_MAXADDR, /* high */ 2917 NULL, NULL, /* filter */ 2918 3*4096, /* maxsize */ 2919 3, /* num segs */ 2920 4096, /* maxsegsize */ 2921 BUS_DMA_ALLOCNOW, /* flags */ 2922 NULL, NULL, /* lock */ 2923 &ss->rx_big.dmat); /* tag */ 2924 if (err != 0) { 2925 device_printf(sc->dev, "Err %d allocating rx_big dmat\n", 2926 err); 2927 return err;; 2928 } 2929 for (i = 0; i <= ss->rx_small.mask; i++) { 2930 err = bus_dmamap_create(ss->rx_small.dmat, 0, 2931 &ss->rx_small.info[i].map); 2932 if (err != 0) { 2933 device_printf(sc->dev, "Err %d rx_small dmamap\n", 2934 err); 2935 return err;; 2936 } 2937 } 2938 err = bus_dmamap_create(ss->rx_small.dmat, 0, 2939 &ss->rx_small.extra_map); 2940 if (err != 0) { 2941 device_printf(sc->dev, "Err %d extra rx_small dmamap\n", 2942 err); 2943 return err;; 2944 } 2945 2946 for (i = 0; i <= ss->rx_big.mask; i++) { 2947 err = bus_dmamap_create(ss->rx_big.dmat, 0, 2948 &ss->rx_big.info[i].map); 2949 if (err != 0) { 2950 device_printf(sc->dev, "Err %d rx_big dmamap\n", 2951 err); 2952 return err;; 2953 } 2954 } 2955 err = bus_dmamap_create(ss->rx_big.dmat, 0, 2956 &ss->rx_big.extra_map); 2957 if (err != 0) { 2958 device_printf(sc->dev, "Err %d extra rx_big dmamap\n", 2959 err); 2960 return err;; 2961 } 2962 2963 /* now allocate TX resouces */ 2964 2965 /* only use a single TX ring for now */ 2966 if (ss != ss->sc->ss) 2967 return 0; 2968 2969 ss->tx.mask = tx_ring_entries - 1; 2970 ss->tx.max_desc = MIN(MXGE_MAX_SEND_DESC, tx_ring_entries / 4); 2971 2972 2973 /* allocate the tx request copy block */ 2974 bytes = 8 + 2975 sizeof (*ss->tx.req_list) * (ss->tx.max_desc + 4); 2976 ss->tx.req_bytes = malloc(bytes, M_DEVBUF, M_WAITOK); 2977 if (ss->tx.req_bytes == NULL) 2978 return err;; 2979 /* ensure req_list entries are aligned to 8 bytes */ 2980 ss->tx.req_list = (mcp_kreq_ether_send_t *) 2981 ((unsigned long)(ss->tx.req_bytes + 7) & ~7UL); 2982 2983 /* allocate the tx busdma segment list */ 2984 bytes = sizeof (*ss->tx.seg_list) * ss->tx.max_desc; 2985 ss->tx.seg_list = (bus_dma_segment_t *) 2986 malloc(bytes, M_DEVBUF, M_WAITOK); 2987 if (ss->tx.seg_list == NULL) 2988 return err;; 2989 2990 /* allocate the tx host info ring */ 2991 bytes = tx_ring_entries * sizeof (*ss->tx.info); 2992 ss->tx.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK); 2993 if (ss->tx.info == NULL) 2994 return err;; 2995 2996 /* allocate the tx busdma resources */ 2997 err = bus_dma_tag_create(sc->parent_dmat, /* parent */ 2998 1, /* alignment */ 2999 sc->tx_boundary, /* boundary */ 3000 BUS_SPACE_MAXADDR, /* low */ 3001 BUS_SPACE_MAXADDR, /* high */ 3002 NULL, NULL, /* filter */ 3003 65536 + 256, /* maxsize */ 3004 ss->tx.max_desc - 2, /* num segs */ 3005 sc->tx_boundary, /* maxsegsz */ 3006 BUS_DMA_ALLOCNOW, /* flags */ 3007 NULL, NULL, /* lock */ 3008 &ss->tx.dmat); /* tag */ 3009 3010 if (err != 0) { 3011 device_printf(sc->dev, "Err %d allocating tx dmat\n", 3012 err); 3013 return err;; 3014 } 3015 3016 /* now use these tags to setup dmamaps for each slot 3017 in the ring */ 3018 for (i = 0; i <= ss->tx.mask; i++) { 3019 err = bus_dmamap_create(ss->tx.dmat, 0, 3020 &ss->tx.info[i].map); 3021 if (err != 0) { 3022 device_printf(sc->dev, "Err %d tx dmamap\n", 3023 err); 3024 return err;; 3025 } 3026 } 3027 return 0; 3028 3029 } 3030 3031 static int 3032 mxge_alloc_rings(mxge_softc_t *sc) 3033 { 3034 mxge_cmd_t cmd; 3035 int tx_ring_size; 3036 int tx_ring_entries, rx_ring_entries; 3037 int err, slice; 3038 3039 /* get ring sizes */ 3040 err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_RING_SIZE, &cmd); 3041 tx_ring_size = cmd.data0; 3042 if (err != 0) { 3043 device_printf(sc->dev, "Cannot determine tx ring sizes\n"); 3044 goto abort; 3045 } 3046 3047 tx_ring_entries = tx_ring_size / sizeof (mcp_kreq_ether_send_t); 3048 rx_ring_entries = sc->rx_ring_size / sizeof (mcp_dma_addr_t); 3049 IFQ_SET_MAXLEN(&sc->ifp->if_snd, tx_ring_entries - 1); 3050 sc->ifp->if_snd.ifq_drv_maxlen = sc->ifp->if_snd.ifq_maxlen; 3051 IFQ_SET_READY(&sc->ifp->if_snd); 3052 3053 for (slice = 0; slice < sc->num_slices; slice++) { 3054 err = mxge_alloc_slice_rings(&sc->ss[slice], 3055 rx_ring_entries, 3056 tx_ring_entries); 3057 if (err != 0) 3058 goto abort; 3059 } 3060 return 0; 3061 3062 abort: 3063 mxge_free_rings(sc); 3064 return err; 3065 3066 } 3067 3068 3069 static void 3070 mxge_choose_params(int mtu, int *big_buf_size, int *cl_size, int *nbufs) 3071 { 3072 int bufsize = mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN + MXGEFW_PAD; 3073 3074 if (bufsize < MCLBYTES) { 3075 /* easy, everything fits in a single buffer */ 3076 *big_buf_size = MCLBYTES; 3077 *cl_size = MCLBYTES; 3078 *nbufs = 1; 3079 return; 3080 } 3081 3082 if (bufsize < MJUMPAGESIZE) { 3083 /* still easy, everything still fits in a single buffer */ 3084 *big_buf_size = MJUMPAGESIZE; 3085 *cl_size = MJUMPAGESIZE; 3086 *nbufs = 1; 3087 return; 3088 } 3089 /* now we need to use virtually contiguous buffers */ 3090 *cl_size = MJUM9BYTES; 3091 *big_buf_size = 4096; 3092 *nbufs = mtu / 4096 + 1; 3093 /* needs to be a power of two, so round up */ 3094 if (*nbufs == 3) 3095 *nbufs = 4; 3096 } 3097 3098 static int 3099 mxge_slice_open(struct mxge_slice_state *ss, int nbufs, int cl_size) 3100 { 3101 mxge_softc_t *sc; 3102 mxge_cmd_t cmd; 3103 bus_dmamap_t map; 3104 struct lro_entry *lro_entry; 3105 int err, i, slice; 3106 3107 3108 sc = ss->sc; 3109 slice = ss - sc->ss; 3110 3111 SLIST_INIT(&ss->lro_free); 3112 SLIST_INIT(&ss->lro_active); 3113 3114 for (i = 0; i < sc->lro_cnt; i++) { 3115 lro_entry = (struct lro_entry *) 3116 malloc(sizeof (*lro_entry), M_DEVBUF, 3117 M_NOWAIT | M_ZERO); 3118 if (lro_entry == NULL) { 3119 sc->lro_cnt = i; 3120 break; 3121 } 3122 SLIST_INSERT_HEAD(&ss->lro_free, lro_entry, next); 3123 } 3124 /* get the lanai pointers to the send and receive rings */ 3125 3126 err = 0; 3127 /* We currently only send from the first slice */ 3128 if (slice == 0) { 3129 cmd.data0 = slice; 3130 err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_OFFSET, &cmd); 3131 ss->tx.lanai = 3132 (volatile mcp_kreq_ether_send_t *)(sc->sram + cmd.data0); 3133 } 3134 cmd.data0 = slice; 3135 err |= mxge_send_cmd(sc, 3136 MXGEFW_CMD_GET_SMALL_RX_OFFSET, &cmd); 3137 ss->rx_small.lanai = 3138 (volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0); 3139 cmd.data0 = slice; 3140 err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_BIG_RX_OFFSET, &cmd); 3141 ss->rx_big.lanai = 3142 (volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0); 3143 3144 if (err != 0) { 3145 device_printf(sc->dev, 3146 "failed to get ring sizes or locations\n"); 3147 return EIO; 3148 } 3149 3150 /* stock receive rings */ 3151 for (i = 0; i <= ss->rx_small.mask; i++) { 3152 map = ss->rx_small.info[i].map; 3153 err = mxge_get_buf_small(ss, map, i); 3154 if (err) { 3155 device_printf(sc->dev, "alloced %d/%d smalls\n", 3156 i, ss->rx_small.mask + 1); 3157 return ENOMEM; 3158 } 3159 } 3160 for (i = 0; i <= ss->rx_big.mask; i++) { 3161 ss->rx_big.shadow[i].addr_low = 0xffffffff; 3162 ss->rx_big.shadow[i].addr_high = 0xffffffff; 3163 } 3164 ss->rx_big.nbufs = nbufs; 3165 ss->rx_big.cl_size = cl_size; 3166 for (i = 0; i <= ss->rx_big.mask; i += ss->rx_big.nbufs) { 3167 map = ss->rx_big.info[i].map; 3168 err = mxge_get_buf_big(ss, map, i); 3169 if (err) { 3170 device_printf(sc->dev, "alloced %d/%d bigs\n", 3171 i, ss->rx_big.mask + 1); 3172 return ENOMEM; 3173 } 3174 } 3175 return 0; 3176 } 3177 3178 static int 3179 mxge_open(mxge_softc_t *sc) 3180 { 3181 mxge_cmd_t cmd; 3182 int err, big_bytes, nbufs, slice, cl_size, i; 3183 bus_addr_t bus; 3184 volatile uint8_t *itable; 3185 3186 /* Copy the MAC address in case it was overridden */ 3187 bcopy(IF_LLADDR(sc->ifp), sc->mac_addr, ETHER_ADDR_LEN); 3188 3189 err = mxge_reset(sc, 1); 3190 if (err != 0) { 3191 device_printf(sc->dev, "failed to reset\n"); 3192 return EIO; 3193 } 3194 3195 if (sc->num_slices > 1) { 3196 /* setup the indirection table */ 3197 cmd.data0 = sc->num_slices; 3198 err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_TABLE_SIZE, 3199 &cmd); 3200 3201 err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_RSS_TABLE_OFFSET, 3202 &cmd); 3203 if (err != 0) { 3204 device_printf(sc->dev, 3205 "failed to setup rss tables\n"); 3206 return err; 3207 } 3208 3209 /* just enable an identity mapping */ 3210 itable = sc->sram + cmd.data0; 3211 for (i = 0; i < sc->num_slices; i++) 3212 itable[i] = (uint8_t)i; 3213 3214 cmd.data0 = 1; 3215 cmd.data1 = mxge_rss_hash_type; 3216 err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_ENABLE, &cmd); 3217 if (err != 0) { 3218 device_printf(sc->dev, "failed to enable slices\n"); 3219 return err; 3220 } 3221 } 3222 3223 3224 mxge_choose_params(sc->ifp->if_mtu, &big_bytes, &cl_size, &nbufs); 3225 3226 cmd.data0 = nbufs; 3227 err = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS, 3228 &cmd); 3229 /* error is only meaningful if we're trying to set 3230 MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS > 1 */ 3231 if (err && nbufs > 1) { 3232 device_printf(sc->dev, 3233 "Failed to set alway-use-n to %d\n", 3234 nbufs); 3235 return EIO; 3236 } 3237 /* Give the firmware the mtu and the big and small buffer 3238 sizes. The firmware wants the big buf size to be a power 3239 of two. Luckily, FreeBSD's clusters are powers of two */ 3240 cmd.data0 = sc->ifp->if_mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN; 3241 err = mxge_send_cmd(sc, MXGEFW_CMD_SET_MTU, &cmd); 3242 cmd.data0 = MHLEN - MXGEFW_PAD; 3243 err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_SMALL_BUFFER_SIZE, 3244 &cmd); 3245 cmd.data0 = big_bytes; 3246 err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_BIG_BUFFER_SIZE, &cmd); 3247 3248 if (err != 0) { 3249 device_printf(sc->dev, "failed to setup params\n"); 3250 goto abort; 3251 } 3252 3253 /* Now give him the pointer to the stats block */ 3254 cmd.data0 = MXGE_LOWPART_TO_U32(sc->ss->fw_stats_dma.bus_addr); 3255 cmd.data1 = MXGE_HIGHPART_TO_U32(sc->ss->fw_stats_dma.bus_addr); 3256 cmd.data2 = sizeof(struct mcp_irq_data); 3257 err = mxge_send_cmd(sc, MXGEFW_CMD_SET_STATS_DMA_V2, &cmd); 3258 3259 if (err != 0) { 3260 bus = sc->ss->fw_stats_dma.bus_addr; 3261 bus += offsetof(struct mcp_irq_data, send_done_count); 3262 cmd.data0 = MXGE_LOWPART_TO_U32(bus); 3263 cmd.data1 = MXGE_HIGHPART_TO_U32(bus); 3264 err = mxge_send_cmd(sc, 3265 MXGEFW_CMD_SET_STATS_DMA_OBSOLETE, 3266 &cmd); 3267 /* Firmware cannot support multicast without STATS_DMA_V2 */ 3268 sc->fw_multicast_support = 0; 3269 } else { 3270 sc->fw_multicast_support = 1; 3271 } 3272 3273 if (err != 0) { 3274 device_printf(sc->dev, "failed to setup params\n"); 3275 goto abort; 3276 } 3277 3278 for (slice = 0; slice < sc->num_slices; slice++) { 3279 err = mxge_slice_open(&sc->ss[slice], nbufs, cl_size); 3280 if (err != 0) { 3281 device_printf(sc->dev, "couldn't open slice %d\n", 3282 slice); 3283 goto abort; 3284 } 3285 } 3286 3287 /* Finally, start the firmware running */ 3288 err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_UP, &cmd); 3289 if (err) { 3290 device_printf(sc->dev, "Couldn't bring up link\n"); 3291 goto abort; 3292 } 3293 sc->ifp->if_drv_flags |= IFF_DRV_RUNNING; 3294 sc->ifp->if_drv_flags &= ~IFF_DRV_OACTIVE; 3295 3296 return 0; 3297 3298 3299 abort: 3300 mxge_free_mbufs(sc); 3301 3302 return err; 3303 } 3304 3305 static int 3306 mxge_close(mxge_softc_t *sc) 3307 { 3308 mxge_cmd_t cmd; 3309 int err, old_down_cnt; 3310 3311 sc->ifp->if_drv_flags &= ~IFF_DRV_RUNNING; 3312 old_down_cnt = sc->down_cnt; 3313 mb(); 3314 err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_DOWN, &cmd); 3315 if (err) { 3316 device_printf(sc->dev, "Couldn't bring down link\n"); 3317 } 3318 if (old_down_cnt == sc->down_cnt) { 3319 /* wait for down irq */ 3320 DELAY(10 * sc->intr_coal_delay); 3321 } 3322 mb(); 3323 if (old_down_cnt == sc->down_cnt) { 3324 device_printf(sc->dev, "never got down irq\n"); 3325 } 3326 3327 mxge_free_mbufs(sc); 3328 3329 return 0; 3330 } 3331 3332 static void 3333 mxge_setup_cfg_space(mxge_softc_t *sc) 3334 { 3335 device_t dev = sc->dev; 3336 int reg; 3337 uint16_t cmd, lnk, pectl; 3338 3339 /* find the PCIe link width and set max read request to 4KB*/ 3340 if (pci_find_extcap(dev, PCIY_EXPRESS, ®) == 0) { 3341 lnk = pci_read_config(dev, reg + 0x12, 2); 3342 sc->link_width = (lnk >> 4) & 0x3f; 3343 3344 pectl = pci_read_config(dev, reg + 0x8, 2); 3345 pectl = (pectl & ~0x7000) | (5 << 12); 3346 pci_write_config(dev, reg + 0x8, pectl, 2); 3347 } 3348 3349 /* Enable DMA and Memory space access */ 3350 pci_enable_busmaster(dev); 3351 cmd = pci_read_config(dev, PCIR_COMMAND, 2); 3352 cmd |= PCIM_CMD_MEMEN; 3353 pci_write_config(dev, PCIR_COMMAND, cmd, 2); 3354 } 3355 3356 static uint32_t 3357 mxge_read_reboot(mxge_softc_t *sc) 3358 { 3359 device_t dev = sc->dev; 3360 uint32_t vs; 3361 3362 /* find the vendor specific offset */ 3363 if (pci_find_extcap(dev, PCIY_VENDOR, &vs) != 0) { 3364 device_printf(sc->dev, 3365 "could not find vendor specific offset\n"); 3366 return (uint32_t)-1; 3367 } 3368 /* enable read32 mode */ 3369 pci_write_config(dev, vs + 0x10, 0x3, 1); 3370 /* tell NIC which register to read */ 3371 pci_write_config(dev, vs + 0x18, 0xfffffff0, 4); 3372 return (pci_read_config(dev, vs + 0x14, 4)); 3373 } 3374 3375 static void 3376 mxge_watchdog_reset(mxge_softc_t *sc) 3377 { 3378 int err; 3379 uint32_t reboot; 3380 uint16_t cmd; 3381 3382 err = ENXIO; 3383 3384 device_printf(sc->dev, "Watchdog reset!\n"); 3385 3386 /* 3387 * check to see if the NIC rebooted. If it did, then all of 3388 * PCI config space has been reset, and things like the 3389 * busmaster bit will be zero. If this is the case, then we 3390 * must restore PCI config space before the NIC can be used 3391 * again 3392 */ 3393 cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2); 3394 if (cmd == 0xffff) { 3395 /* 3396 * maybe the watchdog caught the NIC rebooting; wait 3397 * up to 100ms for it to finish. If it does not come 3398 * back, then give up 3399 */ 3400 DELAY(1000*100); 3401 cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2); 3402 if (cmd == 0xffff) { 3403 device_printf(sc->dev, "NIC disappeared!\n"); 3404 goto abort; 3405 } 3406 } 3407 if ((cmd & PCIM_CMD_BUSMASTEREN) == 0) { 3408 /* print the reboot status */ 3409 reboot = mxge_read_reboot(sc); 3410 device_printf(sc->dev, "NIC rebooted, status = 0x%x\n", 3411 reboot); 3412 /* restore PCI configuration space */ 3413 3414 /* XXXX waiting for pci_cfg_restore() to be exported */ 3415 goto abort; /* just abort for now */ 3416 3417 /* and redo any changes we made to our config space */ 3418 mxge_setup_cfg_space(sc); 3419 } else { 3420 device_printf(sc->dev, "NIC did not reboot, ring state:\n"); 3421 device_printf(sc->dev, "tx.req=%d tx.done=%d\n", 3422 sc->ss->tx.req, sc->ss->tx.done); 3423 device_printf(sc->dev, "pkt_done=%d fw=%d\n", 3424 sc->ss->tx.pkt_done, 3425 be32toh(sc->ss->fw_stats->send_done_count)); 3426 } 3427 3428 if (sc->ifp->if_drv_flags & IFF_DRV_RUNNING) { 3429 mxge_close(sc); 3430 err = mxge_open(sc); 3431 } 3432 3433 abort: 3434 /* 3435 * stop the watchdog if the nic is dead, to avoid spamming the 3436 * console 3437 */ 3438 if (err != 0) { 3439 callout_stop(&sc->co_hdl); 3440 } 3441 } 3442 3443 static void 3444 mxge_watchdog(mxge_softc_t *sc) 3445 { 3446 mxge_tx_ring_t *tx = &sc->ss->tx; 3447 uint32_t rx_pause = be32toh(sc->ss->fw_stats->dropped_pause); 3448 3449 /* see if we have outstanding transmits, which 3450 have been pending for more than mxge_ticks */ 3451 if (tx->req != tx->done && 3452 tx->watchdog_req != tx->watchdog_done && 3453 tx->done == tx->watchdog_done) { 3454 /* check for pause blocking before resetting */ 3455 if (tx->watchdog_rx_pause == rx_pause) 3456 mxge_watchdog_reset(sc); 3457 else 3458 device_printf(sc->dev, "Flow control blocking " 3459 "xmits, check link partner\n"); 3460 } 3461 3462 tx->watchdog_req = tx->req; 3463 tx->watchdog_done = tx->done; 3464 tx->watchdog_rx_pause = rx_pause; 3465 3466 if (sc->need_media_probe) 3467 mxge_media_probe(sc); 3468 } 3469 3470 static void 3471 mxge_update_stats(mxge_softc_t *sc) 3472 { 3473 struct mxge_slice_state *ss; 3474 u_long ipackets = 0; 3475 int slice; 3476 3477 for(slice = 0; slice < sc->num_slices; slice++) { 3478 ss = &sc->ss[slice]; 3479 ipackets += ss->ipackets; 3480 } 3481 sc->ifp->if_ipackets = ipackets; 3482 3483 } 3484 static void 3485 mxge_tick(void *arg) 3486 { 3487 mxge_softc_t *sc = arg; 3488 3489 3490 /* Synchronize with possible callout reset/stop. */ 3491 if (callout_pending(&sc->co_hdl) || 3492 !callout_active(&sc->co_hdl)) { 3493 mtx_unlock(&sc->driver_mtx); 3494 return; 3495 } 3496 3497 /* aggregate stats from different slices */ 3498 mxge_update_stats(sc); 3499 3500 callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc); 3501 if (!sc->watchdog_countdown) { 3502 mxge_watchdog(sc); 3503 sc->watchdog_countdown = 4; 3504 } 3505 sc->watchdog_countdown--; 3506 } 3507 3508 static int 3509 mxge_media_change(struct ifnet *ifp) 3510 { 3511 return EINVAL; 3512 } 3513 3514 static int 3515 mxge_change_mtu(mxge_softc_t *sc, int mtu) 3516 { 3517 struct ifnet *ifp = sc->ifp; 3518 int real_mtu, old_mtu; 3519 int err = 0; 3520 3521 3522 real_mtu = mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN; 3523 if ((real_mtu > sc->max_mtu) || real_mtu < 60) 3524 return EINVAL; 3525 mtx_lock(&sc->driver_mtx); 3526 old_mtu = ifp->if_mtu; 3527 ifp->if_mtu = mtu; 3528 if (ifp->if_drv_flags & IFF_DRV_RUNNING) { 3529 callout_stop(&sc->co_hdl); 3530 mxge_close(sc); 3531 err = mxge_open(sc); 3532 if (err != 0) { 3533 ifp->if_mtu = old_mtu; 3534 mxge_close(sc); 3535 (void) mxge_open(sc); 3536 } 3537 callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc); 3538 } 3539 mtx_unlock(&sc->driver_mtx); 3540 return err; 3541 } 3542 3543 static void 3544 mxge_media_status(struct ifnet *ifp, struct ifmediareq *ifmr) 3545 { 3546 mxge_softc_t *sc = ifp->if_softc; 3547 3548 3549 if (sc == NULL) 3550 return; 3551 ifmr->ifm_status = IFM_AVALID; 3552 ifmr->ifm_status |= sc->link_state ? IFM_ACTIVE : 0; 3553 ifmr->ifm_active = IFM_AUTO | IFM_ETHER; 3554 ifmr->ifm_active |= sc->link_state ? IFM_FDX : 0; 3555 } 3556 3557 static int 3558 mxge_ioctl(struct ifnet *ifp, u_long command, caddr_t data) 3559 { 3560 mxge_softc_t *sc = ifp->if_softc; 3561 struct ifreq *ifr = (struct ifreq *)data; 3562 int err, mask; 3563 3564 err = 0; 3565 switch (command) { 3566 case SIOCSIFADDR: 3567 case SIOCGIFADDR: 3568 err = ether_ioctl(ifp, command, data); 3569 break; 3570 3571 case SIOCSIFMTU: 3572 err = mxge_change_mtu(sc, ifr->ifr_mtu); 3573 break; 3574 3575 case SIOCSIFFLAGS: 3576 mtx_lock(&sc->driver_mtx); 3577 if (ifp->if_flags & IFF_UP) { 3578 if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) { 3579 err = mxge_open(sc); 3580 callout_reset(&sc->co_hdl, mxge_ticks, 3581 mxge_tick, sc); 3582 } else { 3583 /* take care of promis can allmulti 3584 flag chages */ 3585 mxge_change_promisc(sc, 3586 ifp->if_flags & IFF_PROMISC); 3587 mxge_set_multicast_list(sc); 3588 } 3589 } else { 3590 if (ifp->if_drv_flags & IFF_DRV_RUNNING) { 3591 callout_stop(&sc->co_hdl); 3592 mxge_close(sc); 3593 } 3594 } 3595 mtx_unlock(&sc->driver_mtx); 3596 break; 3597 3598 case SIOCADDMULTI: 3599 case SIOCDELMULTI: 3600 mtx_lock(&sc->driver_mtx); 3601 mxge_set_multicast_list(sc); 3602 mtx_unlock(&sc->driver_mtx); 3603 break; 3604 3605 case SIOCSIFCAP: 3606 mtx_lock(&sc->driver_mtx); 3607 mask = ifr->ifr_reqcap ^ ifp->if_capenable; 3608 if (mask & IFCAP_TXCSUM) { 3609 if (IFCAP_TXCSUM & ifp->if_capenable) { 3610 ifp->if_capenable &= ~(IFCAP_TXCSUM|IFCAP_TSO4); 3611 ifp->if_hwassist &= ~(CSUM_TCP | CSUM_UDP 3612 | CSUM_TSO); 3613 } else { 3614 ifp->if_capenable |= IFCAP_TXCSUM; 3615 ifp->if_hwassist |= (CSUM_TCP | CSUM_UDP); 3616 } 3617 } else if (mask & IFCAP_RXCSUM) { 3618 if (IFCAP_RXCSUM & ifp->if_capenable) { 3619 ifp->if_capenable &= ~IFCAP_RXCSUM; 3620 sc->csum_flag = 0; 3621 } else { 3622 ifp->if_capenable |= IFCAP_RXCSUM; 3623 sc->csum_flag = 1; 3624 } 3625 } 3626 if (mask & IFCAP_TSO4) { 3627 if (IFCAP_TSO4 & ifp->if_capenable) { 3628 ifp->if_capenable &= ~IFCAP_TSO4; 3629 ifp->if_hwassist &= ~CSUM_TSO; 3630 } else if (IFCAP_TXCSUM & ifp->if_capenable) { 3631 ifp->if_capenable |= IFCAP_TSO4; 3632 ifp->if_hwassist |= CSUM_TSO; 3633 } else { 3634 printf("mxge requires tx checksum offload" 3635 " be enabled to use TSO\n"); 3636 err = EINVAL; 3637 } 3638 } 3639 if (mask & IFCAP_LRO) { 3640 if (IFCAP_LRO & ifp->if_capenable) 3641 err = mxge_change_lro_locked(sc, 0); 3642 else 3643 err = mxge_change_lro_locked(sc, mxge_lro_cnt); 3644 } 3645 if (mask & IFCAP_VLAN_HWTAGGING) 3646 ifp->if_capenable ^= IFCAP_VLAN_HWTAGGING; 3647 mtx_unlock(&sc->driver_mtx); 3648 VLAN_CAPABILITIES(ifp); 3649 3650 break; 3651 3652 case SIOCGIFMEDIA: 3653 err = ifmedia_ioctl(ifp, (struct ifreq *)data, 3654 &sc->media, command); 3655 break; 3656 3657 default: 3658 err = ENOTTY; 3659 } 3660 return err; 3661 } 3662 3663 static void 3664 mxge_fetch_tunables(mxge_softc_t *sc) 3665 { 3666 3667 TUNABLE_INT_FETCH("hw.mxge.max_slices", &mxge_max_slices); 3668 TUNABLE_INT_FETCH("hw.mxge.flow_control_enabled", 3669 &mxge_flow_control); 3670 TUNABLE_INT_FETCH("hw.mxge.intr_coal_delay", 3671 &mxge_intr_coal_delay); 3672 TUNABLE_INT_FETCH("hw.mxge.nvidia_ecrc_enable", 3673 &mxge_nvidia_ecrc_enable); 3674 TUNABLE_INT_FETCH("hw.mxge.force_firmware", 3675 &mxge_force_firmware); 3676 TUNABLE_INT_FETCH("hw.mxge.deassert_wait", 3677 &mxge_deassert_wait); 3678 TUNABLE_INT_FETCH("hw.mxge.verbose", 3679 &mxge_verbose); 3680 TUNABLE_INT_FETCH("hw.mxge.ticks", &mxge_ticks); 3681 TUNABLE_INT_FETCH("hw.mxge.lro_cnt", &sc->lro_cnt); 3682 TUNABLE_INT_FETCH("hw.mxge.always_promisc", &mxge_always_promisc); 3683 TUNABLE_INT_FETCH("hw.mxge.rss_hash_type", &mxge_rss_hash_type); 3684 if (sc->lro_cnt != 0) 3685 mxge_lro_cnt = sc->lro_cnt; 3686 3687 if (bootverbose) 3688 mxge_verbose = 1; 3689 if (mxge_intr_coal_delay < 0 || mxge_intr_coal_delay > 10*1000) 3690 mxge_intr_coal_delay = 30; 3691 if (mxge_ticks == 0) 3692 mxge_ticks = hz / 2; 3693 sc->pause = mxge_flow_control; 3694 if (mxge_rss_hash_type < MXGEFW_RSS_HASH_TYPE_IPV4 3695 || mxge_rss_hash_type > MXGEFW_RSS_HASH_TYPE_SRC_PORT) { 3696 mxge_rss_hash_type = MXGEFW_RSS_HASH_TYPE_SRC_PORT; 3697 } 3698 } 3699 3700 3701 static void 3702 mxge_free_slices(mxge_softc_t *sc) 3703 { 3704 struct mxge_slice_state *ss; 3705 int i; 3706 3707 3708 if (sc->ss == NULL) 3709 return; 3710 3711 for (i = 0; i < sc->num_slices; i++) { 3712 ss = &sc->ss[i]; 3713 if (ss->fw_stats != NULL) { 3714 mxge_dma_free(&ss->fw_stats_dma); 3715 ss->fw_stats = NULL; 3716 mtx_destroy(&ss->tx.mtx); 3717 } 3718 if (ss->rx_done.entry != NULL) { 3719 mxge_dma_free(&ss->rx_done.dma); 3720 ss->rx_done.entry = NULL; 3721 } 3722 } 3723 free(sc->ss, M_DEVBUF); 3724 sc->ss = NULL; 3725 } 3726 3727 static int 3728 mxge_alloc_slices(mxge_softc_t *sc) 3729 { 3730 mxge_cmd_t cmd; 3731 struct mxge_slice_state *ss; 3732 size_t bytes; 3733 int err, i, max_intr_slots; 3734 3735 err = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd); 3736 if (err != 0) { 3737 device_printf(sc->dev, "Cannot determine rx ring size\n"); 3738 return err; 3739 } 3740 sc->rx_ring_size = cmd.data0; 3741 max_intr_slots = 2 * (sc->rx_ring_size / sizeof (mcp_dma_addr_t)); 3742 3743 bytes = sizeof (*sc->ss) * sc->num_slices; 3744 sc->ss = malloc(bytes, M_DEVBUF, M_NOWAIT | M_ZERO); 3745 if (sc->ss == NULL) 3746 return (ENOMEM); 3747 for (i = 0; i < sc->num_slices; i++) { 3748 ss = &sc->ss[i]; 3749 3750 ss->sc = sc; 3751 3752 /* allocate per-slice rx interrupt queues */ 3753 3754 bytes = max_intr_slots * sizeof (*ss->rx_done.entry); 3755 err = mxge_dma_alloc(sc, &ss->rx_done.dma, bytes, 4096); 3756 if (err != 0) 3757 goto abort; 3758 ss->rx_done.entry = ss->rx_done.dma.addr; 3759 bzero(ss->rx_done.entry, bytes); 3760 3761 /* 3762 * allocate the per-slice firmware stats; stats 3763 * (including tx) are used used only on the first 3764 * slice for now 3765 */ 3766 if (i > 0) 3767 continue; 3768 3769 bytes = sizeof (*ss->fw_stats); 3770 err = mxge_dma_alloc(sc, &ss->fw_stats_dma, 3771 sizeof (*ss->fw_stats), 64); 3772 if (err != 0) 3773 goto abort; 3774 ss->fw_stats = (mcp_irq_data_t *)ss->fw_stats_dma.addr; 3775 snprintf(ss->tx.mtx_name, sizeof(ss->tx.mtx_name), 3776 "%s:tx(%d)", device_get_nameunit(sc->dev), i); 3777 mtx_init(&ss->tx.mtx, ss->tx.mtx_name, NULL, MTX_DEF); 3778 } 3779 3780 return (0); 3781 3782 abort: 3783 mxge_free_slices(sc); 3784 return (ENOMEM); 3785 } 3786 3787 static void 3788 mxge_slice_probe(mxge_softc_t *sc) 3789 { 3790 mxge_cmd_t cmd; 3791 char *old_fw; 3792 int msix_cnt, status, max_intr_slots; 3793 3794 sc->num_slices = 1; 3795 /* 3796 * don't enable multiple slices if they are not enabled, 3797 * or if this is not an SMP system 3798 */ 3799 3800 if (mxge_max_slices == 0 || mxge_max_slices == 1 || mp_ncpus < 2) 3801 return; 3802 3803 /* see how many MSI-X interrupts are available */ 3804 msix_cnt = pci_msix_count(sc->dev); 3805 if (msix_cnt < 2) 3806 return; 3807 3808 /* now load the slice aware firmware see what it supports */ 3809 old_fw = sc->fw_name; 3810 if (old_fw == mxge_fw_aligned) 3811 sc->fw_name = mxge_fw_rss_aligned; 3812 else 3813 sc->fw_name = mxge_fw_rss_unaligned; 3814 status = mxge_load_firmware(sc, 0); 3815 if (status != 0) { 3816 device_printf(sc->dev, "Falling back to a single slice\n"); 3817 return; 3818 } 3819 3820 /* try to send a reset command to the card to see if it 3821 is alive */ 3822 memset(&cmd, 0, sizeof (cmd)); 3823 status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd); 3824 if (status != 0) { 3825 device_printf(sc->dev, "failed reset\n"); 3826 goto abort_with_fw; 3827 } 3828 3829 /* get rx ring size */ 3830 status = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd); 3831 if (status != 0) { 3832 device_printf(sc->dev, "Cannot determine rx ring size\n"); 3833 goto abort_with_fw; 3834 } 3835 max_intr_slots = 2 * (cmd.data0 / sizeof (mcp_dma_addr_t)); 3836 3837 /* tell it the size of the interrupt queues */ 3838 cmd.data0 = max_intr_slots * sizeof (struct mcp_slot); 3839 status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd); 3840 if (status != 0) { 3841 device_printf(sc->dev, "failed MXGEFW_CMD_SET_INTRQ_SIZE\n"); 3842 goto abort_with_fw; 3843 } 3844 3845 /* ask the maximum number of slices it supports */ 3846 status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES, &cmd); 3847 if (status != 0) { 3848 device_printf(sc->dev, 3849 "failed MXGEFW_CMD_GET_MAX_RSS_QUEUES\n"); 3850 goto abort_with_fw; 3851 } 3852 sc->num_slices = cmd.data0; 3853 if (sc->num_slices > msix_cnt) 3854 sc->num_slices = msix_cnt; 3855 3856 if (mxge_max_slices == -1) { 3857 /* cap to number of CPUs in system */ 3858 if (sc->num_slices > mp_ncpus) 3859 sc->num_slices = mp_ncpus; 3860 } else { 3861 if (sc->num_slices > mxge_max_slices) 3862 sc->num_slices = mxge_max_slices; 3863 } 3864 /* make sure it is a power of two */ 3865 while (sc->num_slices & (sc->num_slices - 1)) 3866 sc->num_slices--; 3867 3868 if (mxge_verbose) 3869 device_printf(sc->dev, "using %d slices\n", 3870 sc->num_slices); 3871 3872 return; 3873 3874 abort_with_fw: 3875 sc->fw_name = old_fw; 3876 (void) mxge_load_firmware(sc, 0); 3877 } 3878 3879 static int 3880 mxge_add_msix_irqs(mxge_softc_t *sc) 3881 { 3882 size_t bytes; 3883 int count, err, i, rid; 3884 3885 rid = PCIR_BAR(2); 3886 sc->msix_table_res = bus_alloc_resource_any(sc->dev, SYS_RES_MEMORY, 3887 &rid, RF_ACTIVE); 3888 3889 if (sc->msix_table_res == NULL) { 3890 device_printf(sc->dev, "couldn't alloc MSIX table res\n"); 3891 return ENXIO; 3892 } 3893 3894 count = sc->num_slices; 3895 err = pci_alloc_msix(sc->dev, &count); 3896 if (err != 0) { 3897 device_printf(sc->dev, "pci_alloc_msix: failed, wanted %d" 3898 "err = %d \n", sc->num_slices, err); 3899 goto abort_with_msix_table; 3900 } 3901 if (count < sc->num_slices) { 3902 device_printf(sc->dev, "pci_alloc_msix: need %d, got %d\n", 3903 count, sc->num_slices); 3904 device_printf(sc->dev, 3905 "Try setting hw.mxge.max_slices to %d\n", 3906 count); 3907 err = ENOSPC; 3908 goto abort_with_msix; 3909 } 3910 bytes = sizeof (*sc->msix_irq_res) * sc->num_slices; 3911 sc->msix_irq_res = malloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO); 3912 if (sc->msix_irq_res == NULL) { 3913 err = ENOMEM; 3914 goto abort_with_msix; 3915 } 3916 3917 for (i = 0; i < sc->num_slices; i++) { 3918 rid = i + 1; 3919 sc->msix_irq_res[i] = bus_alloc_resource_any(sc->dev, 3920 SYS_RES_IRQ, 3921 &rid, RF_ACTIVE); 3922 if (sc->msix_irq_res[i] == NULL) { 3923 device_printf(sc->dev, "couldn't allocate IRQ res" 3924 " for message %d\n", i); 3925 err = ENXIO; 3926 goto abort_with_res; 3927 } 3928 } 3929 3930 bytes = sizeof (*sc->msix_ih) * sc->num_slices; 3931 sc->msix_ih = malloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO); 3932 3933 for (i = 0; i < sc->num_slices; i++) { 3934 err = bus_setup_intr(sc->dev, sc->msix_irq_res[i], 3935 INTR_TYPE_NET | INTR_MPSAFE, 3936 NULL, mxge_intr, &sc->ss[i], 3937 &sc->msix_ih[i]); 3938 if (err != 0) { 3939 device_printf(sc->dev, "couldn't setup intr for " 3940 "message %d\n", i); 3941 goto abort_with_intr; 3942 } 3943 } 3944 3945 if (mxge_verbose) { 3946 device_printf(sc->dev, "using %d msix IRQs:", 3947 sc->num_slices); 3948 for (i = 0; i < sc->num_slices; i++) 3949 printf(" %ld", rman_get_start(sc->msix_irq_res[i])); 3950 printf("\n"); 3951 } 3952 return (0); 3953 3954 abort_with_intr: 3955 for (i = 0; i < sc->num_slices; i++) { 3956 if (sc->msix_ih[i] != NULL) { 3957 bus_teardown_intr(sc->dev, sc->msix_irq_res[i], 3958 sc->msix_ih[i]); 3959 sc->msix_ih[i] = NULL; 3960 } 3961 } 3962 free(sc->msix_ih, M_DEVBUF); 3963 3964 3965 abort_with_res: 3966 for (i = 0; i < sc->num_slices; i++) { 3967 rid = i + 1; 3968 if (sc->msix_irq_res[i] != NULL) 3969 bus_release_resource(sc->dev, SYS_RES_IRQ, rid, 3970 sc->msix_irq_res[i]); 3971 sc->msix_irq_res[i] = NULL; 3972 } 3973 free(sc->msix_irq_res, M_DEVBUF); 3974 3975 3976 abort_with_msix: 3977 pci_release_msi(sc->dev); 3978 3979 abort_with_msix_table: 3980 bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2), 3981 sc->msix_table_res); 3982 3983 return err; 3984 } 3985 3986 static int 3987 mxge_add_single_irq(mxge_softc_t *sc) 3988 { 3989 int count, err, rid; 3990 3991 count = pci_msi_count(sc->dev); 3992 if (count == 1 && pci_alloc_msi(sc->dev, &count) == 0) { 3993 rid = 1; 3994 sc->msi_enabled = 1; 3995 } else { 3996 rid = 0; 3997 } 3998 sc->irq_res = bus_alloc_resource(sc->dev, SYS_RES_IRQ, &rid, 0, ~0, 3999 1, RF_SHAREABLE | RF_ACTIVE); 4000 if (sc->irq_res == NULL) { 4001 device_printf(sc->dev, "could not alloc interrupt\n"); 4002 return ENXIO; 4003 } 4004 if (mxge_verbose) 4005 device_printf(sc->dev, "using %s irq %ld\n", 4006 sc->msi_enabled ? "MSI" : "INTx", 4007 rman_get_start(sc->irq_res)); 4008 err = bus_setup_intr(sc->dev, sc->irq_res, 4009 INTR_TYPE_NET | INTR_MPSAFE, 4010 NULL, mxge_intr, &sc->ss[0], &sc->ih); 4011 if (err != 0) { 4012 bus_release_resource(sc->dev, SYS_RES_IRQ, 4013 sc->msi_enabled ? 1 : 0, sc->irq_res); 4014 if (sc->msi_enabled) 4015 pci_release_msi(sc->dev); 4016 } 4017 return err; 4018 } 4019 4020 static void 4021 mxge_rem_msix_irqs(mxge_softc_t *sc) 4022 { 4023 int i, rid; 4024 4025 for (i = 0; i < sc->num_slices; i++) { 4026 if (sc->msix_ih[i] != NULL) { 4027 bus_teardown_intr(sc->dev, sc->msix_irq_res[i], 4028 sc->msix_ih[i]); 4029 sc->msix_ih[i] = NULL; 4030 } 4031 } 4032 free(sc->msix_ih, M_DEVBUF); 4033 4034 for (i = 0; i < sc->num_slices; i++) { 4035 rid = i + 1; 4036 if (sc->msix_irq_res[i] != NULL) 4037 bus_release_resource(sc->dev, SYS_RES_IRQ, rid, 4038 sc->msix_irq_res[i]); 4039 sc->msix_irq_res[i] = NULL; 4040 } 4041 free(sc->msix_irq_res, M_DEVBUF); 4042 4043 bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2), 4044 sc->msix_table_res); 4045 4046 pci_release_msi(sc->dev); 4047 return; 4048 } 4049 4050 static void 4051 mxge_rem_single_irq(mxge_softc_t *sc) 4052 { 4053 bus_teardown_intr(sc->dev, sc->irq_res, sc->ih); 4054 bus_release_resource(sc->dev, SYS_RES_IRQ, 4055 sc->msi_enabled ? 1 : 0, sc->irq_res); 4056 if (sc->msi_enabled) 4057 pci_release_msi(sc->dev); 4058 } 4059 4060 static void 4061 mxge_rem_irq(mxge_softc_t *sc) 4062 { 4063 if (sc->num_slices > 1) 4064 mxge_rem_msix_irqs(sc); 4065 else 4066 mxge_rem_single_irq(sc); 4067 } 4068 4069 static int 4070 mxge_add_irq(mxge_softc_t *sc) 4071 { 4072 int err; 4073 4074 if (sc->num_slices > 1) 4075 err = mxge_add_msix_irqs(sc); 4076 else 4077 err = mxge_add_single_irq(sc); 4078 4079 if (0 && err == 0 && sc->num_slices > 1) { 4080 mxge_rem_msix_irqs(sc); 4081 err = mxge_add_msix_irqs(sc); 4082 } 4083 return err; 4084 } 4085 4086 4087 static int 4088 mxge_attach(device_t dev) 4089 { 4090 mxge_softc_t *sc = device_get_softc(dev); 4091 struct ifnet *ifp; 4092 int err, rid; 4093 4094 sc->dev = dev; 4095 mxge_fetch_tunables(sc); 4096 4097 err = bus_dma_tag_create(NULL, /* parent */ 4098 1, /* alignment */ 4099 0, /* boundary */ 4100 BUS_SPACE_MAXADDR, /* low */ 4101 BUS_SPACE_MAXADDR, /* high */ 4102 NULL, NULL, /* filter */ 4103 65536 + 256, /* maxsize */ 4104 MXGE_MAX_SEND_DESC, /* num segs */ 4105 65536, /* maxsegsize */ 4106 0, /* flags */ 4107 NULL, NULL, /* lock */ 4108 &sc->parent_dmat); /* tag */ 4109 4110 if (err != 0) { 4111 device_printf(sc->dev, "Err %d allocating parent dmat\n", 4112 err); 4113 goto abort_with_nothing; 4114 } 4115 4116 ifp = sc->ifp = if_alloc(IFT_ETHER); 4117 if (ifp == NULL) { 4118 device_printf(dev, "can not if_alloc()\n"); 4119 err = ENOSPC; 4120 goto abort_with_parent_dmat; 4121 } 4122 if_initname(ifp, device_get_name(dev), device_get_unit(dev)); 4123 4124 snprintf(sc->cmd_mtx_name, sizeof(sc->cmd_mtx_name), "%s:cmd", 4125 device_get_nameunit(dev)); 4126 mtx_init(&sc->cmd_mtx, sc->cmd_mtx_name, NULL, MTX_DEF); 4127 snprintf(sc->driver_mtx_name, sizeof(sc->driver_mtx_name), 4128 "%s:drv", device_get_nameunit(dev)); 4129 mtx_init(&sc->driver_mtx, sc->driver_mtx_name, 4130 MTX_NETWORK_LOCK, MTX_DEF); 4131 4132 callout_init_mtx(&sc->co_hdl, &sc->driver_mtx, 0); 4133 4134 mxge_setup_cfg_space(sc); 4135 4136 /* Map the board into the kernel */ 4137 rid = PCIR_BARS; 4138 sc->mem_res = bus_alloc_resource(dev, SYS_RES_MEMORY, &rid, 0, 4139 ~0, 1, RF_ACTIVE); 4140 if (sc->mem_res == NULL) { 4141 device_printf(dev, "could not map memory\n"); 4142 err = ENXIO; 4143 goto abort_with_lock; 4144 } 4145 sc->sram = rman_get_virtual(sc->mem_res); 4146 sc->sram_size = 2*1024*1024 - (2*(48*1024)+(32*1024)) - 0x100; 4147 if (sc->sram_size > rman_get_size(sc->mem_res)) { 4148 device_printf(dev, "impossible memory region size %ld\n", 4149 rman_get_size(sc->mem_res)); 4150 err = ENXIO; 4151 goto abort_with_mem_res; 4152 } 4153 4154 /* make NULL terminated copy of the EEPROM strings section of 4155 lanai SRAM */ 4156 bzero(sc->eeprom_strings, MXGE_EEPROM_STRINGS_SIZE); 4157 bus_space_read_region_1(rman_get_bustag(sc->mem_res), 4158 rman_get_bushandle(sc->mem_res), 4159 sc->sram_size - MXGE_EEPROM_STRINGS_SIZE, 4160 sc->eeprom_strings, 4161 MXGE_EEPROM_STRINGS_SIZE - 2); 4162 err = mxge_parse_strings(sc); 4163 if (err != 0) 4164 goto abort_with_mem_res; 4165 4166 /* Enable write combining for efficient use of PCIe bus */ 4167 mxge_enable_wc(sc); 4168 4169 /* Allocate the out of band dma memory */ 4170 err = mxge_dma_alloc(sc, &sc->cmd_dma, 4171 sizeof (mxge_cmd_t), 64); 4172 if (err != 0) 4173 goto abort_with_mem_res; 4174 sc->cmd = (mcp_cmd_response_t *) sc->cmd_dma.addr; 4175 err = mxge_dma_alloc(sc, &sc->zeropad_dma, 64, 64); 4176 if (err != 0) 4177 goto abort_with_cmd_dma; 4178 4179 err = mxge_dma_alloc(sc, &sc->dmabench_dma, 4096, 4096); 4180 if (err != 0) 4181 goto abort_with_zeropad_dma; 4182 4183 /* select & load the firmware */ 4184 err = mxge_select_firmware(sc); 4185 if (err != 0) 4186 goto abort_with_dmabench; 4187 sc->intr_coal_delay = mxge_intr_coal_delay; 4188 4189 mxge_slice_probe(sc); 4190 err = mxge_alloc_slices(sc); 4191 if (err != 0) 4192 goto abort_with_dmabench; 4193 4194 err = mxge_reset(sc, 0); 4195 if (err != 0) 4196 goto abort_with_slices; 4197 4198 err = mxge_alloc_rings(sc); 4199 if (err != 0) { 4200 device_printf(sc->dev, "failed to allocate rings\n"); 4201 goto abort_with_dmabench; 4202 } 4203 4204 err = mxge_add_irq(sc); 4205 if (err != 0) { 4206 device_printf(sc->dev, "failed to add irq\n"); 4207 goto abort_with_rings; 4208 } 4209 4210 ifp->if_baudrate = 100000000; 4211 ifp->if_capabilities = IFCAP_RXCSUM | IFCAP_TXCSUM | IFCAP_TSO4 | 4212 IFCAP_VLAN_MTU | IFCAP_VLAN_HWTAGGING | 4213 IFCAP_VLAN_HWCSUM | IFCAP_LRO; 4214 4215 sc->max_mtu = mxge_max_mtu(sc); 4216 if (sc->max_mtu >= 9000) 4217 ifp->if_capabilities |= IFCAP_JUMBO_MTU; 4218 else 4219 device_printf(dev, "MTU limited to %d. Install " 4220 "latest firmware for 9000 byte jumbo support\n", 4221 sc->max_mtu - ETHER_HDR_LEN); 4222 ifp->if_hwassist = CSUM_TCP | CSUM_UDP | CSUM_TSO; 4223 ifp->if_capenable = ifp->if_capabilities; 4224 if (sc->lro_cnt == 0) 4225 ifp->if_capenable &= ~IFCAP_LRO; 4226 sc->csum_flag = 1; 4227 ifp->if_init = mxge_init; 4228 ifp->if_softc = sc; 4229 ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST; 4230 ifp->if_ioctl = mxge_ioctl; 4231 ifp->if_start = mxge_start; 4232 /* Initialise the ifmedia structure */ 4233 ifmedia_init(&sc->media, 0, mxge_media_change, 4234 mxge_media_status); 4235 mxge_set_media(sc, IFM_ETHER | IFM_AUTO); 4236 mxge_media_probe(sc); 4237 ether_ifattach(ifp, sc->mac_addr); 4238 /* ether_ifattach sets mtu to 1500 */ 4239 if (ifp->if_capabilities & IFCAP_JUMBO_MTU) 4240 ifp->if_mtu = 9000; 4241 4242 mxge_add_sysctls(sc); 4243 return 0; 4244 4245 abort_with_rings: 4246 mxge_free_rings(sc); 4247 abort_with_slices: 4248 mxge_free_slices(sc); 4249 abort_with_dmabench: 4250 mxge_dma_free(&sc->dmabench_dma); 4251 abort_with_zeropad_dma: 4252 mxge_dma_free(&sc->zeropad_dma); 4253 abort_with_cmd_dma: 4254 mxge_dma_free(&sc->cmd_dma); 4255 abort_with_mem_res: 4256 bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res); 4257 abort_with_lock: 4258 pci_disable_busmaster(dev); 4259 mtx_destroy(&sc->cmd_mtx); 4260 mtx_destroy(&sc->driver_mtx); 4261 if_free(ifp); 4262 abort_with_parent_dmat: 4263 bus_dma_tag_destroy(sc->parent_dmat); 4264 4265 abort_with_nothing: 4266 return err; 4267 } 4268 4269 static int 4270 mxge_detach(device_t dev) 4271 { 4272 mxge_softc_t *sc = device_get_softc(dev); 4273 4274 if (sc->ifp->if_vlantrunk != NULL) { 4275 device_printf(sc->dev, 4276 "Detach vlans before removing module\n"); 4277 return EBUSY; 4278 } 4279 mtx_lock(&sc->driver_mtx); 4280 callout_stop(&sc->co_hdl); 4281 if (sc->ifp->if_drv_flags & IFF_DRV_RUNNING) 4282 mxge_close(sc); 4283 mtx_unlock(&sc->driver_mtx); 4284 ether_ifdetach(sc->ifp); 4285 ifmedia_removeall(&sc->media); 4286 mxge_dummy_rdma(sc, 0); 4287 mxge_rem_sysctls(sc); 4288 mxge_rem_irq(sc); 4289 mxge_free_rings(sc); 4290 mxge_free_slices(sc); 4291 mxge_dma_free(&sc->dmabench_dma); 4292 mxge_dma_free(&sc->zeropad_dma); 4293 mxge_dma_free(&sc->cmd_dma); 4294 bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res); 4295 pci_disable_busmaster(dev); 4296 mtx_destroy(&sc->cmd_mtx); 4297 mtx_destroy(&sc->driver_mtx); 4298 if_free(sc->ifp); 4299 bus_dma_tag_destroy(sc->parent_dmat); 4300 return 0; 4301 } 4302 4303 static int 4304 mxge_shutdown(device_t dev) 4305 { 4306 return 0; 4307 } 4308 4309 /* 4310 This file uses Myri10GE driver indentation. 4311 4312 Local Variables: 4313 c-file-style:"linux" 4314 tab-width:8 4315 End: 4316 */ 4317