xref: /freebsd/sys/dev/mxge/if_mxge.c (revision 282a3889ebf826db9839be296ff1dd903f6d6d6e)
1 /******************************************************************************
2 
3 Copyright (c) 2006-2007, Myricom Inc.
4 All rights reserved.
5 
6 Redistribution and use in source and binary forms, with or without
7 modification, are permitted provided that the following conditions are met:
8 
9  1. Redistributions of source code must retain the above copyright notice,
10     this list of conditions and the following disclaimer.
11 
12  2. Neither the name of the Myricom Inc, nor the names of its
13     contributors may be used to endorse or promote products derived from
14     this software without specific prior written permission.
15 
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 POSSIBILITY OF SUCH DAMAGE.
27 
28 ***************************************************************************/
29 
30 #include <sys/cdefs.h>
31 __FBSDID("$FreeBSD$");
32 
33 #include <sys/param.h>
34 #include <sys/systm.h>
35 #include <sys/linker.h>
36 #include <sys/firmware.h>
37 #include <sys/endian.h>
38 #include <sys/sockio.h>
39 #include <sys/mbuf.h>
40 #include <sys/malloc.h>
41 #include <sys/kdb.h>
42 #include <sys/kernel.h>
43 #include <sys/lock.h>
44 #include <sys/module.h>
45 #include <sys/memrange.h>
46 #include <sys/socket.h>
47 #include <sys/sysctl.h>
48 #include <sys/sx.h>
49 
50 #include <net/if.h>
51 #include <net/if_arp.h>
52 #include <net/ethernet.h>
53 #include <net/if_dl.h>
54 #include <net/if_media.h>
55 
56 #include <net/bpf.h>
57 
58 #include <net/if_types.h>
59 #include <net/if_vlan_var.h>
60 #include <net/zlib.h>
61 
62 #include <netinet/in_systm.h>
63 #include <netinet/in.h>
64 #include <netinet/ip.h>
65 #include <netinet/tcp.h>
66 
67 #include <machine/bus.h>
68 #include <machine/in_cksum.h>
69 #include <machine/resource.h>
70 #include <sys/bus.h>
71 #include <sys/rman.h>
72 
73 #include <dev/pci/pcireg.h>
74 #include <dev/pci/pcivar.h>
75 
76 #include <vm/vm.h>		/* for pmap_mapdev() */
77 #include <vm/pmap.h>
78 
79 #if defined(__i386) || defined(__amd64)
80 #include <machine/specialreg.h>
81 #endif
82 
83 #include <dev/mxge/mxge_mcp.h>
84 #include <dev/mxge/mcp_gen_header.h>
85 #include <dev/mxge/if_mxge_var.h>
86 
87 /* tunable params */
88 static int mxge_nvidia_ecrc_enable = 1;
89 static int mxge_force_firmware = 0;
90 static int mxge_intr_coal_delay = 30;
91 static int mxge_deassert_wait = 1;
92 static int mxge_flow_control = 1;
93 static int mxge_verbose = 0;
94 static int mxge_lro_cnt = 8;
95 static int mxge_ticks;
96 static char *mxge_fw_unaligned = "mxge_ethp_z8e";
97 static char *mxge_fw_aligned = "mxge_eth_z8e";
98 
99 static int mxge_probe(device_t dev);
100 static int mxge_attach(device_t dev);
101 static int mxge_detach(device_t dev);
102 static int mxge_shutdown(device_t dev);
103 static void mxge_intr(void *arg);
104 
105 static device_method_t mxge_methods[] =
106 {
107   /* Device interface */
108   DEVMETHOD(device_probe, mxge_probe),
109   DEVMETHOD(device_attach, mxge_attach),
110   DEVMETHOD(device_detach, mxge_detach),
111   DEVMETHOD(device_shutdown, mxge_shutdown),
112   {0, 0}
113 };
114 
115 static driver_t mxge_driver =
116 {
117   "mxge",
118   mxge_methods,
119   sizeof(mxge_softc_t),
120 };
121 
122 static devclass_t mxge_devclass;
123 
124 /* Declare ourselves to be a child of the PCI bus.*/
125 DRIVER_MODULE(mxge, pci, mxge_driver, mxge_devclass, 0, 0);
126 MODULE_DEPEND(mxge, firmware, 1, 1, 1);
127 MODULE_DEPEND(mxge, zlib, 1, 1, 1);
128 
129 static int mxge_load_firmware(mxge_softc_t *sc);
130 static int mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data);
131 static int mxge_close(mxge_softc_t *sc);
132 static int mxge_open(mxge_softc_t *sc);
133 static void mxge_tick(void *arg);
134 
135 static int
136 mxge_probe(device_t dev)
137 {
138   if ((pci_get_vendor(dev) == MXGE_PCI_VENDOR_MYRICOM) &&
139       (pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E)) {
140 	  device_set_desc(dev, "Myri10G-PCIE-8A");
141 	  return 0;
142   }
143   return ENXIO;
144 }
145 
146 static void
147 mxge_enable_wc(mxge_softc_t *sc)
148 {
149 #if defined(__i386) || defined(__amd64)
150 	struct mem_range_desc mrdesc;
151 	vm_paddr_t pa;
152 	vm_offset_t len;
153 	int err, action;
154 
155 	sc->wc = 1;
156 	len = rman_get_size(sc->mem_res);
157 	err = pmap_change_attr((vm_offset_t) sc->sram,
158 			       len, PAT_WRITE_COMBINING);
159 	if (err == 0)
160 		return;
161 	else
162 		device_printf(sc->dev, "pmap_change_attr failed, %d\n",
163 			      err);
164 	pa = rman_get_start(sc->mem_res);
165 	mrdesc.mr_base = pa;
166 	mrdesc.mr_len = len;
167 	mrdesc.mr_flags = MDF_WRITECOMBINE;
168 	action = MEMRANGE_SET_UPDATE;
169 	strcpy((char *)&mrdesc.mr_owner, "mxge");
170 	err = mem_range_attr_set(&mrdesc, &action);
171 	if (err != 0) {
172 		sc->wc = 0;
173 		device_printf(sc->dev,
174 			      "w/c failed for pa 0x%lx, len 0x%lx, err = %d\n",
175 			      (unsigned long)pa, (unsigned long)len, err);
176 	}
177 #endif
178 }
179 
180 
181 /* callback to get our DMA address */
182 static void
183 mxge_dmamap_callback(void *arg, bus_dma_segment_t *segs, int nsegs,
184 			 int error)
185 {
186 	if (error == 0) {
187 		*(bus_addr_t *) arg = segs->ds_addr;
188 	}
189 }
190 
191 static int
192 mxge_dma_alloc(mxge_softc_t *sc, mxge_dma_t *dma, size_t bytes,
193 		   bus_size_t alignment)
194 {
195 	int err;
196 	device_t dev = sc->dev;
197 
198 	/* allocate DMAable memory tags */
199 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
200 				 alignment,		/* alignment */
201 				 4096,			/* boundary */
202 				 BUS_SPACE_MAXADDR,	/* low */
203 				 BUS_SPACE_MAXADDR,	/* high */
204 				 NULL, NULL,		/* filter */
205 				 bytes,			/* maxsize */
206 				 1,			/* num segs */
207 				 4096,			/* maxsegsize */
208 				 BUS_DMA_COHERENT,	/* flags */
209 				 NULL, NULL,		/* lock */
210 				 &dma->dmat);		/* tag */
211 	if (err != 0) {
212 		device_printf(dev, "couldn't alloc tag (err = %d)\n", err);
213 		return err;
214 	}
215 
216 	/* allocate DMAable memory & map */
217 	err = bus_dmamem_alloc(dma->dmat, &dma->addr,
218 			       (BUS_DMA_WAITOK | BUS_DMA_COHERENT
219 				| BUS_DMA_ZERO),  &dma->map);
220 	if (err != 0) {
221 		device_printf(dev, "couldn't alloc mem (err = %d)\n", err);
222 		goto abort_with_dmat;
223 	}
224 
225 	/* load the memory */
226 	err = bus_dmamap_load(dma->dmat, dma->map, dma->addr, bytes,
227 			      mxge_dmamap_callback,
228 			      (void *)&dma->bus_addr, 0);
229 	if (err != 0) {
230 		device_printf(dev, "couldn't load map (err = %d)\n", err);
231 		goto abort_with_mem;
232 	}
233 	return 0;
234 
235 abort_with_mem:
236 	bus_dmamem_free(dma->dmat, dma->addr, dma->map);
237 abort_with_dmat:
238 	(void)bus_dma_tag_destroy(dma->dmat);
239 	return err;
240 }
241 
242 
243 static void
244 mxge_dma_free(mxge_dma_t *dma)
245 {
246 	bus_dmamap_unload(dma->dmat, dma->map);
247 	bus_dmamem_free(dma->dmat, dma->addr, dma->map);
248 	(void)bus_dma_tag_destroy(dma->dmat);
249 }
250 
251 /*
252  * The eeprom strings on the lanaiX have the format
253  * SN=x\0
254  * MAC=x:x:x:x:x:x\0
255  * PC=text\0
256  */
257 
258 static int
259 mxge_parse_strings(mxge_softc_t *sc)
260 {
261 #define MXGE_NEXT_STRING(p) while(ptr < limit && *ptr++)
262 
263 	char *ptr, *limit;
264 	int i, found_mac;
265 
266 	ptr = sc->eeprom_strings;
267 	limit = sc->eeprom_strings + MXGE_EEPROM_STRINGS_SIZE;
268 	found_mac = 0;
269 	while (ptr < limit && *ptr != '\0') {
270 		if (memcmp(ptr, "MAC=", 4) == 0) {
271 			ptr += 1;
272 			sc->mac_addr_string = ptr;
273 			for (i = 0; i < 6; i++) {
274 				ptr += 3;
275 				if ((ptr + 2) > limit)
276 					goto abort;
277 				sc->mac_addr[i] = strtoul(ptr, NULL, 16);
278 				found_mac = 1;
279 			}
280 		} else if (memcmp(ptr, "PC=", 3) == 0) {
281 			ptr += 3;
282 			strncpy(sc->product_code_string, ptr,
283 				sizeof (sc->product_code_string) - 1);
284 		} else if (memcmp(ptr, "SN=", 3) == 0) {
285 			ptr += 3;
286 			strncpy(sc->serial_number_string, ptr,
287 				sizeof (sc->serial_number_string) - 1);
288 		}
289 		MXGE_NEXT_STRING(ptr);
290 	}
291 
292 	if (found_mac)
293 		return 0;
294 
295  abort:
296 	device_printf(sc->dev, "failed to parse eeprom_strings\n");
297 
298 	return ENXIO;
299 }
300 
301 #if #cpu(i386) || defined __i386 || defined i386 || defined __i386__ || #cpu(x86_64) || defined __x86_64__
302 static void
303 mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
304 {
305 	uint32_t val;
306 	unsigned long base, off;
307 	char *va, *cfgptr;
308 	device_t pdev, mcp55;
309 	uint16_t vendor_id, device_id, word;
310 	uintptr_t bus, slot, func, ivend, idev;
311 	uint32_t *ptr32;
312 
313 
314 	if (!mxge_nvidia_ecrc_enable)
315 		return;
316 
317 	pdev = device_get_parent(device_get_parent(sc->dev));
318 	if (pdev == NULL) {
319 		device_printf(sc->dev, "could not find parent?\n");
320 		return;
321 	}
322 	vendor_id = pci_read_config(pdev, PCIR_VENDOR, 2);
323 	device_id = pci_read_config(pdev, PCIR_DEVICE, 2);
324 
325 	if (vendor_id != 0x10de)
326 		return;
327 
328 	base = 0;
329 
330 	if (device_id == 0x005d) {
331 		/* ck804, base address is magic */
332 		base = 0xe0000000UL;
333 	} else if (device_id >= 0x0374 && device_id <= 0x378) {
334 		/* mcp55, base address stored in chipset */
335 		mcp55 = pci_find_bsf(0, 0, 0);
336 		if (mcp55 &&
337 		    0x10de == pci_read_config(mcp55, PCIR_VENDOR, 2) &&
338 		    0x0369 == pci_read_config(mcp55, PCIR_DEVICE, 2)) {
339 			word = pci_read_config(mcp55, 0x90, 2);
340 			base = ((unsigned long)word & 0x7ffeU) << 25;
341 		}
342 	}
343 	if (!base)
344 		return;
345 
346 	/* XXXX
347 	   Test below is commented because it is believed that doing
348 	   config read/write beyond 0xff will access the config space
349 	   for the next larger function.  Uncomment this and remove
350 	   the hacky pmap_mapdev() way of accessing config space when
351 	   FreeBSD grows support for extended pcie config space access
352 	*/
353 #if 0
354 	/* See if we can, by some miracle, access the extended
355 	   config space */
356 	val = pci_read_config(pdev, 0x178, 4);
357 	if (val != 0xffffffff) {
358 		val |= 0x40;
359 		pci_write_config(pdev, 0x178, val, 4);
360 		return;
361 	}
362 #endif
363 	/* Rather than using normal pci config space writes, we must
364 	 * map the Nvidia config space ourselves.  This is because on
365 	 * opteron/nvidia class machine the 0xe000000 mapping is
366 	 * handled by the nvidia chipset, that means the internal PCI
367 	 * device (the on-chip northbridge), or the amd-8131 bridge
368 	 * and things behind them are not visible by this method.
369 	 */
370 
371 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
372 		      PCI_IVAR_BUS, &bus);
373 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
374 		      PCI_IVAR_SLOT, &slot);
375 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
376 		      PCI_IVAR_FUNCTION, &func);
377 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
378 		      PCI_IVAR_VENDOR, &ivend);
379 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
380 		      PCI_IVAR_DEVICE, &idev);
381 
382 	off =  base
383 		+ 0x00100000UL * (unsigned long)bus
384 		+ 0x00001000UL * (unsigned long)(func
385 						 + 8 * slot);
386 
387 	/* map it into the kernel */
388 	va = pmap_mapdev(trunc_page((vm_paddr_t)off), PAGE_SIZE);
389 
390 
391 	if (va == NULL) {
392 		device_printf(sc->dev, "pmap_kenter_temporary didn't\n");
393 		return;
394 	}
395 	/* get a pointer to the config space mapped into the kernel */
396 	cfgptr = va + (off & PAGE_MASK);
397 
398 	/* make sure that we can really access it */
399 	vendor_id = *(uint16_t *)(cfgptr + PCIR_VENDOR);
400 	device_id = *(uint16_t *)(cfgptr + PCIR_DEVICE);
401 	if (! (vendor_id == ivend && device_id == idev)) {
402 		device_printf(sc->dev, "mapping failed: 0x%x:0x%x\n",
403 			      vendor_id, device_id);
404 		pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
405 		return;
406 	}
407 
408 	ptr32 = (uint32_t*)(cfgptr + 0x178);
409 	val = *ptr32;
410 
411 	if (val == 0xffffffff) {
412 		device_printf(sc->dev, "extended mapping failed\n");
413 		pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
414 		return;
415 	}
416 	*ptr32 = val | 0x40;
417 	pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
418 	if (mxge_verbose)
419 		device_printf(sc->dev,
420 			      "Enabled ECRC on upstream Nvidia bridge "
421 			      "at %d:%d:%d\n",
422 			      (int)bus, (int)slot, (int)func);
423 	return;
424 }
425 #else
426 static void
427 mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
428 {
429 	device_printf(sc->dev,
430 		      "Nforce 4 chipset on non-x86/amd64!?!?!\n");
431 	return;
432 }
433 #endif
434 
435 
436 static int
437 mxge_dma_test(mxge_softc_t *sc, int test_type)
438 {
439 	mxge_cmd_t cmd;
440 	bus_addr_t dmatest_bus = sc->dmabench_dma.bus_addr;
441 	int status;
442 	uint32_t len;
443 	char *test = " ";
444 
445 
446 	/* Run a small DMA test.
447 	 * The magic multipliers to the length tell the firmware
448 	 * to do DMA read, write, or read+write tests.  The
449 	 * results are returned in cmd.data0.  The upper 16
450 	 * bits of the return is the number of transfers completed.
451 	 * The lower 16 bits is the time in 0.5us ticks that the
452 	 * transfers took to complete.
453 	 */
454 
455 	len = sc->tx.boundary;
456 
457 	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
458 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
459 	cmd.data2 = len * 0x10000;
460 	status = mxge_send_cmd(sc, test_type, &cmd);
461 	if (status != 0) {
462 		test = "read";
463 		goto abort;
464 	}
465 	sc->read_dma = ((cmd.data0>>16) * len * 2) /
466 		(cmd.data0 & 0xffff);
467 	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
468 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
469 	cmd.data2 = len * 0x1;
470 	status = mxge_send_cmd(sc, test_type, &cmd);
471 	if (status != 0) {
472 		test = "write";
473 		goto abort;
474 	}
475 	sc->write_dma = ((cmd.data0>>16) * len * 2) /
476 		(cmd.data0 & 0xffff);
477 
478 	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
479 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
480 	cmd.data2 = len * 0x10001;
481 	status = mxge_send_cmd(sc, test_type, &cmd);
482 	if (status != 0) {
483 		test = "read/write";
484 		goto abort;
485 	}
486 	sc->read_write_dma = ((cmd.data0>>16) * len * 2 * 2) /
487 		(cmd.data0 & 0xffff);
488 
489 abort:
490 	if (status != 0 && test_type != MXGEFW_CMD_UNALIGNED_TEST)
491 		device_printf(sc->dev, "DMA %s benchmark failed: %d\n",
492 			      test, status);
493 
494 	return status;
495 }
496 
497 /*
498  * The Lanai Z8E PCI-E interface achieves higher Read-DMA throughput
499  * when the PCI-E Completion packets are aligned on an 8-byte
500  * boundary.  Some PCI-E chip sets always align Completion packets; on
501  * the ones that do not, the alignment can be enforced by enabling
502  * ECRC generation (if supported).
503  *
504  * When PCI-E Completion packets are not aligned, it is actually more
505  * efficient to limit Read-DMA transactions to 2KB, rather than 4KB.
506  *
507  * If the driver can neither enable ECRC nor verify that it has
508  * already been enabled, then it must use a firmware image which works
509  * around unaligned completion packets (ethp_z8e.dat), and it should
510  * also ensure that it never gives the device a Read-DMA which is
511  * larger than 2KB by setting the tx.boundary to 2KB.  If ECRC is
512  * enabled, then the driver should use the aligned (eth_z8e.dat)
513  * firmware image, and set tx.boundary to 4KB.
514  */
515 
516 static int
517 mxge_firmware_probe(mxge_softc_t *sc)
518 {
519 	device_t dev = sc->dev;
520 	int reg, status;
521 	uint16_t pectl;
522 
523 	sc->tx.boundary = 4096;
524 	/*
525 	 * Verify the max read request size was set to 4KB
526 	 * before trying the test with 4KB.
527 	 */
528 	if (pci_find_extcap(dev, PCIY_EXPRESS, &reg) == 0) {
529 		pectl = pci_read_config(dev, reg + 0x8, 2);
530 		if ((pectl & (5 << 12)) != (5 << 12)) {
531 			device_printf(dev, "Max Read Req. size != 4k (0x%x\n",
532 				      pectl);
533 			sc->tx.boundary = 2048;
534 		}
535 	}
536 
537 	/*
538 	 * load the optimized firmware (which assumes aligned PCIe
539 	 * completions) in order to see if it works on this host.
540 	 */
541 	sc->fw_name = mxge_fw_aligned;
542 	status = mxge_load_firmware(sc);
543 	if (status != 0) {
544 		return status;
545 	}
546 
547 	/*
548 	 * Enable ECRC if possible
549 	 */
550 	mxge_enable_nvidia_ecrc(sc);
551 
552 	/*
553 	 * Run a DMA test which watches for unaligned completions and
554 	 * aborts on the first one seen.
555 	 */
556 
557 	status = mxge_dma_test(sc, MXGEFW_CMD_UNALIGNED_TEST);
558 	if (status == 0)
559 		return 0; /* keep the aligned firmware */
560 
561 	if (status != E2BIG)
562 		device_printf(dev, "DMA test failed: %d\n", status);
563 	if (status == ENOSYS)
564 		device_printf(dev, "Falling back to ethp! "
565 			      "Please install up to date fw\n");
566 	return status;
567 }
568 
569 static int
570 mxge_select_firmware(mxge_softc_t *sc)
571 {
572 	int aligned = 0;
573 
574 
575 	if (mxge_force_firmware != 0) {
576 		if (mxge_force_firmware == 1)
577 			aligned = 1;
578 		else
579 			aligned = 0;
580 		if (mxge_verbose)
581 			device_printf(sc->dev,
582 				      "Assuming %s completions (forced)\n",
583 				      aligned ? "aligned" : "unaligned");
584 		goto abort;
585 	}
586 
587 	/* if the PCIe link width is 4 or less, we can use the aligned
588 	   firmware and skip any checks */
589 	if (sc->link_width != 0 && sc->link_width <= 4) {
590 		device_printf(sc->dev,
591 			      "PCIe x%d Link, expect reduced performance\n",
592 			      sc->link_width);
593 		aligned = 1;
594 		goto abort;
595 	}
596 
597 	if (0 == mxge_firmware_probe(sc))
598 		return 0;
599 
600 abort:
601 	if (aligned) {
602 		sc->fw_name = mxge_fw_aligned;
603 		sc->tx.boundary = 4096;
604 	} else {
605 		sc->fw_name = mxge_fw_unaligned;
606 		sc->tx.boundary = 2048;
607 	}
608 	return (mxge_load_firmware(sc));
609 }
610 
611 union qualhack
612 {
613         const char *ro_char;
614         char *rw_char;
615 };
616 
617 static int
618 mxge_validate_firmware(mxge_softc_t *sc, const mcp_gen_header_t *hdr)
619 {
620 
621 
622 	if (be32toh(hdr->mcp_type) != MCP_TYPE_ETH) {
623 		device_printf(sc->dev, "Bad firmware type: 0x%x\n",
624 			      be32toh(hdr->mcp_type));
625 		return EIO;
626 	}
627 
628 	/* save firmware version for sysctl */
629 	strncpy(sc->fw_version, hdr->version, sizeof (sc->fw_version));
630 	if (mxge_verbose)
631 		device_printf(sc->dev, "firmware id: %s\n", hdr->version);
632 
633 	sscanf(sc->fw_version, "%d.%d.%d", &sc->fw_ver_major,
634 	       &sc->fw_ver_minor, &sc->fw_ver_tiny);
635 
636 	if (!(sc->fw_ver_major == MXGEFW_VERSION_MAJOR
637 	      && sc->fw_ver_minor == MXGEFW_VERSION_MINOR)) {
638 		device_printf(sc->dev, "Found firmware version %s\n",
639 			      sc->fw_version);
640 		device_printf(sc->dev, "Driver needs %d.%d\n",
641 			      MXGEFW_VERSION_MAJOR, MXGEFW_VERSION_MINOR);
642 		return EINVAL;
643 	}
644 	return 0;
645 
646 }
647 
648 static void *
649 z_alloc(void *nil, u_int items, u_int size)
650 {
651         void *ptr;
652 
653         ptr = malloc(items * size, M_TEMP, M_NOWAIT);
654         return ptr;
655 }
656 
657 static void
658 z_free(void *nil, void *ptr)
659 {
660         free(ptr, M_TEMP);
661 }
662 
663 
664 static int
665 mxge_load_firmware_helper(mxge_softc_t *sc, uint32_t *limit)
666 {
667 	z_stream zs;
668 	char *inflate_buffer;
669 	const struct firmware *fw;
670 	const mcp_gen_header_t *hdr;
671 	unsigned hdr_offset;
672 	int status;
673 	unsigned int i;
674 	char dummy;
675 	size_t fw_len;
676 
677 	fw = firmware_get(sc->fw_name);
678 	if (fw == NULL) {
679 		device_printf(sc->dev, "Could not find firmware image %s\n",
680 			      sc->fw_name);
681 		return ENOENT;
682 	}
683 
684 
685 
686 	/* setup zlib and decompress f/w */
687 	bzero(&zs, sizeof (zs));
688 	zs.zalloc = z_alloc;
689 	zs.zfree = z_free;
690 	status = inflateInit(&zs);
691 	if (status != Z_OK) {
692 		status = EIO;
693 		goto abort_with_fw;
694 	}
695 
696 	/* the uncompressed size is stored as the firmware version,
697 	   which would otherwise go unused */
698 	fw_len = (size_t) fw->version;
699 	inflate_buffer = malloc(fw_len, M_TEMP, M_NOWAIT);
700 	if (inflate_buffer == NULL)
701 		goto abort_with_zs;
702 	zs.avail_in = fw->datasize;
703 	zs.next_in = __DECONST(char *, fw->data);
704 	zs.avail_out = fw_len;
705 	zs.next_out = inflate_buffer;
706 	status = inflate(&zs, Z_FINISH);
707 	if (status != Z_STREAM_END) {
708 		device_printf(sc->dev, "zlib %d\n", status);
709 		status = EIO;
710 		goto abort_with_buffer;
711 	}
712 
713 	/* check id */
714 	hdr_offset = htobe32(*(const uint32_t *)
715 			     (inflate_buffer + MCP_HEADER_PTR_OFFSET));
716 	if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > fw_len) {
717 		device_printf(sc->dev, "Bad firmware file");
718 		status = EIO;
719 		goto abort_with_buffer;
720 	}
721 	hdr = (const void*)(inflate_buffer + hdr_offset);
722 
723 	status = mxge_validate_firmware(sc, hdr);
724 	if (status != 0)
725 		goto abort_with_buffer;
726 
727 	/* Copy the inflated firmware to NIC SRAM. */
728 	for (i = 0; i < fw_len; i += 256) {
729 		mxge_pio_copy(sc->sram + MXGE_FW_OFFSET + i,
730 			      inflate_buffer + i,
731 			      min(256U, (unsigned)(fw_len - i)));
732 		mb();
733 		dummy = *sc->sram;
734 		mb();
735 	}
736 
737 	*limit = fw_len;
738 	status = 0;
739 abort_with_buffer:
740 	free(inflate_buffer, M_TEMP);
741 abort_with_zs:
742 	inflateEnd(&zs);
743 abort_with_fw:
744 	firmware_put(fw, FIRMWARE_UNLOAD);
745 	return status;
746 }
747 
748 /*
749  * Enable or disable periodic RDMAs from the host to make certain
750  * chipsets resend dropped PCIe messages
751  */
752 
753 static void
754 mxge_dummy_rdma(mxge_softc_t *sc, int enable)
755 {
756 	char buf_bytes[72];
757 	volatile uint32_t *confirm;
758 	volatile char *submit;
759 	uint32_t *buf, dma_low, dma_high;
760 	int i;
761 
762 	buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
763 
764 	/* clear confirmation addr */
765 	confirm = (volatile uint32_t *)sc->cmd;
766 	*confirm = 0;
767 	mb();
768 
769 	/* send an rdma command to the PCIe engine, and wait for the
770 	   response in the confirmation address.  The firmware should
771 	   write a -1 there to indicate it is alive and well
772 	*/
773 
774 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
775 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
776 	buf[0] = htobe32(dma_high);		/* confirm addr MSW */
777 	buf[1] = htobe32(dma_low);		/* confirm addr LSW */
778 	buf[2] = htobe32(0xffffffff);		/* confirm data */
779 	dma_low = MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr);
780 	dma_high = MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr);
781 	buf[3] = htobe32(dma_high); 		/* dummy addr MSW */
782 	buf[4] = htobe32(dma_low); 		/* dummy addr LSW */
783 	buf[5] = htobe32(enable);			/* enable? */
784 
785 
786 	submit = (volatile char *)(sc->sram + MXGEFW_BOOT_DUMMY_RDMA);
787 
788 	mxge_pio_copy(submit, buf, 64);
789 	mb();
790 	DELAY(1000);
791 	mb();
792 	i = 0;
793 	while (*confirm != 0xffffffff && i < 20) {
794 		DELAY(1000);
795 		i++;
796 	}
797 	if (*confirm != 0xffffffff) {
798 		device_printf(sc->dev, "dummy rdma %s failed (%p = 0x%x)",
799 			      (enable ? "enable" : "disable"), confirm,
800 			      *confirm);
801 	}
802 	return;
803 }
804 
805 static int
806 mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data)
807 {
808 	mcp_cmd_t *buf;
809 	char buf_bytes[sizeof(*buf) + 8];
810 	volatile mcp_cmd_response_t *response = sc->cmd;
811 	volatile char *cmd_addr = sc->sram + MXGEFW_ETH_CMD;
812 	uint32_t dma_low, dma_high;
813 	int err, sleep_total = 0;
814 
815 	/* ensure buf is aligned to 8 bytes */
816 	buf = (mcp_cmd_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
817 
818 	buf->data0 = htobe32(data->data0);
819 	buf->data1 = htobe32(data->data1);
820 	buf->data2 = htobe32(data->data2);
821 	buf->cmd = htobe32(cmd);
822 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
823 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
824 
825 	buf->response_addr.low = htobe32(dma_low);
826 	buf->response_addr.high = htobe32(dma_high);
827 	mtx_lock(&sc->cmd_mtx);
828 	response->result = 0xffffffff;
829 	mb();
830 	mxge_pio_copy((volatile void *)cmd_addr, buf, sizeof (*buf));
831 
832 	/* wait up to 20ms */
833 	err = EAGAIN;
834 	for (sleep_total = 0; sleep_total <  20; sleep_total++) {
835 		bus_dmamap_sync(sc->cmd_dma.dmat,
836 				sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
837 		mb();
838 		switch (be32toh(response->result)) {
839 		case 0:
840 			data->data0 = be32toh(response->data);
841 			err = 0;
842 			break;
843 		case 0xffffffff:
844 			DELAY(1000);
845 			break;
846 		case MXGEFW_CMD_UNKNOWN:
847 			err = ENOSYS;
848 			break;
849 		case MXGEFW_CMD_ERROR_UNALIGNED:
850 			err = E2BIG;
851 			break;
852 		default:
853 			device_printf(sc->dev,
854 				      "mxge: command %d "
855 				      "failed, result = %d\n",
856 				      cmd, be32toh(response->result));
857 			err = ENXIO;
858 			break;
859 		}
860 		if (err != EAGAIN)
861 			break;
862 	}
863 	if (err == EAGAIN)
864 		device_printf(sc->dev, "mxge: command %d timed out"
865 			      "result = %d\n",
866 			      cmd, be32toh(response->result));
867 	mtx_unlock(&sc->cmd_mtx);
868 	return err;
869 }
870 
871 static int
872 mxge_adopt_running_firmware(mxge_softc_t *sc)
873 {
874 	struct mcp_gen_header *hdr;
875 	const size_t bytes = sizeof (struct mcp_gen_header);
876 	size_t hdr_offset;
877 	int status;
878 
879 	/* find running firmware header */
880 	hdr_offset = htobe32(*(volatile uint32_t *)
881 			     (sc->sram + MCP_HEADER_PTR_OFFSET));
882 
883 	if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > sc->sram_size) {
884 		device_printf(sc->dev,
885 			      "Running firmware has bad header offset (%d)\n",
886 			      (int)hdr_offset);
887 		return EIO;
888 	}
889 
890 	/* copy header of running firmware from SRAM to host memory to
891 	 * validate firmware */
892 	hdr = malloc(bytes, M_DEVBUF, M_NOWAIT);
893 	if (hdr == NULL) {
894 		device_printf(sc->dev, "could not malloc firmware hdr\n");
895 		return ENOMEM;
896 	}
897 	bus_space_read_region_1(rman_get_bustag(sc->mem_res),
898 				rman_get_bushandle(sc->mem_res),
899 				hdr_offset, (char *)hdr, bytes);
900 	status = mxge_validate_firmware(sc, hdr);
901 	free(hdr, M_DEVBUF);
902 
903 	/*
904 	 * check to see if adopted firmware has bug where adopting
905 	 * it will cause broadcasts to be filtered unless the NIC
906 	 * is kept in ALLMULTI mode
907 	 */
908 	if (sc->fw_ver_major == 1 && sc->fw_ver_minor == 4 &&
909 	    sc->fw_ver_tiny >= 4 && sc->fw_ver_tiny <= 11) {
910 		sc->adopted_rx_filter_bug = 1;
911 		device_printf(sc->dev, "Adopting fw %d.%d.%d: "
912 			      "working around rx filter bug\n",
913 			      sc->fw_ver_major, sc->fw_ver_minor,
914 			      sc->fw_ver_tiny);
915 	}
916 
917 	return status;
918 }
919 
920 
921 static int
922 mxge_load_firmware(mxge_softc_t *sc)
923 {
924 	volatile uint32_t *confirm;
925 	volatile char *submit;
926 	char buf_bytes[72];
927 	uint32_t *buf, size, dma_low, dma_high;
928 	int status, i;
929 
930 	buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
931 
932 	size = sc->sram_size;
933 	status = mxge_load_firmware_helper(sc, &size);
934 	if (status) {
935 		/* Try to use the currently running firmware, if
936 		   it is new enough */
937 		status = mxge_adopt_running_firmware(sc);
938 		if (status) {
939 			device_printf(sc->dev,
940 				      "failed to adopt running firmware\n");
941 			return status;
942 		}
943 		device_printf(sc->dev,
944 			      "Successfully adopted running firmware\n");
945 		if (sc->tx.boundary == 4096) {
946 			device_printf(sc->dev,
947 				"Using firmware currently running on NIC"
948 				 ".  For optimal\n");
949 			device_printf(sc->dev,
950 				 "performance consider loading optimized "
951 				 "firmware\n");
952 		}
953 		sc->fw_name = mxge_fw_unaligned;
954 		sc->tx.boundary = 2048;
955 		return 0;
956 	}
957 	/* clear confirmation addr */
958 	confirm = (volatile uint32_t *)sc->cmd;
959 	*confirm = 0;
960 	mb();
961 	/* send a reload command to the bootstrap MCP, and wait for the
962 	   response in the confirmation address.  The firmware should
963 	   write a -1 there to indicate it is alive and well
964 	*/
965 
966 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
967 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
968 
969 	buf[0] = htobe32(dma_high);	/* confirm addr MSW */
970 	buf[1] = htobe32(dma_low);	/* confirm addr LSW */
971 	buf[2] = htobe32(0xffffffff);	/* confirm data */
972 
973 	/* FIX: All newest firmware should un-protect the bottom of
974 	   the sram before handoff. However, the very first interfaces
975 	   do not. Therefore the handoff copy must skip the first 8 bytes
976 	*/
977 					/* where the code starts*/
978 	buf[3] = htobe32(MXGE_FW_OFFSET + 8);
979 	buf[4] = htobe32(size - 8); 	/* length of code */
980 	buf[5] = htobe32(8);		/* where to copy to */
981 	buf[6] = htobe32(0);		/* where to jump to */
982 
983 	submit = (volatile char *)(sc->sram + MXGEFW_BOOT_HANDOFF);
984 	mxge_pio_copy(submit, buf, 64);
985 	mb();
986 	DELAY(1000);
987 	mb();
988 	i = 0;
989 	while (*confirm != 0xffffffff && i < 20) {
990 		DELAY(1000*10);
991 		i++;
992 		bus_dmamap_sync(sc->cmd_dma.dmat,
993 				sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
994 	}
995 	if (*confirm != 0xffffffff) {
996 		device_printf(sc->dev,"handoff failed (%p = 0x%x)",
997 			confirm, *confirm);
998 
999 		return ENXIO;
1000 	}
1001 	return 0;
1002 }
1003 
1004 static int
1005 mxge_update_mac_address(mxge_softc_t *sc)
1006 {
1007 	mxge_cmd_t cmd;
1008 	uint8_t *addr = sc->mac_addr;
1009 	int status;
1010 
1011 
1012 	cmd.data0 = ((addr[0] << 24) | (addr[1] << 16)
1013 		     | (addr[2] << 8) | addr[3]);
1014 
1015 	cmd.data1 = ((addr[4] << 8) | (addr[5]));
1016 
1017 	status = mxge_send_cmd(sc, MXGEFW_SET_MAC_ADDRESS, &cmd);
1018 	return status;
1019 }
1020 
1021 static int
1022 mxge_change_pause(mxge_softc_t *sc, int pause)
1023 {
1024 	mxge_cmd_t cmd;
1025 	int status;
1026 
1027 	if (pause)
1028 		status = mxge_send_cmd(sc, MXGEFW_ENABLE_FLOW_CONTROL,
1029 				       &cmd);
1030 	else
1031 		status = mxge_send_cmd(sc, MXGEFW_DISABLE_FLOW_CONTROL,
1032 				       &cmd);
1033 
1034 	if (status) {
1035 		device_printf(sc->dev, "Failed to set flow control mode\n");
1036 		return ENXIO;
1037 	}
1038 	sc->pause = pause;
1039 	return 0;
1040 }
1041 
1042 static void
1043 mxge_change_promisc(mxge_softc_t *sc, int promisc)
1044 {
1045 	mxge_cmd_t cmd;
1046 	int status;
1047 
1048 	if (promisc)
1049 		status = mxge_send_cmd(sc, MXGEFW_ENABLE_PROMISC,
1050 				       &cmd);
1051 	else
1052 		status = mxge_send_cmd(sc, MXGEFW_DISABLE_PROMISC,
1053 				       &cmd);
1054 
1055 	if (status) {
1056 		device_printf(sc->dev, "Failed to set promisc mode\n");
1057 	}
1058 }
1059 
1060 static void
1061 mxge_set_multicast_list(mxge_softc_t *sc)
1062 {
1063 	mxge_cmd_t cmd;
1064 	struct ifmultiaddr *ifma;
1065 	struct ifnet *ifp = sc->ifp;
1066 	int err;
1067 
1068 	/* This firmware is known to not support multicast */
1069 	if (!sc->fw_multicast_support)
1070 		return;
1071 
1072 	/* Disable multicast filtering while we play with the lists*/
1073 	err = mxge_send_cmd(sc, MXGEFW_ENABLE_ALLMULTI, &cmd);
1074 	if (err != 0) {
1075 		device_printf(sc->dev, "Failed MXGEFW_ENABLE_ALLMULTI,"
1076 		       " error status: %d\n", err);
1077 		return;
1078 	}
1079 
1080 	if (sc->adopted_rx_filter_bug)
1081 		return;
1082 
1083 	if (ifp->if_flags & IFF_ALLMULTI)
1084 		/* request to disable multicast filtering, so quit here */
1085 		return;
1086 
1087 	/* Flush all the filters */
1088 
1089 	err = mxge_send_cmd(sc, MXGEFW_LEAVE_ALL_MULTICAST_GROUPS, &cmd);
1090 	if (err != 0) {
1091 		device_printf(sc->dev,
1092 			      "Failed MXGEFW_LEAVE_ALL_MULTICAST_GROUPS"
1093 			      ", error status: %d\n", err);
1094 		return;
1095 	}
1096 
1097 	/* Walk the multicast list, and add each address */
1098 
1099 	IF_ADDR_LOCK(ifp);
1100 	TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
1101 		if (ifma->ifma_addr->sa_family != AF_LINK)
1102 			continue;
1103 		bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr),
1104 		      &cmd.data0, 4);
1105 		bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr) + 4,
1106 		      &cmd.data1, 2);
1107 		cmd.data0 = htonl(cmd.data0);
1108 		cmd.data1 = htonl(cmd.data1);
1109 		err = mxge_send_cmd(sc, MXGEFW_JOIN_MULTICAST_GROUP, &cmd);
1110 		if (err != 0) {
1111 			device_printf(sc->dev, "Failed "
1112 			       "MXGEFW_JOIN_MULTICAST_GROUP, error status:"
1113 			       "%d\t", err);
1114 			/* abort, leaving multicast filtering off */
1115 			IF_ADDR_UNLOCK(ifp);
1116 			return;
1117 		}
1118 	}
1119 	IF_ADDR_UNLOCK(ifp);
1120 	/* Enable multicast filtering */
1121 	err = mxge_send_cmd(sc, MXGEFW_DISABLE_ALLMULTI, &cmd);
1122 	if (err != 0) {
1123 		device_printf(sc->dev, "Failed MXGEFW_DISABLE_ALLMULTI"
1124 		       ", error status: %d\n", err);
1125 	}
1126 }
1127 
1128 static int
1129 mxge_max_mtu(mxge_softc_t *sc)
1130 {
1131 	mxge_cmd_t cmd;
1132 	int status;
1133 
1134 	if (MJUMPAGESIZE - MXGEFW_PAD >  MXGEFW_MAX_MTU)
1135 		return  MXGEFW_MAX_MTU - MXGEFW_PAD;
1136 
1137 	/* try to set nbufs to see if it we can
1138 	   use virtually contiguous jumbos */
1139 	cmd.data0 = 0;
1140 	status = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
1141 			       &cmd);
1142 	if (status == 0)
1143 		return  MXGEFW_MAX_MTU - MXGEFW_PAD;
1144 
1145 	/* otherwise, we're limited to MJUMPAGESIZE */
1146 	return MJUMPAGESIZE - MXGEFW_PAD;
1147 }
1148 
1149 static int
1150 mxge_reset(mxge_softc_t *sc, int interrupts_setup)
1151 {
1152 
1153 	mxge_cmd_t cmd;
1154 	size_t bytes;
1155 	int status;
1156 
1157 	/* try to send a reset command to the card to see if it
1158 	   is alive */
1159 	memset(&cmd, 0, sizeof (cmd));
1160 	status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
1161 	if (status != 0) {
1162 		device_printf(sc->dev, "failed reset\n");
1163 		return ENXIO;
1164 	}
1165 
1166 	mxge_dummy_rdma(sc, 1);
1167 
1168 	if (interrupts_setup) {
1169 		/* Now exchange information about interrupts  */
1170 		bytes = (sc->rx_done.mask + 1) * sizeof (*sc->rx_done.entry);
1171 		memset(sc->rx_done.entry, 0, bytes);
1172 		cmd.data0 = (uint32_t)bytes;
1173 		status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
1174 		cmd.data0 = MXGE_LOWPART_TO_U32(sc->rx_done.dma.bus_addr);
1175 		cmd.data1 = MXGE_HIGHPART_TO_U32(sc->rx_done.dma.bus_addr);
1176 		status |= mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_DMA, &cmd);
1177 	}
1178 
1179 	status |= mxge_send_cmd(sc,
1180 				MXGEFW_CMD_GET_INTR_COAL_DELAY_OFFSET, &cmd);
1181 
1182 
1183 	sc->intr_coal_delay_ptr = (volatile uint32_t *)(sc->sram + cmd.data0);
1184 
1185 	status |= mxge_send_cmd(sc, MXGEFW_CMD_GET_IRQ_ACK_OFFSET, &cmd);
1186 	sc->irq_claim = (volatile uint32_t *)(sc->sram + cmd.data0);
1187 
1188 
1189 	status |= mxge_send_cmd(sc,  MXGEFW_CMD_GET_IRQ_DEASSERT_OFFSET,
1190 				&cmd);
1191 	sc->irq_deassert = (volatile uint32_t *)(sc->sram + cmd.data0);
1192 	if (status != 0) {
1193 		device_printf(sc->dev, "failed set interrupt parameters\n");
1194 		return status;
1195 	}
1196 
1197 
1198 	*sc->intr_coal_delay_ptr = htobe32(sc->intr_coal_delay);
1199 
1200 
1201 	/* run a DMA benchmark */
1202 	(void) mxge_dma_test(sc, MXGEFW_DMA_TEST);
1203 
1204 	/* reset mcp/driver shared state back to 0 */
1205 	sc->rx_done.idx = 0;
1206 	sc->rx_done.cnt = 0;
1207 	sc->tx.req = 0;
1208 	sc->tx.done = 0;
1209 	sc->tx.pkt_done = 0;
1210 	sc->tx.wake = 0;
1211 	sc->tx_defrag = 0;
1212 	sc->tx.stall = 0;
1213 	sc->rx_big.cnt = 0;
1214 	sc->rx_small.cnt = 0;
1215 	sc->rdma_tags_available = 15;
1216 	sc->fw_stats->valid = 0;
1217 	sc->fw_stats->send_done_count = 0;
1218 	sc->lro_bad_csum = 0;
1219 	sc->lro_queued = 0;
1220 	sc->lro_flushed = 0;
1221 	status = mxge_update_mac_address(sc);
1222 	mxge_change_promisc(sc, 0);
1223 	mxge_change_pause(sc, sc->pause);
1224 	mxge_set_multicast_list(sc);
1225 	return status;
1226 }
1227 
1228 static int
1229 mxge_change_intr_coal(SYSCTL_HANDLER_ARGS)
1230 {
1231         mxge_softc_t *sc;
1232         unsigned int intr_coal_delay;
1233         int err;
1234 
1235         sc = arg1;
1236         intr_coal_delay = sc->intr_coal_delay;
1237         err = sysctl_handle_int(oidp, &intr_coal_delay, arg2, req);
1238         if (err != 0) {
1239                 return err;
1240         }
1241         if (intr_coal_delay == sc->intr_coal_delay)
1242                 return 0;
1243 
1244         if (intr_coal_delay == 0 || intr_coal_delay > 1000*1000)
1245                 return EINVAL;
1246 
1247 	mtx_lock(&sc->driver_mtx);
1248 	*sc->intr_coal_delay_ptr = htobe32(intr_coal_delay);
1249 	sc->intr_coal_delay = intr_coal_delay;
1250 
1251 	mtx_unlock(&sc->driver_mtx);
1252         return err;
1253 }
1254 
1255 static int
1256 mxge_change_flow_control(SYSCTL_HANDLER_ARGS)
1257 {
1258         mxge_softc_t *sc;
1259         unsigned int enabled;
1260         int err;
1261 
1262         sc = arg1;
1263         enabled = sc->pause;
1264         err = sysctl_handle_int(oidp, &enabled, arg2, req);
1265         if (err != 0) {
1266                 return err;
1267         }
1268         if (enabled == sc->pause)
1269                 return 0;
1270 
1271 	mtx_lock(&sc->driver_mtx);
1272 	err = mxge_change_pause(sc, enabled);
1273 	mtx_unlock(&sc->driver_mtx);
1274         return err;
1275 }
1276 
1277 static int
1278 mxge_change_lro_locked(mxge_softc_t *sc, int lro_cnt)
1279 {
1280 	struct ifnet *ifp;
1281 	int err;
1282 
1283 	ifp = sc->ifp;
1284 	if (lro_cnt == 0)
1285 		ifp->if_capenable &= ~IFCAP_LRO;
1286 	else
1287 		ifp->if_capenable |= IFCAP_LRO;
1288 	sc->lro_cnt = lro_cnt;
1289 	callout_stop(&sc->co_hdl);
1290 	mxge_close(sc);
1291 	err = mxge_open(sc);
1292 	if (err == 0)
1293 		callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
1294 	return err;
1295 }
1296 
1297 static int
1298 mxge_change_lro(SYSCTL_HANDLER_ARGS)
1299 {
1300 	mxge_softc_t *sc;
1301 	unsigned int lro_cnt;
1302 	int err;
1303 
1304 	sc = arg1;
1305 	lro_cnt = sc->lro_cnt;
1306 	err = sysctl_handle_int(oidp, &lro_cnt, arg2, req);
1307 	if (err != 0)
1308 		return err;
1309 
1310 	if (lro_cnt == sc->lro_cnt)
1311 		return 0;
1312 
1313 	if (lro_cnt > 128)
1314 		return EINVAL;
1315 
1316 	mtx_lock(&sc->driver_mtx);
1317 	err = mxge_change_lro_locked(sc, lro_cnt);
1318 	mtx_unlock(&sc->driver_mtx);
1319 	return err;
1320 }
1321 
1322 static int
1323 mxge_handle_be32(SYSCTL_HANDLER_ARGS)
1324 {
1325         int err;
1326 
1327         if (arg1 == NULL)
1328                 return EFAULT;
1329         arg2 = be32toh(*(int *)arg1);
1330         arg1 = NULL;
1331         err = sysctl_handle_int(oidp, arg1, arg2, req);
1332 
1333         return err;
1334 }
1335 
1336 static void
1337 mxge_add_sysctls(mxge_softc_t *sc)
1338 {
1339 	struct sysctl_ctx_list *ctx;
1340 	struct sysctl_oid_list *children;
1341 	mcp_irq_data_t *fw;
1342 
1343 	ctx = device_get_sysctl_ctx(sc->dev);
1344 	children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev));
1345 	fw = sc->fw_stats;
1346 
1347 	/* random information */
1348 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1349 		       "firmware_version",
1350 		       CTLFLAG_RD, &sc->fw_version,
1351 		       0, "firmware version");
1352 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1353 		       "serial_number",
1354 		       CTLFLAG_RD, &sc->serial_number_string,
1355 		       0, "serial number");
1356 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1357 		       "product_code",
1358 		       CTLFLAG_RD, &sc->product_code_string,
1359 		       0, "product_code");
1360 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1361 		       "pcie_link_width",
1362 		       CTLFLAG_RD, &sc->link_width,
1363 		       0, "tx_boundary");
1364 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1365 		       "tx_boundary",
1366 		       CTLFLAG_RD, &sc->tx.boundary,
1367 		       0, "tx_boundary");
1368 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1369 		       "write_combine",
1370 		       CTLFLAG_RD, &sc->wc,
1371 		       0, "write combining PIO?");
1372 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1373 		       "read_dma_MBs",
1374 		       CTLFLAG_RD, &sc->read_dma,
1375 		       0, "DMA Read speed in MB/s");
1376 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1377 		       "write_dma_MBs",
1378 		       CTLFLAG_RD, &sc->write_dma,
1379 		       0, "DMA Write speed in MB/s");
1380 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1381 		       "read_write_dma_MBs",
1382 		       CTLFLAG_RD, &sc->read_write_dma,
1383 		       0, "DMA concurrent Read/Write speed in MB/s");
1384 
1385 
1386 	/* performance related tunables */
1387 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1388 			"intr_coal_delay",
1389 			CTLTYPE_INT|CTLFLAG_RW, sc,
1390 			0, mxge_change_intr_coal,
1391 			"I", "interrupt coalescing delay in usecs");
1392 
1393 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1394 			"flow_control_enabled",
1395 			CTLTYPE_INT|CTLFLAG_RW, sc,
1396 			0, mxge_change_flow_control,
1397 			"I", "interrupt coalescing delay in usecs");
1398 
1399 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1400 		       "deassert_wait",
1401 		       CTLFLAG_RW, &mxge_deassert_wait,
1402 		       0, "Wait for IRQ line to go low in ihandler");
1403 
1404 	/* stats block from firmware is in network byte order.
1405 	   Need to swap it */
1406 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1407 			"link_up",
1408 			CTLTYPE_INT|CTLFLAG_RD, &fw->link_up,
1409 			0, mxge_handle_be32,
1410 			"I", "link up");
1411 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1412 			"rdma_tags_available",
1413 			CTLTYPE_INT|CTLFLAG_RD, &fw->rdma_tags_available,
1414 			0, mxge_handle_be32,
1415 			"I", "rdma_tags_available");
1416 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1417 			"dropped_bad_crc32",
1418 			CTLTYPE_INT|CTLFLAG_RD,
1419 			&fw->dropped_bad_crc32,
1420 			0, mxge_handle_be32,
1421 			"I", "dropped_bad_crc32");
1422 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1423 			"dropped_bad_phy",
1424 			CTLTYPE_INT|CTLFLAG_RD,
1425 			&fw->dropped_bad_phy,
1426 			0, mxge_handle_be32,
1427 			"I", "dropped_bad_phy");
1428 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1429 			"dropped_link_error_or_filtered",
1430 			CTLTYPE_INT|CTLFLAG_RD,
1431 			&fw->dropped_link_error_or_filtered,
1432 			0, mxge_handle_be32,
1433 			"I", "dropped_link_error_or_filtered");
1434 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1435 			"dropped_link_overflow",
1436 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_link_overflow,
1437 			0, mxge_handle_be32,
1438 			"I", "dropped_link_overflow");
1439 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1440 			"dropped_multicast_filtered",
1441 			CTLTYPE_INT|CTLFLAG_RD,
1442 			&fw->dropped_multicast_filtered,
1443 			0, mxge_handle_be32,
1444 			"I", "dropped_multicast_filtered");
1445 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1446 			"dropped_no_big_buffer",
1447 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_no_big_buffer,
1448 			0, mxge_handle_be32,
1449 			"I", "dropped_no_big_buffer");
1450 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1451 			"dropped_no_small_buffer",
1452 			CTLTYPE_INT|CTLFLAG_RD,
1453 			&fw->dropped_no_small_buffer,
1454 			0, mxge_handle_be32,
1455 			"I", "dropped_no_small_buffer");
1456 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1457 			"dropped_overrun",
1458 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_overrun,
1459 			0, mxge_handle_be32,
1460 			"I", "dropped_overrun");
1461 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1462 			"dropped_pause",
1463 			CTLTYPE_INT|CTLFLAG_RD,
1464 			&fw->dropped_pause,
1465 			0, mxge_handle_be32,
1466 			"I", "dropped_pause");
1467 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1468 			"dropped_runt",
1469 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_runt,
1470 			0, mxge_handle_be32,
1471 			"I", "dropped_runt");
1472 
1473 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1474 			"dropped_unicast_filtered",
1475 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_unicast_filtered,
1476 			0, mxge_handle_be32,
1477 			"I", "dropped_unicast_filtered");
1478 
1479 	/* host counters exported for debugging */
1480 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1481 		       "rx_small_cnt",
1482 		       CTLFLAG_RD, &sc->rx_small.cnt,
1483 		       0, "rx_small_cnt");
1484 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1485 		       "rx_big_cnt",
1486 		       CTLFLAG_RD, &sc->rx_big.cnt,
1487 		       0, "rx_small_cnt");
1488 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1489 		       "tx_req",
1490 		       CTLFLAG_RD, &sc->tx.req,
1491 		       0, "tx_req");
1492 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1493 		       "tx_done",
1494 		       CTLFLAG_RD, &sc->tx.done,
1495 		       0, "tx_done");
1496 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1497 		       "tx_pkt_done",
1498 		       CTLFLAG_RD, &sc->tx.pkt_done,
1499 		       0, "tx_done");
1500 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1501 		       "tx_stall",
1502 		       CTLFLAG_RD, &sc->tx.stall,
1503 		       0, "tx_stall");
1504 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1505 		       "tx_wake",
1506 		       CTLFLAG_RD, &sc->tx.wake,
1507 		       0, "tx_wake");
1508 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1509 		       "tx_defrag",
1510 		       CTLFLAG_RD, &sc->tx_defrag,
1511 		       0, "tx_defrag");
1512 
1513 	/* verbose printing? */
1514 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1515 		       "verbose",
1516 		       CTLFLAG_RW, &mxge_verbose,
1517 		       0, "verbose printing");
1518 
1519 	/* lro */
1520 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1521 			"lro_cnt",
1522 			CTLTYPE_INT|CTLFLAG_RW, sc,
1523 			0, mxge_change_lro,
1524 			"I", "number of lro merge queues");
1525 
1526 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1527 		       "lro_flushed", CTLFLAG_RD, &sc->lro_flushed,
1528 		       0, "number of lro merge queues flushed");
1529 
1530 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1531 		       "lro_queued", CTLFLAG_RD, &sc->lro_queued,
1532 		       0, "number of frames appended to lro merge queues");
1533 
1534 }
1535 
1536 /* copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
1537    backwards one at a time and handle ring wraps */
1538 
1539 static inline void
1540 mxge_submit_req_backwards(mxge_tx_buf_t *tx,
1541 			    mcp_kreq_ether_send_t *src, int cnt)
1542 {
1543         int idx, starting_slot;
1544         starting_slot = tx->req;
1545         while (cnt > 1) {
1546                 cnt--;
1547                 idx = (starting_slot + cnt) & tx->mask;
1548                 mxge_pio_copy(&tx->lanai[idx],
1549 			      &src[cnt], sizeof(*src));
1550                 mb();
1551         }
1552 }
1553 
1554 /*
1555  * copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
1556  * at most 32 bytes at a time, so as to avoid involving the software
1557  * pio handler in the nic.   We re-write the first segment's flags
1558  * to mark them valid only after writing the entire chain
1559  */
1560 
1561 static inline void
1562 mxge_submit_req(mxge_tx_buf_t *tx, mcp_kreq_ether_send_t *src,
1563                   int cnt)
1564 {
1565         int idx, i;
1566         uint32_t *src_ints;
1567 	volatile uint32_t *dst_ints;
1568         mcp_kreq_ether_send_t *srcp;
1569 	volatile mcp_kreq_ether_send_t *dstp, *dst;
1570 	uint8_t last_flags;
1571 
1572         idx = tx->req & tx->mask;
1573 
1574 	last_flags = src->flags;
1575 	src->flags = 0;
1576         mb();
1577         dst = dstp = &tx->lanai[idx];
1578         srcp = src;
1579 
1580         if ((idx + cnt) < tx->mask) {
1581                 for (i = 0; i < (cnt - 1); i += 2) {
1582                         mxge_pio_copy(dstp, srcp, 2 * sizeof(*src));
1583                         mb(); /* force write every 32 bytes */
1584                         srcp += 2;
1585                         dstp += 2;
1586                 }
1587         } else {
1588                 /* submit all but the first request, and ensure
1589                    that it is submitted below */
1590                 mxge_submit_req_backwards(tx, src, cnt);
1591                 i = 0;
1592         }
1593         if (i < cnt) {
1594                 /* submit the first request */
1595                 mxge_pio_copy(dstp, srcp, sizeof(*src));
1596                 mb(); /* barrier before setting valid flag */
1597         }
1598 
1599         /* re-write the last 32-bits with the valid flags */
1600         src->flags = last_flags;
1601         src_ints = (uint32_t *)src;
1602         src_ints+=3;
1603         dst_ints = (volatile uint32_t *)dst;
1604         dst_ints+=3;
1605         *dst_ints =  *src_ints;
1606         tx->req += cnt;
1607         mb();
1608 }
1609 
1610 static void
1611 mxge_encap_tso(mxge_softc_t *sc, struct mbuf *m, int busdma_seg_cnt,
1612 	       int ip_off)
1613 {
1614 	mxge_tx_buf_t *tx;
1615 	mcp_kreq_ether_send_t *req;
1616 	bus_dma_segment_t *seg;
1617 	struct ip *ip;
1618 	struct tcphdr *tcp;
1619 	uint32_t low, high_swapped;
1620 	int len, seglen, cum_len, cum_len_next;
1621 	int next_is_first, chop, cnt, rdma_count, small;
1622 	uint16_t pseudo_hdr_offset, cksum_offset, mss;
1623 	uint8_t flags, flags_next;
1624 	static int once;
1625 
1626 	mss = m->m_pkthdr.tso_segsz;
1627 
1628 	/* negative cum_len signifies to the
1629 	 * send loop that we are still in the
1630 	 * header portion of the TSO packet.
1631 	 */
1632 
1633 	/* ensure we have the ethernet, IP and TCP
1634 	   header together in the first mbuf, copy
1635 	   it to a scratch buffer if not */
1636 	if (__predict_false(m->m_len < ip_off + sizeof (*ip))) {
1637 		m_copydata(m, 0, ip_off + sizeof (*ip),
1638 			   sc->scratch);
1639 		ip = (struct ip *)(sc->scratch + ip_off);
1640 	} else {
1641 		ip = (struct ip *)(mtod(m, char *) + ip_off);
1642 	}
1643 	if (__predict_false(m->m_len < ip_off + (ip->ip_hl << 2)
1644 			    + sizeof (*tcp))) {
1645 		m_copydata(m, 0, ip_off + (ip->ip_hl << 2)
1646 			   + sizeof (*tcp),  sc->scratch);
1647 		ip = (struct ip *)(mtod(m, char *) + ip_off);
1648 	}
1649 
1650 	tcp = (struct tcphdr *)((char *)ip + (ip->ip_hl << 2));
1651 	cum_len = -(ip_off + ((ip->ip_hl + tcp->th_off) << 2));
1652 
1653 	/* TSO implies checksum offload on this hardware */
1654 	cksum_offset = ip_off + (ip->ip_hl << 2);
1655 	flags = MXGEFW_FLAGS_TSO_HDR | MXGEFW_FLAGS_FIRST;
1656 
1657 
1658 	/* for TSO, pseudo_hdr_offset holds mss.
1659 	 * The firmware figures out where to put
1660 	 * the checksum by parsing the header. */
1661 	pseudo_hdr_offset = htobe16(mss);
1662 
1663 	tx = &sc->tx;
1664 	req = tx->req_list;
1665 	seg = tx->seg_list;
1666 	cnt = 0;
1667 	rdma_count = 0;
1668 	/* "rdma_count" is the number of RDMAs belonging to the
1669 	 * current packet BEFORE the current send request. For
1670 	 * non-TSO packets, this is equal to "count".
1671 	 * For TSO packets, rdma_count needs to be reset
1672 	 * to 0 after a segment cut.
1673 	 *
1674 	 * The rdma_count field of the send request is
1675 	 * the number of RDMAs of the packet starting at
1676 	 * that request. For TSO send requests with one ore more cuts
1677 	 * in the middle, this is the number of RDMAs starting
1678 	 * after the last cut in the request. All previous
1679 	 * segments before the last cut implicitly have 1 RDMA.
1680 	 *
1681 	 * Since the number of RDMAs is not known beforehand,
1682 	 * it must be filled-in retroactively - after each
1683 	 * segmentation cut or at the end of the entire packet.
1684 	 */
1685 
1686 	while (busdma_seg_cnt) {
1687 		/* Break the busdma segment up into pieces*/
1688 		low = MXGE_LOWPART_TO_U32(seg->ds_addr);
1689 		high_swapped = 	htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
1690 		len = seg->ds_len;
1691 
1692 		while (len) {
1693 			flags_next = flags & ~MXGEFW_FLAGS_FIRST;
1694 			seglen = len;
1695 			cum_len_next = cum_len + seglen;
1696 			(req-rdma_count)->rdma_count = rdma_count + 1;
1697 			if (__predict_true(cum_len >= 0)) {
1698 				/* payload */
1699 				chop = (cum_len_next > mss);
1700 				cum_len_next = cum_len_next % mss;
1701 				next_is_first = (cum_len_next == 0);
1702 				flags |= chop * MXGEFW_FLAGS_TSO_CHOP;
1703 				flags_next |= next_is_first *
1704 					MXGEFW_FLAGS_FIRST;
1705 				rdma_count |= -(chop | next_is_first);
1706 				rdma_count += chop & !next_is_first;
1707 			} else if (cum_len_next >= 0) {
1708 				/* header ends */
1709 				rdma_count = -1;
1710 				cum_len_next = 0;
1711 				seglen = -cum_len;
1712 				small = (mss <= MXGEFW_SEND_SMALL_SIZE);
1713 				flags_next = MXGEFW_FLAGS_TSO_PLD |
1714 					MXGEFW_FLAGS_FIRST |
1715 					(small * MXGEFW_FLAGS_SMALL);
1716 			    }
1717 
1718 			req->addr_high = high_swapped;
1719 			req->addr_low = htobe32(low);
1720 			req->pseudo_hdr_offset = pseudo_hdr_offset;
1721 			req->pad = 0;
1722 			req->rdma_count = 1;
1723 			req->length = htobe16(seglen);
1724 			req->cksum_offset = cksum_offset;
1725 			req->flags = flags | ((cum_len & 1) *
1726 					      MXGEFW_FLAGS_ALIGN_ODD);
1727 			low += seglen;
1728 			len -= seglen;
1729 			cum_len = cum_len_next;
1730 			flags = flags_next;
1731 			req++;
1732 			cnt++;
1733 			rdma_count++;
1734 			if (__predict_false(cksum_offset > seglen))
1735 				cksum_offset -= seglen;
1736 			else
1737 				cksum_offset = 0;
1738 			if (__predict_false(cnt > tx->max_desc))
1739 				goto drop;
1740 		}
1741 		busdma_seg_cnt--;
1742 		seg++;
1743 	}
1744 	(req-rdma_count)->rdma_count = rdma_count;
1745 
1746 	do {
1747 		req--;
1748 		req->flags |= MXGEFW_FLAGS_TSO_LAST;
1749 	} while (!(req->flags & (MXGEFW_FLAGS_TSO_CHOP | MXGEFW_FLAGS_FIRST)));
1750 
1751 	tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
1752 	mxge_submit_req(tx, tx->req_list, cnt);
1753 	return;
1754 
1755 drop:
1756 	bus_dmamap_unload(tx->dmat, tx->info[tx->req & tx->mask].map);
1757 	m_freem(m);
1758 	sc->ifp->if_oerrors++;
1759 	if (!once) {
1760 		printf("tx->max_desc exceeded via TSO!\n");
1761 		printf("mss = %d, %ld, %d!\n", mss,
1762 		       (long)seg - (long)tx->seg_list, tx->max_desc);
1763 		once = 1;
1764 	}
1765 	return;
1766 
1767 }
1768 
1769 /*
1770  * We reproduce the software vlan tag insertion from
1771  * net/if_vlan.c:vlan_start() here so that we can advertise "hardware"
1772  * vlan tag insertion. We need to advertise this in order to have the
1773  * vlan interface respect our csum offload flags.
1774  */
1775 static struct mbuf *
1776 mxge_vlan_tag_insert(struct mbuf *m)
1777 {
1778 	struct ether_vlan_header *evl;
1779 
1780 	M_PREPEND(m, ETHER_VLAN_ENCAP_LEN, M_DONTWAIT);
1781 	if (__predict_false(m == NULL))
1782 		return NULL;
1783 	if (m->m_len < sizeof(*evl)) {
1784 		m = m_pullup(m, sizeof(*evl));
1785 		if (__predict_false(m == NULL))
1786 			return NULL;
1787 	}
1788 	/*
1789 	 * Transform the Ethernet header into an Ethernet header
1790 	 * with 802.1Q encapsulation.
1791 	 */
1792 	evl = mtod(m, struct ether_vlan_header *);
1793 	bcopy((char *)evl + ETHER_VLAN_ENCAP_LEN,
1794 	      (char *)evl, ETHER_HDR_LEN - ETHER_TYPE_LEN);
1795 	evl->evl_encap_proto = htons(ETHERTYPE_VLAN);
1796 	evl->evl_tag = htons(m->m_pkthdr.ether_vtag);
1797 	m->m_flags &= ~M_VLANTAG;
1798 	return m;
1799 }
1800 
1801 static void
1802 mxge_encap(mxge_softc_t *sc, struct mbuf *m)
1803 {
1804 	mcp_kreq_ether_send_t *req;
1805 	bus_dma_segment_t *seg;
1806 	struct mbuf *m_tmp;
1807 	struct ifnet *ifp;
1808 	mxge_tx_buf_t *tx;
1809 	struct ip *ip;
1810 	int cnt, cum_len, err, i, idx, odd_flag, ip_off;
1811 	uint16_t pseudo_hdr_offset;
1812         uint8_t flags, cksum_offset;
1813 
1814 
1815 
1816 	ifp = sc->ifp;
1817 	tx = &sc->tx;
1818 
1819 	ip_off = sizeof (struct ether_header);
1820 	if (m->m_flags & M_VLANTAG) {
1821 		m = mxge_vlan_tag_insert(m);
1822 		if (__predict_false(m == NULL))
1823 			goto drop;
1824 		ip_off += ETHER_VLAN_ENCAP_LEN;
1825 	}
1826 
1827 	/* (try to) map the frame for DMA */
1828 	idx = tx->req & tx->mask;
1829 	err = bus_dmamap_load_mbuf_sg(tx->dmat, tx->info[idx].map,
1830 				      m, tx->seg_list, &cnt,
1831 				      BUS_DMA_NOWAIT);
1832 	if (__predict_false(err == EFBIG)) {
1833 		/* Too many segments in the chain.  Try
1834 		   to defrag */
1835 		m_tmp = m_defrag(m, M_NOWAIT);
1836 		if (m_tmp == NULL) {
1837 			goto drop;
1838 		}
1839 		sc->tx_defrag++;
1840 		m = m_tmp;
1841 		err = bus_dmamap_load_mbuf_sg(tx->dmat,
1842 					      tx->info[idx].map,
1843 					      m, tx->seg_list, &cnt,
1844 					      BUS_DMA_NOWAIT);
1845 	}
1846 	if (__predict_false(err != 0)) {
1847 		device_printf(sc->dev, "bus_dmamap_load_mbuf_sg returned %d"
1848 			      " packet len = %d\n", err, m->m_pkthdr.len);
1849 		goto drop;
1850 	}
1851 	bus_dmamap_sync(tx->dmat, tx->info[idx].map,
1852 			BUS_DMASYNC_PREWRITE);
1853 	tx->info[idx].m = m;
1854 
1855 
1856 	/* TSO is different enough, we handle it in another routine */
1857 	if (m->m_pkthdr.csum_flags & (CSUM_TSO)) {
1858 		mxge_encap_tso(sc, m, cnt, ip_off);
1859 		return;
1860 	}
1861 
1862 	req = tx->req_list;
1863 	cksum_offset = 0;
1864 	pseudo_hdr_offset = 0;
1865 	flags = MXGEFW_FLAGS_NO_TSO;
1866 
1867 	/* checksum offloading? */
1868 	if (m->m_pkthdr.csum_flags & (CSUM_DELAY_DATA)) {
1869 		/* ensure ip header is in first mbuf, copy
1870 		   it to a scratch buffer if not */
1871 		if (__predict_false(m->m_len < ip_off + sizeof (*ip))) {
1872 			m_copydata(m, 0, ip_off + sizeof (*ip),
1873 				   sc->scratch);
1874 			ip = (struct ip *)(sc->scratch + ip_off);
1875 		} else {
1876 			ip = (struct ip *)(mtod(m, char *) + ip_off);
1877 		}
1878 		cksum_offset = ip_off + (ip->ip_hl << 2);
1879 		pseudo_hdr_offset = cksum_offset +  m->m_pkthdr.csum_data;
1880 		pseudo_hdr_offset = htobe16(pseudo_hdr_offset);
1881 		req->cksum_offset = cksum_offset;
1882 		flags |= MXGEFW_FLAGS_CKSUM;
1883 		odd_flag = MXGEFW_FLAGS_ALIGN_ODD;
1884 	} else {
1885 		odd_flag = 0;
1886 	}
1887 	if (m->m_pkthdr.len < MXGEFW_SEND_SMALL_SIZE)
1888 		flags |= MXGEFW_FLAGS_SMALL;
1889 
1890 	/* convert segments into a request list */
1891 	cum_len = 0;
1892 	seg = tx->seg_list;
1893 	req->flags = MXGEFW_FLAGS_FIRST;
1894 	for (i = 0; i < cnt; i++) {
1895 		req->addr_low =
1896 			htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
1897 		req->addr_high =
1898 			htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
1899 		req->length = htobe16(seg->ds_len);
1900 		req->cksum_offset = cksum_offset;
1901 		if (cksum_offset > seg->ds_len)
1902 			cksum_offset -= seg->ds_len;
1903 		else
1904 			cksum_offset = 0;
1905 		req->pseudo_hdr_offset = pseudo_hdr_offset;
1906 		req->pad = 0; /* complete solid 16-byte block */
1907 		req->rdma_count = 1;
1908 		req->flags |= flags | ((cum_len & 1) * odd_flag);
1909 		cum_len += seg->ds_len;
1910 		seg++;
1911 		req++;
1912 		req->flags = 0;
1913 	}
1914 	req--;
1915 	/* pad runts to 60 bytes */
1916 	if (cum_len < 60) {
1917 		req++;
1918 		req->addr_low =
1919 			htobe32(MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr));
1920 		req->addr_high =
1921 			htobe32(MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr));
1922 		req->length = htobe16(60 - cum_len);
1923 		req->cksum_offset = 0;
1924 		req->pseudo_hdr_offset = pseudo_hdr_offset;
1925 		req->pad = 0; /* complete solid 16-byte block */
1926 		req->rdma_count = 1;
1927 		req->flags |= flags | ((cum_len & 1) * odd_flag);
1928 		cnt++;
1929 	}
1930 
1931 	tx->req_list[0].rdma_count = cnt;
1932 #if 0
1933 	/* print what the firmware will see */
1934 	for (i = 0; i < cnt; i++) {
1935 		printf("%d: addr: 0x%x 0x%x len:%d pso%d,"
1936 		    "cso:%d, flags:0x%x, rdma:%d\n",
1937 		    i, (int)ntohl(tx->req_list[i].addr_high),
1938 		    (int)ntohl(tx->req_list[i].addr_low),
1939 		    (int)ntohs(tx->req_list[i].length),
1940 		    (int)ntohs(tx->req_list[i].pseudo_hdr_offset),
1941 		    tx->req_list[i].cksum_offset, tx->req_list[i].flags,
1942 		    tx->req_list[i].rdma_count);
1943 	}
1944 	printf("--------------\n");
1945 #endif
1946 	tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
1947 	mxge_submit_req(tx, tx->req_list, cnt);
1948 	return;
1949 
1950 drop:
1951 	m_freem(m);
1952 	ifp->if_oerrors++;
1953 	return;
1954 }
1955 
1956 
1957 
1958 
1959 static inline void
1960 mxge_start_locked(mxge_softc_t *sc)
1961 {
1962 	struct mbuf *m;
1963 	struct ifnet *ifp;
1964 	mxge_tx_buf_t *tx;
1965 
1966 	ifp = sc->ifp;
1967 	tx = &sc->tx;
1968 	while ((tx->mask - (tx->req - tx->done)) > tx->max_desc) {
1969 		IFQ_DRV_DEQUEUE(&ifp->if_snd, m);
1970 		if (m == NULL) {
1971 			return;
1972 		}
1973 		/* let BPF see it */
1974 		BPF_MTAP(ifp, m);
1975 
1976 		/* give it to the nic */
1977 		mxge_encap(sc, m);
1978 	}
1979 	/* ran out of transmit slots */
1980 	if ((sc->ifp->if_drv_flags & IFF_DRV_OACTIVE) == 0) {
1981 		sc->ifp->if_drv_flags |= IFF_DRV_OACTIVE;
1982 		tx->stall++;
1983 	}
1984 }
1985 
1986 static void
1987 mxge_start(struct ifnet *ifp)
1988 {
1989 	mxge_softc_t *sc = ifp->if_softc;
1990 
1991 
1992 	mtx_lock(&sc->tx_mtx);
1993 	mxge_start_locked(sc);
1994 	mtx_unlock(&sc->tx_mtx);
1995 }
1996 
1997 /*
1998  * copy an array of mcp_kreq_ether_recv_t's to the mcp.  Copy
1999  * at most 32 bytes at a time, so as to avoid involving the software
2000  * pio handler in the nic.   We re-write the first segment's low
2001  * DMA address to mark it valid only after we write the entire chunk
2002  * in a burst
2003  */
2004 static inline void
2005 mxge_submit_8rx(volatile mcp_kreq_ether_recv_t *dst,
2006 		mcp_kreq_ether_recv_t *src)
2007 {
2008 	uint32_t low;
2009 
2010 	low = src->addr_low;
2011 	src->addr_low = 0xffffffff;
2012 	mxge_pio_copy(dst, src, 4 * sizeof (*src));
2013 	mb();
2014 	mxge_pio_copy(dst + 4, src + 4, 4 * sizeof (*src));
2015 	mb();
2016 	src->addr_low = low;
2017 	dst->addr_low = low;
2018 	mb();
2019 }
2020 
2021 static int
2022 mxge_get_buf_small(mxge_softc_t *sc, bus_dmamap_t map, int idx)
2023 {
2024 	bus_dma_segment_t seg;
2025 	struct mbuf *m;
2026 	mxge_rx_buf_t *rx = &sc->rx_small;
2027 	int cnt, err;
2028 
2029 	m = m_gethdr(M_DONTWAIT, MT_DATA);
2030 	if (m == NULL) {
2031 		rx->alloc_fail++;
2032 		err = ENOBUFS;
2033 		goto done;
2034 	}
2035 	m->m_len = MHLEN;
2036 	err = bus_dmamap_load_mbuf_sg(rx->dmat, map, m,
2037 				      &seg, &cnt, BUS_DMA_NOWAIT);
2038 	if (err != 0) {
2039 		m_free(m);
2040 		goto done;
2041 	}
2042 	rx->info[idx].m = m;
2043 	rx->shadow[idx].addr_low =
2044 		htobe32(MXGE_LOWPART_TO_U32(seg.ds_addr));
2045 	rx->shadow[idx].addr_high =
2046 		htobe32(MXGE_HIGHPART_TO_U32(seg.ds_addr));
2047 
2048 done:
2049 	if ((idx & 7) == 7)
2050 		mxge_submit_8rx(&rx->lanai[idx - 7], &rx->shadow[idx - 7]);
2051 	return err;
2052 }
2053 
2054 static int
2055 mxge_get_buf_big(mxge_softc_t *sc, bus_dmamap_t map, int idx)
2056 {
2057 	bus_dma_segment_t seg[3];
2058 	struct mbuf *m;
2059 	mxge_rx_buf_t *rx = &sc->rx_big;
2060 	int cnt, err, i;
2061 
2062 	if (rx->cl_size == MCLBYTES)
2063 		m = m_getcl(M_DONTWAIT, MT_DATA, M_PKTHDR);
2064 	else
2065 		m = m_getjcl(M_DONTWAIT, MT_DATA, M_PKTHDR, rx->cl_size);
2066 	if (m == NULL) {
2067 		rx->alloc_fail++;
2068 		err = ENOBUFS;
2069 		goto done;
2070 	}
2071 	m->m_len = rx->cl_size;
2072 	err = bus_dmamap_load_mbuf_sg(rx->dmat, map, m,
2073 				      seg, &cnt, BUS_DMA_NOWAIT);
2074 	if (err != 0) {
2075 		m_free(m);
2076 		goto done;
2077 	}
2078 	rx->info[idx].m = m;
2079 
2080 	for (i = 0; i < cnt; i++) {
2081 		rx->shadow[idx + i].addr_low =
2082 			htobe32(MXGE_LOWPART_TO_U32(seg[i].ds_addr));
2083 		rx->shadow[idx + i].addr_high =
2084 			htobe32(MXGE_HIGHPART_TO_U32(seg[i].ds_addr));
2085        }
2086 
2087 
2088 done:
2089        for (i = 0; i < rx->nbufs; i++) {
2090 		if ((idx & 7) == 7) {
2091 			mxge_submit_8rx(&rx->lanai[idx - 7],
2092 					&rx->shadow[idx - 7]);
2093 		}
2094 		idx++;
2095 	}
2096 	return err;
2097 }
2098 
2099 /*
2100  *  Myri10GE hardware checksums are not valid if the sender
2101  *  padded the frame with non-zero padding.  This is because
2102  *  the firmware just does a simple 16-bit 1s complement
2103  *  checksum across the entire frame, excluding the first 14
2104  *  bytes.  It is best to simply to check the checksum and
2105  *  tell the stack about it only if the checksum is good
2106  */
2107 
2108 static inline uint16_t
2109 mxge_rx_csum(struct mbuf *m, int csum)
2110 {
2111 	struct ether_header *eh;
2112 	struct ip *ip;
2113 	uint16_t c;
2114 
2115 	eh = mtod(m, struct ether_header *);
2116 
2117 	/* only deal with IPv4 TCP & UDP for now */
2118 	if (__predict_false(eh->ether_type != htons(ETHERTYPE_IP)))
2119 		return 1;
2120 	ip = (struct ip *)(eh + 1);
2121 	if (__predict_false(ip->ip_p != IPPROTO_TCP &&
2122 			    ip->ip_p != IPPROTO_UDP))
2123 		return 1;
2124 
2125 	c = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
2126 		      htonl(ntohs(csum) + ntohs(ip->ip_len) +
2127 			    - (ip->ip_hl << 2) + ip->ip_p));
2128 	c ^= 0xffff;
2129 	return (c);
2130 }
2131 
2132 static void
2133 mxge_vlan_tag_remove(struct mbuf *m, uint32_t *csum)
2134 {
2135 	struct ether_vlan_header *evl;
2136 	struct ether_header *eh;
2137 	uint32_t partial;
2138 
2139 	evl = mtod(m, struct ether_vlan_header *);
2140 	eh = mtod(m, struct ether_header *);
2141 
2142 	/*
2143 	 * fix checksum by subtracting ETHER_VLAN_ENCAP_LEN bytes
2144 	 * after what the firmware thought was the end of the ethernet
2145 	 * header.
2146 	 */
2147 
2148 	/* put checksum into host byte order */
2149 	*csum = ntohs(*csum);
2150 	partial = ntohl(*(uint32_t *)(mtod(m, char *) + ETHER_HDR_LEN));
2151 	(*csum) += ~partial;
2152 	(*csum) +=  ((*csum) < ~partial);
2153 	(*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2154 	(*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2155 
2156 	/* restore checksum to network byte order;
2157 	   later consumers expect this */
2158 	*csum = htons(*csum);
2159 
2160 	/* save the tag */
2161 	m->m_flags |= M_VLANTAG;
2162 	m->m_pkthdr.ether_vtag = ntohs(evl->evl_tag);
2163 
2164 	/*
2165 	 * Remove the 802.1q header by copying the Ethernet
2166 	 * addresses over it and adjusting the beginning of
2167 	 * the data in the mbuf.  The encapsulated Ethernet
2168 	 * type field is already in place.
2169 	 */
2170 	bcopy((char *)evl, (char *)evl + ETHER_VLAN_ENCAP_LEN,
2171 	      ETHER_HDR_LEN - ETHER_TYPE_LEN);
2172 	m_adj(m, ETHER_VLAN_ENCAP_LEN);
2173 }
2174 
2175 
2176 static inline void
2177 mxge_rx_done_big(mxge_softc_t *sc, uint32_t len, uint32_t csum)
2178 {
2179 	struct ifnet *ifp;
2180 	struct mbuf *m;
2181 	struct ether_header *eh;
2182 	mxge_rx_buf_t *rx;
2183 	bus_dmamap_t old_map;
2184 	int idx;
2185 	uint16_t tcpudp_csum;
2186 
2187 	ifp = sc->ifp;
2188 	rx = &sc->rx_big;
2189 	idx = rx->cnt & rx->mask;
2190 	rx->cnt += rx->nbufs;
2191 	/* save a pointer to the received mbuf */
2192 	m = rx->info[idx].m;
2193 	/* try to replace the received mbuf */
2194 	if (mxge_get_buf_big(sc, rx->extra_map, idx)) {
2195 		/* drop the frame -- the old mbuf is re-cycled */
2196 		ifp->if_ierrors++;
2197 		return;
2198 	}
2199 
2200 	/* unmap the received buffer */
2201 	old_map = rx->info[idx].map;
2202 	bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2203 	bus_dmamap_unload(rx->dmat, old_map);
2204 
2205 	/* swap the bus_dmamap_t's */
2206 	rx->info[idx].map = rx->extra_map;
2207 	rx->extra_map = old_map;
2208 
2209 	/* mcp implicitly skips 1st 2 bytes so that packet is properly
2210 	 * aligned */
2211 	m->m_data += MXGEFW_PAD;
2212 
2213 	m->m_pkthdr.rcvif = ifp;
2214 	m->m_len = m->m_pkthdr.len = len;
2215 	ifp->if_ipackets++;
2216 	eh = mtod(m, struct ether_header *);
2217 	if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2218 		mxge_vlan_tag_remove(m, &csum);
2219 	}
2220 	/* if the checksum is valid, mark it in the mbuf header */
2221 	if (sc->csum_flag && (0 == (tcpudp_csum = mxge_rx_csum(m, csum)))) {
2222 		if (sc->lro_cnt && (0 == mxge_lro_rx(sc, m, csum)))
2223 			return;
2224 		/* otherwise, it was a UDP frame, or a TCP frame which
2225 		   we could not do LRO on.  Tell the stack that the
2226 		   checksum is good */
2227 		m->m_pkthdr.csum_data = 0xffff;
2228 		m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR | CSUM_DATA_VALID;
2229 	}
2230 	/* pass the frame up the stack */
2231 	(*ifp->if_input)(ifp, m);
2232 }
2233 
2234 static inline void
2235 mxge_rx_done_small(mxge_softc_t *sc, uint32_t len, uint32_t csum)
2236 {
2237 	struct ifnet *ifp;
2238 	struct ether_header *eh;
2239 	struct mbuf *m;
2240 	mxge_rx_buf_t *rx;
2241 	bus_dmamap_t old_map;
2242 	int idx;
2243 	uint16_t tcpudp_csum;
2244 
2245 	ifp = sc->ifp;
2246 	rx = &sc->rx_small;
2247 	idx = rx->cnt & rx->mask;
2248 	rx->cnt++;
2249 	/* save a pointer to the received mbuf */
2250 	m = rx->info[idx].m;
2251 	/* try to replace the received mbuf */
2252 	if (mxge_get_buf_small(sc, rx->extra_map, idx)) {
2253 		/* drop the frame -- the old mbuf is re-cycled */
2254 		ifp->if_ierrors++;
2255 		return;
2256 	}
2257 
2258 	/* unmap the received buffer */
2259 	old_map = rx->info[idx].map;
2260 	bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2261 	bus_dmamap_unload(rx->dmat, old_map);
2262 
2263 	/* swap the bus_dmamap_t's */
2264 	rx->info[idx].map = rx->extra_map;
2265 	rx->extra_map = old_map;
2266 
2267 	/* mcp implicitly skips 1st 2 bytes so that packet is properly
2268 	 * aligned */
2269 	m->m_data += MXGEFW_PAD;
2270 
2271 	m->m_pkthdr.rcvif = ifp;
2272 	m->m_len = m->m_pkthdr.len = len;
2273 	ifp->if_ipackets++;
2274 	eh = mtod(m, struct ether_header *);
2275 	if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2276 		mxge_vlan_tag_remove(m, &csum);
2277 	}
2278 	/* if the checksum is valid, mark it in the mbuf header */
2279 	if (sc->csum_flag && (0 == (tcpudp_csum = mxge_rx_csum(m, csum)))) {
2280 		if (sc->lro_cnt && (0 == mxge_lro_rx(sc, m, csum)))
2281 			return;
2282 		/* otherwise, it was a UDP frame, or a TCP frame which
2283 		   we could not do LRO on.  Tell the stack that the
2284 		   checksum is good */
2285 		m->m_pkthdr.csum_data = 0xffff;
2286 		m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR | CSUM_DATA_VALID;
2287 	}
2288 
2289 	/* pass the frame up the stack */
2290 	(*ifp->if_input)(ifp, m);
2291 }
2292 
2293 static inline void
2294 mxge_clean_rx_done(mxge_softc_t *sc)
2295 {
2296 	mxge_rx_done_t *rx_done = &sc->rx_done;
2297 	struct lro_entry *lro;
2298 	int limit = 0;
2299 	uint16_t length;
2300 	uint16_t checksum;
2301 
2302 
2303 	while (rx_done->entry[rx_done->idx].length != 0) {
2304 		length = ntohs(rx_done->entry[rx_done->idx].length);
2305 		rx_done->entry[rx_done->idx].length = 0;
2306 		checksum = rx_done->entry[rx_done->idx].checksum;
2307 		if (length <= (MHLEN - MXGEFW_PAD))
2308 			mxge_rx_done_small(sc, length, checksum);
2309 		else
2310 			mxge_rx_done_big(sc, length, checksum);
2311 		rx_done->cnt++;
2312 		rx_done->idx = rx_done->cnt & rx_done->mask;
2313 
2314 		/* limit potential for livelock */
2315 		if (__predict_false(++limit > rx_done->mask / 2))
2316 			break;
2317 	}
2318 	while(!SLIST_EMPTY(&sc->lro_active)) {
2319 		lro = SLIST_FIRST(&sc->lro_active);
2320 		SLIST_REMOVE_HEAD(&sc->lro_active, next);
2321 		mxge_lro_flush(sc, lro);
2322 	}
2323 }
2324 
2325 
2326 static inline void
2327 mxge_tx_done(mxge_softc_t *sc, uint32_t mcp_idx)
2328 {
2329 	struct ifnet *ifp;
2330 	mxge_tx_buf_t *tx;
2331 	struct mbuf *m;
2332 	bus_dmamap_t map;
2333 	int idx;
2334 
2335 	tx = &sc->tx;
2336 	ifp = sc->ifp;
2337 	while (tx->pkt_done != mcp_idx) {
2338 		idx = tx->done & tx->mask;
2339 		tx->done++;
2340 		m = tx->info[idx].m;
2341 		/* mbuf and DMA map only attached to the first
2342 		   segment per-mbuf */
2343 		if (m != NULL) {
2344 			ifp->if_opackets++;
2345 			tx->info[idx].m = NULL;
2346 			map = tx->info[idx].map;
2347 			bus_dmamap_unload(tx->dmat, map);
2348 			m_freem(m);
2349 		}
2350 		if (tx->info[idx].flag) {
2351 			tx->info[idx].flag = 0;
2352 			tx->pkt_done++;
2353 		}
2354 	}
2355 
2356 	/* If we have space, clear IFF_OACTIVE to tell the stack that
2357            its OK to send packets */
2358 
2359 	if (ifp->if_drv_flags & IFF_DRV_OACTIVE &&
2360 	    tx->req - tx->done < (tx->mask + 1)/4) {
2361 		mtx_lock(&sc->tx_mtx);
2362 		ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
2363 		sc->tx.wake++;
2364 		mxge_start_locked(sc);
2365 		mtx_unlock(&sc->tx_mtx);
2366 	}
2367 }
2368 
2369 static void
2370 mxge_intr(void *arg)
2371 {
2372 	mxge_softc_t *sc = arg;
2373 	mcp_irq_data_t *stats = sc->fw_stats;
2374 	mxge_tx_buf_t *tx = &sc->tx;
2375 	mxge_rx_done_t *rx_done = &sc->rx_done;
2376 	uint32_t send_done_count;
2377 	uint8_t valid;
2378 
2379 
2380 	/* make sure the DMA has finished */
2381 	if (!stats->valid) {
2382 		return;
2383 	}
2384 	valid = stats->valid;
2385 
2386 	if (!sc->msi_enabled) {
2387 		/* lower legacy IRQ  */
2388 		*sc->irq_deassert = 0;
2389 		if (!mxge_deassert_wait)
2390 			/* don't wait for conf. that irq is low */
2391 			stats->valid = 0;
2392 	} else {
2393 		stats->valid = 0;
2394 	}
2395 
2396 	/* loop while waiting for legacy irq deassertion */
2397 	do {
2398 		/* check for transmit completes and receives */
2399 		send_done_count = be32toh(stats->send_done_count);
2400 		while ((send_done_count != tx->pkt_done) ||
2401 		       (rx_done->entry[rx_done->idx].length != 0)) {
2402 			mxge_tx_done(sc, (int)send_done_count);
2403 			mxge_clean_rx_done(sc);
2404 			send_done_count = be32toh(stats->send_done_count);
2405 		}
2406 	} while (*((volatile uint8_t *) &stats->valid));
2407 
2408 	if (__predict_false(stats->stats_updated)) {
2409 		if (sc->link_state != stats->link_up) {
2410 			sc->link_state = stats->link_up;
2411 			if (sc->link_state) {
2412 				if_link_state_change(sc->ifp, LINK_STATE_UP);
2413 				if (mxge_verbose)
2414 					device_printf(sc->dev, "link up\n");
2415 			} else {
2416 				if_link_state_change(sc->ifp, LINK_STATE_DOWN);
2417 				if (mxge_verbose)
2418 					device_printf(sc->dev, "link down\n");
2419 			}
2420 		}
2421 		if (sc->rdma_tags_available !=
2422 		    be32toh(sc->fw_stats->rdma_tags_available)) {
2423 			sc->rdma_tags_available =
2424 				be32toh(sc->fw_stats->rdma_tags_available);
2425 			device_printf(sc->dev, "RDMA timed out! %d tags "
2426 				      "left\n", sc->rdma_tags_available);
2427 		}
2428 		sc->down_cnt += stats->link_down;
2429 	}
2430 
2431 	/* check to see if we have rx token to pass back */
2432 	if (valid & 0x1)
2433 	    *sc->irq_claim = be32toh(3);
2434 	*(sc->irq_claim + 1) = be32toh(3);
2435 }
2436 
2437 static void
2438 mxge_init(void *arg)
2439 {
2440 }
2441 
2442 
2443 
2444 static void
2445 mxge_free_mbufs(mxge_softc_t *sc)
2446 {
2447 	int i;
2448 
2449 	for (i = 0; i <= sc->rx_big.mask; i++) {
2450 		if (sc->rx_big.info[i].m == NULL)
2451 			continue;
2452 		bus_dmamap_unload(sc->rx_big.dmat,
2453 				  sc->rx_big.info[i].map);
2454 		m_freem(sc->rx_big.info[i].m);
2455 		sc->rx_big.info[i].m = NULL;
2456 	}
2457 
2458 	for (i = 0; i <= sc->rx_small.mask; i++) {
2459 		if (sc->rx_small.info[i].m == NULL)
2460 			continue;
2461 		bus_dmamap_unload(sc->rx_small.dmat,
2462 				  sc->rx_small.info[i].map);
2463 		m_freem(sc->rx_small.info[i].m);
2464 		sc->rx_small.info[i].m = NULL;
2465 	}
2466 
2467 	for (i = 0; i <= sc->tx.mask; i++) {
2468 		sc->tx.info[i].flag = 0;
2469 		if (sc->tx.info[i].m == NULL)
2470 			continue;
2471 		bus_dmamap_unload(sc->tx.dmat,
2472 				  sc->tx.info[i].map);
2473 		m_freem(sc->tx.info[i].m);
2474 		sc->tx.info[i].m = NULL;
2475 	}
2476 }
2477 
2478 static void
2479 mxge_free_rings(mxge_softc_t *sc)
2480 {
2481 	int i;
2482 
2483 	if (sc->rx_done.entry != NULL)
2484 		mxge_dma_free(&sc->rx_done.dma);
2485 	sc->rx_done.entry = NULL;
2486 	if (sc->tx.req_bytes != NULL)
2487 		free(sc->tx.req_bytes, M_DEVBUF);
2488 	if (sc->tx.seg_list != NULL)
2489 		free(sc->tx.seg_list, M_DEVBUF);
2490 	if (sc->rx_small.shadow != NULL)
2491 		free(sc->rx_small.shadow, M_DEVBUF);
2492 	if (sc->rx_big.shadow != NULL)
2493 		free(sc->rx_big.shadow, M_DEVBUF);
2494 	if (sc->tx.info != NULL) {
2495 		if (sc->tx.dmat != NULL) {
2496 			for (i = 0; i <= sc->tx.mask; i++) {
2497 				bus_dmamap_destroy(sc->tx.dmat,
2498 						   sc->tx.info[i].map);
2499 			}
2500 			bus_dma_tag_destroy(sc->tx.dmat);
2501 		}
2502 		free(sc->tx.info, M_DEVBUF);
2503 	}
2504 	if (sc->rx_small.info != NULL) {
2505 		if (sc->rx_small.dmat != NULL) {
2506 			for (i = 0; i <= sc->rx_small.mask; i++) {
2507 				bus_dmamap_destroy(sc->rx_small.dmat,
2508 						   sc->rx_small.info[i].map);
2509 			}
2510 			bus_dmamap_destroy(sc->rx_small.dmat,
2511 					   sc->rx_small.extra_map);
2512 			bus_dma_tag_destroy(sc->rx_small.dmat);
2513 		}
2514 		free(sc->rx_small.info, M_DEVBUF);
2515 	}
2516 	if (sc->rx_big.info != NULL) {
2517 		if (sc->rx_big.dmat != NULL) {
2518 			for (i = 0; i <= sc->rx_big.mask; i++) {
2519 				bus_dmamap_destroy(sc->rx_big.dmat,
2520 						   sc->rx_big.info[i].map);
2521 			}
2522 			bus_dmamap_destroy(sc->rx_big.dmat,
2523 					   sc->rx_big.extra_map);
2524 			bus_dma_tag_destroy(sc->rx_big.dmat);
2525 		}
2526 		free(sc->rx_big.info, M_DEVBUF);
2527 	}
2528 }
2529 
2530 static int
2531 mxge_alloc_rings(mxge_softc_t *sc)
2532 {
2533 	mxge_cmd_t cmd;
2534 	int tx_ring_size, rx_ring_size;
2535 	int tx_ring_entries, rx_ring_entries;
2536 	int i, err;
2537 	unsigned long bytes;
2538 
2539 	/* get ring sizes */
2540 	err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_RING_SIZE, &cmd);
2541 	tx_ring_size = cmd.data0;
2542 	err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
2543 	if (err != 0) {
2544 		device_printf(sc->dev, "Cannot determine ring sizes\n");
2545 		goto abort_with_nothing;
2546 	}
2547 
2548 	rx_ring_size = cmd.data0;
2549 
2550 	tx_ring_entries = tx_ring_size / sizeof (mcp_kreq_ether_send_t);
2551 	rx_ring_entries = rx_ring_size / sizeof (mcp_dma_addr_t);
2552 	IFQ_SET_MAXLEN(&sc->ifp->if_snd, tx_ring_entries - 1);
2553 	sc->ifp->if_snd.ifq_drv_maxlen = sc->ifp->if_snd.ifq_maxlen;
2554 	IFQ_SET_READY(&sc->ifp->if_snd);
2555 
2556 	sc->tx.mask = tx_ring_entries - 1;
2557 	sc->tx.max_desc = MIN(MXGE_MAX_SEND_DESC, tx_ring_entries / 4);
2558 	sc->rx_small.mask = sc->rx_big.mask = rx_ring_entries - 1;
2559 	sc->rx_done.mask = (2 * rx_ring_entries) - 1;
2560 
2561 	err = ENOMEM;
2562 
2563 	/* allocate interrupt queues */
2564 	bytes = (sc->rx_done.mask + 1) * sizeof (*sc->rx_done.entry);
2565 	err = mxge_dma_alloc(sc, &sc->rx_done.dma, bytes, 4096);
2566 	if (err != 0)
2567 		goto abort_with_nothing;
2568 	sc->rx_done.entry = sc->rx_done.dma.addr;
2569 	bzero(sc->rx_done.entry, bytes);
2570 
2571 	/* allocate the tx request copy block */
2572 	bytes = 8 +
2573 		sizeof (*sc->tx.req_list) * (sc->tx.max_desc + 4);
2574 	sc->tx.req_bytes = malloc(bytes, M_DEVBUF, M_WAITOK);
2575 	if (sc->tx.req_bytes == NULL)
2576 		goto abort_with_alloc;
2577 	/* ensure req_list entries are aligned to 8 bytes */
2578 	sc->tx.req_list = (mcp_kreq_ether_send_t *)
2579 		((unsigned long)(sc->tx.req_bytes + 7) & ~7UL);
2580 
2581 	/* allocate the tx busdma segment list */
2582 	bytes = sizeof (*sc->tx.seg_list) * sc->tx.max_desc;
2583 	sc->tx.seg_list = (bus_dma_segment_t *)
2584 		malloc(bytes, M_DEVBUF, M_WAITOK);
2585 	if (sc->tx.seg_list == NULL)
2586 		goto abort_with_alloc;
2587 
2588 	/* allocate the rx shadow rings */
2589 	bytes = rx_ring_entries * sizeof (*sc->rx_small.shadow);
2590 	sc->rx_small.shadow = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
2591 	if (sc->rx_small.shadow == NULL)
2592 		goto abort_with_alloc;
2593 
2594 	bytes = rx_ring_entries * sizeof (*sc->rx_big.shadow);
2595 	sc->rx_big.shadow = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
2596 	if (sc->rx_big.shadow == NULL)
2597 		goto abort_with_alloc;
2598 
2599 	/* allocate the host info rings */
2600 	bytes = tx_ring_entries * sizeof (*sc->tx.info);
2601 	sc->tx.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
2602 	if (sc->tx.info == NULL)
2603 		goto abort_with_alloc;
2604 
2605 	bytes = rx_ring_entries * sizeof (*sc->rx_small.info);
2606 	sc->rx_small.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
2607 	if (sc->rx_small.info == NULL)
2608 		goto abort_with_alloc;
2609 
2610 	bytes = rx_ring_entries * sizeof (*sc->rx_big.info);
2611 	sc->rx_big.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
2612 	if (sc->rx_big.info == NULL)
2613 		goto abort_with_alloc;
2614 
2615 	/* allocate the busdma resources */
2616 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
2617 				 1,			/* alignment */
2618 				 sc->tx.boundary,	/* boundary */
2619 				 BUS_SPACE_MAXADDR,	/* low */
2620 				 BUS_SPACE_MAXADDR,	/* high */
2621 				 NULL, NULL,		/* filter */
2622 				 65536 + 256,		/* maxsize */
2623 				 sc->tx.max_desc - 2,	/* num segs */
2624 				 sc->tx.boundary,	/* maxsegsize */
2625 				 BUS_DMA_ALLOCNOW,	/* flags */
2626 				 NULL, NULL,		/* lock */
2627 				 &sc->tx.dmat);		/* tag */
2628 
2629 	if (err != 0) {
2630 		device_printf(sc->dev, "Err %d allocating tx dmat\n",
2631 			      err);
2632 		goto abort_with_alloc;
2633 	}
2634 
2635 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
2636 				 1,			/* alignment */
2637 				 4096,			/* boundary */
2638 				 BUS_SPACE_MAXADDR,	/* low */
2639 				 BUS_SPACE_MAXADDR,	/* high */
2640 				 NULL, NULL,		/* filter */
2641 				 MHLEN,			/* maxsize */
2642 				 1,			/* num segs */
2643 				 MHLEN,			/* maxsegsize */
2644 				 BUS_DMA_ALLOCNOW,	/* flags */
2645 				 NULL, NULL,		/* lock */
2646 				 &sc->rx_small.dmat);	/* tag */
2647 	if (err != 0) {
2648 		device_printf(sc->dev, "Err %d allocating rx_small dmat\n",
2649 			      err);
2650 		goto abort_with_alloc;
2651 	}
2652 
2653 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
2654 				 1,			/* alignment */
2655 				 4096,			/* boundary */
2656 				 BUS_SPACE_MAXADDR,	/* low */
2657 				 BUS_SPACE_MAXADDR,	/* high */
2658 				 NULL, NULL,		/* filter */
2659 				 3*4096,		/* maxsize */
2660 				 3,			/* num segs */
2661 				 4096,			/* maxsegsize */
2662 				 BUS_DMA_ALLOCNOW,	/* flags */
2663 				 NULL, NULL,		/* lock */
2664 				 &sc->rx_big.dmat);	/* tag */
2665 	if (err != 0) {
2666 		device_printf(sc->dev, "Err %d allocating rx_big dmat\n",
2667 			      err);
2668 		goto abort_with_alloc;
2669 	}
2670 
2671 	/* now use these tags to setup dmamaps for each slot
2672 	   in each ring */
2673 	for (i = 0; i <= sc->tx.mask; i++) {
2674 		err = bus_dmamap_create(sc->tx.dmat, 0,
2675 					&sc->tx.info[i].map);
2676 		if (err != 0) {
2677 			device_printf(sc->dev, "Err %d  tx dmamap\n",
2678 			      err);
2679 			goto abort_with_alloc;
2680 		}
2681 	}
2682 	for (i = 0; i <= sc->rx_small.mask; i++) {
2683 		err = bus_dmamap_create(sc->rx_small.dmat, 0,
2684 					&sc->rx_small.info[i].map);
2685 		if (err != 0) {
2686 			device_printf(sc->dev, "Err %d  rx_small dmamap\n",
2687 				      err);
2688 			goto abort_with_alloc;
2689 		}
2690 	}
2691 	err = bus_dmamap_create(sc->rx_small.dmat, 0,
2692 				&sc->rx_small.extra_map);
2693 	if (err != 0) {
2694 		device_printf(sc->dev, "Err %d extra rx_small dmamap\n",
2695 			      err);
2696 			goto abort_with_alloc;
2697 	}
2698 
2699 	for (i = 0; i <= sc->rx_big.mask; i++) {
2700 		err = bus_dmamap_create(sc->rx_big.dmat, 0,
2701 					&sc->rx_big.info[i].map);
2702 		if (err != 0) {
2703 			device_printf(sc->dev, "Err %d  rx_big dmamap\n",
2704 			      err);
2705 			goto abort_with_alloc;
2706 		}
2707 	}
2708 	err = bus_dmamap_create(sc->rx_big.dmat, 0,
2709 				&sc->rx_big.extra_map);
2710 	if (err != 0) {
2711 		device_printf(sc->dev, "Err %d extra rx_big dmamap\n",
2712 			      err);
2713 			goto abort_with_alloc;
2714 	}
2715 	return 0;
2716 
2717 abort_with_alloc:
2718 	mxge_free_rings(sc);
2719 
2720 abort_with_nothing:
2721 	return err;
2722 }
2723 
2724 static void
2725 mxge_choose_params(int mtu, int *big_buf_size, int *cl_size, int *nbufs)
2726 {
2727 	int bufsize = mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN + MXGEFW_PAD;
2728 
2729 	if (bufsize < MCLBYTES) {
2730 		/* easy, everything fits in a single buffer */
2731 		*big_buf_size = MCLBYTES;
2732 		*cl_size = MCLBYTES;
2733 		*nbufs = 1;
2734 		return;
2735 	}
2736 
2737 	if (bufsize < MJUMPAGESIZE) {
2738 		/* still easy, everything still fits in a single buffer */
2739 		*big_buf_size = MJUMPAGESIZE;
2740 		*cl_size = MJUMPAGESIZE;
2741 		*nbufs = 1;
2742 		return;
2743 	}
2744 	/* now we need to use virtually contiguous buffers */
2745 	*cl_size = MJUM9BYTES;
2746 	*big_buf_size = 4096;
2747 	*nbufs = mtu / 4096 + 1;
2748 	/* needs to be a power of two, so round up */
2749 	if (*nbufs == 3)
2750 		*nbufs = 4;
2751 }
2752 
2753 static int
2754 mxge_open(mxge_softc_t *sc)
2755 {
2756 	mxge_cmd_t cmd;
2757 	int i, err, big_bytes;
2758 	bus_dmamap_t map;
2759 	bus_addr_t bus;
2760 	struct lro_entry *lro_entry;
2761 
2762 	SLIST_INIT(&sc->lro_free);
2763 	SLIST_INIT(&sc->lro_active);
2764 
2765 	for (i = 0; i < sc->lro_cnt; i++) {
2766 		lro_entry = (struct lro_entry *)
2767 			malloc(sizeof (*lro_entry), M_DEVBUF, M_NOWAIT | M_ZERO);
2768 		if (lro_entry == NULL) {
2769 			sc->lro_cnt = i;
2770 			break;
2771 		}
2772 		SLIST_INSERT_HEAD(&sc->lro_free, lro_entry, next);
2773 	}
2774 
2775 	/* Copy the MAC address in case it was overridden */
2776 	bcopy(IF_LLADDR(sc->ifp), sc->mac_addr, ETHER_ADDR_LEN);
2777 
2778 	err = mxge_reset(sc, 1);
2779 	if (err != 0) {
2780 		device_printf(sc->dev, "failed to reset\n");
2781 		return EIO;
2782 	}
2783 
2784 	mxge_choose_params(sc->ifp->if_mtu, &big_bytes,
2785 			   &sc->rx_big.cl_size, &sc->rx_big.nbufs);
2786 
2787 	cmd.data0 = sc->rx_big.nbufs;
2788 	err = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
2789 			    &cmd);
2790 	/* error is only meaningful if we're trying to set
2791 	   MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS > 1 */
2792 	if (err && sc->rx_big.nbufs > 1) {
2793 		device_printf(sc->dev,
2794 			      "Failed to set alway-use-n to %d\n",
2795 			      sc->rx_big.nbufs);
2796 		return EIO;
2797 	}
2798 	/* get the lanai pointers to the send and receive rings */
2799 
2800 	err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_OFFSET, &cmd);
2801 	sc->tx.lanai =
2802 		(volatile mcp_kreq_ether_send_t *)(sc->sram + cmd.data0);
2803 	err |= mxge_send_cmd(sc,
2804 				 MXGEFW_CMD_GET_SMALL_RX_OFFSET, &cmd);
2805 	sc->rx_small.lanai =
2806 		(volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
2807 	err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_BIG_RX_OFFSET, &cmd);
2808 	sc->rx_big.lanai =
2809 		(volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
2810 
2811 	if (err != 0) {
2812 		device_printf(sc->dev,
2813 			      "failed to get ring sizes or locations\n");
2814 		return EIO;
2815 	}
2816 
2817 	/* stock receive rings */
2818 	for (i = 0; i <= sc->rx_small.mask; i++) {
2819 		map = sc->rx_small.info[i].map;
2820 		err = mxge_get_buf_small(sc, map, i);
2821 		if (err) {
2822 			device_printf(sc->dev, "alloced %d/%d smalls\n",
2823 				      i, sc->rx_small.mask + 1);
2824 			goto abort;
2825 		}
2826 	}
2827 	for (i = 0; i <= sc->rx_big.mask; i++) {
2828 		sc->rx_big.shadow[i].addr_low = 0xffffffff;
2829 		sc->rx_big.shadow[i].addr_high = 0xffffffff;
2830 	}
2831 	for (i = 0; i <= sc->rx_big.mask; i += sc->rx_big.nbufs) {
2832 		map = sc->rx_big.info[i].map;
2833 		err = mxge_get_buf_big(sc, map, i);
2834 		if (err) {
2835 			device_printf(sc->dev, "alloced %d/%d bigs\n",
2836 				      i, sc->rx_big.mask + 1);
2837 			goto abort;
2838 		}
2839 	}
2840 
2841 	/* Give the firmware the mtu and the big and small buffer
2842 	   sizes.  The firmware wants the big buf size to be a power
2843 	   of two. Luckily, FreeBSD's clusters are powers of two */
2844 	cmd.data0 = sc->ifp->if_mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
2845 	err = mxge_send_cmd(sc, MXGEFW_CMD_SET_MTU, &cmd);
2846 	cmd.data0 = MHLEN - MXGEFW_PAD;
2847 	err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_SMALL_BUFFER_SIZE,
2848 			     &cmd);
2849 	cmd.data0 = big_bytes;
2850 	err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_BIG_BUFFER_SIZE, &cmd);
2851 
2852 	if (err != 0) {
2853 		device_printf(sc->dev, "failed to setup params\n");
2854 		goto abort;
2855 	}
2856 
2857 	/* Now give him the pointer to the stats block */
2858 	cmd.data0 = MXGE_LOWPART_TO_U32(sc->fw_stats_dma.bus_addr);
2859 	cmd.data1 = MXGE_HIGHPART_TO_U32(sc->fw_stats_dma.bus_addr);
2860 	cmd.data2 = sizeof(struct mcp_irq_data);
2861 	err = mxge_send_cmd(sc, MXGEFW_CMD_SET_STATS_DMA_V2, &cmd);
2862 
2863 	if (err != 0) {
2864 		bus = sc->fw_stats_dma.bus_addr;
2865 		bus += offsetof(struct mcp_irq_data, send_done_count);
2866 		cmd.data0 = MXGE_LOWPART_TO_U32(bus);
2867 		cmd.data1 = MXGE_HIGHPART_TO_U32(bus);
2868 		err = mxge_send_cmd(sc,
2869 				    MXGEFW_CMD_SET_STATS_DMA_OBSOLETE,
2870 				    &cmd);
2871 		/* Firmware cannot support multicast without STATS_DMA_V2 */
2872 		sc->fw_multicast_support = 0;
2873 	} else {
2874 		sc->fw_multicast_support = 1;
2875 	}
2876 
2877 	if (err != 0) {
2878 		device_printf(sc->dev, "failed to setup params\n");
2879 		goto abort;
2880 	}
2881 
2882 	/* Finally, start the firmware running */
2883 	err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_UP, &cmd);
2884 	if (err) {
2885 		device_printf(sc->dev, "Couldn't bring up link\n");
2886 		goto abort;
2887 	}
2888 	sc->ifp->if_drv_flags |= IFF_DRV_RUNNING;
2889 	sc->ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
2890 
2891 	return 0;
2892 
2893 
2894 abort:
2895 	mxge_free_mbufs(sc);
2896 
2897 	return err;
2898 }
2899 
2900 static int
2901 mxge_close(mxge_softc_t *sc)
2902 {
2903 	struct lro_entry *lro_entry;
2904 	mxge_cmd_t cmd;
2905 	int err, old_down_cnt;
2906 
2907 	sc->ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
2908 	old_down_cnt = sc->down_cnt;
2909 	mb();
2910 	err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_DOWN, &cmd);
2911 	if (err) {
2912 		device_printf(sc->dev, "Couldn't bring down link\n");
2913 	}
2914 	if (old_down_cnt == sc->down_cnt) {
2915 		/* wait for down irq */
2916 		DELAY(10 * sc->intr_coal_delay);
2917 	}
2918 	if (old_down_cnt == sc->down_cnt) {
2919 		device_printf(sc->dev, "never got down irq\n");
2920 	}
2921 
2922 	mxge_free_mbufs(sc);
2923 
2924 	while (!SLIST_EMPTY(&sc->lro_free)) {
2925 		lro_entry = SLIST_FIRST(&sc->lro_free);
2926 		SLIST_REMOVE_HEAD(&sc->lro_free, next);
2927 	}
2928 	return 0;
2929 }
2930 
2931 static void
2932 mxge_setup_cfg_space(mxge_softc_t *sc)
2933 {
2934 	device_t dev = sc->dev;
2935 	int reg;
2936 	uint16_t cmd, lnk, pectl;
2937 
2938 	/* find the PCIe link width and set max read request to 4KB*/
2939 	if (pci_find_extcap(dev, PCIY_EXPRESS, &reg) == 0) {
2940 		lnk = pci_read_config(dev, reg + 0x12, 2);
2941 		sc->link_width = (lnk >> 4) & 0x3f;
2942 
2943 		pectl = pci_read_config(dev, reg + 0x8, 2);
2944 		pectl = (pectl & ~0x7000) | (5 << 12);
2945 		pci_write_config(dev, reg + 0x8, pectl, 2);
2946 	}
2947 
2948 	/* Enable DMA and Memory space access */
2949 	pci_enable_busmaster(dev);
2950 	cmd = pci_read_config(dev, PCIR_COMMAND, 2);
2951 	cmd |= PCIM_CMD_MEMEN;
2952 	pci_write_config(dev, PCIR_COMMAND, cmd, 2);
2953 }
2954 
2955 static uint32_t
2956 mxge_read_reboot(mxge_softc_t *sc)
2957 {
2958 	device_t dev = sc->dev;
2959 	uint32_t vs;
2960 
2961 	/* find the vendor specific offset */
2962 	if (pci_find_extcap(dev, PCIY_VENDOR, &vs) != 0) {
2963 		device_printf(sc->dev,
2964 			      "could not find vendor specific offset\n");
2965 		return (uint32_t)-1;
2966 	}
2967 	/* enable read32 mode */
2968 	pci_write_config(dev, vs + 0x10, 0x3, 1);
2969 	/* tell NIC which register to read */
2970 	pci_write_config(dev, vs + 0x18, 0xfffffff0, 4);
2971 	return (pci_read_config(dev, vs + 0x14, 4));
2972 }
2973 
2974 static void
2975 mxge_watchdog_reset(mxge_softc_t *sc)
2976 {
2977 	int err;
2978 	uint32_t reboot;
2979 	uint16_t cmd;
2980 
2981 	err = ENXIO;
2982 
2983 	device_printf(sc->dev, "Watchdog reset!\n");
2984 
2985 	/*
2986 	 * check to see if the NIC rebooted.  If it did, then all of
2987 	 * PCI config space has been reset, and things like the
2988 	 * busmaster bit will be zero.  If this is the case, then we
2989 	 * must restore PCI config space before the NIC can be used
2990 	 * again
2991 	 */
2992 	cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
2993 	if (cmd == 0xffff) {
2994 		/*
2995 		 * maybe the watchdog caught the NIC rebooting; wait
2996 		 * up to 100ms for it to finish.  If it does not come
2997 		 * back, then give up
2998 		 */
2999 		DELAY(1000*100);
3000 		cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3001 		if (cmd == 0xffff) {
3002 			device_printf(sc->dev, "NIC disappeared!\n");
3003 			goto abort;
3004 		}
3005 	}
3006 	if ((cmd & PCIM_CMD_BUSMASTEREN) == 0) {
3007 		/* print the reboot status */
3008 		reboot = mxge_read_reboot(sc);
3009 		device_printf(sc->dev, "NIC rebooted, status = 0x%x\n",
3010 			      reboot);
3011 		/* restore PCI configuration space */
3012 
3013 		/* XXXX waiting for pci_cfg_restore() to be exported */
3014 		goto abort; /* just abort for now */
3015 
3016 		/* and redo any changes we made to our config space */
3017 		mxge_setup_cfg_space(sc);
3018 	} else {
3019 		device_printf(sc->dev, "NIC did not reboot, ring state:\n");
3020 		device_printf(sc->dev, "tx.req=%d tx.done=%d\n",
3021 			      sc->tx.req, sc->tx.done);
3022 		device_printf(sc->dev, "pkt_done=%d fw=%d\n",
3023 			      sc->tx.pkt_done,
3024 			      be32toh(sc->fw_stats->send_done_count));
3025 	}
3026 
3027 	if (sc->ifp->if_drv_flags & IFF_DRV_RUNNING) {
3028 		mxge_close(sc);
3029 		err = mxge_open(sc);
3030 	}
3031 
3032 abort:
3033 	/*
3034 	 * stop the watchdog if the nic is dead, to avoid spamming the
3035 	 * console
3036 	 */
3037 	if (err != 0) {
3038 		callout_stop(&sc->co_hdl);
3039 	}
3040 }
3041 
3042 static void
3043 mxge_watchdog(mxge_softc_t *sc)
3044 {
3045 	mxge_tx_buf_t *tx = &sc->tx;
3046 
3047 	/* see if we have outstanding transmits, which
3048 	   have been pending for more than mxge_ticks */
3049 	if (tx->req != tx->done &&
3050 	    tx->watchdog_req != tx->watchdog_done &&
3051 	    tx->done == tx->watchdog_done)
3052 		mxge_watchdog_reset(sc);
3053 
3054 	tx->watchdog_req = tx->req;
3055 	tx->watchdog_done = tx->done;
3056 }
3057 
3058 static void
3059 mxge_tick(void *arg)
3060 {
3061 	mxge_softc_t *sc = arg;
3062 
3063 
3064 	/* Synchronize with possible callout reset/stop. */
3065 	if (callout_pending(&sc->co_hdl) ||
3066 	    !callout_active(&sc->co_hdl)) {
3067 		mtx_unlock(&sc->driver_mtx);
3068 		return;
3069 	}
3070 
3071 	callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
3072 	mxge_watchdog(sc);
3073 }
3074 
3075 static int
3076 mxge_media_change(struct ifnet *ifp)
3077 {
3078 	return EINVAL;
3079 }
3080 
3081 static int
3082 mxge_change_mtu(mxge_softc_t *sc, int mtu)
3083 {
3084 	struct ifnet *ifp = sc->ifp;
3085 	int real_mtu, old_mtu;
3086 	int err = 0;
3087 
3088 
3089 	real_mtu = mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
3090 	if ((real_mtu > sc->max_mtu) || real_mtu < 60)
3091 		return EINVAL;
3092 	mtx_lock(&sc->driver_mtx);
3093 	old_mtu = ifp->if_mtu;
3094 	ifp->if_mtu = mtu;
3095 	if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
3096 		callout_stop(&sc->co_hdl);
3097 		mxge_close(sc);
3098 		err = mxge_open(sc);
3099 		if (err != 0) {
3100 			ifp->if_mtu = old_mtu;
3101 			mxge_close(sc);
3102 			(void) mxge_open(sc);
3103 		}
3104 		callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
3105 	}
3106 	mtx_unlock(&sc->driver_mtx);
3107 	return err;
3108 }
3109 
3110 static void
3111 mxge_media_status(struct ifnet *ifp, struct ifmediareq *ifmr)
3112 {
3113 	mxge_softc_t *sc = ifp->if_softc;
3114 
3115 
3116 	if (sc == NULL)
3117 		return;
3118 	ifmr->ifm_status = IFM_AVALID;
3119 	ifmr->ifm_status |= sc->fw_stats->link_up ? IFM_ACTIVE : 0;
3120 	ifmr->ifm_active = IFM_AUTO | IFM_ETHER;
3121 	ifmr->ifm_active |= sc->fw_stats->link_up ? IFM_FDX : 0;
3122 }
3123 
3124 static int
3125 mxge_ioctl(struct ifnet *ifp, u_long command, caddr_t data)
3126 {
3127 	mxge_softc_t *sc = ifp->if_softc;
3128 	struct ifreq *ifr = (struct ifreq *)data;
3129 	int err, mask;
3130 
3131 	err = 0;
3132 	switch (command) {
3133 	case SIOCSIFADDR:
3134 	case SIOCGIFADDR:
3135 		err = ether_ioctl(ifp, command, data);
3136 		break;
3137 
3138 	case SIOCSIFMTU:
3139 		err = mxge_change_mtu(sc, ifr->ifr_mtu);
3140 		break;
3141 
3142 	case SIOCSIFFLAGS:
3143 		mtx_lock(&sc->driver_mtx);
3144 		if (ifp->if_flags & IFF_UP) {
3145 			if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) {
3146 				err = mxge_open(sc);
3147 				callout_reset(&sc->co_hdl, mxge_ticks,
3148 					      mxge_tick, sc);
3149 			} else {
3150 				/* take care of promis can allmulti
3151 				   flag chages */
3152 				mxge_change_promisc(sc,
3153 						    ifp->if_flags & IFF_PROMISC);
3154 				mxge_set_multicast_list(sc);
3155 			}
3156 		} else {
3157 			if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
3158 				mxge_close(sc);
3159 				callout_stop(&sc->co_hdl);
3160 			}
3161 		}
3162 		mtx_unlock(&sc->driver_mtx);
3163 		break;
3164 
3165 	case SIOCADDMULTI:
3166 	case SIOCDELMULTI:
3167 		mtx_lock(&sc->driver_mtx);
3168 		mxge_set_multicast_list(sc);
3169 		mtx_unlock(&sc->driver_mtx);
3170 		break;
3171 
3172 	case SIOCSIFCAP:
3173 		mtx_lock(&sc->driver_mtx);
3174 		mask = ifr->ifr_reqcap ^ ifp->if_capenable;
3175 		if (mask & IFCAP_TXCSUM) {
3176 			if (IFCAP_TXCSUM & ifp->if_capenable) {
3177 				ifp->if_capenable &= ~(IFCAP_TXCSUM|IFCAP_TSO4);
3178 				ifp->if_hwassist &= ~(CSUM_TCP | CSUM_UDP
3179 						      | CSUM_TSO);
3180 			} else {
3181 				ifp->if_capenable |= IFCAP_TXCSUM;
3182 				ifp->if_hwassist |= (CSUM_TCP | CSUM_UDP);
3183 			}
3184 		} else if (mask & IFCAP_RXCSUM) {
3185 			if (IFCAP_RXCSUM & ifp->if_capenable) {
3186 				ifp->if_capenable &= ~IFCAP_RXCSUM;
3187 				sc->csum_flag = 0;
3188 			} else {
3189 				ifp->if_capenable |= IFCAP_RXCSUM;
3190 				sc->csum_flag = 1;
3191 			}
3192 		}
3193 		if (mask & IFCAP_TSO4) {
3194 			if (IFCAP_TSO4 & ifp->if_capenable) {
3195 				ifp->if_capenable &= ~IFCAP_TSO4;
3196 				ifp->if_hwassist &= ~CSUM_TSO;
3197 			} else if (IFCAP_TXCSUM & ifp->if_capenable) {
3198 				ifp->if_capenable |= IFCAP_TSO4;
3199 				ifp->if_hwassist |= CSUM_TSO;
3200 			} else {
3201 				printf("mxge requires tx checksum offload"
3202 				       " be enabled to use TSO\n");
3203 				err = EINVAL;
3204 			}
3205 		}
3206 		if (mask & IFCAP_LRO) {
3207 			if (IFCAP_LRO & ifp->if_capenable)
3208 				err = mxge_change_lro_locked(sc, 0);
3209 			else
3210 				err = mxge_change_lro_locked(sc, mxge_lro_cnt);
3211 		}
3212 		if (mask & IFCAP_VLAN_HWTAGGING)
3213 			ifp->if_capenable ^= IFCAP_VLAN_HWTAGGING;
3214 		mtx_unlock(&sc->driver_mtx);
3215 		VLAN_CAPABILITIES(ifp);
3216 
3217 		break;
3218 
3219 	case SIOCGIFMEDIA:
3220 		err = ifmedia_ioctl(ifp, (struct ifreq *)data,
3221 				    &sc->media, command);
3222                 break;
3223 
3224 	default:
3225 		err = ENOTTY;
3226         }
3227 	return err;
3228 }
3229 
3230 static void
3231 mxge_fetch_tunables(mxge_softc_t *sc)
3232 {
3233 
3234 	TUNABLE_INT_FETCH("hw.mxge.flow_control_enabled",
3235 			  &mxge_flow_control);
3236 	TUNABLE_INT_FETCH("hw.mxge.intr_coal_delay",
3237 			  &mxge_intr_coal_delay);
3238 	TUNABLE_INT_FETCH("hw.mxge.nvidia_ecrc_enable",
3239 			  &mxge_nvidia_ecrc_enable);
3240 	TUNABLE_INT_FETCH("hw.mxge.force_firmware",
3241 			  &mxge_force_firmware);
3242 	TUNABLE_INT_FETCH("hw.mxge.deassert_wait",
3243 			  &mxge_deassert_wait);
3244 	TUNABLE_INT_FETCH("hw.mxge.verbose",
3245 			  &mxge_verbose);
3246 	TUNABLE_INT_FETCH("hw.mxge.ticks", &mxge_ticks);
3247 	TUNABLE_INT_FETCH("hw.mxge.lro_cnt", &sc->lro_cnt);
3248 	if (sc->lro_cnt != 0)
3249 		mxge_lro_cnt = sc->lro_cnt;
3250 
3251 	if (bootverbose)
3252 		mxge_verbose = 1;
3253 	if (mxge_intr_coal_delay < 0 || mxge_intr_coal_delay > 10*1000)
3254 		mxge_intr_coal_delay = 30;
3255 	if (mxge_ticks == 0)
3256 		mxge_ticks = hz;
3257 	sc->pause = mxge_flow_control;
3258 
3259 }
3260 
3261 static int
3262 mxge_attach(device_t dev)
3263 {
3264 	mxge_softc_t *sc = device_get_softc(dev);
3265 	struct ifnet *ifp;
3266 	int count, rid, err;
3267 
3268 	sc->dev = dev;
3269 	mxge_fetch_tunables(sc);
3270 
3271 	err = bus_dma_tag_create(NULL,			/* parent */
3272 				 1,			/* alignment */
3273 				 4096,			/* boundary */
3274 				 BUS_SPACE_MAXADDR,	/* low */
3275 				 BUS_SPACE_MAXADDR,	/* high */
3276 				 NULL, NULL,		/* filter */
3277 				 65536 + 256,		/* maxsize */
3278 				 MXGE_MAX_SEND_DESC, 	/* num segs */
3279 				 4096,			/* maxsegsize */
3280 				 0,			/* flags */
3281 				 NULL, NULL,		/* lock */
3282 				 &sc->parent_dmat);	/* tag */
3283 
3284 	if (err != 0) {
3285 		device_printf(sc->dev, "Err %d allocating parent dmat\n",
3286 			      err);
3287 		goto abort_with_nothing;
3288 	}
3289 
3290 	ifp = sc->ifp = if_alloc(IFT_ETHER);
3291 	if (ifp == NULL) {
3292 		device_printf(dev, "can not if_alloc()\n");
3293 		err = ENOSPC;
3294 		goto abort_with_parent_dmat;
3295 	}
3296 	snprintf(sc->cmd_mtx_name, sizeof(sc->cmd_mtx_name), "%s:cmd",
3297 		 device_get_nameunit(dev));
3298 	mtx_init(&sc->cmd_mtx, sc->cmd_mtx_name, NULL, MTX_DEF);
3299 	snprintf(sc->tx_mtx_name, sizeof(sc->tx_mtx_name), "%s:tx",
3300 		 device_get_nameunit(dev));
3301 	mtx_init(&sc->tx_mtx, sc->tx_mtx_name, NULL, MTX_DEF);
3302 	snprintf(sc->driver_mtx_name, sizeof(sc->driver_mtx_name),
3303 		 "%s:drv", device_get_nameunit(dev));
3304 	mtx_init(&sc->driver_mtx, sc->driver_mtx_name,
3305 		 MTX_NETWORK_LOCK, MTX_DEF);
3306 
3307 	callout_init_mtx(&sc->co_hdl, &sc->driver_mtx, 0);
3308 
3309 	mxge_setup_cfg_space(sc);
3310 
3311 	/* Map the board into the kernel */
3312 	rid = PCIR_BARS;
3313 	sc->mem_res = bus_alloc_resource(dev, SYS_RES_MEMORY, &rid, 0,
3314 					 ~0, 1, RF_ACTIVE);
3315 	if (sc->mem_res == NULL) {
3316 		device_printf(dev, "could not map memory\n");
3317 		err = ENXIO;
3318 		goto abort_with_lock;
3319 	}
3320 	sc->sram = rman_get_virtual(sc->mem_res);
3321 	sc->sram_size = 2*1024*1024 - (2*(48*1024)+(32*1024)) - 0x100;
3322 	if (sc->sram_size > rman_get_size(sc->mem_res)) {
3323 		device_printf(dev, "impossible memory region size %ld\n",
3324 			      rman_get_size(sc->mem_res));
3325 		err = ENXIO;
3326 		goto abort_with_mem_res;
3327 	}
3328 
3329 	/* make NULL terminated copy of the EEPROM strings section of
3330 	   lanai SRAM */
3331 	bzero(sc->eeprom_strings, MXGE_EEPROM_STRINGS_SIZE);
3332 	bus_space_read_region_1(rman_get_bustag(sc->mem_res),
3333 				rman_get_bushandle(sc->mem_res),
3334 				sc->sram_size - MXGE_EEPROM_STRINGS_SIZE,
3335 				sc->eeprom_strings,
3336 				MXGE_EEPROM_STRINGS_SIZE - 2);
3337 	err = mxge_parse_strings(sc);
3338 	if (err != 0)
3339 		goto abort_with_mem_res;
3340 
3341 	/* Enable write combining for efficient use of PCIe bus */
3342 	mxge_enable_wc(sc);
3343 
3344 	/* Allocate the out of band dma memory */
3345 	err = mxge_dma_alloc(sc, &sc->cmd_dma,
3346 			     sizeof (mxge_cmd_t), 64);
3347 	if (err != 0)
3348 		goto abort_with_mem_res;
3349 	sc->cmd = (mcp_cmd_response_t *) sc->cmd_dma.addr;
3350 	err = mxge_dma_alloc(sc, &sc->zeropad_dma, 64, 64);
3351 	if (err != 0)
3352 		goto abort_with_cmd_dma;
3353 
3354 	err = mxge_dma_alloc(sc, &sc->fw_stats_dma,
3355 			     sizeof (*sc->fw_stats), 64);
3356 	if (err != 0)
3357 		goto abort_with_zeropad_dma;
3358 	sc->fw_stats = (mcp_irq_data_t *)sc->fw_stats_dma.addr;
3359 
3360 	err = mxge_dma_alloc(sc, &sc->dmabench_dma, 4096, 4096);
3361 	if (err != 0)
3362 		goto abort_with_fw_stats;
3363 
3364 	/* Add our ithread  */
3365 	count = pci_msi_count(dev);
3366 	if (count == 1 && pci_alloc_msi(dev, &count) == 0) {
3367 		rid = 1;
3368 		sc->msi_enabled = 1;
3369 	} else {
3370 		rid = 0;
3371 	}
3372 	sc->irq_res = bus_alloc_resource(dev, SYS_RES_IRQ, &rid, 0, ~0,
3373 					 1, RF_SHAREABLE | RF_ACTIVE);
3374 	if (sc->irq_res == NULL) {
3375 		device_printf(dev, "could not alloc interrupt\n");
3376 		goto abort_with_dmabench;
3377 	}
3378 	if (mxge_verbose)
3379 		device_printf(dev, "using %s irq %ld\n",
3380 			      sc->msi_enabled ? "MSI" : "INTx",
3381 			      rman_get_start(sc->irq_res));
3382 	/* select & load the firmware */
3383 	err = mxge_select_firmware(sc);
3384 	if (err != 0)
3385 		goto abort_with_irq_res;
3386 	sc->intr_coal_delay = mxge_intr_coal_delay;
3387 	err = mxge_reset(sc, 0);
3388 	if (err != 0)
3389 		goto abort_with_irq_res;
3390 
3391 	err = mxge_alloc_rings(sc);
3392 	if (err != 0) {
3393 		device_printf(sc->dev, "failed to allocate rings\n");
3394 		goto abort_with_irq_res;
3395 	}
3396 
3397 	err = bus_setup_intr(sc->dev, sc->irq_res,
3398 			     INTR_TYPE_NET | INTR_MPSAFE,
3399 			     NULL, mxge_intr, sc, &sc->ih);
3400 	if (err != 0) {
3401 		goto abort_with_rings;
3402 	}
3403 	/* hook into the network stack */
3404 	if_initname(ifp, device_get_name(dev), device_get_unit(dev));
3405 	ifp->if_baudrate = 100000000;
3406 	ifp->if_capabilities = IFCAP_RXCSUM | IFCAP_TXCSUM | IFCAP_TSO4 |
3407 		IFCAP_VLAN_MTU | IFCAP_VLAN_HWTAGGING |
3408 		IFCAP_VLAN_HWCSUM | IFCAP_LRO;
3409 
3410 	sc->max_mtu = mxge_max_mtu(sc);
3411 	if (sc->max_mtu >= 9000)
3412 		ifp->if_capabilities |= IFCAP_JUMBO_MTU;
3413 	else
3414 		device_printf(dev, "MTU limited to %d.  Install "
3415 			      "latest firmware for 9000 byte jumbo support\n",
3416 			      sc->max_mtu - ETHER_HDR_LEN);
3417 	ifp->if_hwassist = CSUM_TCP | CSUM_UDP | CSUM_TSO;
3418 	ifp->if_capenable = ifp->if_capabilities;
3419 	if (sc->lro_cnt == 0)
3420 		ifp->if_capenable &= ~IFCAP_LRO;
3421 	sc->csum_flag = 1;
3422         ifp->if_init = mxge_init;
3423         ifp->if_softc = sc;
3424         ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
3425         ifp->if_ioctl = mxge_ioctl;
3426         ifp->if_start = mxge_start;
3427 	ether_ifattach(ifp, sc->mac_addr);
3428 	/* ether_ifattach sets mtu to 1500 */
3429 	if (ifp->if_capabilities & IFCAP_JUMBO_MTU)
3430 		ifp->if_mtu = 9000;
3431 
3432 	/* Initialise the ifmedia structure */
3433 	ifmedia_init(&sc->media, 0, mxge_media_change,
3434 		     mxge_media_status);
3435 	ifmedia_add(&sc->media, IFM_ETHER|IFM_AUTO, 0, NULL);
3436 	mxge_add_sysctls(sc);
3437 	return 0;
3438 
3439 abort_with_rings:
3440 	mxge_free_rings(sc);
3441 abort_with_irq_res:
3442 	bus_release_resource(dev, SYS_RES_IRQ,
3443 			     sc->msi_enabled ? 1 : 0, sc->irq_res);
3444 	if (sc->msi_enabled)
3445 		pci_release_msi(dev);
3446 abort_with_dmabench:
3447 	mxge_dma_free(&sc->dmabench_dma);
3448 abort_with_fw_stats:
3449 	mxge_dma_free(&sc->fw_stats_dma);
3450 abort_with_zeropad_dma:
3451 	mxge_dma_free(&sc->zeropad_dma);
3452 abort_with_cmd_dma:
3453 	mxge_dma_free(&sc->cmd_dma);
3454 abort_with_mem_res:
3455 	bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
3456 abort_with_lock:
3457 	pci_disable_busmaster(dev);
3458 	mtx_destroy(&sc->cmd_mtx);
3459 	mtx_destroy(&sc->tx_mtx);
3460 	mtx_destroy(&sc->driver_mtx);
3461 	if_free(ifp);
3462 abort_with_parent_dmat:
3463 	bus_dma_tag_destroy(sc->parent_dmat);
3464 
3465 abort_with_nothing:
3466 	return err;
3467 }
3468 
3469 static int
3470 mxge_detach(device_t dev)
3471 {
3472 	mxge_softc_t *sc = device_get_softc(dev);
3473 
3474 	if (sc->ifp->if_vlantrunk != NULL) {
3475 		device_printf(sc->dev,
3476 			      "Detach vlans before removing module\n");
3477 		return EBUSY;
3478 	}
3479 	mtx_lock(&sc->driver_mtx);
3480 	if (sc->ifp->if_drv_flags & IFF_DRV_RUNNING)
3481 		mxge_close(sc);
3482 	callout_stop(&sc->co_hdl);
3483 	mtx_unlock(&sc->driver_mtx);
3484 	ether_ifdetach(sc->ifp);
3485 	ifmedia_removeall(&sc->media);
3486 	mxge_dummy_rdma(sc, 0);
3487 	bus_teardown_intr(sc->dev, sc->irq_res, sc->ih);
3488 	mxge_free_rings(sc);
3489 	bus_release_resource(dev, SYS_RES_IRQ,
3490 			     sc->msi_enabled ? 1 : 0, sc->irq_res);
3491 	if (sc->msi_enabled)
3492 		pci_release_msi(dev);
3493 
3494 	sc->rx_done.entry = NULL;
3495 	mxge_dma_free(&sc->fw_stats_dma);
3496 	mxge_dma_free(&sc->dmabench_dma);
3497 	mxge_dma_free(&sc->zeropad_dma);
3498 	mxge_dma_free(&sc->cmd_dma);
3499 	bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
3500 	pci_disable_busmaster(dev);
3501 	mtx_destroy(&sc->cmd_mtx);
3502 	mtx_destroy(&sc->tx_mtx);
3503 	mtx_destroy(&sc->driver_mtx);
3504 	if_free(sc->ifp);
3505 	bus_dma_tag_destroy(sc->parent_dmat);
3506 	return 0;
3507 }
3508 
3509 static int
3510 mxge_shutdown(device_t dev)
3511 {
3512 	return 0;
3513 }
3514 
3515 /*
3516   This file uses Myri10GE driver indentation.
3517 
3518   Local Variables:
3519   c-file-style:"linux"
3520   tab-width:8
3521   End:
3522 */
3523