xref: /freebsd/sys/dev/mxge/if_mxge.c (revision 4ed925457ab06e83238a5db33e89ccc94b99a713)
1 /******************************************************************************
2 
3 Copyright (c) 2006-2009, Myricom Inc.
4 All rights reserved.
5 
6 Redistribution and use in source and binary forms, with or without
7 modification, are permitted provided that the following conditions are met:
8 
9  1. Redistributions of source code must retain the above copyright notice,
10     this list of conditions and the following disclaimer.
11 
12  2. Neither the name of the Myricom Inc, nor the names of its
13     contributors may be used to endorse or promote products derived from
14     this software without specific prior written permission.
15 
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 POSSIBILITY OF SUCH DAMAGE.
27 
28 ***************************************************************************/
29 
30 #include <sys/cdefs.h>
31 __FBSDID("$FreeBSD$");
32 
33 #include <sys/param.h>
34 #include <sys/systm.h>
35 #include <sys/linker.h>
36 #include <sys/firmware.h>
37 #include <sys/endian.h>
38 #include <sys/sockio.h>
39 #include <sys/mbuf.h>
40 #include <sys/malloc.h>
41 #include <sys/kdb.h>
42 #include <sys/kernel.h>
43 #include <sys/lock.h>
44 #include <sys/module.h>
45 #include <sys/socket.h>
46 #include <sys/sysctl.h>
47 #include <sys/sx.h>
48 #include <sys/taskqueue.h>
49 
50 /* count xmits ourselves, rather than via drbr */
51 #define NO_SLOW_STATS
52 #include <net/if.h>
53 #include <net/if_arp.h>
54 #include <net/ethernet.h>
55 #include <net/if_dl.h>
56 #include <net/if_media.h>
57 
58 #include <net/bpf.h>
59 
60 #include <net/if_types.h>
61 #include <net/if_vlan_var.h>
62 #include <net/zlib.h>
63 
64 #include <netinet/in_systm.h>
65 #include <netinet/in.h>
66 #include <netinet/ip.h>
67 #include <netinet/tcp.h>
68 
69 #include <machine/bus.h>
70 #include <machine/in_cksum.h>
71 #include <machine/resource.h>
72 #include <sys/bus.h>
73 #include <sys/rman.h>
74 #include <sys/smp.h>
75 
76 #include <dev/pci/pcireg.h>
77 #include <dev/pci/pcivar.h>
78 #include <dev/pci/pci_private.h> /* XXX for pci_cfg_restore */
79 
80 #include <vm/vm.h>		/* for pmap_mapdev() */
81 #include <vm/pmap.h>
82 
83 #if defined(__i386) || defined(__amd64)
84 #include <machine/specialreg.h>
85 #endif
86 
87 #include <dev/mxge/mxge_mcp.h>
88 #include <dev/mxge/mcp_gen_header.h>
89 /*#define MXGE_FAKE_IFP*/
90 #include <dev/mxge/if_mxge_var.h>
91 #ifdef IFNET_BUF_RING
92 #include <sys/buf_ring.h>
93 #endif
94 
95 #include "opt_inet.h"
96 
97 /* tunable params */
98 static int mxge_nvidia_ecrc_enable = 1;
99 static int mxge_force_firmware = 0;
100 static int mxge_intr_coal_delay = 30;
101 static int mxge_deassert_wait = 1;
102 static int mxge_flow_control = 1;
103 static int mxge_verbose = 0;
104 static int mxge_lro_cnt = 8;
105 static int mxge_ticks;
106 static int mxge_max_slices = 1;
107 static int mxge_rss_hash_type = MXGEFW_RSS_HASH_TYPE_SRC_DST_PORT;
108 static int mxge_always_promisc = 0;
109 static int mxge_initial_mtu = ETHERMTU_JUMBO;
110 static int mxge_throttle = 0;
111 static char *mxge_fw_unaligned = "mxge_ethp_z8e";
112 static char *mxge_fw_aligned = "mxge_eth_z8e";
113 static char *mxge_fw_rss_aligned = "mxge_rss_eth_z8e";
114 static char *mxge_fw_rss_unaligned = "mxge_rss_ethp_z8e";
115 
116 static int mxge_probe(device_t dev);
117 static int mxge_attach(device_t dev);
118 static int mxge_detach(device_t dev);
119 static int mxge_shutdown(device_t dev);
120 static void mxge_intr(void *arg);
121 
122 static device_method_t mxge_methods[] =
123 {
124   /* Device interface */
125   DEVMETHOD(device_probe, mxge_probe),
126   DEVMETHOD(device_attach, mxge_attach),
127   DEVMETHOD(device_detach, mxge_detach),
128   DEVMETHOD(device_shutdown, mxge_shutdown),
129   {0, 0}
130 };
131 
132 static driver_t mxge_driver =
133 {
134   "mxge",
135   mxge_methods,
136   sizeof(mxge_softc_t),
137 };
138 
139 static devclass_t mxge_devclass;
140 
141 /* Declare ourselves to be a child of the PCI bus.*/
142 DRIVER_MODULE(mxge, pci, mxge_driver, mxge_devclass, 0, 0);
143 MODULE_DEPEND(mxge, firmware, 1, 1, 1);
144 MODULE_DEPEND(mxge, zlib, 1, 1, 1);
145 
146 static int mxge_load_firmware(mxge_softc_t *sc, int adopt);
147 static int mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data);
148 static int mxge_close(mxge_softc_t *sc, int down);
149 static int mxge_open(mxge_softc_t *sc);
150 static void mxge_tick(void *arg);
151 
152 static int
153 mxge_probe(device_t dev)
154 {
155 	int rev;
156 
157 
158 	if ((pci_get_vendor(dev) == MXGE_PCI_VENDOR_MYRICOM) &&
159 	    ((pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E) ||
160 	     (pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E_9))) {
161 		rev = pci_get_revid(dev);
162 		switch (rev) {
163 		case MXGE_PCI_REV_Z8E:
164 			device_set_desc(dev, "Myri10G-PCIE-8A");
165 			break;
166 		case MXGE_PCI_REV_Z8ES:
167 			device_set_desc(dev, "Myri10G-PCIE-8B");
168 			break;
169 		default:
170 			device_set_desc(dev, "Myri10G-PCIE-8??");
171 			device_printf(dev, "Unrecognized rev %d NIC\n",
172 				      rev);
173 			break;
174 		}
175 		return 0;
176 	}
177 	return ENXIO;
178 }
179 
180 static void
181 mxge_enable_wc(mxge_softc_t *sc)
182 {
183 #if defined(__i386) || defined(__amd64)
184 	vm_offset_t len;
185 	int err;
186 
187 	sc->wc = 1;
188 	len = rman_get_size(sc->mem_res);
189 	err = pmap_change_attr((vm_offset_t) sc->sram,
190 			       len, PAT_WRITE_COMBINING);
191 	if (err != 0) {
192 		device_printf(sc->dev, "pmap_change_attr failed, %d\n",
193 			      err);
194 		sc->wc = 0;
195 	}
196 #endif
197 }
198 
199 
200 /* callback to get our DMA address */
201 static void
202 mxge_dmamap_callback(void *arg, bus_dma_segment_t *segs, int nsegs,
203 			 int error)
204 {
205 	if (error == 0) {
206 		*(bus_addr_t *) arg = segs->ds_addr;
207 	}
208 }
209 
210 static int
211 mxge_dma_alloc(mxge_softc_t *sc, mxge_dma_t *dma, size_t bytes,
212 		   bus_size_t alignment)
213 {
214 	int err;
215 	device_t dev = sc->dev;
216 	bus_size_t boundary, maxsegsize;
217 
218 	if (bytes > 4096 && alignment == 4096) {
219 		boundary = 0;
220 		maxsegsize = bytes;
221 	} else {
222 		boundary = 4096;
223 		maxsegsize = 4096;
224 	}
225 
226 	/* allocate DMAable memory tags */
227 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
228 				 alignment,		/* alignment */
229 				 boundary,		/* boundary */
230 				 BUS_SPACE_MAXADDR,	/* low */
231 				 BUS_SPACE_MAXADDR,	/* high */
232 				 NULL, NULL,		/* filter */
233 				 bytes,			/* maxsize */
234 				 1,			/* num segs */
235 				 maxsegsize,		/* maxsegsize */
236 				 BUS_DMA_COHERENT,	/* flags */
237 				 NULL, NULL,		/* lock */
238 				 &dma->dmat);		/* tag */
239 	if (err != 0) {
240 		device_printf(dev, "couldn't alloc tag (err = %d)\n", err);
241 		return err;
242 	}
243 
244 	/* allocate DMAable memory & map */
245 	err = bus_dmamem_alloc(dma->dmat, &dma->addr,
246 			       (BUS_DMA_WAITOK | BUS_DMA_COHERENT
247 				| BUS_DMA_ZERO),  &dma->map);
248 	if (err != 0) {
249 		device_printf(dev, "couldn't alloc mem (err = %d)\n", err);
250 		goto abort_with_dmat;
251 	}
252 
253 	/* load the memory */
254 	err = bus_dmamap_load(dma->dmat, dma->map, dma->addr, bytes,
255 			      mxge_dmamap_callback,
256 			      (void *)&dma->bus_addr, 0);
257 	if (err != 0) {
258 		device_printf(dev, "couldn't load map (err = %d)\n", err);
259 		goto abort_with_mem;
260 	}
261 	return 0;
262 
263 abort_with_mem:
264 	bus_dmamem_free(dma->dmat, dma->addr, dma->map);
265 abort_with_dmat:
266 	(void)bus_dma_tag_destroy(dma->dmat);
267 	return err;
268 }
269 
270 
271 static void
272 mxge_dma_free(mxge_dma_t *dma)
273 {
274 	bus_dmamap_unload(dma->dmat, dma->map);
275 	bus_dmamem_free(dma->dmat, dma->addr, dma->map);
276 	(void)bus_dma_tag_destroy(dma->dmat);
277 }
278 
279 /*
280  * The eeprom strings on the lanaiX have the format
281  * SN=x\0
282  * MAC=x:x:x:x:x:x\0
283  * PC=text\0
284  */
285 
286 static int
287 mxge_parse_strings(mxge_softc_t *sc)
288 {
289 #define MXGE_NEXT_STRING(p) while(ptr < limit && *ptr++)
290 
291 	char *ptr, *limit;
292 	int i, found_mac;
293 
294 	ptr = sc->eeprom_strings;
295 	limit = sc->eeprom_strings + MXGE_EEPROM_STRINGS_SIZE;
296 	found_mac = 0;
297 	while (ptr < limit && *ptr != '\0') {
298 		if (memcmp(ptr, "MAC=", 4) == 0) {
299 			ptr += 1;
300 			sc->mac_addr_string = ptr;
301 			for (i = 0; i < 6; i++) {
302 				ptr += 3;
303 				if ((ptr + 2) > limit)
304 					goto abort;
305 				sc->mac_addr[i] = strtoul(ptr, NULL, 16);
306 				found_mac = 1;
307 			}
308 		} else if (memcmp(ptr, "PC=", 3) == 0) {
309 			ptr += 3;
310 			strncpy(sc->product_code_string, ptr,
311 				sizeof (sc->product_code_string) - 1);
312 		} else if (memcmp(ptr, "SN=", 3) == 0) {
313 			ptr += 3;
314 			strncpy(sc->serial_number_string, ptr,
315 				sizeof (sc->serial_number_string) - 1);
316 		}
317 		MXGE_NEXT_STRING(ptr);
318 	}
319 
320 	if (found_mac)
321 		return 0;
322 
323  abort:
324 	device_printf(sc->dev, "failed to parse eeprom_strings\n");
325 
326 	return ENXIO;
327 }
328 
329 #if defined __i386 || defined i386 || defined __i386__ || defined __x86_64__
330 static void
331 mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
332 {
333 	uint32_t val;
334 	unsigned long base, off;
335 	char *va, *cfgptr;
336 	device_t pdev, mcp55;
337 	uint16_t vendor_id, device_id, word;
338 	uintptr_t bus, slot, func, ivend, idev;
339 	uint32_t *ptr32;
340 
341 
342 	if (!mxge_nvidia_ecrc_enable)
343 		return;
344 
345 	pdev = device_get_parent(device_get_parent(sc->dev));
346 	if (pdev == NULL) {
347 		device_printf(sc->dev, "could not find parent?\n");
348 		return;
349 	}
350 	vendor_id = pci_read_config(pdev, PCIR_VENDOR, 2);
351 	device_id = pci_read_config(pdev, PCIR_DEVICE, 2);
352 
353 	if (vendor_id != 0x10de)
354 		return;
355 
356 	base = 0;
357 
358 	if (device_id == 0x005d) {
359 		/* ck804, base address is magic */
360 		base = 0xe0000000UL;
361 	} else if (device_id >= 0x0374 && device_id <= 0x378) {
362 		/* mcp55, base address stored in chipset */
363 		mcp55 = pci_find_bsf(0, 0, 0);
364 		if (mcp55 &&
365 		    0x10de == pci_read_config(mcp55, PCIR_VENDOR, 2) &&
366 		    0x0369 == pci_read_config(mcp55, PCIR_DEVICE, 2)) {
367 			word = pci_read_config(mcp55, 0x90, 2);
368 			base = ((unsigned long)word & 0x7ffeU) << 25;
369 		}
370 	}
371 	if (!base)
372 		return;
373 
374 	/* XXXX
375 	   Test below is commented because it is believed that doing
376 	   config read/write beyond 0xff will access the config space
377 	   for the next larger function.  Uncomment this and remove
378 	   the hacky pmap_mapdev() way of accessing config space when
379 	   FreeBSD grows support for extended pcie config space access
380 	*/
381 #if 0
382 	/* See if we can, by some miracle, access the extended
383 	   config space */
384 	val = pci_read_config(pdev, 0x178, 4);
385 	if (val != 0xffffffff) {
386 		val |= 0x40;
387 		pci_write_config(pdev, 0x178, val, 4);
388 		return;
389 	}
390 #endif
391 	/* Rather than using normal pci config space writes, we must
392 	 * map the Nvidia config space ourselves.  This is because on
393 	 * opteron/nvidia class machine the 0xe000000 mapping is
394 	 * handled by the nvidia chipset, that means the internal PCI
395 	 * device (the on-chip northbridge), or the amd-8131 bridge
396 	 * and things behind them are not visible by this method.
397 	 */
398 
399 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
400 		      PCI_IVAR_BUS, &bus);
401 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
402 		      PCI_IVAR_SLOT, &slot);
403 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
404 		      PCI_IVAR_FUNCTION, &func);
405 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
406 		      PCI_IVAR_VENDOR, &ivend);
407 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
408 		      PCI_IVAR_DEVICE, &idev);
409 
410 	off =  base
411 		+ 0x00100000UL * (unsigned long)bus
412 		+ 0x00001000UL * (unsigned long)(func
413 						 + 8 * slot);
414 
415 	/* map it into the kernel */
416 	va = pmap_mapdev(trunc_page((vm_paddr_t)off), PAGE_SIZE);
417 
418 
419 	if (va == NULL) {
420 		device_printf(sc->dev, "pmap_kenter_temporary didn't\n");
421 		return;
422 	}
423 	/* get a pointer to the config space mapped into the kernel */
424 	cfgptr = va + (off & PAGE_MASK);
425 
426 	/* make sure that we can really access it */
427 	vendor_id = *(uint16_t *)(cfgptr + PCIR_VENDOR);
428 	device_id = *(uint16_t *)(cfgptr + PCIR_DEVICE);
429 	if (! (vendor_id == ivend && device_id == idev)) {
430 		device_printf(sc->dev, "mapping failed: 0x%x:0x%x\n",
431 			      vendor_id, device_id);
432 		pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
433 		return;
434 	}
435 
436 	ptr32 = (uint32_t*)(cfgptr + 0x178);
437 	val = *ptr32;
438 
439 	if (val == 0xffffffff) {
440 		device_printf(sc->dev, "extended mapping failed\n");
441 		pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
442 		return;
443 	}
444 	*ptr32 = val | 0x40;
445 	pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
446 	if (mxge_verbose)
447 		device_printf(sc->dev,
448 			      "Enabled ECRC on upstream Nvidia bridge "
449 			      "at %d:%d:%d\n",
450 			      (int)bus, (int)slot, (int)func);
451 	return;
452 }
453 #else
454 static void
455 mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
456 {
457 	device_printf(sc->dev,
458 		      "Nforce 4 chipset on non-x86/amd64!?!?!\n");
459 	return;
460 }
461 #endif
462 
463 
464 static int
465 mxge_dma_test(mxge_softc_t *sc, int test_type)
466 {
467 	mxge_cmd_t cmd;
468 	bus_addr_t dmatest_bus = sc->dmabench_dma.bus_addr;
469 	int status;
470 	uint32_t len;
471 	char *test = " ";
472 
473 
474 	/* Run a small DMA test.
475 	 * The magic multipliers to the length tell the firmware
476 	 * to do DMA read, write, or read+write tests.  The
477 	 * results are returned in cmd.data0.  The upper 16
478 	 * bits of the return is the number of transfers completed.
479 	 * The lower 16 bits is the time in 0.5us ticks that the
480 	 * transfers took to complete.
481 	 */
482 
483 	len = sc->tx_boundary;
484 
485 	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
486 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
487 	cmd.data2 = len * 0x10000;
488 	status = mxge_send_cmd(sc, test_type, &cmd);
489 	if (status != 0) {
490 		test = "read";
491 		goto abort;
492 	}
493 	sc->read_dma = ((cmd.data0>>16) * len * 2) /
494 		(cmd.data0 & 0xffff);
495 	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
496 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
497 	cmd.data2 = len * 0x1;
498 	status = mxge_send_cmd(sc, test_type, &cmd);
499 	if (status != 0) {
500 		test = "write";
501 		goto abort;
502 	}
503 	sc->write_dma = ((cmd.data0>>16) * len * 2) /
504 		(cmd.data0 & 0xffff);
505 
506 	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
507 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
508 	cmd.data2 = len * 0x10001;
509 	status = mxge_send_cmd(sc, test_type, &cmd);
510 	if (status != 0) {
511 		test = "read/write";
512 		goto abort;
513 	}
514 	sc->read_write_dma = ((cmd.data0>>16) * len * 2 * 2) /
515 		(cmd.data0 & 0xffff);
516 
517 abort:
518 	if (status != 0 && test_type != MXGEFW_CMD_UNALIGNED_TEST)
519 		device_printf(sc->dev, "DMA %s benchmark failed: %d\n",
520 			      test, status);
521 
522 	return status;
523 }
524 
525 /*
526  * The Lanai Z8E PCI-E interface achieves higher Read-DMA throughput
527  * when the PCI-E Completion packets are aligned on an 8-byte
528  * boundary.  Some PCI-E chip sets always align Completion packets; on
529  * the ones that do not, the alignment can be enforced by enabling
530  * ECRC generation (if supported).
531  *
532  * When PCI-E Completion packets are not aligned, it is actually more
533  * efficient to limit Read-DMA transactions to 2KB, rather than 4KB.
534  *
535  * If the driver can neither enable ECRC nor verify that it has
536  * already been enabled, then it must use a firmware image which works
537  * around unaligned completion packets (ethp_z8e.dat), and it should
538  * also ensure that it never gives the device a Read-DMA which is
539  * larger than 2KB by setting the tx_boundary to 2KB.  If ECRC is
540  * enabled, then the driver should use the aligned (eth_z8e.dat)
541  * firmware image, and set tx_boundary to 4KB.
542  */
543 
544 static int
545 mxge_firmware_probe(mxge_softc_t *sc)
546 {
547 	device_t dev = sc->dev;
548 	int reg, status;
549 	uint16_t pectl;
550 
551 	sc->tx_boundary = 4096;
552 	/*
553 	 * Verify the max read request size was set to 4KB
554 	 * before trying the test with 4KB.
555 	 */
556 	if (pci_find_extcap(dev, PCIY_EXPRESS, &reg) == 0) {
557 		pectl = pci_read_config(dev, reg + 0x8, 2);
558 		if ((pectl & (5 << 12)) != (5 << 12)) {
559 			device_printf(dev, "Max Read Req. size != 4k (0x%x\n",
560 				      pectl);
561 			sc->tx_boundary = 2048;
562 		}
563 	}
564 
565 	/*
566 	 * load the optimized firmware (which assumes aligned PCIe
567 	 * completions) in order to see if it works on this host.
568 	 */
569 	sc->fw_name = mxge_fw_aligned;
570 	status = mxge_load_firmware(sc, 1);
571 	if (status != 0) {
572 		return status;
573 	}
574 
575 	/*
576 	 * Enable ECRC if possible
577 	 */
578 	mxge_enable_nvidia_ecrc(sc);
579 
580 	/*
581 	 * Run a DMA test which watches for unaligned completions and
582 	 * aborts on the first one seen.
583 	 */
584 
585 	status = mxge_dma_test(sc, MXGEFW_CMD_UNALIGNED_TEST);
586 	if (status == 0)
587 		return 0; /* keep the aligned firmware */
588 
589 	if (status != E2BIG)
590 		device_printf(dev, "DMA test failed: %d\n", status);
591 	if (status == ENOSYS)
592 		device_printf(dev, "Falling back to ethp! "
593 			      "Please install up to date fw\n");
594 	return status;
595 }
596 
597 static int
598 mxge_select_firmware(mxge_softc_t *sc)
599 {
600 	int aligned = 0;
601 	int force_firmware = mxge_force_firmware;
602 
603 	if (sc->throttle)
604 		force_firmware = sc->throttle;
605 
606 	if (force_firmware != 0) {
607 		if (force_firmware == 1)
608 			aligned = 1;
609 		else
610 			aligned = 0;
611 		if (mxge_verbose)
612 			device_printf(sc->dev,
613 				      "Assuming %s completions (forced)\n",
614 				      aligned ? "aligned" : "unaligned");
615 		goto abort;
616 	}
617 
618 	/* if the PCIe link width is 4 or less, we can use the aligned
619 	   firmware and skip any checks */
620 	if (sc->link_width != 0 && sc->link_width <= 4) {
621 		device_printf(sc->dev,
622 			      "PCIe x%d Link, expect reduced performance\n",
623 			      sc->link_width);
624 		aligned = 1;
625 		goto abort;
626 	}
627 
628 	if (0 == mxge_firmware_probe(sc))
629 		return 0;
630 
631 abort:
632 	if (aligned) {
633 		sc->fw_name = mxge_fw_aligned;
634 		sc->tx_boundary = 4096;
635 	} else {
636 		sc->fw_name = mxge_fw_unaligned;
637 		sc->tx_boundary = 2048;
638 	}
639 	return (mxge_load_firmware(sc, 0));
640 }
641 
642 union qualhack
643 {
644         const char *ro_char;
645         char *rw_char;
646 };
647 
648 static int
649 mxge_validate_firmware(mxge_softc_t *sc, const mcp_gen_header_t *hdr)
650 {
651 
652 
653 	if (be32toh(hdr->mcp_type) != MCP_TYPE_ETH) {
654 		device_printf(sc->dev, "Bad firmware type: 0x%x\n",
655 			      be32toh(hdr->mcp_type));
656 		return EIO;
657 	}
658 
659 	/* save firmware version for sysctl */
660 	strncpy(sc->fw_version, hdr->version, sizeof (sc->fw_version));
661 	if (mxge_verbose)
662 		device_printf(sc->dev, "firmware id: %s\n", hdr->version);
663 
664 	sscanf(sc->fw_version, "%d.%d.%d", &sc->fw_ver_major,
665 	       &sc->fw_ver_minor, &sc->fw_ver_tiny);
666 
667 	if (!(sc->fw_ver_major == MXGEFW_VERSION_MAJOR
668 	      && sc->fw_ver_minor == MXGEFW_VERSION_MINOR)) {
669 		device_printf(sc->dev, "Found firmware version %s\n",
670 			      sc->fw_version);
671 		device_printf(sc->dev, "Driver needs %d.%d\n",
672 			      MXGEFW_VERSION_MAJOR, MXGEFW_VERSION_MINOR);
673 		return EINVAL;
674 	}
675 	return 0;
676 
677 }
678 
679 static void *
680 z_alloc(void *nil, u_int items, u_int size)
681 {
682         void *ptr;
683 
684         ptr = malloc(items * size, M_TEMP, M_NOWAIT);
685         return ptr;
686 }
687 
688 static void
689 z_free(void *nil, void *ptr)
690 {
691         free(ptr, M_TEMP);
692 }
693 
694 
695 static int
696 mxge_load_firmware_helper(mxge_softc_t *sc, uint32_t *limit)
697 {
698 	z_stream zs;
699 	char *inflate_buffer;
700 	const struct firmware *fw;
701 	const mcp_gen_header_t *hdr;
702 	unsigned hdr_offset;
703 	int status;
704 	unsigned int i;
705 	char dummy;
706 	size_t fw_len;
707 
708 	fw = firmware_get(sc->fw_name);
709 	if (fw == NULL) {
710 		device_printf(sc->dev, "Could not find firmware image %s\n",
711 			      sc->fw_name);
712 		return ENOENT;
713 	}
714 
715 
716 
717 	/* setup zlib and decompress f/w */
718 	bzero(&zs, sizeof (zs));
719 	zs.zalloc = z_alloc;
720 	zs.zfree = z_free;
721 	status = inflateInit(&zs);
722 	if (status != Z_OK) {
723 		status = EIO;
724 		goto abort_with_fw;
725 	}
726 
727 	/* the uncompressed size is stored as the firmware version,
728 	   which would otherwise go unused */
729 	fw_len = (size_t) fw->version;
730 	inflate_buffer = malloc(fw_len, M_TEMP, M_NOWAIT);
731 	if (inflate_buffer == NULL)
732 		goto abort_with_zs;
733 	zs.avail_in = fw->datasize;
734 	zs.next_in = __DECONST(char *, fw->data);
735 	zs.avail_out = fw_len;
736 	zs.next_out = inflate_buffer;
737 	status = inflate(&zs, Z_FINISH);
738 	if (status != Z_STREAM_END) {
739 		device_printf(sc->dev, "zlib %d\n", status);
740 		status = EIO;
741 		goto abort_with_buffer;
742 	}
743 
744 	/* check id */
745 	hdr_offset = htobe32(*(const uint32_t *)
746 			     (inflate_buffer + MCP_HEADER_PTR_OFFSET));
747 	if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > fw_len) {
748 		device_printf(sc->dev, "Bad firmware file");
749 		status = EIO;
750 		goto abort_with_buffer;
751 	}
752 	hdr = (const void*)(inflate_buffer + hdr_offset);
753 
754 	status = mxge_validate_firmware(sc, hdr);
755 	if (status != 0)
756 		goto abort_with_buffer;
757 
758 	/* Copy the inflated firmware to NIC SRAM. */
759 	for (i = 0; i < fw_len; i += 256) {
760 		mxge_pio_copy(sc->sram + MXGE_FW_OFFSET + i,
761 			      inflate_buffer + i,
762 			      min(256U, (unsigned)(fw_len - i)));
763 		wmb();
764 		dummy = *sc->sram;
765 		wmb();
766 	}
767 
768 	*limit = fw_len;
769 	status = 0;
770 abort_with_buffer:
771 	free(inflate_buffer, M_TEMP);
772 abort_with_zs:
773 	inflateEnd(&zs);
774 abort_with_fw:
775 	firmware_put(fw, FIRMWARE_UNLOAD);
776 	return status;
777 }
778 
779 /*
780  * Enable or disable periodic RDMAs from the host to make certain
781  * chipsets resend dropped PCIe messages
782  */
783 
784 static void
785 mxge_dummy_rdma(mxge_softc_t *sc, int enable)
786 {
787 	char buf_bytes[72];
788 	volatile uint32_t *confirm;
789 	volatile char *submit;
790 	uint32_t *buf, dma_low, dma_high;
791 	int i;
792 
793 	buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
794 
795 	/* clear confirmation addr */
796 	confirm = (volatile uint32_t *)sc->cmd;
797 	*confirm = 0;
798 	wmb();
799 
800 	/* send an rdma command to the PCIe engine, and wait for the
801 	   response in the confirmation address.  The firmware should
802 	   write a -1 there to indicate it is alive and well
803 	*/
804 
805 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
806 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
807 	buf[0] = htobe32(dma_high);		/* confirm addr MSW */
808 	buf[1] = htobe32(dma_low);		/* confirm addr LSW */
809 	buf[2] = htobe32(0xffffffff);		/* confirm data */
810 	dma_low = MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr);
811 	dma_high = MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr);
812 	buf[3] = htobe32(dma_high); 		/* dummy addr MSW */
813 	buf[4] = htobe32(dma_low); 		/* dummy addr LSW */
814 	buf[5] = htobe32(enable);			/* enable? */
815 
816 
817 	submit = (volatile char *)(sc->sram + MXGEFW_BOOT_DUMMY_RDMA);
818 
819 	mxge_pio_copy(submit, buf, 64);
820 	wmb();
821 	DELAY(1000);
822 	wmb();
823 	i = 0;
824 	while (*confirm != 0xffffffff && i < 20) {
825 		DELAY(1000);
826 		i++;
827 	}
828 	if (*confirm != 0xffffffff) {
829 		device_printf(sc->dev, "dummy rdma %s failed (%p = 0x%x)",
830 			      (enable ? "enable" : "disable"), confirm,
831 			      *confirm);
832 	}
833 	return;
834 }
835 
836 static int
837 mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data)
838 {
839 	mcp_cmd_t *buf;
840 	char buf_bytes[sizeof(*buf) + 8];
841 	volatile mcp_cmd_response_t *response = sc->cmd;
842 	volatile char *cmd_addr = sc->sram + MXGEFW_ETH_CMD;
843 	uint32_t dma_low, dma_high;
844 	int err, sleep_total = 0;
845 
846 	/* ensure buf is aligned to 8 bytes */
847 	buf = (mcp_cmd_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
848 
849 	buf->data0 = htobe32(data->data0);
850 	buf->data1 = htobe32(data->data1);
851 	buf->data2 = htobe32(data->data2);
852 	buf->cmd = htobe32(cmd);
853 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
854 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
855 
856 	buf->response_addr.low = htobe32(dma_low);
857 	buf->response_addr.high = htobe32(dma_high);
858 	mtx_lock(&sc->cmd_mtx);
859 	response->result = 0xffffffff;
860 	wmb();
861 	mxge_pio_copy((volatile void *)cmd_addr, buf, sizeof (*buf));
862 
863 	/* wait up to 20ms */
864 	err = EAGAIN;
865 	for (sleep_total = 0; sleep_total <  20; sleep_total++) {
866 		bus_dmamap_sync(sc->cmd_dma.dmat,
867 				sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
868 		wmb();
869 		switch (be32toh(response->result)) {
870 		case 0:
871 			data->data0 = be32toh(response->data);
872 			err = 0;
873 			break;
874 		case 0xffffffff:
875 			DELAY(1000);
876 			break;
877 		case MXGEFW_CMD_UNKNOWN:
878 			err = ENOSYS;
879 			break;
880 		case MXGEFW_CMD_ERROR_UNALIGNED:
881 			err = E2BIG;
882 			break;
883 		case MXGEFW_CMD_ERROR_BUSY:
884 			err = EBUSY;
885 			break;
886 		default:
887 			device_printf(sc->dev,
888 				      "mxge: command %d "
889 				      "failed, result = %d\n",
890 				      cmd, be32toh(response->result));
891 			err = ENXIO;
892 			break;
893 		}
894 		if (err != EAGAIN)
895 			break;
896 	}
897 	if (err == EAGAIN)
898 		device_printf(sc->dev, "mxge: command %d timed out"
899 			      "result = %d\n",
900 			      cmd, be32toh(response->result));
901 	mtx_unlock(&sc->cmd_mtx);
902 	return err;
903 }
904 
905 static int
906 mxge_adopt_running_firmware(mxge_softc_t *sc)
907 {
908 	struct mcp_gen_header *hdr;
909 	const size_t bytes = sizeof (struct mcp_gen_header);
910 	size_t hdr_offset;
911 	int status;
912 
913 	/* find running firmware header */
914 	hdr_offset = htobe32(*(volatile uint32_t *)
915 			     (sc->sram + MCP_HEADER_PTR_OFFSET));
916 
917 	if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > sc->sram_size) {
918 		device_printf(sc->dev,
919 			      "Running firmware has bad header offset (%d)\n",
920 			      (int)hdr_offset);
921 		return EIO;
922 	}
923 
924 	/* copy header of running firmware from SRAM to host memory to
925 	 * validate firmware */
926 	hdr = malloc(bytes, M_DEVBUF, M_NOWAIT);
927 	if (hdr == NULL) {
928 		device_printf(sc->dev, "could not malloc firmware hdr\n");
929 		return ENOMEM;
930 	}
931 	bus_space_read_region_1(rman_get_bustag(sc->mem_res),
932 				rman_get_bushandle(sc->mem_res),
933 				hdr_offset, (char *)hdr, bytes);
934 	status = mxge_validate_firmware(sc, hdr);
935 	free(hdr, M_DEVBUF);
936 
937 	/*
938 	 * check to see if adopted firmware has bug where adopting
939 	 * it will cause broadcasts to be filtered unless the NIC
940 	 * is kept in ALLMULTI mode
941 	 */
942 	if (sc->fw_ver_major == 1 && sc->fw_ver_minor == 4 &&
943 	    sc->fw_ver_tiny >= 4 && sc->fw_ver_tiny <= 11) {
944 		sc->adopted_rx_filter_bug = 1;
945 		device_printf(sc->dev, "Adopting fw %d.%d.%d: "
946 			      "working around rx filter bug\n",
947 			      sc->fw_ver_major, sc->fw_ver_minor,
948 			      sc->fw_ver_tiny);
949 	}
950 
951 	return status;
952 }
953 
954 
955 static int
956 mxge_load_firmware(mxge_softc_t *sc, int adopt)
957 {
958 	volatile uint32_t *confirm;
959 	volatile char *submit;
960 	char buf_bytes[72];
961 	uint32_t *buf, size, dma_low, dma_high;
962 	int status, i;
963 
964 	buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
965 
966 	size = sc->sram_size;
967 	status = mxge_load_firmware_helper(sc, &size);
968 	if (status) {
969 		if (!adopt)
970 			return status;
971 		/* Try to use the currently running firmware, if
972 		   it is new enough */
973 		status = mxge_adopt_running_firmware(sc);
974 		if (status) {
975 			device_printf(sc->dev,
976 				      "failed to adopt running firmware\n");
977 			return status;
978 		}
979 		device_printf(sc->dev,
980 			      "Successfully adopted running firmware\n");
981 		if (sc->tx_boundary == 4096) {
982 			device_printf(sc->dev,
983 				"Using firmware currently running on NIC"
984 				 ".  For optimal\n");
985 			device_printf(sc->dev,
986 				 "performance consider loading optimized "
987 				 "firmware\n");
988 		}
989 		sc->fw_name = mxge_fw_unaligned;
990 		sc->tx_boundary = 2048;
991 		return 0;
992 	}
993 	/* clear confirmation addr */
994 	confirm = (volatile uint32_t *)sc->cmd;
995 	*confirm = 0;
996 	wmb();
997 	/* send a reload command to the bootstrap MCP, and wait for the
998 	   response in the confirmation address.  The firmware should
999 	   write a -1 there to indicate it is alive and well
1000 	*/
1001 
1002 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
1003 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
1004 
1005 	buf[0] = htobe32(dma_high);	/* confirm addr MSW */
1006 	buf[1] = htobe32(dma_low);	/* confirm addr LSW */
1007 	buf[2] = htobe32(0xffffffff);	/* confirm data */
1008 
1009 	/* FIX: All newest firmware should un-protect the bottom of
1010 	   the sram before handoff. However, the very first interfaces
1011 	   do not. Therefore the handoff copy must skip the first 8 bytes
1012 	*/
1013 					/* where the code starts*/
1014 	buf[3] = htobe32(MXGE_FW_OFFSET + 8);
1015 	buf[4] = htobe32(size - 8); 	/* length of code */
1016 	buf[5] = htobe32(8);		/* where to copy to */
1017 	buf[6] = htobe32(0);		/* where to jump to */
1018 
1019 	submit = (volatile char *)(sc->sram + MXGEFW_BOOT_HANDOFF);
1020 	mxge_pio_copy(submit, buf, 64);
1021 	wmb();
1022 	DELAY(1000);
1023 	wmb();
1024 	i = 0;
1025 	while (*confirm != 0xffffffff && i < 20) {
1026 		DELAY(1000*10);
1027 		i++;
1028 		bus_dmamap_sync(sc->cmd_dma.dmat,
1029 				sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
1030 	}
1031 	if (*confirm != 0xffffffff) {
1032 		device_printf(sc->dev,"handoff failed (%p = 0x%x)",
1033 			confirm, *confirm);
1034 
1035 		return ENXIO;
1036 	}
1037 	return 0;
1038 }
1039 
1040 static int
1041 mxge_update_mac_address(mxge_softc_t *sc)
1042 {
1043 	mxge_cmd_t cmd;
1044 	uint8_t *addr = sc->mac_addr;
1045 	int status;
1046 
1047 
1048 	cmd.data0 = ((addr[0] << 24) | (addr[1] << 16)
1049 		     | (addr[2] << 8) | addr[3]);
1050 
1051 	cmd.data1 = ((addr[4] << 8) | (addr[5]));
1052 
1053 	status = mxge_send_cmd(sc, MXGEFW_SET_MAC_ADDRESS, &cmd);
1054 	return status;
1055 }
1056 
1057 static int
1058 mxge_change_pause(mxge_softc_t *sc, int pause)
1059 {
1060 	mxge_cmd_t cmd;
1061 	int status;
1062 
1063 	if (pause)
1064 		status = mxge_send_cmd(sc, MXGEFW_ENABLE_FLOW_CONTROL,
1065 				       &cmd);
1066 	else
1067 		status = mxge_send_cmd(sc, MXGEFW_DISABLE_FLOW_CONTROL,
1068 				       &cmd);
1069 
1070 	if (status) {
1071 		device_printf(sc->dev, "Failed to set flow control mode\n");
1072 		return ENXIO;
1073 	}
1074 	sc->pause = pause;
1075 	return 0;
1076 }
1077 
1078 static void
1079 mxge_change_promisc(mxge_softc_t *sc, int promisc)
1080 {
1081 	mxge_cmd_t cmd;
1082 	int status;
1083 
1084 	if (mxge_always_promisc)
1085 		promisc = 1;
1086 
1087 	if (promisc)
1088 		status = mxge_send_cmd(sc, MXGEFW_ENABLE_PROMISC,
1089 				       &cmd);
1090 	else
1091 		status = mxge_send_cmd(sc, MXGEFW_DISABLE_PROMISC,
1092 				       &cmd);
1093 
1094 	if (status) {
1095 		device_printf(sc->dev, "Failed to set promisc mode\n");
1096 	}
1097 }
1098 
1099 static void
1100 mxge_set_multicast_list(mxge_softc_t *sc)
1101 {
1102 	mxge_cmd_t cmd;
1103 	struct ifmultiaddr *ifma;
1104 	struct ifnet *ifp = sc->ifp;
1105 	int err;
1106 
1107 	/* This firmware is known to not support multicast */
1108 	if (!sc->fw_multicast_support)
1109 		return;
1110 
1111 	/* Disable multicast filtering while we play with the lists*/
1112 	err = mxge_send_cmd(sc, MXGEFW_ENABLE_ALLMULTI, &cmd);
1113 	if (err != 0) {
1114 		device_printf(sc->dev, "Failed MXGEFW_ENABLE_ALLMULTI,"
1115 		       " error status: %d\n", err);
1116 		return;
1117 	}
1118 
1119 	if (sc->adopted_rx_filter_bug)
1120 		return;
1121 
1122 	if (ifp->if_flags & IFF_ALLMULTI)
1123 		/* request to disable multicast filtering, so quit here */
1124 		return;
1125 
1126 	/* Flush all the filters */
1127 
1128 	err = mxge_send_cmd(sc, MXGEFW_LEAVE_ALL_MULTICAST_GROUPS, &cmd);
1129 	if (err != 0) {
1130 		device_printf(sc->dev,
1131 			      "Failed MXGEFW_LEAVE_ALL_MULTICAST_GROUPS"
1132 			      ", error status: %d\n", err);
1133 		return;
1134 	}
1135 
1136 	/* Walk the multicast list, and add each address */
1137 
1138 	if_maddr_rlock(ifp);
1139 	TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
1140 		if (ifma->ifma_addr->sa_family != AF_LINK)
1141 			continue;
1142 		bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr),
1143 		      &cmd.data0, 4);
1144 		bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr) + 4,
1145 		      &cmd.data1, 2);
1146 		cmd.data0 = htonl(cmd.data0);
1147 		cmd.data1 = htonl(cmd.data1);
1148 		err = mxge_send_cmd(sc, MXGEFW_JOIN_MULTICAST_GROUP, &cmd);
1149 		if (err != 0) {
1150 			device_printf(sc->dev, "Failed "
1151 			       "MXGEFW_JOIN_MULTICAST_GROUP, error status:"
1152 			       "%d\t", err);
1153 			/* abort, leaving multicast filtering off */
1154 			if_maddr_runlock(ifp);
1155 			return;
1156 		}
1157 	}
1158 	if_maddr_runlock(ifp);
1159 	/* Enable multicast filtering */
1160 	err = mxge_send_cmd(sc, MXGEFW_DISABLE_ALLMULTI, &cmd);
1161 	if (err != 0) {
1162 		device_printf(sc->dev, "Failed MXGEFW_DISABLE_ALLMULTI"
1163 		       ", error status: %d\n", err);
1164 	}
1165 }
1166 
1167 static int
1168 mxge_max_mtu(mxge_softc_t *sc)
1169 {
1170 	mxge_cmd_t cmd;
1171 	int status;
1172 
1173 	if (MJUMPAGESIZE - MXGEFW_PAD >  MXGEFW_MAX_MTU)
1174 		return  MXGEFW_MAX_MTU - MXGEFW_PAD;
1175 
1176 	/* try to set nbufs to see if it we can
1177 	   use virtually contiguous jumbos */
1178 	cmd.data0 = 0;
1179 	status = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
1180 			       &cmd);
1181 	if (status == 0)
1182 		return  MXGEFW_MAX_MTU - MXGEFW_PAD;
1183 
1184 	/* otherwise, we're limited to MJUMPAGESIZE */
1185 	return MJUMPAGESIZE - MXGEFW_PAD;
1186 }
1187 
1188 static int
1189 mxge_reset(mxge_softc_t *sc, int interrupts_setup)
1190 {
1191 	struct mxge_slice_state *ss;
1192 	mxge_rx_done_t *rx_done;
1193 	volatile uint32_t *irq_claim;
1194 	mxge_cmd_t cmd;
1195 	int slice, status;
1196 
1197 	/* try to send a reset command to the card to see if it
1198 	   is alive */
1199 	memset(&cmd, 0, sizeof (cmd));
1200 	status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
1201 	if (status != 0) {
1202 		device_printf(sc->dev, "failed reset\n");
1203 		return ENXIO;
1204 	}
1205 
1206 	mxge_dummy_rdma(sc, 1);
1207 
1208 
1209 	/* set the intrq size */
1210 	cmd.data0 = sc->rx_ring_size;
1211 	status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
1212 
1213 	/*
1214 	 * Even though we already know how many slices are supported
1215 	 * via mxge_slice_probe(), MXGEFW_CMD_GET_MAX_RSS_QUEUES
1216 	 * has magic side effects, and must be called after a reset.
1217 	 * It must be called prior to calling any RSS related cmds,
1218 	 * including assigning an interrupt queue for anything but
1219 	 * slice 0.  It must also be called *after*
1220 	 * MXGEFW_CMD_SET_INTRQ_SIZE, since the intrq size is used by
1221 	 * the firmware to compute offsets.
1222 	 */
1223 
1224 	if (sc->num_slices > 1) {
1225 		/* ask the maximum number of slices it supports */
1226 		status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES,
1227 					   &cmd);
1228 		if (status != 0) {
1229 			device_printf(sc->dev,
1230 				      "failed to get number of slices\n");
1231 			return status;
1232 		}
1233 		/*
1234 		 * MXGEFW_CMD_ENABLE_RSS_QUEUES must be called prior
1235 		 * to setting up the interrupt queue DMA
1236 		 */
1237 		cmd.data0 = sc->num_slices;
1238 		cmd.data1 = MXGEFW_SLICE_INTR_MODE_ONE_PER_SLICE;
1239 #ifdef IFNET_BUF_RING
1240 		cmd.data1 |= MXGEFW_SLICE_ENABLE_MULTIPLE_TX_QUEUES;
1241 #endif
1242 		status = mxge_send_cmd(sc, MXGEFW_CMD_ENABLE_RSS_QUEUES,
1243 					   &cmd);
1244 		if (status != 0) {
1245 			device_printf(sc->dev,
1246 				      "failed to set number of slices\n");
1247 			return status;
1248 		}
1249 	}
1250 
1251 
1252 	if (interrupts_setup) {
1253 		/* Now exchange information about interrupts  */
1254 		for (slice = 0; slice < sc->num_slices; slice++) {
1255 			rx_done = &sc->ss[slice].rx_done;
1256 			memset(rx_done->entry, 0, sc->rx_ring_size);
1257 			cmd.data0 = MXGE_LOWPART_TO_U32(rx_done->dma.bus_addr);
1258 			cmd.data1 = MXGE_HIGHPART_TO_U32(rx_done->dma.bus_addr);
1259 			cmd.data2 = slice;
1260 			status |= mxge_send_cmd(sc,
1261 						MXGEFW_CMD_SET_INTRQ_DMA,
1262 						&cmd);
1263 		}
1264 	}
1265 
1266 	status |= mxge_send_cmd(sc,
1267 				MXGEFW_CMD_GET_INTR_COAL_DELAY_OFFSET, &cmd);
1268 
1269 
1270 	sc->intr_coal_delay_ptr = (volatile uint32_t *)(sc->sram + cmd.data0);
1271 
1272 	status |= mxge_send_cmd(sc, MXGEFW_CMD_GET_IRQ_ACK_OFFSET, &cmd);
1273 	irq_claim = (volatile uint32_t *)(sc->sram + cmd.data0);
1274 
1275 
1276 	status |= mxge_send_cmd(sc,  MXGEFW_CMD_GET_IRQ_DEASSERT_OFFSET,
1277 				&cmd);
1278 	sc->irq_deassert = (volatile uint32_t *)(sc->sram + cmd.data0);
1279 	if (status != 0) {
1280 		device_printf(sc->dev, "failed set interrupt parameters\n");
1281 		return status;
1282 	}
1283 
1284 
1285 	*sc->intr_coal_delay_ptr = htobe32(sc->intr_coal_delay);
1286 
1287 
1288 	/* run a DMA benchmark */
1289 	(void) mxge_dma_test(sc, MXGEFW_DMA_TEST);
1290 
1291 	for (slice = 0; slice < sc->num_slices; slice++) {
1292 		ss = &sc->ss[slice];
1293 
1294 		ss->irq_claim = irq_claim + (2 * slice);
1295 		/* reset mcp/driver shared state back to 0 */
1296 		ss->rx_done.idx = 0;
1297 		ss->rx_done.cnt = 0;
1298 		ss->tx.req = 0;
1299 		ss->tx.done = 0;
1300 		ss->tx.pkt_done = 0;
1301 		ss->tx.queue_active = 0;
1302 		ss->tx.activate = 0;
1303 		ss->tx.deactivate = 0;
1304 		ss->tx.wake = 0;
1305 		ss->tx.defrag = 0;
1306 		ss->tx.stall = 0;
1307 		ss->rx_big.cnt = 0;
1308 		ss->rx_small.cnt = 0;
1309 		ss->lro_bad_csum = 0;
1310 		ss->lro_queued = 0;
1311 		ss->lro_flushed = 0;
1312 		if (ss->fw_stats != NULL) {
1313 			bzero(ss->fw_stats, sizeof *ss->fw_stats);
1314 		}
1315 	}
1316 	sc->rdma_tags_available = 15;
1317 	status = mxge_update_mac_address(sc);
1318 	mxge_change_promisc(sc, sc->ifp->if_flags & IFF_PROMISC);
1319 	mxge_change_pause(sc, sc->pause);
1320 	mxge_set_multicast_list(sc);
1321 	if (sc->throttle) {
1322 		cmd.data0 = sc->throttle;
1323 		if (mxge_send_cmd(sc, MXGEFW_CMD_SET_THROTTLE_FACTOR,
1324 				  &cmd)) {
1325 			device_printf(sc->dev,
1326 				      "can't enable throttle\n");
1327 		}
1328 	}
1329 	return status;
1330 }
1331 
1332 static int
1333 mxge_change_throttle(SYSCTL_HANDLER_ARGS)
1334 {
1335 	mxge_cmd_t cmd;
1336 	mxge_softc_t *sc;
1337 	int err;
1338 	unsigned int throttle;
1339 
1340 	sc = arg1;
1341 	throttle = sc->throttle;
1342 	err = sysctl_handle_int(oidp, &throttle, arg2, req);
1343         if (err != 0) {
1344                 return err;
1345         }
1346 
1347 	if (throttle == sc->throttle)
1348 		return 0;
1349 
1350         if (throttle < MXGE_MIN_THROTTLE || throttle > MXGE_MAX_THROTTLE)
1351                 return EINVAL;
1352 
1353 	mtx_lock(&sc->driver_mtx);
1354 	cmd.data0 = throttle;
1355 	err = mxge_send_cmd(sc, MXGEFW_CMD_SET_THROTTLE_FACTOR, &cmd);
1356 	if (err == 0)
1357 		sc->throttle = throttle;
1358 	mtx_unlock(&sc->driver_mtx);
1359 	return err;
1360 }
1361 
1362 static int
1363 mxge_change_intr_coal(SYSCTL_HANDLER_ARGS)
1364 {
1365         mxge_softc_t *sc;
1366         unsigned int intr_coal_delay;
1367         int err;
1368 
1369         sc = arg1;
1370         intr_coal_delay = sc->intr_coal_delay;
1371         err = sysctl_handle_int(oidp, &intr_coal_delay, arg2, req);
1372         if (err != 0) {
1373                 return err;
1374         }
1375         if (intr_coal_delay == sc->intr_coal_delay)
1376                 return 0;
1377 
1378         if (intr_coal_delay == 0 || intr_coal_delay > 1000*1000)
1379                 return EINVAL;
1380 
1381 	mtx_lock(&sc->driver_mtx);
1382 	*sc->intr_coal_delay_ptr = htobe32(intr_coal_delay);
1383 	sc->intr_coal_delay = intr_coal_delay;
1384 
1385 	mtx_unlock(&sc->driver_mtx);
1386         return err;
1387 }
1388 
1389 static int
1390 mxge_change_flow_control(SYSCTL_HANDLER_ARGS)
1391 {
1392         mxge_softc_t *sc;
1393         unsigned int enabled;
1394         int err;
1395 
1396         sc = arg1;
1397         enabled = sc->pause;
1398         err = sysctl_handle_int(oidp, &enabled, arg2, req);
1399         if (err != 0) {
1400                 return err;
1401         }
1402         if (enabled == sc->pause)
1403                 return 0;
1404 
1405 	mtx_lock(&sc->driver_mtx);
1406 	err = mxge_change_pause(sc, enabled);
1407 	mtx_unlock(&sc->driver_mtx);
1408         return err;
1409 }
1410 
1411 static int
1412 mxge_change_lro_locked(mxge_softc_t *sc, int lro_cnt)
1413 {
1414 	struct ifnet *ifp;
1415 	int err = 0;
1416 
1417 	ifp = sc->ifp;
1418 	if (lro_cnt == 0)
1419 		ifp->if_capenable &= ~IFCAP_LRO;
1420 	else
1421 		ifp->if_capenable |= IFCAP_LRO;
1422 	sc->lro_cnt = lro_cnt;
1423 	if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
1424 		mxge_close(sc, 0);
1425 		err = mxge_open(sc);
1426 	}
1427 	return err;
1428 }
1429 
1430 static int
1431 mxge_change_lro(SYSCTL_HANDLER_ARGS)
1432 {
1433 	mxge_softc_t *sc;
1434 	unsigned int lro_cnt;
1435 	int err;
1436 
1437 	sc = arg1;
1438 	lro_cnt = sc->lro_cnt;
1439 	err = sysctl_handle_int(oidp, &lro_cnt, arg2, req);
1440 	if (err != 0)
1441 		return err;
1442 
1443 	if (lro_cnt == sc->lro_cnt)
1444 		return 0;
1445 
1446 	if (lro_cnt > 128)
1447 		return EINVAL;
1448 
1449 	mtx_lock(&sc->driver_mtx);
1450 	err = mxge_change_lro_locked(sc, lro_cnt);
1451 	mtx_unlock(&sc->driver_mtx);
1452 	return err;
1453 }
1454 
1455 static int
1456 mxge_handle_be32(SYSCTL_HANDLER_ARGS)
1457 {
1458         int err;
1459 
1460         if (arg1 == NULL)
1461                 return EFAULT;
1462         arg2 = be32toh(*(int *)arg1);
1463         arg1 = NULL;
1464         err = sysctl_handle_int(oidp, arg1, arg2, req);
1465 
1466         return err;
1467 }
1468 
1469 static void
1470 mxge_rem_sysctls(mxge_softc_t *sc)
1471 {
1472 	struct mxge_slice_state *ss;
1473 	int slice;
1474 
1475 	if (sc->slice_sysctl_tree == NULL)
1476 		return;
1477 
1478 	for (slice = 0; slice < sc->num_slices; slice++) {
1479 		ss = &sc->ss[slice];
1480 		if (ss == NULL || ss->sysctl_tree == NULL)
1481 			continue;
1482 		sysctl_ctx_free(&ss->sysctl_ctx);
1483 		ss->sysctl_tree = NULL;
1484 	}
1485 	sysctl_ctx_free(&sc->slice_sysctl_ctx);
1486 	sc->slice_sysctl_tree = NULL;
1487 }
1488 
1489 static void
1490 mxge_add_sysctls(mxge_softc_t *sc)
1491 {
1492 	struct sysctl_ctx_list *ctx;
1493 	struct sysctl_oid_list *children;
1494 	mcp_irq_data_t *fw;
1495 	struct mxge_slice_state *ss;
1496 	int slice;
1497 	char slice_num[8];
1498 
1499 	ctx = device_get_sysctl_ctx(sc->dev);
1500 	children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev));
1501 	fw = sc->ss[0].fw_stats;
1502 
1503 	/* random information */
1504 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1505 		       "firmware_version",
1506 		       CTLFLAG_RD, &sc->fw_version,
1507 		       0, "firmware version");
1508 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1509 		       "serial_number",
1510 		       CTLFLAG_RD, &sc->serial_number_string,
1511 		       0, "serial number");
1512 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1513 		       "product_code",
1514 		       CTLFLAG_RD, &sc->product_code_string,
1515 		       0, "product_code");
1516 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1517 		       "pcie_link_width",
1518 		       CTLFLAG_RD, &sc->link_width,
1519 		       0, "tx_boundary");
1520 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1521 		       "tx_boundary",
1522 		       CTLFLAG_RD, &sc->tx_boundary,
1523 		       0, "tx_boundary");
1524 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1525 		       "write_combine",
1526 		       CTLFLAG_RD, &sc->wc,
1527 		       0, "write combining PIO?");
1528 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1529 		       "read_dma_MBs",
1530 		       CTLFLAG_RD, &sc->read_dma,
1531 		       0, "DMA Read speed in MB/s");
1532 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1533 		       "write_dma_MBs",
1534 		       CTLFLAG_RD, &sc->write_dma,
1535 		       0, "DMA Write speed in MB/s");
1536 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1537 		       "read_write_dma_MBs",
1538 		       CTLFLAG_RD, &sc->read_write_dma,
1539 		       0, "DMA concurrent Read/Write speed in MB/s");
1540 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1541 		       "watchdog_resets",
1542 		       CTLFLAG_RD, &sc->watchdog_resets,
1543 		       0, "Number of times NIC was reset");
1544 
1545 
1546 	/* performance related tunables */
1547 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1548 			"intr_coal_delay",
1549 			CTLTYPE_INT|CTLFLAG_RW, sc,
1550 			0, mxge_change_intr_coal,
1551 			"I", "interrupt coalescing delay in usecs");
1552 
1553 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1554 			"throttle",
1555 			CTLTYPE_INT|CTLFLAG_RW, sc,
1556 			0, mxge_change_throttle,
1557 			"I", "transmit throttling");
1558 
1559 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1560 			"flow_control_enabled",
1561 			CTLTYPE_INT|CTLFLAG_RW, sc,
1562 			0, mxge_change_flow_control,
1563 			"I", "interrupt coalescing delay in usecs");
1564 
1565 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1566 		       "deassert_wait",
1567 		       CTLFLAG_RW, &mxge_deassert_wait,
1568 		       0, "Wait for IRQ line to go low in ihandler");
1569 
1570 	/* stats block from firmware is in network byte order.
1571 	   Need to swap it */
1572 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1573 			"link_up",
1574 			CTLTYPE_INT|CTLFLAG_RD, &fw->link_up,
1575 			0, mxge_handle_be32,
1576 			"I", "link up");
1577 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1578 			"rdma_tags_available",
1579 			CTLTYPE_INT|CTLFLAG_RD, &fw->rdma_tags_available,
1580 			0, mxge_handle_be32,
1581 			"I", "rdma_tags_available");
1582 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1583 			"dropped_bad_crc32",
1584 			CTLTYPE_INT|CTLFLAG_RD,
1585 			&fw->dropped_bad_crc32,
1586 			0, mxge_handle_be32,
1587 			"I", "dropped_bad_crc32");
1588 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1589 			"dropped_bad_phy",
1590 			CTLTYPE_INT|CTLFLAG_RD,
1591 			&fw->dropped_bad_phy,
1592 			0, mxge_handle_be32,
1593 			"I", "dropped_bad_phy");
1594 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1595 			"dropped_link_error_or_filtered",
1596 			CTLTYPE_INT|CTLFLAG_RD,
1597 			&fw->dropped_link_error_or_filtered,
1598 			0, mxge_handle_be32,
1599 			"I", "dropped_link_error_or_filtered");
1600 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1601 			"dropped_link_overflow",
1602 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_link_overflow,
1603 			0, mxge_handle_be32,
1604 			"I", "dropped_link_overflow");
1605 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1606 			"dropped_multicast_filtered",
1607 			CTLTYPE_INT|CTLFLAG_RD,
1608 			&fw->dropped_multicast_filtered,
1609 			0, mxge_handle_be32,
1610 			"I", "dropped_multicast_filtered");
1611 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1612 			"dropped_no_big_buffer",
1613 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_no_big_buffer,
1614 			0, mxge_handle_be32,
1615 			"I", "dropped_no_big_buffer");
1616 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1617 			"dropped_no_small_buffer",
1618 			CTLTYPE_INT|CTLFLAG_RD,
1619 			&fw->dropped_no_small_buffer,
1620 			0, mxge_handle_be32,
1621 			"I", "dropped_no_small_buffer");
1622 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1623 			"dropped_overrun",
1624 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_overrun,
1625 			0, mxge_handle_be32,
1626 			"I", "dropped_overrun");
1627 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1628 			"dropped_pause",
1629 			CTLTYPE_INT|CTLFLAG_RD,
1630 			&fw->dropped_pause,
1631 			0, mxge_handle_be32,
1632 			"I", "dropped_pause");
1633 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1634 			"dropped_runt",
1635 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_runt,
1636 			0, mxge_handle_be32,
1637 			"I", "dropped_runt");
1638 
1639 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1640 			"dropped_unicast_filtered",
1641 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_unicast_filtered,
1642 			0, mxge_handle_be32,
1643 			"I", "dropped_unicast_filtered");
1644 
1645 	/* verbose printing? */
1646 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1647 		       "verbose",
1648 		       CTLFLAG_RW, &mxge_verbose,
1649 		       0, "verbose printing");
1650 
1651 	/* lro */
1652 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1653 			"lro_cnt",
1654 			CTLTYPE_INT|CTLFLAG_RW, sc,
1655 			0, mxge_change_lro,
1656 			"I", "number of lro merge queues");
1657 
1658 
1659 	/* add counters exported for debugging from all slices */
1660 	sysctl_ctx_init(&sc->slice_sysctl_ctx);
1661 	sc->slice_sysctl_tree =
1662 		SYSCTL_ADD_NODE(&sc->slice_sysctl_ctx, children, OID_AUTO,
1663 				"slice", CTLFLAG_RD, 0, "");
1664 
1665 	for (slice = 0; slice < sc->num_slices; slice++) {
1666 		ss = &sc->ss[slice];
1667 		sysctl_ctx_init(&ss->sysctl_ctx);
1668 		ctx = &ss->sysctl_ctx;
1669 		children = SYSCTL_CHILDREN(sc->slice_sysctl_tree);
1670 		sprintf(slice_num, "%d", slice);
1671 		ss->sysctl_tree =
1672 			SYSCTL_ADD_NODE(ctx, children, OID_AUTO, slice_num,
1673 					CTLFLAG_RD, 0, "");
1674 		children = SYSCTL_CHILDREN(ss->sysctl_tree);
1675 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1676 			       "rx_small_cnt",
1677 			       CTLFLAG_RD, &ss->rx_small.cnt,
1678 			       0, "rx_small_cnt");
1679 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1680 			       "rx_big_cnt",
1681 			       CTLFLAG_RD, &ss->rx_big.cnt,
1682 			       0, "rx_small_cnt");
1683 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1684 			       "lro_flushed", CTLFLAG_RD, &ss->lro_flushed,
1685 			       0, "number of lro merge queues flushed");
1686 
1687 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1688 			       "lro_queued", CTLFLAG_RD, &ss->lro_queued,
1689 			       0, "number of frames appended to lro merge"
1690 			       "queues");
1691 
1692 #ifndef IFNET_BUF_RING
1693 		/* only transmit from slice 0 for now */
1694 		if (slice > 0)
1695 			continue;
1696 #endif
1697 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1698 			       "tx_req",
1699 			       CTLFLAG_RD, &ss->tx.req,
1700 			       0, "tx_req");
1701 
1702 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1703 			       "tx_done",
1704 			       CTLFLAG_RD, &ss->tx.done,
1705 			       0, "tx_done");
1706 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1707 			       "tx_pkt_done",
1708 			       CTLFLAG_RD, &ss->tx.pkt_done,
1709 			       0, "tx_done");
1710 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1711 			       "tx_stall",
1712 			       CTLFLAG_RD, &ss->tx.stall,
1713 			       0, "tx_stall");
1714 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1715 			       "tx_wake",
1716 			       CTLFLAG_RD, &ss->tx.wake,
1717 			       0, "tx_wake");
1718 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1719 			       "tx_defrag",
1720 			       CTLFLAG_RD, &ss->tx.defrag,
1721 			       0, "tx_defrag");
1722 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1723 			       "tx_queue_active",
1724 			       CTLFLAG_RD, &ss->tx.queue_active,
1725 			       0, "tx_queue_active");
1726 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1727 			       "tx_activate",
1728 			       CTLFLAG_RD, &ss->tx.activate,
1729 			       0, "tx_activate");
1730 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1731 			       "tx_deactivate",
1732 			       CTLFLAG_RD, &ss->tx.deactivate,
1733 			       0, "tx_deactivate");
1734 	}
1735 }
1736 
1737 /* copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
1738    backwards one at a time and handle ring wraps */
1739 
1740 static inline void
1741 mxge_submit_req_backwards(mxge_tx_ring_t *tx,
1742 			    mcp_kreq_ether_send_t *src, int cnt)
1743 {
1744         int idx, starting_slot;
1745         starting_slot = tx->req;
1746         while (cnt > 1) {
1747                 cnt--;
1748                 idx = (starting_slot + cnt) & tx->mask;
1749                 mxge_pio_copy(&tx->lanai[idx],
1750 			      &src[cnt], sizeof(*src));
1751                 wmb();
1752         }
1753 }
1754 
1755 /*
1756  * copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
1757  * at most 32 bytes at a time, so as to avoid involving the software
1758  * pio handler in the nic.   We re-write the first segment's flags
1759  * to mark them valid only after writing the entire chain
1760  */
1761 
1762 static inline void
1763 mxge_submit_req(mxge_tx_ring_t *tx, mcp_kreq_ether_send_t *src,
1764                   int cnt)
1765 {
1766         int idx, i;
1767         uint32_t *src_ints;
1768 	volatile uint32_t *dst_ints;
1769         mcp_kreq_ether_send_t *srcp;
1770 	volatile mcp_kreq_ether_send_t *dstp, *dst;
1771 	uint8_t last_flags;
1772 
1773         idx = tx->req & tx->mask;
1774 
1775 	last_flags = src->flags;
1776 	src->flags = 0;
1777         wmb();
1778         dst = dstp = &tx->lanai[idx];
1779         srcp = src;
1780 
1781         if ((idx + cnt) < tx->mask) {
1782                 for (i = 0; i < (cnt - 1); i += 2) {
1783                         mxge_pio_copy(dstp, srcp, 2 * sizeof(*src));
1784                         wmb(); /* force write every 32 bytes */
1785                         srcp += 2;
1786                         dstp += 2;
1787                 }
1788         } else {
1789                 /* submit all but the first request, and ensure
1790                    that it is submitted below */
1791                 mxge_submit_req_backwards(tx, src, cnt);
1792                 i = 0;
1793         }
1794         if (i < cnt) {
1795                 /* submit the first request */
1796                 mxge_pio_copy(dstp, srcp, sizeof(*src));
1797                 wmb(); /* barrier before setting valid flag */
1798         }
1799 
1800         /* re-write the last 32-bits with the valid flags */
1801         src->flags = last_flags;
1802         src_ints = (uint32_t *)src;
1803         src_ints+=3;
1804         dst_ints = (volatile uint32_t *)dst;
1805         dst_ints+=3;
1806         *dst_ints =  *src_ints;
1807         tx->req += cnt;
1808         wmb();
1809 }
1810 
1811 #if IFCAP_TSO4
1812 
1813 static void
1814 mxge_encap_tso(struct mxge_slice_state *ss, struct mbuf *m,
1815 	       int busdma_seg_cnt, int ip_off)
1816 {
1817 	mxge_tx_ring_t *tx;
1818 	mcp_kreq_ether_send_t *req;
1819 	bus_dma_segment_t *seg;
1820 	struct ip *ip;
1821 	struct tcphdr *tcp;
1822 	uint32_t low, high_swapped;
1823 	int len, seglen, cum_len, cum_len_next;
1824 	int next_is_first, chop, cnt, rdma_count, small;
1825 	uint16_t pseudo_hdr_offset, cksum_offset, mss;
1826 	uint8_t flags, flags_next;
1827 	static int once;
1828 
1829 	mss = m->m_pkthdr.tso_segsz;
1830 
1831 	/* negative cum_len signifies to the
1832 	 * send loop that we are still in the
1833 	 * header portion of the TSO packet.
1834 	 */
1835 
1836 	/* ensure we have the ethernet, IP and TCP
1837 	   header together in the first mbuf, copy
1838 	   it to a scratch buffer if not */
1839 	if (__predict_false(m->m_len < ip_off + sizeof (*ip))) {
1840 		m_copydata(m, 0, ip_off + sizeof (*ip),
1841 			   ss->scratch);
1842 		ip = (struct ip *)(ss->scratch + ip_off);
1843 	} else {
1844 		ip = (struct ip *)(mtod(m, char *) + ip_off);
1845 	}
1846 	if (__predict_false(m->m_len < ip_off + (ip->ip_hl << 2)
1847 			    + sizeof (*tcp))) {
1848 		m_copydata(m, 0, ip_off + (ip->ip_hl << 2)
1849 			   + sizeof (*tcp),  ss->scratch);
1850 		ip = (struct ip *)(mtod(m, char *) + ip_off);
1851 	}
1852 
1853 	tcp = (struct tcphdr *)((char *)ip + (ip->ip_hl << 2));
1854 	cum_len = -(ip_off + ((ip->ip_hl + tcp->th_off) << 2));
1855 
1856 	/* TSO implies checksum offload on this hardware */
1857 	cksum_offset = ip_off + (ip->ip_hl << 2);
1858 	flags = MXGEFW_FLAGS_TSO_HDR | MXGEFW_FLAGS_FIRST;
1859 
1860 
1861 	/* for TSO, pseudo_hdr_offset holds mss.
1862 	 * The firmware figures out where to put
1863 	 * the checksum by parsing the header. */
1864 	pseudo_hdr_offset = htobe16(mss);
1865 
1866 	tx = &ss->tx;
1867 	req = tx->req_list;
1868 	seg = tx->seg_list;
1869 	cnt = 0;
1870 	rdma_count = 0;
1871 	/* "rdma_count" is the number of RDMAs belonging to the
1872 	 * current packet BEFORE the current send request. For
1873 	 * non-TSO packets, this is equal to "count".
1874 	 * For TSO packets, rdma_count needs to be reset
1875 	 * to 0 after a segment cut.
1876 	 *
1877 	 * The rdma_count field of the send request is
1878 	 * the number of RDMAs of the packet starting at
1879 	 * that request. For TSO send requests with one ore more cuts
1880 	 * in the middle, this is the number of RDMAs starting
1881 	 * after the last cut in the request. All previous
1882 	 * segments before the last cut implicitly have 1 RDMA.
1883 	 *
1884 	 * Since the number of RDMAs is not known beforehand,
1885 	 * it must be filled-in retroactively - after each
1886 	 * segmentation cut or at the end of the entire packet.
1887 	 */
1888 
1889 	while (busdma_seg_cnt) {
1890 		/* Break the busdma segment up into pieces*/
1891 		low = MXGE_LOWPART_TO_U32(seg->ds_addr);
1892 		high_swapped = 	htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
1893 		len = seg->ds_len;
1894 
1895 		while (len) {
1896 			flags_next = flags & ~MXGEFW_FLAGS_FIRST;
1897 			seglen = len;
1898 			cum_len_next = cum_len + seglen;
1899 			(req-rdma_count)->rdma_count = rdma_count + 1;
1900 			if (__predict_true(cum_len >= 0)) {
1901 				/* payload */
1902 				chop = (cum_len_next > mss);
1903 				cum_len_next = cum_len_next % mss;
1904 				next_is_first = (cum_len_next == 0);
1905 				flags |= chop * MXGEFW_FLAGS_TSO_CHOP;
1906 				flags_next |= next_is_first *
1907 					MXGEFW_FLAGS_FIRST;
1908 				rdma_count |= -(chop | next_is_first);
1909 				rdma_count += chop & !next_is_first;
1910 			} else if (cum_len_next >= 0) {
1911 				/* header ends */
1912 				rdma_count = -1;
1913 				cum_len_next = 0;
1914 				seglen = -cum_len;
1915 				small = (mss <= MXGEFW_SEND_SMALL_SIZE);
1916 				flags_next = MXGEFW_FLAGS_TSO_PLD |
1917 					MXGEFW_FLAGS_FIRST |
1918 					(small * MXGEFW_FLAGS_SMALL);
1919 			    }
1920 
1921 			req->addr_high = high_swapped;
1922 			req->addr_low = htobe32(low);
1923 			req->pseudo_hdr_offset = pseudo_hdr_offset;
1924 			req->pad = 0;
1925 			req->rdma_count = 1;
1926 			req->length = htobe16(seglen);
1927 			req->cksum_offset = cksum_offset;
1928 			req->flags = flags | ((cum_len & 1) *
1929 					      MXGEFW_FLAGS_ALIGN_ODD);
1930 			low += seglen;
1931 			len -= seglen;
1932 			cum_len = cum_len_next;
1933 			flags = flags_next;
1934 			req++;
1935 			cnt++;
1936 			rdma_count++;
1937 			if (__predict_false(cksum_offset > seglen))
1938 				cksum_offset -= seglen;
1939 			else
1940 				cksum_offset = 0;
1941 			if (__predict_false(cnt > tx->max_desc))
1942 				goto drop;
1943 		}
1944 		busdma_seg_cnt--;
1945 		seg++;
1946 	}
1947 	(req-rdma_count)->rdma_count = rdma_count;
1948 
1949 	do {
1950 		req--;
1951 		req->flags |= MXGEFW_FLAGS_TSO_LAST;
1952 	} while (!(req->flags & (MXGEFW_FLAGS_TSO_CHOP | MXGEFW_FLAGS_FIRST)));
1953 
1954 	tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
1955 	mxge_submit_req(tx, tx->req_list, cnt);
1956 #ifdef IFNET_BUF_RING
1957 	if ((ss->sc->num_slices > 1) && tx->queue_active == 0) {
1958 		/* tell the NIC to start polling this slice */
1959 		*tx->send_go = 1;
1960 		tx->queue_active = 1;
1961 		tx->activate++;
1962 		wmb();
1963 	}
1964 #endif
1965 	return;
1966 
1967 drop:
1968 	bus_dmamap_unload(tx->dmat, tx->info[tx->req & tx->mask].map);
1969 	m_freem(m);
1970 	ss->oerrors++;
1971 	if (!once) {
1972 		printf("tx->max_desc exceeded via TSO!\n");
1973 		printf("mss = %d, %ld, %d!\n", mss,
1974 		       (long)seg - (long)tx->seg_list, tx->max_desc);
1975 		once = 1;
1976 	}
1977 	return;
1978 
1979 }
1980 
1981 #endif /* IFCAP_TSO4 */
1982 
1983 #ifdef MXGE_NEW_VLAN_API
1984 /*
1985  * We reproduce the software vlan tag insertion from
1986  * net/if_vlan.c:vlan_start() here so that we can advertise "hardware"
1987  * vlan tag insertion. We need to advertise this in order to have the
1988  * vlan interface respect our csum offload flags.
1989  */
1990 static struct mbuf *
1991 mxge_vlan_tag_insert(struct mbuf *m)
1992 {
1993 	struct ether_vlan_header *evl;
1994 
1995 	M_PREPEND(m, ETHER_VLAN_ENCAP_LEN, M_DONTWAIT);
1996 	if (__predict_false(m == NULL))
1997 		return NULL;
1998 	if (m->m_len < sizeof(*evl)) {
1999 		m = m_pullup(m, sizeof(*evl));
2000 		if (__predict_false(m == NULL))
2001 			return NULL;
2002 	}
2003 	/*
2004 	 * Transform the Ethernet header into an Ethernet header
2005 	 * with 802.1Q encapsulation.
2006 	 */
2007 	evl = mtod(m, struct ether_vlan_header *);
2008 	bcopy((char *)evl + ETHER_VLAN_ENCAP_LEN,
2009 	      (char *)evl, ETHER_HDR_LEN - ETHER_TYPE_LEN);
2010 	evl->evl_encap_proto = htons(ETHERTYPE_VLAN);
2011 	evl->evl_tag = htons(m->m_pkthdr.ether_vtag);
2012 	m->m_flags &= ~M_VLANTAG;
2013 	return m;
2014 }
2015 #endif /* MXGE_NEW_VLAN_API */
2016 
2017 static void
2018 mxge_encap(struct mxge_slice_state *ss, struct mbuf *m)
2019 {
2020 	mxge_softc_t *sc;
2021 	mcp_kreq_ether_send_t *req;
2022 	bus_dma_segment_t *seg;
2023 	struct mbuf *m_tmp;
2024 	struct ifnet *ifp;
2025 	mxge_tx_ring_t *tx;
2026 	struct ip *ip;
2027 	int cnt, cum_len, err, i, idx, odd_flag, ip_off;
2028 	uint16_t pseudo_hdr_offset;
2029         uint8_t flags, cksum_offset;
2030 
2031 
2032 	sc = ss->sc;
2033 	ifp = sc->ifp;
2034 	tx = &ss->tx;
2035 
2036 	ip_off = sizeof (struct ether_header);
2037 #ifdef MXGE_NEW_VLAN_API
2038 	if (m->m_flags & M_VLANTAG) {
2039 		m = mxge_vlan_tag_insert(m);
2040 		if (__predict_false(m == NULL))
2041 			goto drop;
2042 		ip_off += ETHER_VLAN_ENCAP_LEN;
2043 	}
2044 #endif
2045 	/* (try to) map the frame for DMA */
2046 	idx = tx->req & tx->mask;
2047 	err = bus_dmamap_load_mbuf_sg(tx->dmat, tx->info[idx].map,
2048 				      m, tx->seg_list, &cnt,
2049 				      BUS_DMA_NOWAIT);
2050 	if (__predict_false(err == EFBIG)) {
2051 		/* Too many segments in the chain.  Try
2052 		   to defrag */
2053 		m_tmp = m_defrag(m, M_NOWAIT);
2054 		if (m_tmp == NULL) {
2055 			goto drop;
2056 		}
2057 		ss->tx.defrag++;
2058 		m = m_tmp;
2059 		err = bus_dmamap_load_mbuf_sg(tx->dmat,
2060 					      tx->info[idx].map,
2061 					      m, tx->seg_list, &cnt,
2062 					      BUS_DMA_NOWAIT);
2063 	}
2064 	if (__predict_false(err != 0)) {
2065 		device_printf(sc->dev, "bus_dmamap_load_mbuf_sg returned %d"
2066 			      " packet len = %d\n", err, m->m_pkthdr.len);
2067 		goto drop;
2068 	}
2069 	bus_dmamap_sync(tx->dmat, tx->info[idx].map,
2070 			BUS_DMASYNC_PREWRITE);
2071 	tx->info[idx].m = m;
2072 
2073 #if IFCAP_TSO4
2074 	/* TSO is different enough, we handle it in another routine */
2075 	if (m->m_pkthdr.csum_flags & (CSUM_TSO)) {
2076 		mxge_encap_tso(ss, m, cnt, ip_off);
2077 		return;
2078 	}
2079 #endif
2080 
2081 	req = tx->req_list;
2082 	cksum_offset = 0;
2083 	pseudo_hdr_offset = 0;
2084 	flags = MXGEFW_FLAGS_NO_TSO;
2085 
2086 	/* checksum offloading? */
2087 	if (m->m_pkthdr.csum_flags & (CSUM_DELAY_DATA)) {
2088 		/* ensure ip header is in first mbuf, copy
2089 		   it to a scratch buffer if not */
2090 		if (__predict_false(m->m_len < ip_off + sizeof (*ip))) {
2091 			m_copydata(m, 0, ip_off + sizeof (*ip),
2092 				   ss->scratch);
2093 			ip = (struct ip *)(ss->scratch + ip_off);
2094 		} else {
2095 			ip = (struct ip *)(mtod(m, char *) + ip_off);
2096 		}
2097 		cksum_offset = ip_off + (ip->ip_hl << 2);
2098 		pseudo_hdr_offset = cksum_offset +  m->m_pkthdr.csum_data;
2099 		pseudo_hdr_offset = htobe16(pseudo_hdr_offset);
2100 		req->cksum_offset = cksum_offset;
2101 		flags |= MXGEFW_FLAGS_CKSUM;
2102 		odd_flag = MXGEFW_FLAGS_ALIGN_ODD;
2103 	} else {
2104 		odd_flag = 0;
2105 	}
2106 	if (m->m_pkthdr.len < MXGEFW_SEND_SMALL_SIZE)
2107 		flags |= MXGEFW_FLAGS_SMALL;
2108 
2109 	/* convert segments into a request list */
2110 	cum_len = 0;
2111 	seg = tx->seg_list;
2112 	req->flags = MXGEFW_FLAGS_FIRST;
2113 	for (i = 0; i < cnt; i++) {
2114 		req->addr_low =
2115 			htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
2116 		req->addr_high =
2117 			htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
2118 		req->length = htobe16(seg->ds_len);
2119 		req->cksum_offset = cksum_offset;
2120 		if (cksum_offset > seg->ds_len)
2121 			cksum_offset -= seg->ds_len;
2122 		else
2123 			cksum_offset = 0;
2124 		req->pseudo_hdr_offset = pseudo_hdr_offset;
2125 		req->pad = 0; /* complete solid 16-byte block */
2126 		req->rdma_count = 1;
2127 		req->flags |= flags | ((cum_len & 1) * odd_flag);
2128 		cum_len += seg->ds_len;
2129 		seg++;
2130 		req++;
2131 		req->flags = 0;
2132 	}
2133 	req--;
2134 	/* pad runts to 60 bytes */
2135 	if (cum_len < 60) {
2136 		req++;
2137 		req->addr_low =
2138 			htobe32(MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr));
2139 		req->addr_high =
2140 			htobe32(MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr));
2141 		req->length = htobe16(60 - cum_len);
2142 		req->cksum_offset = 0;
2143 		req->pseudo_hdr_offset = pseudo_hdr_offset;
2144 		req->pad = 0; /* complete solid 16-byte block */
2145 		req->rdma_count = 1;
2146 		req->flags |= flags | ((cum_len & 1) * odd_flag);
2147 		cnt++;
2148 	}
2149 
2150 	tx->req_list[0].rdma_count = cnt;
2151 #if 0
2152 	/* print what the firmware will see */
2153 	for (i = 0; i < cnt; i++) {
2154 		printf("%d: addr: 0x%x 0x%x len:%d pso%d,"
2155 		    "cso:%d, flags:0x%x, rdma:%d\n",
2156 		    i, (int)ntohl(tx->req_list[i].addr_high),
2157 		    (int)ntohl(tx->req_list[i].addr_low),
2158 		    (int)ntohs(tx->req_list[i].length),
2159 		    (int)ntohs(tx->req_list[i].pseudo_hdr_offset),
2160 		    tx->req_list[i].cksum_offset, tx->req_list[i].flags,
2161 		    tx->req_list[i].rdma_count);
2162 	}
2163 	printf("--------------\n");
2164 #endif
2165 	tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
2166 	mxge_submit_req(tx, tx->req_list, cnt);
2167 #ifdef IFNET_BUF_RING
2168 	if ((ss->sc->num_slices > 1) && tx->queue_active == 0) {
2169 		/* tell the NIC to start polling this slice */
2170 		*tx->send_go = 1;
2171 		tx->queue_active = 1;
2172 		tx->activate++;
2173 		wmb();
2174 	}
2175 #endif
2176 	return;
2177 
2178 drop:
2179 	m_freem(m);
2180 	ss->oerrors++;
2181 	return;
2182 }
2183 
2184 #ifdef IFNET_BUF_RING
2185 static void
2186 mxge_qflush(struct ifnet *ifp)
2187 {
2188 	mxge_softc_t *sc = ifp->if_softc;
2189 	mxge_tx_ring_t *tx;
2190 	struct mbuf *m;
2191 	int slice;
2192 
2193 	for (slice = 0; slice < sc->num_slices; slice++) {
2194 		tx = &sc->ss[slice].tx;
2195 		mtx_lock(&tx->mtx);
2196 		while ((m = buf_ring_dequeue_sc(tx->br)) != NULL)
2197 			m_freem(m);
2198 		mtx_unlock(&tx->mtx);
2199 	}
2200 	if_qflush(ifp);
2201 }
2202 
2203 static inline void
2204 mxge_start_locked(struct mxge_slice_state *ss)
2205 {
2206 	mxge_softc_t *sc;
2207 	struct mbuf *m;
2208 	struct ifnet *ifp;
2209 	mxge_tx_ring_t *tx;
2210 
2211 	sc = ss->sc;
2212 	ifp = sc->ifp;
2213 	tx = &ss->tx;
2214 
2215 	while ((tx->mask - (tx->req - tx->done)) > tx->max_desc) {
2216 		m = drbr_dequeue(ifp, tx->br);
2217 		if (m == NULL) {
2218 			return;
2219 		}
2220 		/* let BPF see it */
2221 		BPF_MTAP(ifp, m);
2222 
2223 		/* give it to the nic */
2224 		mxge_encap(ss, m);
2225 	}
2226 	/* ran out of transmit slots */
2227 	if (((ss->if_drv_flags & IFF_DRV_OACTIVE) == 0)
2228 	    && (!drbr_empty(ifp, tx->br))) {
2229 		ss->if_drv_flags |= IFF_DRV_OACTIVE;
2230 		tx->stall++;
2231 	}
2232 }
2233 
2234 static int
2235 mxge_transmit_locked(struct mxge_slice_state *ss, struct mbuf *m)
2236 {
2237 	mxge_softc_t *sc;
2238 	struct ifnet *ifp;
2239 	mxge_tx_ring_t *tx;
2240 	int err;
2241 
2242 	sc = ss->sc;
2243 	ifp = sc->ifp;
2244 	tx = &ss->tx;
2245 
2246 	if ((ss->if_drv_flags & (IFF_DRV_RUNNING|IFF_DRV_OACTIVE)) !=
2247 	    IFF_DRV_RUNNING) {
2248 		err = drbr_enqueue(ifp, tx->br, m);
2249 		return (err);
2250 	}
2251 
2252 	if (!drbr_needs_enqueue(ifp, tx->br) &&
2253 	    ((tx->mask - (tx->req - tx->done)) > tx->max_desc)) {
2254 		/* let BPF see it */
2255 		BPF_MTAP(ifp, m);
2256 		/* give it to the nic */
2257 		mxge_encap(ss, m);
2258 	} else if ((err = drbr_enqueue(ifp, tx->br, m)) != 0) {
2259 		return (err);
2260 	}
2261 	if (!drbr_empty(ifp, tx->br))
2262 		mxge_start_locked(ss);
2263 	return (0);
2264 }
2265 
2266 static int
2267 mxge_transmit(struct ifnet *ifp, struct mbuf *m)
2268 {
2269 	mxge_softc_t *sc = ifp->if_softc;
2270 	struct mxge_slice_state *ss;
2271 	mxge_tx_ring_t *tx;
2272 	int err = 0;
2273 	int slice;
2274 
2275 	slice = m->m_pkthdr.flowid;
2276 	slice &= (sc->num_slices - 1);  /* num_slices always power of 2 */
2277 
2278 	ss = &sc->ss[slice];
2279 	tx = &ss->tx;
2280 
2281 	if (mtx_trylock(&tx->mtx)) {
2282 		err = mxge_transmit_locked(ss, m);
2283 		mtx_unlock(&tx->mtx);
2284 	} else {
2285 		err = drbr_enqueue(ifp, tx->br, m);
2286 	}
2287 
2288 	return (err);
2289 }
2290 
2291 #else
2292 
2293 static inline void
2294 mxge_start_locked(struct mxge_slice_state *ss)
2295 {
2296 	mxge_softc_t *sc;
2297 	struct mbuf *m;
2298 	struct ifnet *ifp;
2299 	mxge_tx_ring_t *tx;
2300 
2301 	sc = ss->sc;
2302 	ifp = sc->ifp;
2303 	tx = &ss->tx;
2304 	while ((tx->mask - (tx->req - tx->done)) > tx->max_desc) {
2305 		IFQ_DRV_DEQUEUE(&ifp->if_snd, m);
2306 		if (m == NULL) {
2307 			return;
2308 		}
2309 		/* let BPF see it */
2310 		BPF_MTAP(ifp, m);
2311 
2312 		/* give it to the nic */
2313 		mxge_encap(ss, m);
2314 	}
2315 	/* ran out of transmit slots */
2316 	if ((sc->ifp->if_drv_flags & IFF_DRV_OACTIVE) == 0) {
2317 		sc->ifp->if_drv_flags |= IFF_DRV_OACTIVE;
2318 		tx->stall++;
2319 	}
2320 }
2321 #endif
2322 static void
2323 mxge_start(struct ifnet *ifp)
2324 {
2325 	mxge_softc_t *sc = ifp->if_softc;
2326 	struct mxge_slice_state *ss;
2327 
2328 	/* only use the first slice for now */
2329 	ss = &sc->ss[0];
2330 	mtx_lock(&ss->tx.mtx);
2331 	mxge_start_locked(ss);
2332 	mtx_unlock(&ss->tx.mtx);
2333 }
2334 
2335 /*
2336  * copy an array of mcp_kreq_ether_recv_t's to the mcp.  Copy
2337  * at most 32 bytes at a time, so as to avoid involving the software
2338  * pio handler in the nic.   We re-write the first segment's low
2339  * DMA address to mark it valid only after we write the entire chunk
2340  * in a burst
2341  */
2342 static inline void
2343 mxge_submit_8rx(volatile mcp_kreq_ether_recv_t *dst,
2344 		mcp_kreq_ether_recv_t *src)
2345 {
2346 	uint32_t low;
2347 
2348 	low = src->addr_low;
2349 	src->addr_low = 0xffffffff;
2350 	mxge_pio_copy(dst, src, 4 * sizeof (*src));
2351 	wmb();
2352 	mxge_pio_copy(dst + 4, src + 4, 4 * sizeof (*src));
2353 	wmb();
2354 	src->addr_low = low;
2355 	dst->addr_low = low;
2356 	wmb();
2357 }
2358 
2359 static int
2360 mxge_get_buf_small(struct mxge_slice_state *ss, bus_dmamap_t map, int idx)
2361 {
2362 	bus_dma_segment_t seg;
2363 	struct mbuf *m;
2364 	mxge_rx_ring_t *rx = &ss->rx_small;
2365 	int cnt, err;
2366 
2367 	m = m_gethdr(M_DONTWAIT, MT_DATA);
2368 	if (m == NULL) {
2369 		rx->alloc_fail++;
2370 		err = ENOBUFS;
2371 		goto done;
2372 	}
2373 	m->m_len = MHLEN;
2374 	err = bus_dmamap_load_mbuf_sg(rx->dmat, map, m,
2375 				      &seg, &cnt, BUS_DMA_NOWAIT);
2376 	if (err != 0) {
2377 		m_free(m);
2378 		goto done;
2379 	}
2380 	rx->info[idx].m = m;
2381 	rx->shadow[idx].addr_low =
2382 		htobe32(MXGE_LOWPART_TO_U32(seg.ds_addr));
2383 	rx->shadow[idx].addr_high =
2384 		htobe32(MXGE_HIGHPART_TO_U32(seg.ds_addr));
2385 
2386 done:
2387 	if ((idx & 7) == 7)
2388 		mxge_submit_8rx(&rx->lanai[idx - 7], &rx->shadow[idx - 7]);
2389 	return err;
2390 }
2391 
2392 static int
2393 mxge_get_buf_big(struct mxge_slice_state *ss, bus_dmamap_t map, int idx)
2394 {
2395 	bus_dma_segment_t seg[3];
2396 	struct mbuf *m;
2397 	mxge_rx_ring_t *rx = &ss->rx_big;
2398 	int cnt, err, i;
2399 
2400 	if (rx->cl_size == MCLBYTES)
2401 		m = m_getcl(M_DONTWAIT, MT_DATA, M_PKTHDR);
2402 	else
2403 		m = m_getjcl(M_DONTWAIT, MT_DATA, M_PKTHDR, rx->cl_size);
2404 	if (m == NULL) {
2405 		rx->alloc_fail++;
2406 		err = ENOBUFS;
2407 		goto done;
2408 	}
2409 	m->m_len = rx->mlen;
2410 	err = bus_dmamap_load_mbuf_sg(rx->dmat, map, m,
2411 				      seg, &cnt, BUS_DMA_NOWAIT);
2412 	if (err != 0) {
2413 		m_free(m);
2414 		goto done;
2415 	}
2416 	rx->info[idx].m = m;
2417 	rx->shadow[idx].addr_low =
2418 		htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
2419 	rx->shadow[idx].addr_high =
2420 		htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
2421 
2422 #if MXGE_VIRT_JUMBOS
2423 	for (i = 1; i < cnt; i++) {
2424 		rx->shadow[idx + i].addr_low =
2425 			htobe32(MXGE_LOWPART_TO_U32(seg[i].ds_addr));
2426 		rx->shadow[idx + i].addr_high =
2427 			htobe32(MXGE_HIGHPART_TO_U32(seg[i].ds_addr));
2428        }
2429 #endif
2430 
2431 done:
2432        for (i = 0; i < rx->nbufs; i++) {
2433 		if ((idx & 7) == 7) {
2434 			mxge_submit_8rx(&rx->lanai[idx - 7],
2435 					&rx->shadow[idx - 7]);
2436 		}
2437 		idx++;
2438 	}
2439 	return err;
2440 }
2441 
2442 /*
2443  *  Myri10GE hardware checksums are not valid if the sender
2444  *  padded the frame with non-zero padding.  This is because
2445  *  the firmware just does a simple 16-bit 1s complement
2446  *  checksum across the entire frame, excluding the first 14
2447  *  bytes.  It is best to simply to check the checksum and
2448  *  tell the stack about it only if the checksum is good
2449  */
2450 
2451 static inline uint16_t
2452 mxge_rx_csum(struct mbuf *m, int csum)
2453 {
2454 	struct ether_header *eh;
2455 	struct ip *ip;
2456 	uint16_t c;
2457 
2458 	eh = mtod(m, struct ether_header *);
2459 
2460 	/* only deal with IPv4 TCP & UDP for now */
2461 	if (__predict_false(eh->ether_type != htons(ETHERTYPE_IP)))
2462 		return 1;
2463 	ip = (struct ip *)(eh + 1);
2464 	if (__predict_false(ip->ip_p != IPPROTO_TCP &&
2465 			    ip->ip_p != IPPROTO_UDP))
2466 		return 1;
2467 #ifdef INET
2468 	c = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
2469 		      htonl(ntohs(csum) + ntohs(ip->ip_len) +
2470 			    - (ip->ip_hl << 2) + ip->ip_p));
2471 #else
2472 	c = 1;
2473 #endif
2474 	c ^= 0xffff;
2475 	return (c);
2476 }
2477 
2478 static void
2479 mxge_vlan_tag_remove(struct mbuf *m, uint32_t *csum)
2480 {
2481 	struct ether_vlan_header *evl;
2482 	struct ether_header *eh;
2483 	uint32_t partial;
2484 
2485 	evl = mtod(m, struct ether_vlan_header *);
2486 	eh = mtod(m, struct ether_header *);
2487 
2488 	/*
2489 	 * fix checksum by subtracting ETHER_VLAN_ENCAP_LEN bytes
2490 	 * after what the firmware thought was the end of the ethernet
2491 	 * header.
2492 	 */
2493 
2494 	/* put checksum into host byte order */
2495 	*csum = ntohs(*csum);
2496 	partial = ntohl(*(uint32_t *)(mtod(m, char *) + ETHER_HDR_LEN));
2497 	(*csum) += ~partial;
2498 	(*csum) +=  ((*csum) < ~partial);
2499 	(*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2500 	(*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2501 
2502 	/* restore checksum to network byte order;
2503 	   later consumers expect this */
2504 	*csum = htons(*csum);
2505 
2506 	/* save the tag */
2507 #ifdef MXGE_NEW_VLAN_API
2508 	m->m_pkthdr.ether_vtag = ntohs(evl->evl_tag);
2509 #else
2510 	{
2511 		struct m_tag *mtag;
2512 		mtag = m_tag_alloc(MTAG_VLAN, MTAG_VLAN_TAG, sizeof(u_int),
2513 				   M_NOWAIT);
2514 		if (mtag == NULL)
2515 			return;
2516 		VLAN_TAG_VALUE(mtag) = ntohs(evl->evl_tag);
2517 		m_tag_prepend(m, mtag);
2518 	}
2519 
2520 #endif
2521 	m->m_flags |= M_VLANTAG;
2522 
2523 	/*
2524 	 * Remove the 802.1q header by copying the Ethernet
2525 	 * addresses over it and adjusting the beginning of
2526 	 * the data in the mbuf.  The encapsulated Ethernet
2527 	 * type field is already in place.
2528 	 */
2529 	bcopy((char *)evl, (char *)evl + ETHER_VLAN_ENCAP_LEN,
2530 	      ETHER_HDR_LEN - ETHER_TYPE_LEN);
2531 	m_adj(m, ETHER_VLAN_ENCAP_LEN);
2532 }
2533 
2534 
2535 static inline void
2536 mxge_rx_done_big(struct mxge_slice_state *ss, uint32_t len, uint32_t csum)
2537 {
2538 	mxge_softc_t *sc;
2539 	struct ifnet *ifp;
2540 	struct mbuf *m;
2541 	struct ether_header *eh;
2542 	mxge_rx_ring_t *rx;
2543 	bus_dmamap_t old_map;
2544 	int idx;
2545 	uint16_t tcpudp_csum;
2546 
2547 	sc = ss->sc;
2548 	ifp = sc->ifp;
2549 	rx = &ss->rx_big;
2550 	idx = rx->cnt & rx->mask;
2551 	rx->cnt += rx->nbufs;
2552 	/* save a pointer to the received mbuf */
2553 	m = rx->info[idx].m;
2554 	/* try to replace the received mbuf */
2555 	if (mxge_get_buf_big(ss, rx->extra_map, idx)) {
2556 		/* drop the frame -- the old mbuf is re-cycled */
2557 		ifp->if_ierrors++;
2558 		return;
2559 	}
2560 
2561 	/* unmap the received buffer */
2562 	old_map = rx->info[idx].map;
2563 	bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2564 	bus_dmamap_unload(rx->dmat, old_map);
2565 
2566 	/* swap the bus_dmamap_t's */
2567 	rx->info[idx].map = rx->extra_map;
2568 	rx->extra_map = old_map;
2569 
2570 	/* mcp implicitly skips 1st 2 bytes so that packet is properly
2571 	 * aligned */
2572 	m->m_data += MXGEFW_PAD;
2573 
2574 	m->m_pkthdr.rcvif = ifp;
2575 	m->m_len = m->m_pkthdr.len = len;
2576 	ss->ipackets++;
2577 	eh = mtod(m, struct ether_header *);
2578 	if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2579 		mxge_vlan_tag_remove(m, &csum);
2580 	}
2581 	/* if the checksum is valid, mark it in the mbuf header */
2582 	if (sc->csum_flag && (0 == (tcpudp_csum = mxge_rx_csum(m, csum)))) {
2583 		if (sc->lro_cnt && (0 == mxge_lro_rx(ss, m, csum)))
2584 			return;
2585 		/* otherwise, it was a UDP frame, or a TCP frame which
2586 		   we could not do LRO on.  Tell the stack that the
2587 		   checksum is good */
2588 		m->m_pkthdr.csum_data = 0xffff;
2589 		m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR | CSUM_DATA_VALID;
2590 	}
2591 	/* flowid only valid if RSS hashing is enabled */
2592 	if (sc->num_slices > 1) {
2593 		m->m_pkthdr.flowid = (ss - sc->ss);
2594 		m->m_flags |= M_FLOWID;
2595 	}
2596 	/* pass the frame up the stack */
2597 	(*ifp->if_input)(ifp, m);
2598 }
2599 
2600 static inline void
2601 mxge_rx_done_small(struct mxge_slice_state *ss, uint32_t len, uint32_t csum)
2602 {
2603 	mxge_softc_t *sc;
2604 	struct ifnet *ifp;
2605 	struct ether_header *eh;
2606 	struct mbuf *m;
2607 	mxge_rx_ring_t *rx;
2608 	bus_dmamap_t old_map;
2609 	int idx;
2610 	uint16_t tcpudp_csum;
2611 
2612 	sc = ss->sc;
2613 	ifp = sc->ifp;
2614 	rx = &ss->rx_small;
2615 	idx = rx->cnt & rx->mask;
2616 	rx->cnt++;
2617 	/* save a pointer to the received mbuf */
2618 	m = rx->info[idx].m;
2619 	/* try to replace the received mbuf */
2620 	if (mxge_get_buf_small(ss, rx->extra_map, idx)) {
2621 		/* drop the frame -- the old mbuf is re-cycled */
2622 		ifp->if_ierrors++;
2623 		return;
2624 	}
2625 
2626 	/* unmap the received buffer */
2627 	old_map = rx->info[idx].map;
2628 	bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2629 	bus_dmamap_unload(rx->dmat, old_map);
2630 
2631 	/* swap the bus_dmamap_t's */
2632 	rx->info[idx].map = rx->extra_map;
2633 	rx->extra_map = old_map;
2634 
2635 	/* mcp implicitly skips 1st 2 bytes so that packet is properly
2636 	 * aligned */
2637 	m->m_data += MXGEFW_PAD;
2638 
2639 	m->m_pkthdr.rcvif = ifp;
2640 	m->m_len = m->m_pkthdr.len = len;
2641 	ss->ipackets++;
2642 	eh = mtod(m, struct ether_header *);
2643 	if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2644 		mxge_vlan_tag_remove(m, &csum);
2645 	}
2646 	/* if the checksum is valid, mark it in the mbuf header */
2647 	if (sc->csum_flag && (0 == (tcpudp_csum = mxge_rx_csum(m, csum)))) {
2648 		if (sc->lro_cnt && (0 == mxge_lro_rx(ss, m, csum)))
2649 			return;
2650 		/* otherwise, it was a UDP frame, or a TCP frame which
2651 		   we could not do LRO on.  Tell the stack that the
2652 		   checksum is good */
2653 		m->m_pkthdr.csum_data = 0xffff;
2654 		m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR | CSUM_DATA_VALID;
2655 	}
2656 	/* flowid only valid if RSS hashing is enabled */
2657 	if (sc->num_slices > 1) {
2658 		m->m_pkthdr.flowid = (ss - sc->ss);
2659 		m->m_flags |= M_FLOWID;
2660 	}
2661 	/* pass the frame up the stack */
2662 	(*ifp->if_input)(ifp, m);
2663 }
2664 
2665 static inline void
2666 mxge_clean_rx_done(struct mxge_slice_state *ss)
2667 {
2668 	mxge_rx_done_t *rx_done = &ss->rx_done;
2669 	int limit = 0;
2670 	uint16_t length;
2671 	uint16_t checksum;
2672 
2673 
2674 	while (rx_done->entry[rx_done->idx].length != 0) {
2675 		length = ntohs(rx_done->entry[rx_done->idx].length);
2676 		rx_done->entry[rx_done->idx].length = 0;
2677 		checksum = rx_done->entry[rx_done->idx].checksum;
2678 		if (length <= (MHLEN - MXGEFW_PAD))
2679 			mxge_rx_done_small(ss, length, checksum);
2680 		else
2681 			mxge_rx_done_big(ss, length, checksum);
2682 		rx_done->cnt++;
2683 		rx_done->idx = rx_done->cnt & rx_done->mask;
2684 
2685 		/* limit potential for livelock */
2686 		if (__predict_false(++limit > rx_done->mask / 2))
2687 			break;
2688 	}
2689 #ifdef INET
2690 	while (!SLIST_EMPTY(&ss->lro_active)) {
2691 		struct lro_entry *lro = SLIST_FIRST(&ss->lro_active);
2692 		SLIST_REMOVE_HEAD(&ss->lro_active, next);
2693 		mxge_lro_flush(ss, lro);
2694 	}
2695 #endif
2696 }
2697 
2698 
2699 static inline void
2700 mxge_tx_done(struct mxge_slice_state *ss, uint32_t mcp_idx)
2701 {
2702 	struct ifnet *ifp;
2703 	mxge_tx_ring_t *tx;
2704 	struct mbuf *m;
2705 	bus_dmamap_t map;
2706 	int idx;
2707 	int *flags;
2708 
2709 	tx = &ss->tx;
2710 	ifp = ss->sc->ifp;
2711 	while (tx->pkt_done != mcp_idx) {
2712 		idx = tx->done & tx->mask;
2713 		tx->done++;
2714 		m = tx->info[idx].m;
2715 		/* mbuf and DMA map only attached to the first
2716 		   segment per-mbuf */
2717 		if (m != NULL) {
2718 			ss->obytes += m->m_pkthdr.len;
2719 			if (m->m_flags & M_MCAST)
2720 				ss->omcasts++;
2721 			ss->opackets++;
2722 			tx->info[idx].m = NULL;
2723 			map = tx->info[idx].map;
2724 			bus_dmamap_unload(tx->dmat, map);
2725 			m_freem(m);
2726 		}
2727 		if (tx->info[idx].flag) {
2728 			tx->info[idx].flag = 0;
2729 			tx->pkt_done++;
2730 		}
2731 	}
2732 
2733 	/* If we have space, clear IFF_OACTIVE to tell the stack that
2734            its OK to send packets */
2735 #ifdef IFNET_BUF_RING
2736 	flags = &ss->if_drv_flags;
2737 #else
2738 	flags = &ifp->if_drv_flags;
2739 #endif
2740 	mtx_lock(&ss->tx.mtx);
2741 	if ((*flags) & IFF_DRV_OACTIVE &&
2742 	    tx->req - tx->done < (tx->mask + 1)/4) {
2743 		*(flags) &= ~IFF_DRV_OACTIVE;
2744 		ss->tx.wake++;
2745 		mxge_start_locked(ss);
2746 	}
2747 #ifdef IFNET_BUF_RING
2748 	if ((ss->sc->num_slices > 1) && (tx->req == tx->done)) {
2749 		/* let the NIC stop polling this queue, since there
2750 		 * are no more transmits pending */
2751 		if (tx->req == tx->done) {
2752 			*tx->send_stop = 1;
2753 			tx->queue_active = 0;
2754 			tx->deactivate++;
2755 			wmb();
2756 		}
2757 	}
2758 #endif
2759 	mtx_unlock(&ss->tx.mtx);
2760 
2761 }
2762 
2763 static struct mxge_media_type mxge_xfp_media_types[] =
2764 {
2765 	{IFM_10G_CX4,	0x7f, 		"10GBASE-CX4 (module)"},
2766 	{IFM_10G_SR, 	(1 << 7),	"10GBASE-SR"},
2767 	{IFM_10G_LR, 	(1 << 6),	"10GBASE-LR"},
2768 	{0,		(1 << 5),	"10GBASE-ER"},
2769 	{IFM_10G_LRM,	(1 << 4),	"10GBASE-LRM"},
2770 	{0,		(1 << 3),	"10GBASE-SW"},
2771 	{0,		(1 << 2),	"10GBASE-LW"},
2772 	{0,		(1 << 1),	"10GBASE-EW"},
2773 	{0,		(1 << 0),	"Reserved"}
2774 };
2775 static struct mxge_media_type mxge_sfp_media_types[] =
2776 {
2777 	{IFM_10G_TWINAX,      0,	"10GBASE-Twinax"},
2778 	{0,		(1 << 7),	"Reserved"},
2779 	{IFM_10G_LRM,	(1 << 6),	"10GBASE-LRM"},
2780 	{IFM_10G_LR, 	(1 << 5),	"10GBASE-LR"},
2781 	{IFM_10G_SR,	(1 << 4),	"10GBASE-SR"}
2782 };
2783 
2784 static void
2785 mxge_set_media(mxge_softc_t *sc, int type)
2786 {
2787 	sc->media_flags |= type;
2788 	ifmedia_add(&sc->media, sc->media_flags, 0, NULL);
2789 	ifmedia_set(&sc->media, sc->media_flags);
2790 }
2791 
2792 
2793 /*
2794  * Determine the media type for a NIC.  Some XFPs will identify
2795  * themselves only when their link is up, so this is initiated via a
2796  * link up interrupt.  However, this can potentially take up to
2797  * several milliseconds, so it is run via the watchdog routine, rather
2798  * than in the interrupt handler itself.   This need only be done
2799  * once, not each time the link is up.
2800  */
2801 static void
2802 mxge_media_probe(mxge_softc_t *sc)
2803 {
2804 	mxge_cmd_t cmd;
2805 	char *cage_type;
2806 	char *ptr;
2807 	struct mxge_media_type *mxge_media_types = NULL;
2808 	int i, err, ms, mxge_media_type_entries;
2809 	uint32_t byte;
2810 
2811 	sc->need_media_probe = 0;
2812 
2813 	/* if we've already set a media type, we're done */
2814 	if (sc->media_flags  != (IFM_ETHER | IFM_AUTO))
2815 		return;
2816 
2817 	/*
2818 	 * parse the product code to deterimine the interface type
2819 	 * (CX4, XFP, Quad Ribbon Fiber) by looking at the character
2820 	 * after the 3rd dash in the driver's cached copy of the
2821 	 * EEPROM's product code string.
2822 	 */
2823 	ptr = sc->product_code_string;
2824 	if (ptr == NULL) {
2825 		device_printf(sc->dev, "Missing product code\n");
2826 	}
2827 
2828 	for (i = 0; i < 3; i++, ptr++) {
2829 		ptr = index(ptr, '-');
2830 		if (ptr == NULL) {
2831 			device_printf(sc->dev,
2832 				      "only %d dashes in PC?!?\n", i);
2833 			return;
2834 		}
2835 	}
2836 	if (*ptr == 'C') {
2837 		/* -C is CX4 */
2838 		mxge_set_media(sc, IFM_10G_CX4);
2839 		return;
2840 	}
2841 	else if (*ptr == 'Q') {
2842 		/* -Q is Quad Ribbon Fiber */
2843 		device_printf(sc->dev, "Quad Ribbon Fiber Media\n");
2844 		/* FreeBSD has no media type for Quad ribbon fiber */
2845 		return;
2846 	}
2847 
2848 	if (*ptr == 'R') {
2849 		/* -R is XFP */
2850 		mxge_media_types = mxge_xfp_media_types;
2851 		mxge_media_type_entries =
2852 			sizeof (mxge_xfp_media_types) /
2853 			sizeof (mxge_xfp_media_types[0]);
2854 		byte = MXGE_XFP_COMPLIANCE_BYTE;
2855 		cage_type = "XFP";
2856 	}
2857 
2858 	if (*ptr == 'S' || *(ptr +1) == 'S') {
2859 		/* -S or -2S is SFP+ */
2860 		mxge_media_types = mxge_sfp_media_types;
2861 		mxge_media_type_entries =
2862 			sizeof (mxge_sfp_media_types) /
2863 			sizeof (mxge_sfp_media_types[0]);
2864 		cage_type = "SFP+";
2865 		byte = 3;
2866 	}
2867 
2868 	if (mxge_media_types == NULL) {
2869 		device_printf(sc->dev, "Unknown media type: %c\n", *ptr);
2870 		return;
2871 	}
2872 
2873 	/*
2874 	 * At this point we know the NIC has an XFP cage, so now we
2875 	 * try to determine what is in the cage by using the
2876 	 * firmware's XFP I2C commands to read the XFP 10GbE compilance
2877 	 * register.  We read just one byte, which may take over
2878 	 * a millisecond
2879 	 */
2880 
2881 	cmd.data0 = 0;	 /* just fetch 1 byte, not all 256 */
2882 	cmd.data1 = byte;
2883 	err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_READ, &cmd);
2884 	if (err == MXGEFW_CMD_ERROR_I2C_FAILURE) {
2885 		device_printf(sc->dev, "failed to read XFP\n");
2886 	}
2887 	if (err == MXGEFW_CMD_ERROR_I2C_ABSENT) {
2888 		device_printf(sc->dev, "Type R/S with no XFP!?!?\n");
2889 	}
2890 	if (err != MXGEFW_CMD_OK) {
2891 		return;
2892 	}
2893 
2894 	/* now we wait for the data to be cached */
2895 	cmd.data0 = byte;
2896 	err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
2897 	for (ms = 0; (err == EBUSY) && (ms < 50); ms++) {
2898 		DELAY(1000);
2899 		cmd.data0 = byte;
2900 		err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
2901 	}
2902 	if (err != MXGEFW_CMD_OK) {
2903 		device_printf(sc->dev, "failed to read %s (%d, %dms)\n",
2904 			      cage_type, err, ms);
2905 		return;
2906 	}
2907 
2908 	if (cmd.data0 == mxge_media_types[0].bitmask) {
2909 		if (mxge_verbose)
2910 			device_printf(sc->dev, "%s:%s\n", cage_type,
2911 				      mxge_media_types[0].name);
2912 		mxge_set_media(sc, mxge_media_types[0].flag);
2913 		return;
2914 	}
2915 	for (i = 1; i < mxge_media_type_entries; i++) {
2916 		if (cmd.data0 & mxge_media_types[i].bitmask) {
2917 			if (mxge_verbose)
2918 				device_printf(sc->dev, "%s:%s\n",
2919 					      cage_type,
2920 					      mxge_media_types[i].name);
2921 
2922 			mxge_set_media(sc, mxge_media_types[i].flag);
2923 			return;
2924 		}
2925 	}
2926 	device_printf(sc->dev, "%s media 0x%x unknown\n", cage_type,
2927 		      cmd.data0);
2928 
2929 	return;
2930 }
2931 
2932 static void
2933 mxge_intr(void *arg)
2934 {
2935 	struct mxge_slice_state *ss = arg;
2936 	mxge_softc_t *sc = ss->sc;
2937 	mcp_irq_data_t *stats = ss->fw_stats;
2938 	mxge_tx_ring_t *tx = &ss->tx;
2939 	mxge_rx_done_t *rx_done = &ss->rx_done;
2940 	uint32_t send_done_count;
2941 	uint8_t valid;
2942 
2943 
2944 #ifndef IFNET_BUF_RING
2945 	/* an interrupt on a non-zero slice is implicitly valid
2946 	   since MSI-X irqs are not shared */
2947 	if (ss != sc->ss) {
2948 		mxge_clean_rx_done(ss);
2949 		*ss->irq_claim = be32toh(3);
2950 		return;
2951 	}
2952 #endif
2953 
2954 	/* make sure the DMA has finished */
2955 	if (!stats->valid) {
2956 		return;
2957 	}
2958 	valid = stats->valid;
2959 
2960 	if (sc->legacy_irq) {
2961 		/* lower legacy IRQ  */
2962 		*sc->irq_deassert = 0;
2963 		if (!mxge_deassert_wait)
2964 			/* don't wait for conf. that irq is low */
2965 			stats->valid = 0;
2966 	} else {
2967 		stats->valid = 0;
2968 	}
2969 
2970 	/* loop while waiting for legacy irq deassertion */
2971 	do {
2972 		/* check for transmit completes and receives */
2973 		send_done_count = be32toh(stats->send_done_count);
2974 		while ((send_done_count != tx->pkt_done) ||
2975 		       (rx_done->entry[rx_done->idx].length != 0)) {
2976 			if (send_done_count != tx->pkt_done)
2977 				mxge_tx_done(ss, (int)send_done_count);
2978 			mxge_clean_rx_done(ss);
2979 			send_done_count = be32toh(stats->send_done_count);
2980 		}
2981 		if (sc->legacy_irq && mxge_deassert_wait)
2982 			wmb();
2983 	} while (*((volatile uint8_t *) &stats->valid));
2984 
2985 	/* fw link & error stats meaningful only on the first slice */
2986 	if (__predict_false((ss == sc->ss) && stats->stats_updated)) {
2987 		if (sc->link_state != stats->link_up) {
2988 			sc->link_state = stats->link_up;
2989 			if (sc->link_state) {
2990 				if_link_state_change(sc->ifp, LINK_STATE_UP);
2991 				if (mxge_verbose)
2992 					device_printf(sc->dev, "link up\n");
2993 			} else {
2994 				if_link_state_change(sc->ifp, LINK_STATE_DOWN);
2995 				if (mxge_verbose)
2996 					device_printf(sc->dev, "link down\n");
2997 			}
2998 			sc->need_media_probe = 1;
2999 		}
3000 		if (sc->rdma_tags_available !=
3001 		    be32toh(stats->rdma_tags_available)) {
3002 			sc->rdma_tags_available =
3003 				be32toh(stats->rdma_tags_available);
3004 			device_printf(sc->dev, "RDMA timed out! %d tags "
3005 				      "left\n", sc->rdma_tags_available);
3006 		}
3007 
3008 		if (stats->link_down) {
3009 			sc->down_cnt += stats->link_down;
3010 			sc->link_state = 0;
3011 			if_link_state_change(sc->ifp, LINK_STATE_DOWN);
3012 		}
3013 	}
3014 
3015 	/* check to see if we have rx token to pass back */
3016 	if (valid & 0x1)
3017 	    *ss->irq_claim = be32toh(3);
3018 	*(ss->irq_claim + 1) = be32toh(3);
3019 }
3020 
3021 static void
3022 mxge_init(void *arg)
3023 {
3024 }
3025 
3026 
3027 
3028 static void
3029 mxge_free_slice_mbufs(struct mxge_slice_state *ss)
3030 {
3031 	struct lro_entry *lro_entry;
3032 	int i;
3033 
3034 	while (!SLIST_EMPTY(&ss->lro_free)) {
3035 		lro_entry = SLIST_FIRST(&ss->lro_free);
3036 		SLIST_REMOVE_HEAD(&ss->lro_free, next);
3037 		free(lro_entry, M_DEVBUF);
3038 	}
3039 
3040 	for (i = 0; i <= ss->rx_big.mask; i++) {
3041 		if (ss->rx_big.info[i].m == NULL)
3042 			continue;
3043 		bus_dmamap_unload(ss->rx_big.dmat,
3044 				  ss->rx_big.info[i].map);
3045 		m_freem(ss->rx_big.info[i].m);
3046 		ss->rx_big.info[i].m = NULL;
3047 	}
3048 
3049 	for (i = 0; i <= ss->rx_small.mask; i++) {
3050 		if (ss->rx_small.info[i].m == NULL)
3051 			continue;
3052 		bus_dmamap_unload(ss->rx_small.dmat,
3053 				  ss->rx_small.info[i].map);
3054 		m_freem(ss->rx_small.info[i].m);
3055 		ss->rx_small.info[i].m = NULL;
3056 	}
3057 
3058 	/* transmit ring used only on the first slice */
3059 	if (ss->tx.info == NULL)
3060 		return;
3061 
3062 	for (i = 0; i <= ss->tx.mask; i++) {
3063 		ss->tx.info[i].flag = 0;
3064 		if (ss->tx.info[i].m == NULL)
3065 			continue;
3066 		bus_dmamap_unload(ss->tx.dmat,
3067 				  ss->tx.info[i].map);
3068 		m_freem(ss->tx.info[i].m);
3069 		ss->tx.info[i].m = NULL;
3070 	}
3071 }
3072 
3073 static void
3074 mxge_free_mbufs(mxge_softc_t *sc)
3075 {
3076 	int slice;
3077 
3078 	for (slice = 0; slice < sc->num_slices; slice++)
3079 		mxge_free_slice_mbufs(&sc->ss[slice]);
3080 }
3081 
3082 static void
3083 mxge_free_slice_rings(struct mxge_slice_state *ss)
3084 {
3085 	int i;
3086 
3087 
3088 	if (ss->rx_done.entry != NULL)
3089 		mxge_dma_free(&ss->rx_done.dma);
3090 	ss->rx_done.entry = NULL;
3091 
3092 	if (ss->tx.req_bytes != NULL)
3093 		free(ss->tx.req_bytes, M_DEVBUF);
3094 	ss->tx.req_bytes = NULL;
3095 
3096 	if (ss->tx.seg_list != NULL)
3097 		free(ss->tx.seg_list, M_DEVBUF);
3098 	ss->tx.seg_list = NULL;
3099 
3100 	if (ss->rx_small.shadow != NULL)
3101 		free(ss->rx_small.shadow, M_DEVBUF);
3102 	ss->rx_small.shadow = NULL;
3103 
3104 	if (ss->rx_big.shadow != NULL)
3105 		free(ss->rx_big.shadow, M_DEVBUF);
3106 	ss->rx_big.shadow = NULL;
3107 
3108 	if (ss->tx.info != NULL) {
3109 		if (ss->tx.dmat != NULL) {
3110 			for (i = 0; i <= ss->tx.mask; i++) {
3111 				bus_dmamap_destroy(ss->tx.dmat,
3112 						   ss->tx.info[i].map);
3113 			}
3114 			bus_dma_tag_destroy(ss->tx.dmat);
3115 		}
3116 		free(ss->tx.info, M_DEVBUF);
3117 	}
3118 	ss->tx.info = NULL;
3119 
3120 	if (ss->rx_small.info != NULL) {
3121 		if (ss->rx_small.dmat != NULL) {
3122 			for (i = 0; i <= ss->rx_small.mask; i++) {
3123 				bus_dmamap_destroy(ss->rx_small.dmat,
3124 						   ss->rx_small.info[i].map);
3125 			}
3126 			bus_dmamap_destroy(ss->rx_small.dmat,
3127 					   ss->rx_small.extra_map);
3128 			bus_dma_tag_destroy(ss->rx_small.dmat);
3129 		}
3130 		free(ss->rx_small.info, M_DEVBUF);
3131 	}
3132 	ss->rx_small.info = NULL;
3133 
3134 	if (ss->rx_big.info != NULL) {
3135 		if (ss->rx_big.dmat != NULL) {
3136 			for (i = 0; i <= ss->rx_big.mask; i++) {
3137 				bus_dmamap_destroy(ss->rx_big.dmat,
3138 						   ss->rx_big.info[i].map);
3139 			}
3140 			bus_dmamap_destroy(ss->rx_big.dmat,
3141 					   ss->rx_big.extra_map);
3142 			bus_dma_tag_destroy(ss->rx_big.dmat);
3143 		}
3144 		free(ss->rx_big.info, M_DEVBUF);
3145 	}
3146 	ss->rx_big.info = NULL;
3147 }
3148 
3149 static void
3150 mxge_free_rings(mxge_softc_t *sc)
3151 {
3152 	int slice;
3153 
3154 	for (slice = 0; slice < sc->num_slices; slice++)
3155 		mxge_free_slice_rings(&sc->ss[slice]);
3156 }
3157 
3158 static int
3159 mxge_alloc_slice_rings(struct mxge_slice_state *ss, int rx_ring_entries,
3160 		       int tx_ring_entries)
3161 {
3162 	mxge_softc_t *sc = ss->sc;
3163 	size_t bytes;
3164 	int err, i;
3165 
3166 	err = ENOMEM;
3167 
3168 	/* allocate per-slice receive resources */
3169 
3170 	ss->rx_small.mask = ss->rx_big.mask = rx_ring_entries - 1;
3171 	ss->rx_done.mask = (2 * rx_ring_entries) - 1;
3172 
3173 	/* allocate the rx shadow rings */
3174 	bytes = rx_ring_entries * sizeof (*ss->rx_small.shadow);
3175 	ss->rx_small.shadow = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3176 	if (ss->rx_small.shadow == NULL)
3177 		return err;
3178 
3179 	bytes = rx_ring_entries * sizeof (*ss->rx_big.shadow);
3180 	ss->rx_big.shadow = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3181 	if (ss->rx_big.shadow == NULL)
3182 		return err;
3183 
3184 	/* allocate the rx host info rings */
3185 	bytes = rx_ring_entries * sizeof (*ss->rx_small.info);
3186 	ss->rx_small.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3187 	if (ss->rx_small.info == NULL)
3188 		return err;
3189 
3190 	bytes = rx_ring_entries * sizeof (*ss->rx_big.info);
3191 	ss->rx_big.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3192 	if (ss->rx_big.info == NULL)
3193 		return err;
3194 
3195 	/* allocate the rx busdma resources */
3196 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
3197 				 1,			/* alignment */
3198 				 4096,			/* boundary */
3199 				 BUS_SPACE_MAXADDR,	/* low */
3200 				 BUS_SPACE_MAXADDR,	/* high */
3201 				 NULL, NULL,		/* filter */
3202 				 MHLEN,			/* maxsize */
3203 				 1,			/* num segs */
3204 				 MHLEN,			/* maxsegsize */
3205 				 BUS_DMA_ALLOCNOW,	/* flags */
3206 				 NULL, NULL,		/* lock */
3207 				 &ss->rx_small.dmat);	/* tag */
3208 	if (err != 0) {
3209 		device_printf(sc->dev, "Err %d allocating rx_small dmat\n",
3210 			      err);
3211 		return err;
3212 	}
3213 
3214 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
3215 				 1,			/* alignment */
3216 #if MXGE_VIRT_JUMBOS
3217 				 4096,			/* boundary */
3218 #else
3219 				 0,			/* boundary */
3220 #endif
3221 				 BUS_SPACE_MAXADDR,	/* low */
3222 				 BUS_SPACE_MAXADDR,	/* high */
3223 				 NULL, NULL,		/* filter */
3224 				 3*4096,		/* maxsize */
3225 #if MXGE_VIRT_JUMBOS
3226 				 3,			/* num segs */
3227 				 4096,			/* maxsegsize*/
3228 #else
3229 				 1,			/* num segs */
3230 				 MJUM9BYTES,		/* maxsegsize*/
3231 #endif
3232 				 BUS_DMA_ALLOCNOW,	/* flags */
3233 				 NULL, NULL,		/* lock */
3234 				 &ss->rx_big.dmat);	/* tag */
3235 	if (err != 0) {
3236 		device_printf(sc->dev, "Err %d allocating rx_big dmat\n",
3237 			      err);
3238 		return err;
3239 	}
3240 	for (i = 0; i <= ss->rx_small.mask; i++) {
3241 		err = bus_dmamap_create(ss->rx_small.dmat, 0,
3242 					&ss->rx_small.info[i].map);
3243 		if (err != 0) {
3244 			device_printf(sc->dev, "Err %d  rx_small dmamap\n",
3245 				      err);
3246 			return err;
3247 		}
3248 	}
3249 	err = bus_dmamap_create(ss->rx_small.dmat, 0,
3250 				&ss->rx_small.extra_map);
3251 	if (err != 0) {
3252 		device_printf(sc->dev, "Err %d extra rx_small dmamap\n",
3253 			      err);
3254 		return err;
3255 	}
3256 
3257 	for (i = 0; i <= ss->rx_big.mask; i++) {
3258 		err = bus_dmamap_create(ss->rx_big.dmat, 0,
3259 					&ss->rx_big.info[i].map);
3260 		if (err != 0) {
3261 			device_printf(sc->dev, "Err %d  rx_big dmamap\n",
3262 				      err);
3263 			return err;
3264 		}
3265 	}
3266 	err = bus_dmamap_create(ss->rx_big.dmat, 0,
3267 				&ss->rx_big.extra_map);
3268 	if (err != 0) {
3269 		device_printf(sc->dev, "Err %d extra rx_big dmamap\n",
3270 			      err);
3271 		return err;
3272 	}
3273 
3274 	/* now allocate TX resouces */
3275 
3276 #ifndef IFNET_BUF_RING
3277 	/* only use a single TX ring for now */
3278 	if (ss != ss->sc->ss)
3279 		return 0;
3280 #endif
3281 
3282 	ss->tx.mask = tx_ring_entries - 1;
3283 	ss->tx.max_desc = MIN(MXGE_MAX_SEND_DESC, tx_ring_entries / 4);
3284 
3285 
3286 	/* allocate the tx request copy block */
3287 	bytes = 8 +
3288 		sizeof (*ss->tx.req_list) * (ss->tx.max_desc + 4);
3289 	ss->tx.req_bytes = malloc(bytes, M_DEVBUF, M_WAITOK);
3290 	if (ss->tx.req_bytes == NULL)
3291 		return err;
3292 	/* ensure req_list entries are aligned to 8 bytes */
3293 	ss->tx.req_list = (mcp_kreq_ether_send_t *)
3294 		((unsigned long)(ss->tx.req_bytes + 7) & ~7UL);
3295 
3296 	/* allocate the tx busdma segment list */
3297 	bytes = sizeof (*ss->tx.seg_list) * ss->tx.max_desc;
3298 	ss->tx.seg_list = (bus_dma_segment_t *)
3299 		malloc(bytes, M_DEVBUF, M_WAITOK);
3300 	if (ss->tx.seg_list == NULL)
3301 		return err;
3302 
3303 	/* allocate the tx host info ring */
3304 	bytes = tx_ring_entries * sizeof (*ss->tx.info);
3305 	ss->tx.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3306 	if (ss->tx.info == NULL)
3307 		return err;
3308 
3309 	/* allocate the tx busdma resources */
3310 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
3311 				 1,			/* alignment */
3312 				 sc->tx_boundary,	/* boundary */
3313 				 BUS_SPACE_MAXADDR,	/* low */
3314 				 BUS_SPACE_MAXADDR,	/* high */
3315 				 NULL, NULL,		/* filter */
3316 				 65536 + 256,		/* maxsize */
3317 				 ss->tx.max_desc - 2,	/* num segs */
3318 				 sc->tx_boundary,	/* maxsegsz */
3319 				 BUS_DMA_ALLOCNOW,	/* flags */
3320 				 NULL, NULL,		/* lock */
3321 				 &ss->tx.dmat);		/* tag */
3322 
3323 	if (err != 0) {
3324 		device_printf(sc->dev, "Err %d allocating tx dmat\n",
3325 			      err);
3326 		return err;
3327 	}
3328 
3329 	/* now use these tags to setup dmamaps for each slot
3330 	   in the ring */
3331 	for (i = 0; i <= ss->tx.mask; i++) {
3332 		err = bus_dmamap_create(ss->tx.dmat, 0,
3333 					&ss->tx.info[i].map);
3334 		if (err != 0) {
3335 			device_printf(sc->dev, "Err %d  tx dmamap\n",
3336 				      err);
3337 			return err;
3338 		}
3339 	}
3340 	return 0;
3341 
3342 }
3343 
3344 static int
3345 mxge_alloc_rings(mxge_softc_t *sc)
3346 {
3347 	mxge_cmd_t cmd;
3348 	int tx_ring_size;
3349 	int tx_ring_entries, rx_ring_entries;
3350 	int err, slice;
3351 
3352 	/* get ring sizes */
3353 	err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_RING_SIZE, &cmd);
3354 	tx_ring_size = cmd.data0;
3355 	if (err != 0) {
3356 		device_printf(sc->dev, "Cannot determine tx ring sizes\n");
3357 		goto abort;
3358 	}
3359 
3360 	tx_ring_entries = tx_ring_size / sizeof (mcp_kreq_ether_send_t);
3361 	rx_ring_entries = sc->rx_ring_size / sizeof (mcp_dma_addr_t);
3362 	IFQ_SET_MAXLEN(&sc->ifp->if_snd, tx_ring_entries - 1);
3363 	sc->ifp->if_snd.ifq_drv_maxlen = sc->ifp->if_snd.ifq_maxlen;
3364 	IFQ_SET_READY(&sc->ifp->if_snd);
3365 
3366 	for (slice = 0; slice < sc->num_slices; slice++) {
3367 		err = mxge_alloc_slice_rings(&sc->ss[slice],
3368 					     rx_ring_entries,
3369 					     tx_ring_entries);
3370 		if (err != 0)
3371 			goto abort;
3372 	}
3373 	return 0;
3374 
3375 abort:
3376 	mxge_free_rings(sc);
3377 	return err;
3378 
3379 }
3380 
3381 
3382 static void
3383 mxge_choose_params(int mtu, int *big_buf_size, int *cl_size, int *nbufs)
3384 {
3385 	int bufsize = mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN + MXGEFW_PAD;
3386 
3387 	if (bufsize < MCLBYTES) {
3388 		/* easy, everything fits in a single buffer */
3389 		*big_buf_size = MCLBYTES;
3390 		*cl_size = MCLBYTES;
3391 		*nbufs = 1;
3392 		return;
3393 	}
3394 
3395 	if (bufsize < MJUMPAGESIZE) {
3396 		/* still easy, everything still fits in a single buffer */
3397 		*big_buf_size = MJUMPAGESIZE;
3398 		*cl_size = MJUMPAGESIZE;
3399 		*nbufs = 1;
3400 		return;
3401 	}
3402 #if MXGE_VIRT_JUMBOS
3403 	/* now we need to use virtually contiguous buffers */
3404 	*cl_size = MJUM9BYTES;
3405 	*big_buf_size = 4096;
3406 	*nbufs = mtu / 4096 + 1;
3407 	/* needs to be a power of two, so round up */
3408 	if (*nbufs == 3)
3409 		*nbufs = 4;
3410 #else
3411 	*cl_size = MJUM9BYTES;
3412 	*big_buf_size = MJUM9BYTES;
3413 	*nbufs = 1;
3414 #endif
3415 }
3416 
3417 static int
3418 mxge_slice_open(struct mxge_slice_state *ss, int nbufs, int cl_size)
3419 {
3420 	mxge_softc_t *sc;
3421 	mxge_cmd_t cmd;
3422 	bus_dmamap_t map;
3423 	struct lro_entry *lro_entry;
3424 	int err, i, slice;
3425 
3426 
3427 	sc = ss->sc;
3428 	slice = ss - sc->ss;
3429 
3430 	SLIST_INIT(&ss->lro_free);
3431 	SLIST_INIT(&ss->lro_active);
3432 
3433 	for (i = 0; i < sc->lro_cnt; i++) {
3434 		lro_entry = (struct lro_entry *)
3435 			malloc(sizeof (*lro_entry), M_DEVBUF,
3436 			       M_NOWAIT | M_ZERO);
3437 		if (lro_entry == NULL) {
3438 			sc->lro_cnt = i;
3439 			break;
3440 		}
3441 		SLIST_INSERT_HEAD(&ss->lro_free, lro_entry, next);
3442 	}
3443 	/* get the lanai pointers to the send and receive rings */
3444 
3445 	err = 0;
3446 #ifndef IFNET_BUF_RING
3447 	/* We currently only send from the first slice */
3448 	if (slice == 0) {
3449 #endif
3450 		cmd.data0 = slice;
3451 		err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_OFFSET, &cmd);
3452 		ss->tx.lanai =
3453 			(volatile mcp_kreq_ether_send_t *)(sc->sram + cmd.data0);
3454 		ss->tx.send_go = (volatile uint32_t *)
3455 			(sc->sram + MXGEFW_ETH_SEND_GO + 64 * slice);
3456 		ss->tx.send_stop = (volatile uint32_t *)
3457 		(sc->sram + MXGEFW_ETH_SEND_STOP + 64 * slice);
3458 #ifndef IFNET_BUF_RING
3459 	}
3460 #endif
3461 	cmd.data0 = slice;
3462 	err |= mxge_send_cmd(sc,
3463 			     MXGEFW_CMD_GET_SMALL_RX_OFFSET, &cmd);
3464 	ss->rx_small.lanai =
3465 		(volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
3466 	cmd.data0 = slice;
3467 	err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_BIG_RX_OFFSET, &cmd);
3468 	ss->rx_big.lanai =
3469 		(volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
3470 
3471 	if (err != 0) {
3472 		device_printf(sc->dev,
3473 			      "failed to get ring sizes or locations\n");
3474 		return EIO;
3475 	}
3476 
3477 	/* stock receive rings */
3478 	for (i = 0; i <= ss->rx_small.mask; i++) {
3479 		map = ss->rx_small.info[i].map;
3480 		err = mxge_get_buf_small(ss, map, i);
3481 		if (err) {
3482 			device_printf(sc->dev, "alloced %d/%d smalls\n",
3483 				      i, ss->rx_small.mask + 1);
3484 			return ENOMEM;
3485 		}
3486 	}
3487 	for (i = 0; i <= ss->rx_big.mask; i++) {
3488 		ss->rx_big.shadow[i].addr_low = 0xffffffff;
3489 		ss->rx_big.shadow[i].addr_high = 0xffffffff;
3490 	}
3491 	ss->rx_big.nbufs = nbufs;
3492 	ss->rx_big.cl_size = cl_size;
3493 	ss->rx_big.mlen = ss->sc->ifp->if_mtu + ETHER_HDR_LEN +
3494 		ETHER_VLAN_ENCAP_LEN + MXGEFW_PAD;
3495 	for (i = 0; i <= ss->rx_big.mask; i += ss->rx_big.nbufs) {
3496 		map = ss->rx_big.info[i].map;
3497 		err = mxge_get_buf_big(ss, map, i);
3498 		if (err) {
3499 			device_printf(sc->dev, "alloced %d/%d bigs\n",
3500 				      i, ss->rx_big.mask + 1);
3501 			return ENOMEM;
3502 		}
3503 	}
3504 	return 0;
3505 }
3506 
3507 static int
3508 mxge_open(mxge_softc_t *sc)
3509 {
3510 	mxge_cmd_t cmd;
3511 	int err, big_bytes, nbufs, slice, cl_size, i;
3512 	bus_addr_t bus;
3513 	volatile uint8_t *itable;
3514 	struct mxge_slice_state *ss;
3515 
3516 	/* Copy the MAC address in case it was overridden */
3517 	bcopy(IF_LLADDR(sc->ifp), sc->mac_addr, ETHER_ADDR_LEN);
3518 
3519 	err = mxge_reset(sc, 1);
3520 	if (err != 0) {
3521 		device_printf(sc->dev, "failed to reset\n");
3522 		return EIO;
3523 	}
3524 
3525 	if (sc->num_slices > 1) {
3526 		/* setup the indirection table */
3527 		cmd.data0 = sc->num_slices;
3528 		err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_TABLE_SIZE,
3529 				    &cmd);
3530 
3531 		err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_RSS_TABLE_OFFSET,
3532 				     &cmd);
3533 		if (err != 0) {
3534 			device_printf(sc->dev,
3535 				      "failed to setup rss tables\n");
3536 			return err;
3537 		}
3538 
3539 		/* just enable an identity mapping */
3540 		itable = sc->sram + cmd.data0;
3541 		for (i = 0; i < sc->num_slices; i++)
3542 			itable[i] = (uint8_t)i;
3543 
3544 		cmd.data0 = 1;
3545 		cmd.data1 = mxge_rss_hash_type;
3546 		err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_ENABLE, &cmd);
3547 		if (err != 0) {
3548 			device_printf(sc->dev, "failed to enable slices\n");
3549 			return err;
3550 		}
3551 	}
3552 
3553 
3554 	mxge_choose_params(sc->ifp->if_mtu, &big_bytes, &cl_size, &nbufs);
3555 
3556 	cmd.data0 = nbufs;
3557 	err = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
3558 			    &cmd);
3559 	/* error is only meaningful if we're trying to set
3560 	   MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS > 1 */
3561 	if (err && nbufs > 1) {
3562 		device_printf(sc->dev,
3563 			      "Failed to set alway-use-n to %d\n",
3564 			      nbufs);
3565 		return EIO;
3566 	}
3567 	/* Give the firmware the mtu and the big and small buffer
3568 	   sizes.  The firmware wants the big buf size to be a power
3569 	   of two. Luckily, FreeBSD's clusters are powers of two */
3570 	cmd.data0 = sc->ifp->if_mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
3571 	err = mxge_send_cmd(sc, MXGEFW_CMD_SET_MTU, &cmd);
3572 	cmd.data0 = MHLEN - MXGEFW_PAD;
3573 	err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_SMALL_BUFFER_SIZE,
3574 			     &cmd);
3575 	cmd.data0 = big_bytes;
3576 	err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_BIG_BUFFER_SIZE, &cmd);
3577 
3578 	if (err != 0) {
3579 		device_printf(sc->dev, "failed to setup params\n");
3580 		goto abort;
3581 	}
3582 
3583 	/* Now give him the pointer to the stats block */
3584 	for (slice = 0;
3585 #ifdef IFNET_BUF_RING
3586 	     slice < sc->num_slices;
3587 #else
3588 	     slice < 1;
3589 #endif
3590 	     slice++) {
3591 		ss = &sc->ss[slice];
3592 		cmd.data0 =
3593 			MXGE_LOWPART_TO_U32(ss->fw_stats_dma.bus_addr);
3594 		cmd.data1 =
3595 			MXGE_HIGHPART_TO_U32(ss->fw_stats_dma.bus_addr);
3596 		cmd.data2 = sizeof(struct mcp_irq_data);
3597 		cmd.data2 |= (slice << 16);
3598 		err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_STATS_DMA_V2, &cmd);
3599 	}
3600 
3601 	if (err != 0) {
3602 		bus = sc->ss->fw_stats_dma.bus_addr;
3603 		bus += offsetof(struct mcp_irq_data, send_done_count);
3604 		cmd.data0 = MXGE_LOWPART_TO_U32(bus);
3605 		cmd.data1 = MXGE_HIGHPART_TO_U32(bus);
3606 		err = mxge_send_cmd(sc,
3607 				    MXGEFW_CMD_SET_STATS_DMA_OBSOLETE,
3608 				    &cmd);
3609 		/* Firmware cannot support multicast without STATS_DMA_V2 */
3610 		sc->fw_multicast_support = 0;
3611 	} else {
3612 		sc->fw_multicast_support = 1;
3613 	}
3614 
3615 	if (err != 0) {
3616 		device_printf(sc->dev, "failed to setup params\n");
3617 		goto abort;
3618 	}
3619 
3620 	for (slice = 0; slice < sc->num_slices; slice++) {
3621 		err = mxge_slice_open(&sc->ss[slice], nbufs, cl_size);
3622 		if (err != 0) {
3623 			device_printf(sc->dev, "couldn't open slice %d\n",
3624 				      slice);
3625 			goto abort;
3626 		}
3627 	}
3628 
3629 	/* Finally, start the firmware running */
3630 	err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_UP, &cmd);
3631 	if (err) {
3632 		device_printf(sc->dev, "Couldn't bring up link\n");
3633 		goto abort;
3634 	}
3635 #ifdef IFNET_BUF_RING
3636 	for (slice = 0; slice < sc->num_slices; slice++) {
3637 		ss = &sc->ss[slice];
3638 		ss->if_drv_flags |= IFF_DRV_RUNNING;
3639 		ss->if_drv_flags &= ~IFF_DRV_OACTIVE;
3640 	}
3641 #endif
3642 	sc->ifp->if_drv_flags |= IFF_DRV_RUNNING;
3643 	sc->ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
3644 
3645 	return 0;
3646 
3647 
3648 abort:
3649 	mxge_free_mbufs(sc);
3650 
3651 	return err;
3652 }
3653 
3654 static int
3655 mxge_close(mxge_softc_t *sc, int down)
3656 {
3657 	mxge_cmd_t cmd;
3658 	int err, old_down_cnt;
3659 #ifdef IFNET_BUF_RING
3660 	struct mxge_slice_state *ss;
3661 	int slice;
3662 #endif
3663 
3664 #ifdef IFNET_BUF_RING
3665 	for (slice = 0; slice < sc->num_slices; slice++) {
3666 		ss = &sc->ss[slice];
3667 		ss->if_drv_flags &= ~IFF_DRV_RUNNING;
3668 	}
3669 #endif
3670 	sc->ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
3671 	if (!down) {
3672 		old_down_cnt = sc->down_cnt;
3673 		wmb();
3674 		err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_DOWN, &cmd);
3675 		if (err) {
3676 			device_printf(sc->dev,
3677 				      "Couldn't bring down link\n");
3678 		}
3679 		if (old_down_cnt == sc->down_cnt) {
3680 			/* wait for down irq */
3681 			DELAY(10 * sc->intr_coal_delay);
3682 		}
3683 		wmb();
3684 		if (old_down_cnt == sc->down_cnt) {
3685 			device_printf(sc->dev, "never got down irq\n");
3686 		}
3687 	}
3688 	mxge_free_mbufs(sc);
3689 
3690 	return 0;
3691 }
3692 
3693 static void
3694 mxge_setup_cfg_space(mxge_softc_t *sc)
3695 {
3696 	device_t dev = sc->dev;
3697 	int reg;
3698 	uint16_t cmd, lnk, pectl;
3699 
3700 	/* find the PCIe link width and set max read request to 4KB*/
3701 	if (pci_find_extcap(dev, PCIY_EXPRESS, &reg) == 0) {
3702 		lnk = pci_read_config(dev, reg + 0x12, 2);
3703 		sc->link_width = (lnk >> 4) & 0x3f;
3704 
3705 		if (sc->pectl == 0) {
3706 			pectl = pci_read_config(dev, reg + 0x8, 2);
3707 			pectl = (pectl & ~0x7000) | (5 << 12);
3708 			pci_write_config(dev, reg + 0x8, pectl, 2);
3709 			sc->pectl = pectl;
3710 		} else {
3711 			/* restore saved pectl after watchdog reset */
3712 			pci_write_config(dev, reg + 0x8, sc->pectl, 2);
3713 		}
3714 	}
3715 
3716 	/* Enable DMA and Memory space access */
3717 	pci_enable_busmaster(dev);
3718 	cmd = pci_read_config(dev, PCIR_COMMAND, 2);
3719 	cmd |= PCIM_CMD_MEMEN;
3720 	pci_write_config(dev, PCIR_COMMAND, cmd, 2);
3721 }
3722 
3723 static uint32_t
3724 mxge_read_reboot(mxge_softc_t *sc)
3725 {
3726 	device_t dev = sc->dev;
3727 	uint32_t vs;
3728 
3729 	/* find the vendor specific offset */
3730 	if (pci_find_extcap(dev, PCIY_VENDOR, &vs) != 0) {
3731 		device_printf(sc->dev,
3732 			      "could not find vendor specific offset\n");
3733 		return (uint32_t)-1;
3734 	}
3735 	/* enable read32 mode */
3736 	pci_write_config(dev, vs + 0x10, 0x3, 1);
3737 	/* tell NIC which register to read */
3738 	pci_write_config(dev, vs + 0x18, 0xfffffff0, 4);
3739 	return (pci_read_config(dev, vs + 0x14, 4));
3740 }
3741 
3742 static void
3743 mxge_watchdog_reset(mxge_softc_t *sc)
3744 {
3745 	struct pci_devinfo *dinfo;
3746 	struct mxge_slice_state *ss;
3747 	int err, running, s, num_tx_slices = 1;
3748 	uint32_t reboot;
3749 	uint16_t cmd;
3750 
3751 	err = ENXIO;
3752 
3753 	device_printf(sc->dev, "Watchdog reset!\n");
3754 
3755 	/*
3756 	 * check to see if the NIC rebooted.  If it did, then all of
3757 	 * PCI config space has been reset, and things like the
3758 	 * busmaster bit will be zero.  If this is the case, then we
3759 	 * must restore PCI config space before the NIC can be used
3760 	 * again
3761 	 */
3762 	cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3763 	if (cmd == 0xffff) {
3764 		/*
3765 		 * maybe the watchdog caught the NIC rebooting; wait
3766 		 * up to 100ms for it to finish.  If it does not come
3767 		 * back, then give up
3768 		 */
3769 		DELAY(1000*100);
3770 		cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3771 		if (cmd == 0xffff) {
3772 			device_printf(sc->dev, "NIC disappeared!\n");
3773 		}
3774 	}
3775 	if ((cmd & PCIM_CMD_BUSMASTEREN) == 0) {
3776 		/* print the reboot status */
3777 		reboot = mxge_read_reboot(sc);
3778 		device_printf(sc->dev, "NIC rebooted, status = 0x%x\n",
3779 			      reboot);
3780 		running = sc->ifp->if_drv_flags & IFF_DRV_RUNNING;
3781 		if (running) {
3782 
3783 			/*
3784 			 * quiesce NIC so that TX routines will not try to
3785 			 * xmit after restoration of BAR
3786 			 */
3787 
3788 			/* Mark the link as down */
3789 			if (sc->link_state) {
3790 				sc->link_state = 0;
3791 				if_link_state_change(sc->ifp,
3792 						     LINK_STATE_DOWN);
3793 			}
3794 #ifdef IFNET_BUF_RING
3795 			num_tx_slices = sc->num_slices;
3796 #endif
3797 			/* grab all TX locks to ensure no tx  */
3798 			for (s = 0; s < num_tx_slices; s++) {
3799 				ss = &sc->ss[s];
3800 				mtx_lock(&ss->tx.mtx);
3801 			}
3802 			mxge_close(sc, 1);
3803 		}
3804 		/* restore PCI configuration space */
3805 		dinfo = device_get_ivars(sc->dev);
3806 		pci_cfg_restore(sc->dev, dinfo);
3807 
3808 		/* and redo any changes we made to our config space */
3809 		mxge_setup_cfg_space(sc);
3810 
3811 		/* reload f/w */
3812 		err = mxge_load_firmware(sc, 0);
3813 		if (err) {
3814 			device_printf(sc->dev,
3815 				      "Unable to re-load f/w\n");
3816 		}
3817 		if (running) {
3818 			if (!err)
3819 				err = mxge_open(sc);
3820 			/* release all TX locks */
3821 			for (s = 0; s < num_tx_slices; s++) {
3822 				ss = &sc->ss[s];
3823 #ifdef IFNET_BUF_RING
3824 				mxge_start_locked(ss);
3825 #endif
3826 				mtx_unlock(&ss->tx.mtx);
3827 			}
3828 		}
3829 		sc->watchdog_resets++;
3830 	} else {
3831 		device_printf(sc->dev,
3832 			      "NIC did not reboot, not resetting\n");
3833 		err = 0;
3834 	}
3835 	if (err) {
3836 		device_printf(sc->dev, "watchdog reset failed\n");
3837 	} else {
3838 		if (sc->dying == 2)
3839 			sc->dying = 0;
3840 		callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
3841 	}
3842 }
3843 
3844 static void
3845 mxge_watchdog_task(void *arg, int pending)
3846 {
3847 	mxge_softc_t *sc = arg;
3848 
3849 
3850 	mtx_lock(&sc->driver_mtx);
3851 	mxge_watchdog_reset(sc);
3852 	mtx_unlock(&sc->driver_mtx);
3853 }
3854 
3855 static void
3856 mxge_warn_stuck(mxge_softc_t *sc, mxge_tx_ring_t *tx, int slice)
3857 {
3858 	tx = &sc->ss[slice].tx;
3859 	device_printf(sc->dev, "slice %d struck? ring state:\n", slice);
3860 	device_printf(sc->dev,
3861 		      "tx.req=%d tx.done=%d, tx.queue_active=%d\n",
3862 		      tx->req, tx->done, tx->queue_active);
3863 	device_printf(sc->dev, "tx.activate=%d tx.deactivate=%d\n",
3864 			      tx->activate, tx->deactivate);
3865 	device_printf(sc->dev, "pkt_done=%d fw=%d\n",
3866 		      tx->pkt_done,
3867 		      be32toh(sc->ss->fw_stats->send_done_count));
3868 }
3869 
3870 static int
3871 mxge_watchdog(mxge_softc_t *sc)
3872 {
3873 	mxge_tx_ring_t *tx;
3874 	uint32_t rx_pause = be32toh(sc->ss->fw_stats->dropped_pause);
3875 	int i, err = 0;
3876 
3877 	/* see if we have outstanding transmits, which
3878 	   have been pending for more than mxge_ticks */
3879 	for (i = 0;
3880 #ifdef IFNET_BUF_RING
3881 	     (i < sc->num_slices) && (err == 0);
3882 #else
3883 	     (i < 1) && (err == 0);
3884 #endif
3885 	     i++) {
3886 		tx = &sc->ss[i].tx;
3887 		if (tx->req != tx->done &&
3888 		    tx->watchdog_req != tx->watchdog_done &&
3889 		    tx->done == tx->watchdog_done) {
3890 			/* check for pause blocking before resetting */
3891 			if (tx->watchdog_rx_pause == rx_pause) {
3892 				mxge_warn_stuck(sc, tx, i);
3893 				taskqueue_enqueue(sc->tq, &sc->watchdog_task);
3894 				return (ENXIO);
3895 			}
3896 			else
3897 				device_printf(sc->dev, "Flow control blocking "
3898 					      "xmits, check link partner\n");
3899 		}
3900 
3901 		tx->watchdog_req = tx->req;
3902 		tx->watchdog_done = tx->done;
3903 		tx->watchdog_rx_pause = rx_pause;
3904 	}
3905 
3906 	if (sc->need_media_probe)
3907 		mxge_media_probe(sc);
3908 	return (err);
3909 }
3910 
3911 static u_long
3912 mxge_update_stats(mxge_softc_t *sc)
3913 {
3914 	struct mxge_slice_state *ss;
3915 	u_long pkts = 0;
3916 	u_long ipackets = 0;
3917 	u_long opackets = 0;
3918 #ifdef IFNET_BUF_RING
3919 	u_long obytes = 0;
3920 	u_long omcasts = 0;
3921 	u_long odrops = 0;
3922 #endif
3923 	u_long oerrors = 0;
3924 	int slice;
3925 
3926 	for (slice = 0; slice < sc->num_slices; slice++) {
3927 		ss = &sc->ss[slice];
3928 		ipackets += ss->ipackets;
3929 		opackets += ss->opackets;
3930 #ifdef IFNET_BUF_RING
3931 		obytes += ss->obytes;
3932 		omcasts += ss->omcasts;
3933 		odrops += ss->tx.br->br_drops;
3934 #endif
3935 		oerrors += ss->oerrors;
3936 	}
3937 	pkts = (ipackets - sc->ifp->if_ipackets);
3938 	pkts += (opackets - sc->ifp->if_opackets);
3939 	sc->ifp->if_ipackets = ipackets;
3940 	sc->ifp->if_opackets = opackets;
3941 #ifdef IFNET_BUF_RING
3942 	sc->ifp->if_obytes = obytes;
3943 	sc->ifp->if_omcasts = omcasts;
3944 	sc->ifp->if_snd.ifq_drops = odrops;
3945 #endif
3946 	sc->ifp->if_oerrors = oerrors;
3947 	return pkts;
3948 }
3949 
3950 static void
3951 mxge_tick(void *arg)
3952 {
3953 	mxge_softc_t *sc = arg;
3954 	u_long pkts = 0;
3955 	int err = 0;
3956 	int running, ticks;
3957 	uint16_t cmd;
3958 
3959 	ticks = mxge_ticks;
3960 	running = sc->ifp->if_drv_flags & IFF_DRV_RUNNING;
3961 	if (running) {
3962 		/* aggregate stats from different slices */
3963 		pkts = mxge_update_stats(sc);
3964 		if (!sc->watchdog_countdown) {
3965 			err = mxge_watchdog(sc);
3966 			sc->watchdog_countdown = 4;
3967 		}
3968 		sc->watchdog_countdown--;
3969 	}
3970 	if (pkts == 0) {
3971 		/* ensure NIC did not suffer h/w fault while idle */
3972 		cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3973 		if ((cmd & PCIM_CMD_BUSMASTEREN) == 0) {
3974 			sc->dying = 2;
3975 			taskqueue_enqueue(sc->tq, &sc->watchdog_task);
3976 			err = ENXIO;
3977 		}
3978 		/* look less often if NIC is idle */
3979 		ticks *= 4;
3980 	}
3981 
3982 	if (err == 0)
3983 		callout_reset(&sc->co_hdl, ticks, mxge_tick, sc);
3984 
3985 }
3986 
3987 static int
3988 mxge_media_change(struct ifnet *ifp)
3989 {
3990 	return EINVAL;
3991 }
3992 
3993 static int
3994 mxge_change_mtu(mxge_softc_t *sc, int mtu)
3995 {
3996 	struct ifnet *ifp = sc->ifp;
3997 	int real_mtu, old_mtu;
3998 	int err = 0;
3999 
4000 
4001 	real_mtu = mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
4002 	if ((real_mtu > sc->max_mtu) || real_mtu < 60)
4003 		return EINVAL;
4004 	mtx_lock(&sc->driver_mtx);
4005 	old_mtu = ifp->if_mtu;
4006 	ifp->if_mtu = mtu;
4007 	if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
4008 		mxge_close(sc, 0);
4009 		err = mxge_open(sc);
4010 		if (err != 0) {
4011 			ifp->if_mtu = old_mtu;
4012 			mxge_close(sc, 0);
4013 			(void) mxge_open(sc);
4014 		}
4015 	}
4016 	mtx_unlock(&sc->driver_mtx);
4017 	return err;
4018 }
4019 
4020 static void
4021 mxge_media_status(struct ifnet *ifp, struct ifmediareq *ifmr)
4022 {
4023 	mxge_softc_t *sc = ifp->if_softc;
4024 
4025 
4026 	if (sc == NULL)
4027 		return;
4028 	ifmr->ifm_status = IFM_AVALID;
4029 	ifmr->ifm_status |= sc->link_state ? IFM_ACTIVE : 0;
4030 	ifmr->ifm_active = IFM_AUTO | IFM_ETHER;
4031 	ifmr->ifm_active |= sc->link_state ? IFM_FDX : 0;
4032 }
4033 
4034 static int
4035 mxge_ioctl(struct ifnet *ifp, u_long command, caddr_t data)
4036 {
4037 	mxge_softc_t *sc = ifp->if_softc;
4038 	struct ifreq *ifr = (struct ifreq *)data;
4039 	int err, mask;
4040 
4041 	err = 0;
4042 	switch (command) {
4043 	case SIOCSIFADDR:
4044 	case SIOCGIFADDR:
4045 		err = ether_ioctl(ifp, command, data);
4046 		break;
4047 
4048 	case SIOCSIFMTU:
4049 		err = mxge_change_mtu(sc, ifr->ifr_mtu);
4050 		break;
4051 
4052 	case SIOCSIFFLAGS:
4053 		mtx_lock(&sc->driver_mtx);
4054 		if (sc->dying) {
4055 			mtx_unlock(&sc->driver_mtx);
4056 			return EINVAL;
4057 		}
4058 		if (ifp->if_flags & IFF_UP) {
4059 			if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) {
4060 				err = mxge_open(sc);
4061 			} else {
4062 				/* take care of promis can allmulti
4063 				   flag chages */
4064 				mxge_change_promisc(sc,
4065 						    ifp->if_flags & IFF_PROMISC);
4066 				mxge_set_multicast_list(sc);
4067 			}
4068 		} else {
4069 			if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
4070 				mxge_close(sc, 0);
4071 			}
4072 		}
4073 		mtx_unlock(&sc->driver_mtx);
4074 		break;
4075 
4076 	case SIOCADDMULTI:
4077 	case SIOCDELMULTI:
4078 		mtx_lock(&sc->driver_mtx);
4079 		mxge_set_multicast_list(sc);
4080 		mtx_unlock(&sc->driver_mtx);
4081 		break;
4082 
4083 	case SIOCSIFCAP:
4084 		mtx_lock(&sc->driver_mtx);
4085 		mask = ifr->ifr_reqcap ^ ifp->if_capenable;
4086 		if (mask & IFCAP_TXCSUM) {
4087 			if (IFCAP_TXCSUM & ifp->if_capenable) {
4088 				ifp->if_capenable &= ~(IFCAP_TXCSUM|IFCAP_TSO4);
4089 				ifp->if_hwassist &= ~(CSUM_TCP | CSUM_UDP
4090 						      | CSUM_TSO);
4091 			} else {
4092 				ifp->if_capenable |= IFCAP_TXCSUM;
4093 				ifp->if_hwassist |= (CSUM_TCP | CSUM_UDP);
4094 			}
4095 		} else if (mask & IFCAP_RXCSUM) {
4096 			if (IFCAP_RXCSUM & ifp->if_capenable) {
4097 				ifp->if_capenable &= ~IFCAP_RXCSUM;
4098 				sc->csum_flag = 0;
4099 			} else {
4100 				ifp->if_capenable |= IFCAP_RXCSUM;
4101 				sc->csum_flag = 1;
4102 			}
4103 		}
4104 		if (mask & IFCAP_TSO4) {
4105 			if (IFCAP_TSO4 & ifp->if_capenable) {
4106 				ifp->if_capenable &= ~IFCAP_TSO4;
4107 				ifp->if_hwassist &= ~CSUM_TSO;
4108 			} else if (IFCAP_TXCSUM & ifp->if_capenable) {
4109 				ifp->if_capenable |= IFCAP_TSO4;
4110 				ifp->if_hwassist |= CSUM_TSO;
4111 			} else {
4112 				printf("mxge requires tx checksum offload"
4113 				       " be enabled to use TSO\n");
4114 				err = EINVAL;
4115 			}
4116 		}
4117 		if (mask & IFCAP_LRO) {
4118 			if (IFCAP_LRO & ifp->if_capenable)
4119 				err = mxge_change_lro_locked(sc, 0);
4120 			else
4121 				err = mxge_change_lro_locked(sc, mxge_lro_cnt);
4122 		}
4123 		if (mask & IFCAP_VLAN_HWTAGGING)
4124 			ifp->if_capenable ^= IFCAP_VLAN_HWTAGGING;
4125 		mtx_unlock(&sc->driver_mtx);
4126 		VLAN_CAPABILITIES(ifp);
4127 
4128 		break;
4129 
4130 	case SIOCGIFMEDIA:
4131 		err = ifmedia_ioctl(ifp, (struct ifreq *)data,
4132 				    &sc->media, command);
4133                 break;
4134 
4135 	default:
4136 		err = ENOTTY;
4137         }
4138 	return err;
4139 }
4140 
4141 static void
4142 mxge_fetch_tunables(mxge_softc_t *sc)
4143 {
4144 
4145 	TUNABLE_INT_FETCH("hw.mxge.max_slices", &mxge_max_slices);
4146 	TUNABLE_INT_FETCH("hw.mxge.flow_control_enabled",
4147 			  &mxge_flow_control);
4148 	TUNABLE_INT_FETCH("hw.mxge.intr_coal_delay",
4149 			  &mxge_intr_coal_delay);
4150 	TUNABLE_INT_FETCH("hw.mxge.nvidia_ecrc_enable",
4151 			  &mxge_nvidia_ecrc_enable);
4152 	TUNABLE_INT_FETCH("hw.mxge.force_firmware",
4153 			  &mxge_force_firmware);
4154 	TUNABLE_INT_FETCH("hw.mxge.deassert_wait",
4155 			  &mxge_deassert_wait);
4156 	TUNABLE_INT_FETCH("hw.mxge.verbose",
4157 			  &mxge_verbose);
4158 	TUNABLE_INT_FETCH("hw.mxge.ticks", &mxge_ticks);
4159 	TUNABLE_INT_FETCH("hw.mxge.lro_cnt", &sc->lro_cnt);
4160 	TUNABLE_INT_FETCH("hw.mxge.always_promisc", &mxge_always_promisc);
4161 	TUNABLE_INT_FETCH("hw.mxge.rss_hash_type", &mxge_rss_hash_type);
4162 	TUNABLE_INT_FETCH("hw.mxge.rss_hashtype", &mxge_rss_hash_type);
4163 	TUNABLE_INT_FETCH("hw.mxge.initial_mtu", &mxge_initial_mtu);
4164 	TUNABLE_INT_FETCH("hw.mxge.throttle", &mxge_throttle);
4165 	if (sc->lro_cnt != 0)
4166 		mxge_lro_cnt = sc->lro_cnt;
4167 
4168 	if (bootverbose)
4169 		mxge_verbose = 1;
4170 	if (mxge_intr_coal_delay < 0 || mxge_intr_coal_delay > 10*1000)
4171 		mxge_intr_coal_delay = 30;
4172 	if (mxge_ticks == 0)
4173 		mxge_ticks = hz / 2;
4174 	sc->pause = mxge_flow_control;
4175 	if (mxge_rss_hash_type < MXGEFW_RSS_HASH_TYPE_IPV4
4176 	    || mxge_rss_hash_type > MXGEFW_RSS_HASH_TYPE_MAX) {
4177 		mxge_rss_hash_type = MXGEFW_RSS_HASH_TYPE_SRC_DST_PORT;
4178 	}
4179 	if (mxge_initial_mtu > ETHERMTU_JUMBO ||
4180 	    mxge_initial_mtu < ETHER_MIN_LEN)
4181 		mxge_initial_mtu = ETHERMTU_JUMBO;
4182 
4183 	if (mxge_throttle && mxge_throttle > MXGE_MAX_THROTTLE)
4184 		mxge_throttle = MXGE_MAX_THROTTLE;
4185 	if (mxge_throttle && mxge_throttle < MXGE_MIN_THROTTLE)
4186 		mxge_throttle = MXGE_MIN_THROTTLE;
4187 	sc->throttle = mxge_throttle;
4188 }
4189 
4190 
4191 static void
4192 mxge_free_slices(mxge_softc_t *sc)
4193 {
4194 	struct mxge_slice_state *ss;
4195 	int i;
4196 
4197 
4198 	if (sc->ss == NULL)
4199 		return;
4200 
4201 	for (i = 0; i < sc->num_slices; i++) {
4202 		ss = &sc->ss[i];
4203 		if (ss->fw_stats != NULL) {
4204 			mxge_dma_free(&ss->fw_stats_dma);
4205 			ss->fw_stats = NULL;
4206 #ifdef IFNET_BUF_RING
4207 			if (ss->tx.br != NULL) {
4208 				drbr_free(ss->tx.br, M_DEVBUF);
4209 				ss->tx.br = NULL;
4210 			}
4211 #endif
4212 			mtx_destroy(&ss->tx.mtx);
4213 		}
4214 		if (ss->rx_done.entry != NULL) {
4215 			mxge_dma_free(&ss->rx_done.dma);
4216 			ss->rx_done.entry = NULL;
4217 		}
4218 	}
4219 	free(sc->ss, M_DEVBUF);
4220 	sc->ss = NULL;
4221 }
4222 
4223 static int
4224 mxge_alloc_slices(mxge_softc_t *sc)
4225 {
4226 	mxge_cmd_t cmd;
4227 	struct mxge_slice_state *ss;
4228 	size_t bytes;
4229 	int err, i, max_intr_slots;
4230 
4231 	err = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
4232 	if (err != 0) {
4233 		device_printf(sc->dev, "Cannot determine rx ring size\n");
4234 		return err;
4235 	}
4236 	sc->rx_ring_size = cmd.data0;
4237 	max_intr_slots = 2 * (sc->rx_ring_size / sizeof (mcp_dma_addr_t));
4238 
4239 	bytes = sizeof (*sc->ss) * sc->num_slices;
4240 	sc->ss = malloc(bytes, M_DEVBUF, M_NOWAIT | M_ZERO);
4241 	if (sc->ss == NULL)
4242 		return (ENOMEM);
4243 	for (i = 0; i < sc->num_slices; i++) {
4244 		ss = &sc->ss[i];
4245 
4246 		ss->sc = sc;
4247 
4248 		/* allocate per-slice rx interrupt queues */
4249 
4250 		bytes = max_intr_slots * sizeof (*ss->rx_done.entry);
4251 		err = mxge_dma_alloc(sc, &ss->rx_done.dma, bytes, 4096);
4252 		if (err != 0)
4253 			goto abort;
4254 		ss->rx_done.entry = ss->rx_done.dma.addr;
4255 		bzero(ss->rx_done.entry, bytes);
4256 
4257 		/*
4258 		 * allocate the per-slice firmware stats; stats
4259 		 * (including tx) are used used only on the first
4260 		 * slice for now
4261 		 */
4262 #ifndef IFNET_BUF_RING
4263 		if (i > 0)
4264 			continue;
4265 #endif
4266 
4267 		bytes = sizeof (*ss->fw_stats);
4268 		err = mxge_dma_alloc(sc, &ss->fw_stats_dma,
4269 				     sizeof (*ss->fw_stats), 64);
4270 		if (err != 0)
4271 			goto abort;
4272 		ss->fw_stats = (mcp_irq_data_t *)ss->fw_stats_dma.addr;
4273 		snprintf(ss->tx.mtx_name, sizeof(ss->tx.mtx_name),
4274 			 "%s:tx(%d)", device_get_nameunit(sc->dev), i);
4275 		mtx_init(&ss->tx.mtx, ss->tx.mtx_name, NULL, MTX_DEF);
4276 #ifdef IFNET_BUF_RING
4277 		ss->tx.br = buf_ring_alloc(2048, M_DEVBUF, M_WAITOK,
4278 					   &ss->tx.mtx);
4279 #endif
4280 	}
4281 
4282 	return (0);
4283 
4284 abort:
4285 	mxge_free_slices(sc);
4286 	return (ENOMEM);
4287 }
4288 
4289 static void
4290 mxge_slice_probe(mxge_softc_t *sc)
4291 {
4292 	mxge_cmd_t cmd;
4293 	char *old_fw;
4294 	int msix_cnt, status, max_intr_slots;
4295 
4296 	sc->num_slices = 1;
4297 	/*
4298 	 *  don't enable multiple slices if they are not enabled,
4299 	 *  or if this is not an SMP system
4300 	 */
4301 
4302 	if (mxge_max_slices == 0 || mxge_max_slices == 1 || mp_ncpus < 2)
4303 		return;
4304 
4305 	/* see how many MSI-X interrupts are available */
4306 	msix_cnt = pci_msix_count(sc->dev);
4307 	if (msix_cnt < 2)
4308 		return;
4309 
4310 	/* now load the slice aware firmware see what it supports */
4311 	old_fw = sc->fw_name;
4312 	if (old_fw == mxge_fw_aligned)
4313 		sc->fw_name = mxge_fw_rss_aligned;
4314 	else
4315 		sc->fw_name = mxge_fw_rss_unaligned;
4316 	status = mxge_load_firmware(sc, 0);
4317 	if (status != 0) {
4318 		device_printf(sc->dev, "Falling back to a single slice\n");
4319 		return;
4320 	}
4321 
4322 	/* try to send a reset command to the card to see if it
4323 	   is alive */
4324 	memset(&cmd, 0, sizeof (cmd));
4325 	status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
4326 	if (status != 0) {
4327 		device_printf(sc->dev, "failed reset\n");
4328 		goto abort_with_fw;
4329 	}
4330 
4331 	/* get rx ring size */
4332 	status = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
4333 	if (status != 0) {
4334 		device_printf(sc->dev, "Cannot determine rx ring size\n");
4335 		goto abort_with_fw;
4336 	}
4337 	max_intr_slots = 2 * (cmd.data0 / sizeof (mcp_dma_addr_t));
4338 
4339 	/* tell it the size of the interrupt queues */
4340 	cmd.data0 = max_intr_slots * sizeof (struct mcp_slot);
4341 	status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
4342 	if (status != 0) {
4343 		device_printf(sc->dev, "failed MXGEFW_CMD_SET_INTRQ_SIZE\n");
4344 		goto abort_with_fw;
4345 	}
4346 
4347 	/* ask the maximum number of slices it supports */
4348 	status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES, &cmd);
4349 	if (status != 0) {
4350 		device_printf(sc->dev,
4351 			      "failed MXGEFW_CMD_GET_MAX_RSS_QUEUES\n");
4352 		goto abort_with_fw;
4353 	}
4354 	sc->num_slices = cmd.data0;
4355 	if (sc->num_slices > msix_cnt)
4356 		sc->num_slices = msix_cnt;
4357 
4358 	if (mxge_max_slices == -1) {
4359 		/* cap to number of CPUs in system */
4360 		if (sc->num_slices > mp_ncpus)
4361 			sc->num_slices = mp_ncpus;
4362 	} else {
4363 		if (sc->num_slices > mxge_max_slices)
4364 			sc->num_slices = mxge_max_slices;
4365 	}
4366 	/* make sure it is a power of two */
4367 	while (sc->num_slices & (sc->num_slices - 1))
4368 		sc->num_slices--;
4369 
4370 	if (mxge_verbose)
4371 		device_printf(sc->dev, "using %d slices\n",
4372 			      sc->num_slices);
4373 
4374 	return;
4375 
4376 abort_with_fw:
4377 	sc->fw_name = old_fw;
4378 	(void) mxge_load_firmware(sc, 0);
4379 }
4380 
4381 static int
4382 mxge_add_msix_irqs(mxge_softc_t *sc)
4383 {
4384 	size_t bytes;
4385 	int count, err, i, rid;
4386 
4387 	rid = PCIR_BAR(2);
4388 	sc->msix_table_res = bus_alloc_resource_any(sc->dev, SYS_RES_MEMORY,
4389 						    &rid, RF_ACTIVE);
4390 
4391 	if (sc->msix_table_res == NULL) {
4392 		device_printf(sc->dev, "couldn't alloc MSIX table res\n");
4393 		return ENXIO;
4394 	}
4395 
4396 	count = sc->num_slices;
4397 	err = pci_alloc_msix(sc->dev, &count);
4398 	if (err != 0) {
4399 		device_printf(sc->dev, "pci_alloc_msix: failed, wanted %d"
4400 			      "err = %d \n", sc->num_slices, err);
4401 		goto abort_with_msix_table;
4402 	}
4403 	if (count < sc->num_slices) {
4404 		device_printf(sc->dev, "pci_alloc_msix: need %d, got %d\n",
4405 			      count, sc->num_slices);
4406 		device_printf(sc->dev,
4407 			      "Try setting hw.mxge.max_slices to %d\n",
4408 			      count);
4409 		err = ENOSPC;
4410 		goto abort_with_msix;
4411 	}
4412 	bytes = sizeof (*sc->msix_irq_res) * sc->num_slices;
4413 	sc->msix_irq_res = malloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO);
4414 	if (sc->msix_irq_res == NULL) {
4415 		err = ENOMEM;
4416 		goto abort_with_msix;
4417 	}
4418 
4419 	for (i = 0; i < sc->num_slices; i++) {
4420 		rid = i + 1;
4421 		sc->msix_irq_res[i] = bus_alloc_resource_any(sc->dev,
4422 							  SYS_RES_IRQ,
4423 							  &rid, RF_ACTIVE);
4424 		if (sc->msix_irq_res[i] == NULL) {
4425 			device_printf(sc->dev, "couldn't allocate IRQ res"
4426 				      " for message %d\n", i);
4427 			err = ENXIO;
4428 			goto abort_with_res;
4429 		}
4430 	}
4431 
4432 	bytes = sizeof (*sc->msix_ih) * sc->num_slices;
4433 	sc->msix_ih =  malloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO);
4434 
4435 	for (i = 0; i < sc->num_slices; i++) {
4436 		err = bus_setup_intr(sc->dev, sc->msix_irq_res[i],
4437 				     INTR_TYPE_NET | INTR_MPSAFE,
4438 #if __FreeBSD_version > 700030
4439 				     NULL,
4440 #endif
4441 				     mxge_intr, &sc->ss[i], &sc->msix_ih[i]);
4442 		if (err != 0) {
4443 			device_printf(sc->dev, "couldn't setup intr for "
4444 				      "message %d\n", i);
4445 			goto abort_with_intr;
4446 		}
4447 	}
4448 
4449 	if (mxge_verbose) {
4450 		device_printf(sc->dev, "using %d msix IRQs:",
4451 			      sc->num_slices);
4452 		for (i = 0; i < sc->num_slices; i++)
4453 			printf(" %ld",  rman_get_start(sc->msix_irq_res[i]));
4454 		printf("\n");
4455 	}
4456 	return (0);
4457 
4458 abort_with_intr:
4459 	for (i = 0; i < sc->num_slices; i++) {
4460 		if (sc->msix_ih[i] != NULL) {
4461 			bus_teardown_intr(sc->dev, sc->msix_irq_res[i],
4462 					  sc->msix_ih[i]);
4463 			sc->msix_ih[i] = NULL;
4464 		}
4465 	}
4466 	free(sc->msix_ih, M_DEVBUF);
4467 
4468 
4469 abort_with_res:
4470 	for (i = 0; i < sc->num_slices; i++) {
4471 		rid = i + 1;
4472 		if (sc->msix_irq_res[i] != NULL)
4473 			bus_release_resource(sc->dev, SYS_RES_IRQ, rid,
4474 					     sc->msix_irq_res[i]);
4475 		sc->msix_irq_res[i] = NULL;
4476 	}
4477 	free(sc->msix_irq_res, M_DEVBUF);
4478 
4479 
4480 abort_with_msix:
4481 	pci_release_msi(sc->dev);
4482 
4483 abort_with_msix_table:
4484 	bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2),
4485 			     sc->msix_table_res);
4486 
4487 	return err;
4488 }
4489 
4490 static int
4491 mxge_add_single_irq(mxge_softc_t *sc)
4492 {
4493 	int count, err, rid;
4494 
4495 	count = pci_msi_count(sc->dev);
4496 	if (count == 1 && pci_alloc_msi(sc->dev, &count) == 0) {
4497 		rid = 1;
4498 	} else {
4499 		rid = 0;
4500 		sc->legacy_irq = 1;
4501 	}
4502 	sc->irq_res = bus_alloc_resource(sc->dev, SYS_RES_IRQ, &rid, 0, ~0,
4503 					 1, RF_SHAREABLE | RF_ACTIVE);
4504 	if (sc->irq_res == NULL) {
4505 		device_printf(sc->dev, "could not alloc interrupt\n");
4506 		return ENXIO;
4507 	}
4508 	if (mxge_verbose)
4509 		device_printf(sc->dev, "using %s irq %ld\n",
4510 			      sc->legacy_irq ? "INTx" : "MSI",
4511 			      rman_get_start(sc->irq_res));
4512 	err = bus_setup_intr(sc->dev, sc->irq_res,
4513 			     INTR_TYPE_NET | INTR_MPSAFE,
4514 #if __FreeBSD_version > 700030
4515 			     NULL,
4516 #endif
4517 			     mxge_intr, &sc->ss[0], &sc->ih);
4518 	if (err != 0) {
4519 		bus_release_resource(sc->dev, SYS_RES_IRQ,
4520 				     sc->legacy_irq ? 0 : 1, sc->irq_res);
4521 		if (!sc->legacy_irq)
4522 			pci_release_msi(sc->dev);
4523 	}
4524 	return err;
4525 }
4526 
4527 static void
4528 mxge_rem_msix_irqs(mxge_softc_t *sc)
4529 {
4530 	int i, rid;
4531 
4532 	for (i = 0; i < sc->num_slices; i++) {
4533 		if (sc->msix_ih[i] != NULL) {
4534 			bus_teardown_intr(sc->dev, sc->msix_irq_res[i],
4535 					  sc->msix_ih[i]);
4536 			sc->msix_ih[i] = NULL;
4537 		}
4538 	}
4539 	free(sc->msix_ih, M_DEVBUF);
4540 
4541 	for (i = 0; i < sc->num_slices; i++) {
4542 		rid = i + 1;
4543 		if (sc->msix_irq_res[i] != NULL)
4544 			bus_release_resource(sc->dev, SYS_RES_IRQ, rid,
4545 					     sc->msix_irq_res[i]);
4546 		sc->msix_irq_res[i] = NULL;
4547 	}
4548 	free(sc->msix_irq_res, M_DEVBUF);
4549 
4550 	bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2),
4551 			     sc->msix_table_res);
4552 
4553 	pci_release_msi(sc->dev);
4554 	return;
4555 }
4556 
4557 static void
4558 mxge_rem_single_irq(mxge_softc_t *sc)
4559 {
4560 	bus_teardown_intr(sc->dev, sc->irq_res, sc->ih);
4561 	bus_release_resource(sc->dev, SYS_RES_IRQ,
4562 			     sc->legacy_irq ? 0 : 1, sc->irq_res);
4563 	if (!sc->legacy_irq)
4564 		pci_release_msi(sc->dev);
4565 }
4566 
4567 static void
4568 mxge_rem_irq(mxge_softc_t *sc)
4569 {
4570 	if (sc->num_slices > 1)
4571 		mxge_rem_msix_irqs(sc);
4572 	else
4573 		mxge_rem_single_irq(sc);
4574 }
4575 
4576 static int
4577 mxge_add_irq(mxge_softc_t *sc)
4578 {
4579 	int err;
4580 
4581 	if (sc->num_slices > 1)
4582 		err = mxge_add_msix_irqs(sc);
4583 	else
4584 		err = mxge_add_single_irq(sc);
4585 
4586 	if (0 && err == 0 && sc->num_slices > 1) {
4587 		mxge_rem_msix_irqs(sc);
4588 		err = mxge_add_msix_irqs(sc);
4589 	}
4590 	return err;
4591 }
4592 
4593 
4594 static int
4595 mxge_attach(device_t dev)
4596 {
4597 	mxge_softc_t *sc = device_get_softc(dev);
4598 	struct ifnet *ifp;
4599 	int err, rid;
4600 
4601 	sc->dev = dev;
4602 	mxge_fetch_tunables(sc);
4603 
4604 	TASK_INIT(&sc->watchdog_task, 1, mxge_watchdog_task, sc);
4605 	sc->tq = taskqueue_create_fast("mxge_taskq", M_WAITOK,
4606 				       taskqueue_thread_enqueue,
4607 				       &sc->tq);
4608 	if (sc->tq == NULL) {
4609 		err = ENOMEM;
4610 		goto abort_with_nothing;
4611 	}
4612 	taskqueue_start_threads(&sc->tq, 1, PI_NET, "%s taskq",
4613 				device_get_nameunit(sc->dev));
4614 
4615 	err = bus_dma_tag_create(NULL,			/* parent */
4616 				 1,			/* alignment */
4617 				 0,			/* boundary */
4618 				 BUS_SPACE_MAXADDR,	/* low */
4619 				 BUS_SPACE_MAXADDR,	/* high */
4620 				 NULL, NULL,		/* filter */
4621 				 65536 + 256,		/* maxsize */
4622 				 MXGE_MAX_SEND_DESC, 	/* num segs */
4623 				 65536,			/* maxsegsize */
4624 				 0,			/* flags */
4625 				 NULL, NULL,		/* lock */
4626 				 &sc->parent_dmat);	/* tag */
4627 
4628 	if (err != 0) {
4629 		device_printf(sc->dev, "Err %d allocating parent dmat\n",
4630 			      err);
4631 		goto abort_with_tq;
4632 	}
4633 
4634 	ifp = sc->ifp = if_alloc(IFT_ETHER);
4635 	if (ifp == NULL) {
4636 		device_printf(dev, "can not if_alloc()\n");
4637 		err = ENOSPC;
4638 		goto abort_with_parent_dmat;
4639 	}
4640 	if_initname(ifp, device_get_name(dev), device_get_unit(dev));
4641 
4642 	snprintf(sc->cmd_mtx_name, sizeof(sc->cmd_mtx_name), "%s:cmd",
4643 		 device_get_nameunit(dev));
4644 	mtx_init(&sc->cmd_mtx, sc->cmd_mtx_name, NULL, MTX_DEF);
4645 	snprintf(sc->driver_mtx_name, sizeof(sc->driver_mtx_name),
4646 		 "%s:drv", device_get_nameunit(dev));
4647 	mtx_init(&sc->driver_mtx, sc->driver_mtx_name,
4648 		 MTX_NETWORK_LOCK, MTX_DEF);
4649 
4650 	callout_init_mtx(&sc->co_hdl, &sc->driver_mtx, 0);
4651 
4652 	mxge_setup_cfg_space(sc);
4653 
4654 	/* Map the board into the kernel */
4655 	rid = PCIR_BARS;
4656 	sc->mem_res = bus_alloc_resource(dev, SYS_RES_MEMORY, &rid, 0,
4657 					 ~0, 1, RF_ACTIVE);
4658 	if (sc->mem_res == NULL) {
4659 		device_printf(dev, "could not map memory\n");
4660 		err = ENXIO;
4661 		goto abort_with_lock;
4662 	}
4663 	sc->sram = rman_get_virtual(sc->mem_res);
4664 	sc->sram_size = 2*1024*1024 - (2*(48*1024)+(32*1024)) - 0x100;
4665 	if (sc->sram_size > rman_get_size(sc->mem_res)) {
4666 		device_printf(dev, "impossible memory region size %ld\n",
4667 			      rman_get_size(sc->mem_res));
4668 		err = ENXIO;
4669 		goto abort_with_mem_res;
4670 	}
4671 
4672 	/* make NULL terminated copy of the EEPROM strings section of
4673 	   lanai SRAM */
4674 	bzero(sc->eeprom_strings, MXGE_EEPROM_STRINGS_SIZE);
4675 	bus_space_read_region_1(rman_get_bustag(sc->mem_res),
4676 				rman_get_bushandle(sc->mem_res),
4677 				sc->sram_size - MXGE_EEPROM_STRINGS_SIZE,
4678 				sc->eeprom_strings,
4679 				MXGE_EEPROM_STRINGS_SIZE - 2);
4680 	err = mxge_parse_strings(sc);
4681 	if (err != 0)
4682 		goto abort_with_mem_res;
4683 
4684 	/* Enable write combining for efficient use of PCIe bus */
4685 	mxge_enable_wc(sc);
4686 
4687 	/* Allocate the out of band dma memory */
4688 	err = mxge_dma_alloc(sc, &sc->cmd_dma,
4689 			     sizeof (mxge_cmd_t), 64);
4690 	if (err != 0)
4691 		goto abort_with_mem_res;
4692 	sc->cmd = (mcp_cmd_response_t *) sc->cmd_dma.addr;
4693 	err = mxge_dma_alloc(sc, &sc->zeropad_dma, 64, 64);
4694 	if (err != 0)
4695 		goto abort_with_cmd_dma;
4696 
4697 	err = mxge_dma_alloc(sc, &sc->dmabench_dma, 4096, 4096);
4698 	if (err != 0)
4699 		goto abort_with_zeropad_dma;
4700 
4701 	/* select & load the firmware */
4702 	err = mxge_select_firmware(sc);
4703 	if (err != 0)
4704 		goto abort_with_dmabench;
4705 	sc->intr_coal_delay = mxge_intr_coal_delay;
4706 
4707 	mxge_slice_probe(sc);
4708 	err = mxge_alloc_slices(sc);
4709 	if (err != 0)
4710 		goto abort_with_dmabench;
4711 
4712 	err = mxge_reset(sc, 0);
4713 	if (err != 0)
4714 		goto abort_with_slices;
4715 
4716 	err = mxge_alloc_rings(sc);
4717 	if (err != 0) {
4718 		device_printf(sc->dev, "failed to allocate rings\n");
4719 		goto abort_with_dmabench;
4720 	}
4721 
4722 	err = mxge_add_irq(sc);
4723 	if (err != 0) {
4724 		device_printf(sc->dev, "failed to add irq\n");
4725 		goto abort_with_rings;
4726 	}
4727 
4728 	ifp->if_baudrate = IF_Gbps(10UL);
4729 	ifp->if_capabilities = IFCAP_RXCSUM | IFCAP_TXCSUM | IFCAP_TSO4 |
4730 		IFCAP_VLAN_MTU;
4731 #ifdef INET
4732 	ifp->if_capabilities |= IFCAP_LRO;
4733 #endif
4734 
4735 #ifdef MXGE_NEW_VLAN_API
4736 	ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_HWCSUM;
4737 #endif
4738 
4739 	sc->max_mtu = mxge_max_mtu(sc);
4740 	if (sc->max_mtu >= 9000)
4741 		ifp->if_capabilities |= IFCAP_JUMBO_MTU;
4742 	else
4743 		device_printf(dev, "MTU limited to %d.  Install "
4744 			      "latest firmware for 9000 byte jumbo support\n",
4745 			      sc->max_mtu - ETHER_HDR_LEN);
4746 	ifp->if_hwassist = CSUM_TCP | CSUM_UDP | CSUM_TSO;
4747 	ifp->if_capenable = ifp->if_capabilities;
4748 	if (sc->lro_cnt == 0)
4749 		ifp->if_capenable &= ~IFCAP_LRO;
4750 	sc->csum_flag = 1;
4751         ifp->if_init = mxge_init;
4752         ifp->if_softc = sc;
4753         ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
4754         ifp->if_ioctl = mxge_ioctl;
4755         ifp->if_start = mxge_start;
4756 	/* Initialise the ifmedia structure */
4757 	ifmedia_init(&sc->media, 0, mxge_media_change,
4758 		     mxge_media_status);
4759 	mxge_set_media(sc, IFM_ETHER | IFM_AUTO);
4760 	mxge_media_probe(sc);
4761 	sc->dying = 0;
4762 	ether_ifattach(ifp, sc->mac_addr);
4763 	/* ether_ifattach sets mtu to ETHERMTU */
4764 	if (mxge_initial_mtu != ETHERMTU)
4765 		mxge_change_mtu(sc, mxge_initial_mtu);
4766 
4767 	mxge_add_sysctls(sc);
4768 #ifdef IFNET_BUF_RING
4769 	ifp->if_transmit = mxge_transmit;
4770 	ifp->if_qflush = mxge_qflush;
4771 #endif
4772 	callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
4773 	return 0;
4774 
4775 abort_with_rings:
4776 	mxge_free_rings(sc);
4777 abort_with_slices:
4778 	mxge_free_slices(sc);
4779 abort_with_dmabench:
4780 	mxge_dma_free(&sc->dmabench_dma);
4781 abort_with_zeropad_dma:
4782 	mxge_dma_free(&sc->zeropad_dma);
4783 abort_with_cmd_dma:
4784 	mxge_dma_free(&sc->cmd_dma);
4785 abort_with_mem_res:
4786 	bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
4787 abort_with_lock:
4788 	pci_disable_busmaster(dev);
4789 	mtx_destroy(&sc->cmd_mtx);
4790 	mtx_destroy(&sc->driver_mtx);
4791 	if_free(ifp);
4792 abort_with_parent_dmat:
4793 	bus_dma_tag_destroy(sc->parent_dmat);
4794 abort_with_tq:
4795 	if (sc->tq != NULL) {
4796 		taskqueue_drain(sc->tq, &sc->watchdog_task);
4797 		taskqueue_free(sc->tq);
4798 		sc->tq = NULL;
4799 	}
4800 abort_with_nothing:
4801 	return err;
4802 }
4803 
4804 static int
4805 mxge_detach(device_t dev)
4806 {
4807 	mxge_softc_t *sc = device_get_softc(dev);
4808 
4809 	if (mxge_vlans_active(sc)) {
4810 		device_printf(sc->dev,
4811 			      "Detach vlans before removing module\n");
4812 		return EBUSY;
4813 	}
4814 	mtx_lock(&sc->driver_mtx);
4815 	sc->dying = 1;
4816 	if (sc->ifp->if_drv_flags & IFF_DRV_RUNNING)
4817 		mxge_close(sc, 0);
4818 	mtx_unlock(&sc->driver_mtx);
4819 	ether_ifdetach(sc->ifp);
4820 	if (sc->tq != NULL) {
4821 		taskqueue_drain(sc->tq, &sc->watchdog_task);
4822 		taskqueue_free(sc->tq);
4823 		sc->tq = NULL;
4824 	}
4825 	callout_drain(&sc->co_hdl);
4826 	ifmedia_removeall(&sc->media);
4827 	mxge_dummy_rdma(sc, 0);
4828 	mxge_rem_sysctls(sc);
4829 	mxge_rem_irq(sc);
4830 	mxge_free_rings(sc);
4831 	mxge_free_slices(sc);
4832 	mxge_dma_free(&sc->dmabench_dma);
4833 	mxge_dma_free(&sc->zeropad_dma);
4834 	mxge_dma_free(&sc->cmd_dma);
4835 	bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
4836 	pci_disable_busmaster(dev);
4837 	mtx_destroy(&sc->cmd_mtx);
4838 	mtx_destroy(&sc->driver_mtx);
4839 	if_free(sc->ifp);
4840 	bus_dma_tag_destroy(sc->parent_dmat);
4841 	return 0;
4842 }
4843 
4844 static int
4845 mxge_shutdown(device_t dev)
4846 {
4847 	return 0;
4848 }
4849 
4850 /*
4851   This file uses Myri10GE driver indentation.
4852 
4853   Local Variables:
4854   c-file-style:"linux"
4855   tab-width:8
4856   End:
4857 */
4858