xref: /freebsd/sys/dev/mxge/if_mxge.c (revision eb6d21b4ca6d668cf89afd99eef7baeafa712197)
1 /******************************************************************************
2 
3 Copyright (c) 2006-2009, Myricom Inc.
4 All rights reserved.
5 
6 Redistribution and use in source and binary forms, with or without
7 modification, are permitted provided that the following conditions are met:
8 
9  1. Redistributions of source code must retain the above copyright notice,
10     this list of conditions and the following disclaimer.
11 
12  2. Neither the name of the Myricom Inc, nor the names of its
13     contributors may be used to endorse or promote products derived from
14     this software without specific prior written permission.
15 
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 POSSIBILITY OF SUCH DAMAGE.
27 
28 ***************************************************************************/
29 
30 #include <sys/cdefs.h>
31 __FBSDID("$FreeBSD$");
32 
33 #include <sys/param.h>
34 #include <sys/systm.h>
35 #include <sys/linker.h>
36 #include <sys/firmware.h>
37 #include <sys/endian.h>
38 #include <sys/sockio.h>
39 #include <sys/mbuf.h>
40 #include <sys/malloc.h>
41 #include <sys/kdb.h>
42 #include <sys/kernel.h>
43 #include <sys/lock.h>
44 #include <sys/module.h>
45 #include <sys/socket.h>
46 #include <sys/sysctl.h>
47 #include <sys/sx.h>
48 #include <sys/taskqueue.h>
49 
50 /* count xmits ourselves, rather than via drbr */
51 #define NO_SLOW_STATS
52 #include <net/if.h>
53 #include <net/if_arp.h>
54 #include <net/ethernet.h>
55 #include <net/if_dl.h>
56 #include <net/if_media.h>
57 
58 #include <net/bpf.h>
59 
60 #include <net/if_types.h>
61 #include <net/if_vlan_var.h>
62 #include <net/zlib.h>
63 
64 #include <netinet/in_systm.h>
65 #include <netinet/in.h>
66 #include <netinet/ip.h>
67 #include <netinet/tcp.h>
68 
69 #include <machine/bus.h>
70 #include <machine/in_cksum.h>
71 #include <machine/resource.h>
72 #include <sys/bus.h>
73 #include <sys/rman.h>
74 #include <sys/smp.h>
75 
76 #include <dev/pci/pcireg.h>
77 #include <dev/pci/pcivar.h>
78 #include <dev/pci/pci_private.h> /* XXX for pci_cfg_restore */
79 
80 #include <vm/vm.h>		/* for pmap_mapdev() */
81 #include <vm/pmap.h>
82 
83 #if defined(__i386) || defined(__amd64)
84 #include <machine/specialreg.h>
85 #endif
86 
87 #include <dev/mxge/mxge_mcp.h>
88 #include <dev/mxge/mcp_gen_header.h>
89 /*#define MXGE_FAKE_IFP*/
90 #include <dev/mxge/if_mxge_var.h>
91 #ifdef IFNET_BUF_RING
92 #include <sys/buf_ring.h>
93 #endif
94 
95 #include "opt_inet.h"
96 
97 /* tunable params */
98 static int mxge_nvidia_ecrc_enable = 1;
99 static int mxge_force_firmware = 0;
100 static int mxge_intr_coal_delay = 30;
101 static int mxge_deassert_wait = 1;
102 static int mxge_flow_control = 1;
103 static int mxge_verbose = 0;
104 static int mxge_lro_cnt = 8;
105 static int mxge_ticks;
106 static int mxge_max_slices = 1;
107 static int mxge_rss_hash_type = MXGEFW_RSS_HASH_TYPE_SRC_PORT;
108 static int mxge_always_promisc = 0;
109 static int mxge_initial_mtu = ETHERMTU_JUMBO;
110 static int mxge_throttle = 0;
111 static char *mxge_fw_unaligned = "mxge_ethp_z8e";
112 static char *mxge_fw_aligned = "mxge_eth_z8e";
113 static char *mxge_fw_rss_aligned = "mxge_rss_eth_z8e";
114 static char *mxge_fw_rss_unaligned = "mxge_rss_ethp_z8e";
115 
116 static int mxge_probe(device_t dev);
117 static int mxge_attach(device_t dev);
118 static int mxge_detach(device_t dev);
119 static int mxge_shutdown(device_t dev);
120 static void mxge_intr(void *arg);
121 
122 static device_method_t mxge_methods[] =
123 {
124   /* Device interface */
125   DEVMETHOD(device_probe, mxge_probe),
126   DEVMETHOD(device_attach, mxge_attach),
127   DEVMETHOD(device_detach, mxge_detach),
128   DEVMETHOD(device_shutdown, mxge_shutdown),
129   {0, 0}
130 };
131 
132 static driver_t mxge_driver =
133 {
134   "mxge",
135   mxge_methods,
136   sizeof(mxge_softc_t),
137 };
138 
139 static devclass_t mxge_devclass;
140 
141 /* Declare ourselves to be a child of the PCI bus.*/
142 DRIVER_MODULE(mxge, pci, mxge_driver, mxge_devclass, 0, 0);
143 MODULE_DEPEND(mxge, firmware, 1, 1, 1);
144 MODULE_DEPEND(mxge, zlib, 1, 1, 1);
145 
146 static int mxge_load_firmware(mxge_softc_t *sc, int adopt);
147 static int mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data);
148 static int mxge_close(mxge_softc_t *sc, int down);
149 static int mxge_open(mxge_softc_t *sc);
150 static void mxge_tick(void *arg);
151 
152 static int
153 mxge_probe(device_t dev)
154 {
155 	int rev;
156 
157 
158 	if ((pci_get_vendor(dev) == MXGE_PCI_VENDOR_MYRICOM) &&
159 	    ((pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E) ||
160 	     (pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E_9))) {
161 		rev = pci_get_revid(dev);
162 		switch (rev) {
163 		case MXGE_PCI_REV_Z8E:
164 			device_set_desc(dev, "Myri10G-PCIE-8A");
165 			break;
166 		case MXGE_PCI_REV_Z8ES:
167 			device_set_desc(dev, "Myri10G-PCIE-8B");
168 			break;
169 		default:
170 			device_set_desc(dev, "Myri10G-PCIE-8??");
171 			device_printf(dev, "Unrecognized rev %d NIC\n",
172 				      rev);
173 			break;
174 		}
175 		return 0;
176 	}
177 	return ENXIO;
178 }
179 
180 static void
181 mxge_enable_wc(mxge_softc_t *sc)
182 {
183 #if defined(__i386) || defined(__amd64)
184 	vm_offset_t len;
185 	int err;
186 
187 	sc->wc = 1;
188 	len = rman_get_size(sc->mem_res);
189 	err = pmap_change_attr((vm_offset_t) sc->sram,
190 			       len, PAT_WRITE_COMBINING);
191 	if (err != 0) {
192 		device_printf(sc->dev, "pmap_change_attr failed, %d\n",
193 			      err);
194 		sc->wc = 0;
195 	}
196 #endif
197 }
198 
199 
200 /* callback to get our DMA address */
201 static void
202 mxge_dmamap_callback(void *arg, bus_dma_segment_t *segs, int nsegs,
203 			 int error)
204 {
205 	if (error == 0) {
206 		*(bus_addr_t *) arg = segs->ds_addr;
207 	}
208 }
209 
210 static int
211 mxge_dma_alloc(mxge_softc_t *sc, mxge_dma_t *dma, size_t bytes,
212 		   bus_size_t alignment)
213 {
214 	int err;
215 	device_t dev = sc->dev;
216 	bus_size_t boundary, maxsegsize;
217 
218 	if (bytes > 4096 && alignment == 4096) {
219 		boundary = 0;
220 		maxsegsize = bytes;
221 	} else {
222 		boundary = 4096;
223 		maxsegsize = 4096;
224 	}
225 
226 	/* allocate DMAable memory tags */
227 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
228 				 alignment,		/* alignment */
229 				 boundary,		/* boundary */
230 				 BUS_SPACE_MAXADDR,	/* low */
231 				 BUS_SPACE_MAXADDR,	/* high */
232 				 NULL, NULL,		/* filter */
233 				 bytes,			/* maxsize */
234 				 1,			/* num segs */
235 				 maxsegsize,		/* maxsegsize */
236 				 BUS_DMA_COHERENT,	/* flags */
237 				 NULL, NULL,		/* lock */
238 				 &dma->dmat);		/* tag */
239 	if (err != 0) {
240 		device_printf(dev, "couldn't alloc tag (err = %d)\n", err);
241 		return err;
242 	}
243 
244 	/* allocate DMAable memory & map */
245 	err = bus_dmamem_alloc(dma->dmat, &dma->addr,
246 			       (BUS_DMA_WAITOK | BUS_DMA_COHERENT
247 				| BUS_DMA_ZERO),  &dma->map);
248 	if (err != 0) {
249 		device_printf(dev, "couldn't alloc mem (err = %d)\n", err);
250 		goto abort_with_dmat;
251 	}
252 
253 	/* load the memory */
254 	err = bus_dmamap_load(dma->dmat, dma->map, dma->addr, bytes,
255 			      mxge_dmamap_callback,
256 			      (void *)&dma->bus_addr, 0);
257 	if (err != 0) {
258 		device_printf(dev, "couldn't load map (err = %d)\n", err);
259 		goto abort_with_mem;
260 	}
261 	return 0;
262 
263 abort_with_mem:
264 	bus_dmamem_free(dma->dmat, dma->addr, dma->map);
265 abort_with_dmat:
266 	(void)bus_dma_tag_destroy(dma->dmat);
267 	return err;
268 }
269 
270 
271 static void
272 mxge_dma_free(mxge_dma_t *dma)
273 {
274 	bus_dmamap_unload(dma->dmat, dma->map);
275 	bus_dmamem_free(dma->dmat, dma->addr, dma->map);
276 	(void)bus_dma_tag_destroy(dma->dmat);
277 }
278 
279 /*
280  * The eeprom strings on the lanaiX have the format
281  * SN=x\0
282  * MAC=x:x:x:x:x:x\0
283  * PC=text\0
284  */
285 
286 static int
287 mxge_parse_strings(mxge_softc_t *sc)
288 {
289 #define MXGE_NEXT_STRING(p) while(ptr < limit && *ptr++)
290 
291 	char *ptr, *limit;
292 	int i, found_mac;
293 
294 	ptr = sc->eeprom_strings;
295 	limit = sc->eeprom_strings + MXGE_EEPROM_STRINGS_SIZE;
296 	found_mac = 0;
297 	while (ptr < limit && *ptr != '\0') {
298 		if (memcmp(ptr, "MAC=", 4) == 0) {
299 			ptr += 1;
300 			sc->mac_addr_string = ptr;
301 			for (i = 0; i < 6; i++) {
302 				ptr += 3;
303 				if ((ptr + 2) > limit)
304 					goto abort;
305 				sc->mac_addr[i] = strtoul(ptr, NULL, 16);
306 				found_mac = 1;
307 			}
308 		} else if (memcmp(ptr, "PC=", 3) == 0) {
309 			ptr += 3;
310 			strncpy(sc->product_code_string, ptr,
311 				sizeof (sc->product_code_string) - 1);
312 		} else if (memcmp(ptr, "SN=", 3) == 0) {
313 			ptr += 3;
314 			strncpy(sc->serial_number_string, ptr,
315 				sizeof (sc->serial_number_string) - 1);
316 		}
317 		MXGE_NEXT_STRING(ptr);
318 	}
319 
320 	if (found_mac)
321 		return 0;
322 
323  abort:
324 	device_printf(sc->dev, "failed to parse eeprom_strings\n");
325 
326 	return ENXIO;
327 }
328 
329 #if defined __i386 || defined i386 || defined __i386__ || defined __x86_64__
330 static void
331 mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
332 {
333 	uint32_t val;
334 	unsigned long base, off;
335 	char *va, *cfgptr;
336 	device_t pdev, mcp55;
337 	uint16_t vendor_id, device_id, word;
338 	uintptr_t bus, slot, func, ivend, idev;
339 	uint32_t *ptr32;
340 
341 
342 	if (!mxge_nvidia_ecrc_enable)
343 		return;
344 
345 	pdev = device_get_parent(device_get_parent(sc->dev));
346 	if (pdev == NULL) {
347 		device_printf(sc->dev, "could not find parent?\n");
348 		return;
349 	}
350 	vendor_id = pci_read_config(pdev, PCIR_VENDOR, 2);
351 	device_id = pci_read_config(pdev, PCIR_DEVICE, 2);
352 
353 	if (vendor_id != 0x10de)
354 		return;
355 
356 	base = 0;
357 
358 	if (device_id == 0x005d) {
359 		/* ck804, base address is magic */
360 		base = 0xe0000000UL;
361 	} else if (device_id >= 0x0374 && device_id <= 0x378) {
362 		/* mcp55, base address stored in chipset */
363 		mcp55 = pci_find_bsf(0, 0, 0);
364 		if (mcp55 &&
365 		    0x10de == pci_read_config(mcp55, PCIR_VENDOR, 2) &&
366 		    0x0369 == pci_read_config(mcp55, PCIR_DEVICE, 2)) {
367 			word = pci_read_config(mcp55, 0x90, 2);
368 			base = ((unsigned long)word & 0x7ffeU) << 25;
369 		}
370 	}
371 	if (!base)
372 		return;
373 
374 	/* XXXX
375 	   Test below is commented because it is believed that doing
376 	   config read/write beyond 0xff will access the config space
377 	   for the next larger function.  Uncomment this and remove
378 	   the hacky pmap_mapdev() way of accessing config space when
379 	   FreeBSD grows support for extended pcie config space access
380 	*/
381 #if 0
382 	/* See if we can, by some miracle, access the extended
383 	   config space */
384 	val = pci_read_config(pdev, 0x178, 4);
385 	if (val != 0xffffffff) {
386 		val |= 0x40;
387 		pci_write_config(pdev, 0x178, val, 4);
388 		return;
389 	}
390 #endif
391 	/* Rather than using normal pci config space writes, we must
392 	 * map the Nvidia config space ourselves.  This is because on
393 	 * opteron/nvidia class machine the 0xe000000 mapping is
394 	 * handled by the nvidia chipset, that means the internal PCI
395 	 * device (the on-chip northbridge), or the amd-8131 bridge
396 	 * and things behind them are not visible by this method.
397 	 */
398 
399 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
400 		      PCI_IVAR_BUS, &bus);
401 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
402 		      PCI_IVAR_SLOT, &slot);
403 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
404 		      PCI_IVAR_FUNCTION, &func);
405 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
406 		      PCI_IVAR_VENDOR, &ivend);
407 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
408 		      PCI_IVAR_DEVICE, &idev);
409 
410 	off =  base
411 		+ 0x00100000UL * (unsigned long)bus
412 		+ 0x00001000UL * (unsigned long)(func
413 						 + 8 * slot);
414 
415 	/* map it into the kernel */
416 	va = pmap_mapdev(trunc_page((vm_paddr_t)off), PAGE_SIZE);
417 
418 
419 	if (va == NULL) {
420 		device_printf(sc->dev, "pmap_kenter_temporary didn't\n");
421 		return;
422 	}
423 	/* get a pointer to the config space mapped into the kernel */
424 	cfgptr = va + (off & PAGE_MASK);
425 
426 	/* make sure that we can really access it */
427 	vendor_id = *(uint16_t *)(cfgptr + PCIR_VENDOR);
428 	device_id = *(uint16_t *)(cfgptr + PCIR_DEVICE);
429 	if (! (vendor_id == ivend && device_id == idev)) {
430 		device_printf(sc->dev, "mapping failed: 0x%x:0x%x\n",
431 			      vendor_id, device_id);
432 		pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
433 		return;
434 	}
435 
436 	ptr32 = (uint32_t*)(cfgptr + 0x178);
437 	val = *ptr32;
438 
439 	if (val == 0xffffffff) {
440 		device_printf(sc->dev, "extended mapping failed\n");
441 		pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
442 		return;
443 	}
444 	*ptr32 = val | 0x40;
445 	pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
446 	if (mxge_verbose)
447 		device_printf(sc->dev,
448 			      "Enabled ECRC on upstream Nvidia bridge "
449 			      "at %d:%d:%d\n",
450 			      (int)bus, (int)slot, (int)func);
451 	return;
452 }
453 #else
454 static void
455 mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
456 {
457 	device_printf(sc->dev,
458 		      "Nforce 4 chipset on non-x86/amd64!?!?!\n");
459 	return;
460 }
461 #endif
462 
463 
464 static int
465 mxge_dma_test(mxge_softc_t *sc, int test_type)
466 {
467 	mxge_cmd_t cmd;
468 	bus_addr_t dmatest_bus = sc->dmabench_dma.bus_addr;
469 	int status;
470 	uint32_t len;
471 	char *test = " ";
472 
473 
474 	/* Run a small DMA test.
475 	 * The magic multipliers to the length tell the firmware
476 	 * to do DMA read, write, or read+write tests.  The
477 	 * results are returned in cmd.data0.  The upper 16
478 	 * bits of the return is the number of transfers completed.
479 	 * The lower 16 bits is the time in 0.5us ticks that the
480 	 * transfers took to complete.
481 	 */
482 
483 	len = sc->tx_boundary;
484 
485 	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
486 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
487 	cmd.data2 = len * 0x10000;
488 	status = mxge_send_cmd(sc, test_type, &cmd);
489 	if (status != 0) {
490 		test = "read";
491 		goto abort;
492 	}
493 	sc->read_dma = ((cmd.data0>>16) * len * 2) /
494 		(cmd.data0 & 0xffff);
495 	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
496 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
497 	cmd.data2 = len * 0x1;
498 	status = mxge_send_cmd(sc, test_type, &cmd);
499 	if (status != 0) {
500 		test = "write";
501 		goto abort;
502 	}
503 	sc->write_dma = ((cmd.data0>>16) * len * 2) /
504 		(cmd.data0 & 0xffff);
505 
506 	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
507 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
508 	cmd.data2 = len * 0x10001;
509 	status = mxge_send_cmd(sc, test_type, &cmd);
510 	if (status != 0) {
511 		test = "read/write";
512 		goto abort;
513 	}
514 	sc->read_write_dma = ((cmd.data0>>16) * len * 2 * 2) /
515 		(cmd.data0 & 0xffff);
516 
517 abort:
518 	if (status != 0 && test_type != MXGEFW_CMD_UNALIGNED_TEST)
519 		device_printf(sc->dev, "DMA %s benchmark failed: %d\n",
520 			      test, status);
521 
522 	return status;
523 }
524 
525 /*
526  * The Lanai Z8E PCI-E interface achieves higher Read-DMA throughput
527  * when the PCI-E Completion packets are aligned on an 8-byte
528  * boundary.  Some PCI-E chip sets always align Completion packets; on
529  * the ones that do not, the alignment can be enforced by enabling
530  * ECRC generation (if supported).
531  *
532  * When PCI-E Completion packets are not aligned, it is actually more
533  * efficient to limit Read-DMA transactions to 2KB, rather than 4KB.
534  *
535  * If the driver can neither enable ECRC nor verify that it has
536  * already been enabled, then it must use a firmware image which works
537  * around unaligned completion packets (ethp_z8e.dat), and it should
538  * also ensure that it never gives the device a Read-DMA which is
539  * larger than 2KB by setting the tx_boundary to 2KB.  If ECRC is
540  * enabled, then the driver should use the aligned (eth_z8e.dat)
541  * firmware image, and set tx_boundary to 4KB.
542  */
543 
544 static int
545 mxge_firmware_probe(mxge_softc_t *sc)
546 {
547 	device_t dev = sc->dev;
548 	int reg, status;
549 	uint16_t pectl;
550 
551 	sc->tx_boundary = 4096;
552 	/*
553 	 * Verify the max read request size was set to 4KB
554 	 * before trying the test with 4KB.
555 	 */
556 	if (pci_find_extcap(dev, PCIY_EXPRESS, &reg) == 0) {
557 		pectl = pci_read_config(dev, reg + 0x8, 2);
558 		if ((pectl & (5 << 12)) != (5 << 12)) {
559 			device_printf(dev, "Max Read Req. size != 4k (0x%x\n",
560 				      pectl);
561 			sc->tx_boundary = 2048;
562 		}
563 	}
564 
565 	/*
566 	 * load the optimized firmware (which assumes aligned PCIe
567 	 * completions) in order to see if it works on this host.
568 	 */
569 	sc->fw_name = mxge_fw_aligned;
570 	status = mxge_load_firmware(sc, 1);
571 	if (status != 0) {
572 		return status;
573 	}
574 
575 	/*
576 	 * Enable ECRC if possible
577 	 */
578 	mxge_enable_nvidia_ecrc(sc);
579 
580 	/*
581 	 * Run a DMA test which watches for unaligned completions and
582 	 * aborts on the first one seen.
583 	 */
584 
585 	status = mxge_dma_test(sc, MXGEFW_CMD_UNALIGNED_TEST);
586 	if (status == 0)
587 		return 0; /* keep the aligned firmware */
588 
589 	if (status != E2BIG)
590 		device_printf(dev, "DMA test failed: %d\n", status);
591 	if (status == ENOSYS)
592 		device_printf(dev, "Falling back to ethp! "
593 			      "Please install up to date fw\n");
594 	return status;
595 }
596 
597 static int
598 mxge_select_firmware(mxge_softc_t *sc)
599 {
600 	int aligned = 0;
601 	int force_firmware = mxge_force_firmware;
602 
603 	if (sc->throttle)
604 		force_firmware = sc->throttle;
605 
606 	if (force_firmware != 0) {
607 		if (force_firmware == 1)
608 			aligned = 1;
609 		else
610 			aligned = 0;
611 		if (mxge_verbose)
612 			device_printf(sc->dev,
613 				      "Assuming %s completions (forced)\n",
614 				      aligned ? "aligned" : "unaligned");
615 		goto abort;
616 	}
617 
618 	/* if the PCIe link width is 4 or less, we can use the aligned
619 	   firmware and skip any checks */
620 	if (sc->link_width != 0 && sc->link_width <= 4) {
621 		device_printf(sc->dev,
622 			      "PCIe x%d Link, expect reduced performance\n",
623 			      sc->link_width);
624 		aligned = 1;
625 		goto abort;
626 	}
627 
628 	if (0 == mxge_firmware_probe(sc))
629 		return 0;
630 
631 abort:
632 	if (aligned) {
633 		sc->fw_name = mxge_fw_aligned;
634 		sc->tx_boundary = 4096;
635 	} else {
636 		sc->fw_name = mxge_fw_unaligned;
637 		sc->tx_boundary = 2048;
638 	}
639 	return (mxge_load_firmware(sc, 0));
640 }
641 
642 union qualhack
643 {
644         const char *ro_char;
645         char *rw_char;
646 };
647 
648 static int
649 mxge_validate_firmware(mxge_softc_t *sc, const mcp_gen_header_t *hdr)
650 {
651 
652 
653 	if (be32toh(hdr->mcp_type) != MCP_TYPE_ETH) {
654 		device_printf(sc->dev, "Bad firmware type: 0x%x\n",
655 			      be32toh(hdr->mcp_type));
656 		return EIO;
657 	}
658 
659 	/* save firmware version for sysctl */
660 	strncpy(sc->fw_version, hdr->version, sizeof (sc->fw_version));
661 	if (mxge_verbose)
662 		device_printf(sc->dev, "firmware id: %s\n", hdr->version);
663 
664 	sscanf(sc->fw_version, "%d.%d.%d", &sc->fw_ver_major,
665 	       &sc->fw_ver_minor, &sc->fw_ver_tiny);
666 
667 	if (!(sc->fw_ver_major == MXGEFW_VERSION_MAJOR
668 	      && sc->fw_ver_minor == MXGEFW_VERSION_MINOR)) {
669 		device_printf(sc->dev, "Found firmware version %s\n",
670 			      sc->fw_version);
671 		device_printf(sc->dev, "Driver needs %d.%d\n",
672 			      MXGEFW_VERSION_MAJOR, MXGEFW_VERSION_MINOR);
673 		return EINVAL;
674 	}
675 	return 0;
676 
677 }
678 
679 static void *
680 z_alloc(void *nil, u_int items, u_int size)
681 {
682         void *ptr;
683 
684         ptr = malloc(items * size, M_TEMP, M_NOWAIT);
685         return ptr;
686 }
687 
688 static void
689 z_free(void *nil, void *ptr)
690 {
691         free(ptr, M_TEMP);
692 }
693 
694 
695 static int
696 mxge_load_firmware_helper(mxge_softc_t *sc, uint32_t *limit)
697 {
698 	z_stream zs;
699 	char *inflate_buffer;
700 	const struct firmware *fw;
701 	const mcp_gen_header_t *hdr;
702 	unsigned hdr_offset;
703 	int status;
704 	unsigned int i;
705 	char dummy;
706 	size_t fw_len;
707 
708 	fw = firmware_get(sc->fw_name);
709 	if (fw == NULL) {
710 		device_printf(sc->dev, "Could not find firmware image %s\n",
711 			      sc->fw_name);
712 		return ENOENT;
713 	}
714 
715 
716 
717 	/* setup zlib and decompress f/w */
718 	bzero(&zs, sizeof (zs));
719 	zs.zalloc = z_alloc;
720 	zs.zfree = z_free;
721 	status = inflateInit(&zs);
722 	if (status != Z_OK) {
723 		status = EIO;
724 		goto abort_with_fw;
725 	}
726 
727 	/* the uncompressed size is stored as the firmware version,
728 	   which would otherwise go unused */
729 	fw_len = (size_t) fw->version;
730 	inflate_buffer = malloc(fw_len, M_TEMP, M_NOWAIT);
731 	if (inflate_buffer == NULL)
732 		goto abort_with_zs;
733 	zs.avail_in = fw->datasize;
734 	zs.next_in = __DECONST(char *, fw->data);
735 	zs.avail_out = fw_len;
736 	zs.next_out = inflate_buffer;
737 	status = inflate(&zs, Z_FINISH);
738 	if (status != Z_STREAM_END) {
739 		device_printf(sc->dev, "zlib %d\n", status);
740 		status = EIO;
741 		goto abort_with_buffer;
742 	}
743 
744 	/* check id */
745 	hdr_offset = htobe32(*(const uint32_t *)
746 			     (inflate_buffer + MCP_HEADER_PTR_OFFSET));
747 	if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > fw_len) {
748 		device_printf(sc->dev, "Bad firmware file");
749 		status = EIO;
750 		goto abort_with_buffer;
751 	}
752 	hdr = (const void*)(inflate_buffer + hdr_offset);
753 
754 	status = mxge_validate_firmware(sc, hdr);
755 	if (status != 0)
756 		goto abort_with_buffer;
757 
758 	/* Copy the inflated firmware to NIC SRAM. */
759 	for (i = 0; i < fw_len; i += 256) {
760 		mxge_pio_copy(sc->sram + MXGE_FW_OFFSET + i,
761 			      inflate_buffer + i,
762 			      min(256U, (unsigned)(fw_len - i)));
763 		wmb();
764 		dummy = *sc->sram;
765 		wmb();
766 	}
767 
768 	*limit = fw_len;
769 	status = 0;
770 abort_with_buffer:
771 	free(inflate_buffer, M_TEMP);
772 abort_with_zs:
773 	inflateEnd(&zs);
774 abort_with_fw:
775 	firmware_put(fw, FIRMWARE_UNLOAD);
776 	return status;
777 }
778 
779 /*
780  * Enable or disable periodic RDMAs from the host to make certain
781  * chipsets resend dropped PCIe messages
782  */
783 
784 static void
785 mxge_dummy_rdma(mxge_softc_t *sc, int enable)
786 {
787 	char buf_bytes[72];
788 	volatile uint32_t *confirm;
789 	volatile char *submit;
790 	uint32_t *buf, dma_low, dma_high;
791 	int i;
792 
793 	buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
794 
795 	/* clear confirmation addr */
796 	confirm = (volatile uint32_t *)sc->cmd;
797 	*confirm = 0;
798 	wmb();
799 
800 	/* send an rdma command to the PCIe engine, and wait for the
801 	   response in the confirmation address.  The firmware should
802 	   write a -1 there to indicate it is alive and well
803 	*/
804 
805 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
806 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
807 	buf[0] = htobe32(dma_high);		/* confirm addr MSW */
808 	buf[1] = htobe32(dma_low);		/* confirm addr LSW */
809 	buf[2] = htobe32(0xffffffff);		/* confirm data */
810 	dma_low = MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr);
811 	dma_high = MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr);
812 	buf[3] = htobe32(dma_high); 		/* dummy addr MSW */
813 	buf[4] = htobe32(dma_low); 		/* dummy addr LSW */
814 	buf[5] = htobe32(enable);			/* enable? */
815 
816 
817 	submit = (volatile char *)(sc->sram + MXGEFW_BOOT_DUMMY_RDMA);
818 
819 	mxge_pio_copy(submit, buf, 64);
820 	wmb();
821 	DELAY(1000);
822 	wmb();
823 	i = 0;
824 	while (*confirm != 0xffffffff && i < 20) {
825 		DELAY(1000);
826 		i++;
827 	}
828 	if (*confirm != 0xffffffff) {
829 		device_printf(sc->dev, "dummy rdma %s failed (%p = 0x%x)",
830 			      (enable ? "enable" : "disable"), confirm,
831 			      *confirm);
832 	}
833 	return;
834 }
835 
836 static int
837 mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data)
838 {
839 	mcp_cmd_t *buf;
840 	char buf_bytes[sizeof(*buf) + 8];
841 	volatile mcp_cmd_response_t *response = sc->cmd;
842 	volatile char *cmd_addr = sc->sram + MXGEFW_ETH_CMD;
843 	uint32_t dma_low, dma_high;
844 	int err, sleep_total = 0;
845 
846 	/* ensure buf is aligned to 8 bytes */
847 	buf = (mcp_cmd_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
848 
849 	buf->data0 = htobe32(data->data0);
850 	buf->data1 = htobe32(data->data1);
851 	buf->data2 = htobe32(data->data2);
852 	buf->cmd = htobe32(cmd);
853 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
854 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
855 
856 	buf->response_addr.low = htobe32(dma_low);
857 	buf->response_addr.high = htobe32(dma_high);
858 	mtx_lock(&sc->cmd_mtx);
859 	response->result = 0xffffffff;
860 	wmb();
861 	mxge_pio_copy((volatile void *)cmd_addr, buf, sizeof (*buf));
862 
863 	/* wait up to 20ms */
864 	err = EAGAIN;
865 	for (sleep_total = 0; sleep_total <  20; sleep_total++) {
866 		bus_dmamap_sync(sc->cmd_dma.dmat,
867 				sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
868 		wmb();
869 		switch (be32toh(response->result)) {
870 		case 0:
871 			data->data0 = be32toh(response->data);
872 			err = 0;
873 			break;
874 		case 0xffffffff:
875 			DELAY(1000);
876 			break;
877 		case MXGEFW_CMD_UNKNOWN:
878 			err = ENOSYS;
879 			break;
880 		case MXGEFW_CMD_ERROR_UNALIGNED:
881 			err = E2BIG;
882 			break;
883 		case MXGEFW_CMD_ERROR_BUSY:
884 			err = EBUSY;
885 			break;
886 		default:
887 			device_printf(sc->dev,
888 				      "mxge: command %d "
889 				      "failed, result = %d\n",
890 				      cmd, be32toh(response->result));
891 			err = ENXIO;
892 			break;
893 		}
894 		if (err != EAGAIN)
895 			break;
896 	}
897 	if (err == EAGAIN)
898 		device_printf(sc->dev, "mxge: command %d timed out"
899 			      "result = %d\n",
900 			      cmd, be32toh(response->result));
901 	mtx_unlock(&sc->cmd_mtx);
902 	return err;
903 }
904 
905 static int
906 mxge_adopt_running_firmware(mxge_softc_t *sc)
907 {
908 	struct mcp_gen_header *hdr;
909 	const size_t bytes = sizeof (struct mcp_gen_header);
910 	size_t hdr_offset;
911 	int status;
912 
913 	/* find running firmware header */
914 	hdr_offset = htobe32(*(volatile uint32_t *)
915 			     (sc->sram + MCP_HEADER_PTR_OFFSET));
916 
917 	if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > sc->sram_size) {
918 		device_printf(sc->dev,
919 			      "Running firmware has bad header offset (%d)\n",
920 			      (int)hdr_offset);
921 		return EIO;
922 	}
923 
924 	/* copy header of running firmware from SRAM to host memory to
925 	 * validate firmware */
926 	hdr = malloc(bytes, M_DEVBUF, M_NOWAIT);
927 	if (hdr == NULL) {
928 		device_printf(sc->dev, "could not malloc firmware hdr\n");
929 		return ENOMEM;
930 	}
931 	bus_space_read_region_1(rman_get_bustag(sc->mem_res),
932 				rman_get_bushandle(sc->mem_res),
933 				hdr_offset, (char *)hdr, bytes);
934 	status = mxge_validate_firmware(sc, hdr);
935 	free(hdr, M_DEVBUF);
936 
937 	/*
938 	 * check to see if adopted firmware has bug where adopting
939 	 * it will cause broadcasts to be filtered unless the NIC
940 	 * is kept in ALLMULTI mode
941 	 */
942 	if (sc->fw_ver_major == 1 && sc->fw_ver_minor == 4 &&
943 	    sc->fw_ver_tiny >= 4 && sc->fw_ver_tiny <= 11) {
944 		sc->adopted_rx_filter_bug = 1;
945 		device_printf(sc->dev, "Adopting fw %d.%d.%d: "
946 			      "working around rx filter bug\n",
947 			      sc->fw_ver_major, sc->fw_ver_minor,
948 			      sc->fw_ver_tiny);
949 	}
950 
951 	return status;
952 }
953 
954 
955 static int
956 mxge_load_firmware(mxge_softc_t *sc, int adopt)
957 {
958 	volatile uint32_t *confirm;
959 	volatile char *submit;
960 	char buf_bytes[72];
961 	uint32_t *buf, size, dma_low, dma_high;
962 	int status, i;
963 
964 	buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
965 
966 	size = sc->sram_size;
967 	status = mxge_load_firmware_helper(sc, &size);
968 	if (status) {
969 		if (!adopt)
970 			return status;
971 		/* Try to use the currently running firmware, if
972 		   it is new enough */
973 		status = mxge_adopt_running_firmware(sc);
974 		if (status) {
975 			device_printf(sc->dev,
976 				      "failed to adopt running firmware\n");
977 			return status;
978 		}
979 		device_printf(sc->dev,
980 			      "Successfully adopted running firmware\n");
981 		if (sc->tx_boundary == 4096) {
982 			device_printf(sc->dev,
983 				"Using firmware currently running on NIC"
984 				 ".  For optimal\n");
985 			device_printf(sc->dev,
986 				 "performance consider loading optimized "
987 				 "firmware\n");
988 		}
989 		sc->fw_name = mxge_fw_unaligned;
990 		sc->tx_boundary = 2048;
991 		return 0;
992 	}
993 	/* clear confirmation addr */
994 	confirm = (volatile uint32_t *)sc->cmd;
995 	*confirm = 0;
996 	wmb();
997 	/* send a reload command to the bootstrap MCP, and wait for the
998 	   response in the confirmation address.  The firmware should
999 	   write a -1 there to indicate it is alive and well
1000 	*/
1001 
1002 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
1003 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
1004 
1005 	buf[0] = htobe32(dma_high);	/* confirm addr MSW */
1006 	buf[1] = htobe32(dma_low);	/* confirm addr LSW */
1007 	buf[2] = htobe32(0xffffffff);	/* confirm data */
1008 
1009 	/* FIX: All newest firmware should un-protect the bottom of
1010 	   the sram before handoff. However, the very first interfaces
1011 	   do not. Therefore the handoff copy must skip the first 8 bytes
1012 	*/
1013 					/* where the code starts*/
1014 	buf[3] = htobe32(MXGE_FW_OFFSET + 8);
1015 	buf[4] = htobe32(size - 8); 	/* length of code */
1016 	buf[5] = htobe32(8);		/* where to copy to */
1017 	buf[6] = htobe32(0);		/* where to jump to */
1018 
1019 	submit = (volatile char *)(sc->sram + MXGEFW_BOOT_HANDOFF);
1020 	mxge_pio_copy(submit, buf, 64);
1021 	wmb();
1022 	DELAY(1000);
1023 	wmb();
1024 	i = 0;
1025 	while (*confirm != 0xffffffff && i < 20) {
1026 		DELAY(1000*10);
1027 		i++;
1028 		bus_dmamap_sync(sc->cmd_dma.dmat,
1029 				sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
1030 	}
1031 	if (*confirm != 0xffffffff) {
1032 		device_printf(sc->dev,"handoff failed (%p = 0x%x)",
1033 			confirm, *confirm);
1034 
1035 		return ENXIO;
1036 	}
1037 	return 0;
1038 }
1039 
1040 static int
1041 mxge_update_mac_address(mxge_softc_t *sc)
1042 {
1043 	mxge_cmd_t cmd;
1044 	uint8_t *addr = sc->mac_addr;
1045 	int status;
1046 
1047 
1048 	cmd.data0 = ((addr[0] << 24) | (addr[1] << 16)
1049 		     | (addr[2] << 8) | addr[3]);
1050 
1051 	cmd.data1 = ((addr[4] << 8) | (addr[5]));
1052 
1053 	status = mxge_send_cmd(sc, MXGEFW_SET_MAC_ADDRESS, &cmd);
1054 	return status;
1055 }
1056 
1057 static int
1058 mxge_change_pause(mxge_softc_t *sc, int pause)
1059 {
1060 	mxge_cmd_t cmd;
1061 	int status;
1062 
1063 	if (pause)
1064 		status = mxge_send_cmd(sc, MXGEFW_ENABLE_FLOW_CONTROL,
1065 				       &cmd);
1066 	else
1067 		status = mxge_send_cmd(sc, MXGEFW_DISABLE_FLOW_CONTROL,
1068 				       &cmd);
1069 
1070 	if (status) {
1071 		device_printf(sc->dev, "Failed to set flow control mode\n");
1072 		return ENXIO;
1073 	}
1074 	sc->pause = pause;
1075 	return 0;
1076 }
1077 
1078 static void
1079 mxge_change_promisc(mxge_softc_t *sc, int promisc)
1080 {
1081 	mxge_cmd_t cmd;
1082 	int status;
1083 
1084 	if (mxge_always_promisc)
1085 		promisc = 1;
1086 
1087 	if (promisc)
1088 		status = mxge_send_cmd(sc, MXGEFW_ENABLE_PROMISC,
1089 				       &cmd);
1090 	else
1091 		status = mxge_send_cmd(sc, MXGEFW_DISABLE_PROMISC,
1092 				       &cmd);
1093 
1094 	if (status) {
1095 		device_printf(sc->dev, "Failed to set promisc mode\n");
1096 	}
1097 }
1098 
1099 static void
1100 mxge_set_multicast_list(mxge_softc_t *sc)
1101 {
1102 	mxge_cmd_t cmd;
1103 	struct ifmultiaddr *ifma;
1104 	struct ifnet *ifp = sc->ifp;
1105 	int err;
1106 
1107 	/* This firmware is known to not support multicast */
1108 	if (!sc->fw_multicast_support)
1109 		return;
1110 
1111 	/* Disable multicast filtering while we play with the lists*/
1112 	err = mxge_send_cmd(sc, MXGEFW_ENABLE_ALLMULTI, &cmd);
1113 	if (err != 0) {
1114 		device_printf(sc->dev, "Failed MXGEFW_ENABLE_ALLMULTI,"
1115 		       " error status: %d\n", err);
1116 		return;
1117 	}
1118 
1119 	if (sc->adopted_rx_filter_bug)
1120 		return;
1121 
1122 	if (ifp->if_flags & IFF_ALLMULTI)
1123 		/* request to disable multicast filtering, so quit here */
1124 		return;
1125 
1126 	/* Flush all the filters */
1127 
1128 	err = mxge_send_cmd(sc, MXGEFW_LEAVE_ALL_MULTICAST_GROUPS, &cmd);
1129 	if (err != 0) {
1130 		device_printf(sc->dev,
1131 			      "Failed MXGEFW_LEAVE_ALL_MULTICAST_GROUPS"
1132 			      ", error status: %d\n", err);
1133 		return;
1134 	}
1135 
1136 	/* Walk the multicast list, and add each address */
1137 
1138 	if_maddr_rlock(ifp);
1139 	TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
1140 		if (ifma->ifma_addr->sa_family != AF_LINK)
1141 			continue;
1142 		bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr),
1143 		      &cmd.data0, 4);
1144 		bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr) + 4,
1145 		      &cmd.data1, 2);
1146 		cmd.data0 = htonl(cmd.data0);
1147 		cmd.data1 = htonl(cmd.data1);
1148 		err = mxge_send_cmd(sc, MXGEFW_JOIN_MULTICAST_GROUP, &cmd);
1149 		if (err != 0) {
1150 			device_printf(sc->dev, "Failed "
1151 			       "MXGEFW_JOIN_MULTICAST_GROUP, error status:"
1152 			       "%d\t", err);
1153 			/* abort, leaving multicast filtering off */
1154 			if_maddr_runlock(ifp);
1155 			return;
1156 		}
1157 	}
1158 	if_maddr_runlock(ifp);
1159 	/* Enable multicast filtering */
1160 	err = mxge_send_cmd(sc, MXGEFW_DISABLE_ALLMULTI, &cmd);
1161 	if (err != 0) {
1162 		device_printf(sc->dev, "Failed MXGEFW_DISABLE_ALLMULTI"
1163 		       ", error status: %d\n", err);
1164 	}
1165 }
1166 
1167 static int
1168 mxge_max_mtu(mxge_softc_t *sc)
1169 {
1170 	mxge_cmd_t cmd;
1171 	int status;
1172 
1173 	if (MJUMPAGESIZE - MXGEFW_PAD >  MXGEFW_MAX_MTU)
1174 		return  MXGEFW_MAX_MTU - MXGEFW_PAD;
1175 
1176 	/* try to set nbufs to see if it we can
1177 	   use virtually contiguous jumbos */
1178 	cmd.data0 = 0;
1179 	status = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
1180 			       &cmd);
1181 	if (status == 0)
1182 		return  MXGEFW_MAX_MTU - MXGEFW_PAD;
1183 
1184 	/* otherwise, we're limited to MJUMPAGESIZE */
1185 	return MJUMPAGESIZE - MXGEFW_PAD;
1186 }
1187 
1188 static int
1189 mxge_reset(mxge_softc_t *sc, int interrupts_setup)
1190 {
1191 	struct mxge_slice_state *ss;
1192 	mxge_rx_done_t *rx_done;
1193 	volatile uint32_t *irq_claim;
1194 	mxge_cmd_t cmd;
1195 	int slice, status;
1196 
1197 	/* try to send a reset command to the card to see if it
1198 	   is alive */
1199 	memset(&cmd, 0, sizeof (cmd));
1200 	status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
1201 	if (status != 0) {
1202 		device_printf(sc->dev, "failed reset\n");
1203 		return ENXIO;
1204 	}
1205 
1206 	mxge_dummy_rdma(sc, 1);
1207 
1208 
1209 	/* set the intrq size */
1210 	cmd.data0 = sc->rx_ring_size;
1211 	status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
1212 
1213 	/*
1214 	 * Even though we already know how many slices are supported
1215 	 * via mxge_slice_probe(), MXGEFW_CMD_GET_MAX_RSS_QUEUES
1216 	 * has magic side effects, and must be called after a reset.
1217 	 * It must be called prior to calling any RSS related cmds,
1218 	 * including assigning an interrupt queue for anything but
1219 	 * slice 0.  It must also be called *after*
1220 	 * MXGEFW_CMD_SET_INTRQ_SIZE, since the intrq size is used by
1221 	 * the firmware to compute offsets.
1222 	 */
1223 
1224 	if (sc->num_slices > 1) {
1225 		/* ask the maximum number of slices it supports */
1226 		status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES,
1227 					   &cmd);
1228 		if (status != 0) {
1229 			device_printf(sc->dev,
1230 				      "failed to get number of slices\n");
1231 			return status;
1232 		}
1233 		/*
1234 		 * MXGEFW_CMD_ENABLE_RSS_QUEUES must be called prior
1235 		 * to setting up the interrupt queue DMA
1236 		 */
1237 		cmd.data0 = sc->num_slices;
1238 		cmd.data1 = MXGEFW_SLICE_INTR_MODE_ONE_PER_SLICE;
1239 #ifdef IFNET_BUF_RING
1240 		cmd.data1 |= MXGEFW_SLICE_ENABLE_MULTIPLE_TX_QUEUES;
1241 #endif
1242 		status = mxge_send_cmd(sc, MXGEFW_CMD_ENABLE_RSS_QUEUES,
1243 					   &cmd);
1244 		if (status != 0) {
1245 			device_printf(sc->dev,
1246 				      "failed to set number of slices\n");
1247 			return status;
1248 		}
1249 	}
1250 
1251 
1252 	if (interrupts_setup) {
1253 		/* Now exchange information about interrupts  */
1254 		for (slice = 0; slice < sc->num_slices; slice++) {
1255 			rx_done = &sc->ss[slice].rx_done;
1256 			memset(rx_done->entry, 0, sc->rx_ring_size);
1257 			cmd.data0 = MXGE_LOWPART_TO_U32(rx_done->dma.bus_addr);
1258 			cmd.data1 = MXGE_HIGHPART_TO_U32(rx_done->dma.bus_addr);
1259 			cmd.data2 = slice;
1260 			status |= mxge_send_cmd(sc,
1261 						MXGEFW_CMD_SET_INTRQ_DMA,
1262 						&cmd);
1263 		}
1264 	}
1265 
1266 	status |= mxge_send_cmd(sc,
1267 				MXGEFW_CMD_GET_INTR_COAL_DELAY_OFFSET, &cmd);
1268 
1269 
1270 	sc->intr_coal_delay_ptr = (volatile uint32_t *)(sc->sram + cmd.data0);
1271 
1272 	status |= mxge_send_cmd(sc, MXGEFW_CMD_GET_IRQ_ACK_OFFSET, &cmd);
1273 	irq_claim = (volatile uint32_t *)(sc->sram + cmd.data0);
1274 
1275 
1276 	status |= mxge_send_cmd(sc,  MXGEFW_CMD_GET_IRQ_DEASSERT_OFFSET,
1277 				&cmd);
1278 	sc->irq_deassert = (volatile uint32_t *)(sc->sram + cmd.data0);
1279 	if (status != 0) {
1280 		device_printf(sc->dev, "failed set interrupt parameters\n");
1281 		return status;
1282 	}
1283 
1284 
1285 	*sc->intr_coal_delay_ptr = htobe32(sc->intr_coal_delay);
1286 
1287 
1288 	/* run a DMA benchmark */
1289 	(void) mxge_dma_test(sc, MXGEFW_DMA_TEST);
1290 
1291 	for (slice = 0; slice < sc->num_slices; slice++) {
1292 		ss = &sc->ss[slice];
1293 
1294 		ss->irq_claim = irq_claim + (2 * slice);
1295 		/* reset mcp/driver shared state back to 0 */
1296 		ss->rx_done.idx = 0;
1297 		ss->rx_done.cnt = 0;
1298 		ss->tx.req = 0;
1299 		ss->tx.done = 0;
1300 		ss->tx.pkt_done = 0;
1301 		ss->tx.queue_active = 0;
1302 		ss->tx.activate = 0;
1303 		ss->tx.deactivate = 0;
1304 		ss->tx.wake = 0;
1305 		ss->tx.defrag = 0;
1306 		ss->tx.stall = 0;
1307 		ss->rx_big.cnt = 0;
1308 		ss->rx_small.cnt = 0;
1309 		ss->lro_bad_csum = 0;
1310 		ss->lro_queued = 0;
1311 		ss->lro_flushed = 0;
1312 		if (ss->fw_stats != NULL) {
1313 			bzero(ss->fw_stats, sizeof *ss->fw_stats);
1314 		}
1315 	}
1316 	sc->rdma_tags_available = 15;
1317 	status = mxge_update_mac_address(sc);
1318 	mxge_change_promisc(sc, sc->ifp->if_flags & IFF_PROMISC);
1319 	mxge_change_pause(sc, sc->pause);
1320 	mxge_set_multicast_list(sc);
1321 	if (sc->throttle) {
1322 		cmd.data0 = sc->throttle;
1323 		if (mxge_send_cmd(sc, MXGEFW_CMD_SET_THROTTLE_FACTOR,
1324 				  &cmd)) {
1325 			device_printf(sc->dev,
1326 				      "can't enable throttle\n");
1327 		}
1328 	}
1329 	return status;
1330 }
1331 
1332 static int
1333 mxge_change_throttle(SYSCTL_HANDLER_ARGS)
1334 {
1335 	mxge_cmd_t cmd;
1336 	mxge_softc_t *sc;
1337 	int err;
1338 	unsigned int throttle;
1339 
1340 	sc = arg1;
1341 	throttle = sc->throttle;
1342 	err = sysctl_handle_int(oidp, &throttle, arg2, req);
1343         if (err != 0) {
1344                 return err;
1345         }
1346 
1347 	if (throttle == sc->throttle)
1348 		return 0;
1349 
1350         if (throttle < MXGE_MIN_THROTTLE || throttle > MXGE_MAX_THROTTLE)
1351                 return EINVAL;
1352 
1353 	mtx_lock(&sc->driver_mtx);
1354 	cmd.data0 = throttle;
1355 	err = mxge_send_cmd(sc, MXGEFW_CMD_SET_THROTTLE_FACTOR, &cmd);
1356 	if (err == 0)
1357 		sc->throttle = throttle;
1358 	mtx_unlock(&sc->driver_mtx);
1359 	return err;
1360 }
1361 
1362 static int
1363 mxge_change_intr_coal(SYSCTL_HANDLER_ARGS)
1364 {
1365         mxge_softc_t *sc;
1366         unsigned int intr_coal_delay;
1367         int err;
1368 
1369         sc = arg1;
1370         intr_coal_delay = sc->intr_coal_delay;
1371         err = sysctl_handle_int(oidp, &intr_coal_delay, arg2, req);
1372         if (err != 0) {
1373                 return err;
1374         }
1375         if (intr_coal_delay == sc->intr_coal_delay)
1376                 return 0;
1377 
1378         if (intr_coal_delay == 0 || intr_coal_delay > 1000*1000)
1379                 return EINVAL;
1380 
1381 	mtx_lock(&sc->driver_mtx);
1382 	*sc->intr_coal_delay_ptr = htobe32(intr_coal_delay);
1383 	sc->intr_coal_delay = intr_coal_delay;
1384 
1385 	mtx_unlock(&sc->driver_mtx);
1386         return err;
1387 }
1388 
1389 static int
1390 mxge_change_flow_control(SYSCTL_HANDLER_ARGS)
1391 {
1392         mxge_softc_t *sc;
1393         unsigned int enabled;
1394         int err;
1395 
1396         sc = arg1;
1397         enabled = sc->pause;
1398         err = sysctl_handle_int(oidp, &enabled, arg2, req);
1399         if (err != 0) {
1400                 return err;
1401         }
1402         if (enabled == sc->pause)
1403                 return 0;
1404 
1405 	mtx_lock(&sc->driver_mtx);
1406 	err = mxge_change_pause(sc, enabled);
1407 	mtx_unlock(&sc->driver_mtx);
1408         return err;
1409 }
1410 
1411 static int
1412 mxge_change_lro_locked(mxge_softc_t *sc, int lro_cnt)
1413 {
1414 	struct ifnet *ifp;
1415 	int err = 0;
1416 
1417 	ifp = sc->ifp;
1418 	if (lro_cnt == 0)
1419 		ifp->if_capenable &= ~IFCAP_LRO;
1420 	else
1421 		ifp->if_capenable |= IFCAP_LRO;
1422 	sc->lro_cnt = lro_cnt;
1423 	if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
1424 		mxge_close(sc, 0);
1425 		err = mxge_open(sc);
1426 	}
1427 	return err;
1428 }
1429 
1430 static int
1431 mxge_change_lro(SYSCTL_HANDLER_ARGS)
1432 {
1433 	mxge_softc_t *sc;
1434 	unsigned int lro_cnt;
1435 	int err;
1436 
1437 	sc = arg1;
1438 	lro_cnt = sc->lro_cnt;
1439 	err = sysctl_handle_int(oidp, &lro_cnt, arg2, req);
1440 	if (err != 0)
1441 		return err;
1442 
1443 	if (lro_cnt == sc->lro_cnt)
1444 		return 0;
1445 
1446 	if (lro_cnt > 128)
1447 		return EINVAL;
1448 
1449 	mtx_lock(&sc->driver_mtx);
1450 	err = mxge_change_lro_locked(sc, lro_cnt);
1451 	mtx_unlock(&sc->driver_mtx);
1452 	return err;
1453 }
1454 
1455 static int
1456 mxge_handle_be32(SYSCTL_HANDLER_ARGS)
1457 {
1458         int err;
1459 
1460         if (arg1 == NULL)
1461                 return EFAULT;
1462         arg2 = be32toh(*(int *)arg1);
1463         arg1 = NULL;
1464         err = sysctl_handle_int(oidp, arg1, arg2, req);
1465 
1466         return err;
1467 }
1468 
1469 static void
1470 mxge_rem_sysctls(mxge_softc_t *sc)
1471 {
1472 	struct mxge_slice_state *ss;
1473 	int slice;
1474 
1475 	if (sc->slice_sysctl_tree == NULL)
1476 		return;
1477 
1478 	for (slice = 0; slice < sc->num_slices; slice++) {
1479 		ss = &sc->ss[slice];
1480 		if (ss == NULL || ss->sysctl_tree == NULL)
1481 			continue;
1482 		sysctl_ctx_free(&ss->sysctl_ctx);
1483 		ss->sysctl_tree = NULL;
1484 	}
1485 	sysctl_ctx_free(&sc->slice_sysctl_ctx);
1486 	sc->slice_sysctl_tree = NULL;
1487 }
1488 
1489 static void
1490 mxge_add_sysctls(mxge_softc_t *sc)
1491 {
1492 	struct sysctl_ctx_list *ctx;
1493 	struct sysctl_oid_list *children;
1494 	mcp_irq_data_t *fw;
1495 	struct mxge_slice_state *ss;
1496 	int slice;
1497 	char slice_num[8];
1498 
1499 	ctx = device_get_sysctl_ctx(sc->dev);
1500 	children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev));
1501 	fw = sc->ss[0].fw_stats;
1502 
1503 	/* random information */
1504 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1505 		       "firmware_version",
1506 		       CTLFLAG_RD, &sc->fw_version,
1507 		       0, "firmware version");
1508 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1509 		       "serial_number",
1510 		       CTLFLAG_RD, &sc->serial_number_string,
1511 		       0, "serial number");
1512 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1513 		       "product_code",
1514 		       CTLFLAG_RD, &sc->product_code_string,
1515 		       0, "product_code");
1516 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1517 		       "pcie_link_width",
1518 		       CTLFLAG_RD, &sc->link_width,
1519 		       0, "tx_boundary");
1520 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1521 		       "tx_boundary",
1522 		       CTLFLAG_RD, &sc->tx_boundary,
1523 		       0, "tx_boundary");
1524 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1525 		       "write_combine",
1526 		       CTLFLAG_RD, &sc->wc,
1527 		       0, "write combining PIO?");
1528 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1529 		       "read_dma_MBs",
1530 		       CTLFLAG_RD, &sc->read_dma,
1531 		       0, "DMA Read speed in MB/s");
1532 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1533 		       "write_dma_MBs",
1534 		       CTLFLAG_RD, &sc->write_dma,
1535 		       0, "DMA Write speed in MB/s");
1536 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1537 		       "read_write_dma_MBs",
1538 		       CTLFLAG_RD, &sc->read_write_dma,
1539 		       0, "DMA concurrent Read/Write speed in MB/s");
1540 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1541 		       "watchdog_resets",
1542 		       CTLFLAG_RD, &sc->watchdog_resets,
1543 		       0, "Number of times NIC was reset");
1544 
1545 
1546 	/* performance related tunables */
1547 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1548 			"intr_coal_delay",
1549 			CTLTYPE_INT|CTLFLAG_RW, sc,
1550 			0, mxge_change_intr_coal,
1551 			"I", "interrupt coalescing delay in usecs");
1552 
1553 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1554 			"throttle",
1555 			CTLTYPE_INT|CTLFLAG_RW, sc,
1556 			0, mxge_change_throttle,
1557 			"I", "transmit throttling");
1558 
1559 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1560 			"flow_control_enabled",
1561 			CTLTYPE_INT|CTLFLAG_RW, sc,
1562 			0, mxge_change_flow_control,
1563 			"I", "interrupt coalescing delay in usecs");
1564 
1565 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1566 		       "deassert_wait",
1567 		       CTLFLAG_RW, &mxge_deassert_wait,
1568 		       0, "Wait for IRQ line to go low in ihandler");
1569 
1570 	/* stats block from firmware is in network byte order.
1571 	   Need to swap it */
1572 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1573 			"link_up",
1574 			CTLTYPE_INT|CTLFLAG_RD, &fw->link_up,
1575 			0, mxge_handle_be32,
1576 			"I", "link up");
1577 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1578 			"rdma_tags_available",
1579 			CTLTYPE_INT|CTLFLAG_RD, &fw->rdma_tags_available,
1580 			0, mxge_handle_be32,
1581 			"I", "rdma_tags_available");
1582 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1583 			"dropped_bad_crc32",
1584 			CTLTYPE_INT|CTLFLAG_RD,
1585 			&fw->dropped_bad_crc32,
1586 			0, mxge_handle_be32,
1587 			"I", "dropped_bad_crc32");
1588 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1589 			"dropped_bad_phy",
1590 			CTLTYPE_INT|CTLFLAG_RD,
1591 			&fw->dropped_bad_phy,
1592 			0, mxge_handle_be32,
1593 			"I", "dropped_bad_phy");
1594 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1595 			"dropped_link_error_or_filtered",
1596 			CTLTYPE_INT|CTLFLAG_RD,
1597 			&fw->dropped_link_error_or_filtered,
1598 			0, mxge_handle_be32,
1599 			"I", "dropped_link_error_or_filtered");
1600 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1601 			"dropped_link_overflow",
1602 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_link_overflow,
1603 			0, mxge_handle_be32,
1604 			"I", "dropped_link_overflow");
1605 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1606 			"dropped_multicast_filtered",
1607 			CTLTYPE_INT|CTLFLAG_RD,
1608 			&fw->dropped_multicast_filtered,
1609 			0, mxge_handle_be32,
1610 			"I", "dropped_multicast_filtered");
1611 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1612 			"dropped_no_big_buffer",
1613 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_no_big_buffer,
1614 			0, mxge_handle_be32,
1615 			"I", "dropped_no_big_buffer");
1616 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1617 			"dropped_no_small_buffer",
1618 			CTLTYPE_INT|CTLFLAG_RD,
1619 			&fw->dropped_no_small_buffer,
1620 			0, mxge_handle_be32,
1621 			"I", "dropped_no_small_buffer");
1622 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1623 			"dropped_overrun",
1624 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_overrun,
1625 			0, mxge_handle_be32,
1626 			"I", "dropped_overrun");
1627 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1628 			"dropped_pause",
1629 			CTLTYPE_INT|CTLFLAG_RD,
1630 			&fw->dropped_pause,
1631 			0, mxge_handle_be32,
1632 			"I", "dropped_pause");
1633 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1634 			"dropped_runt",
1635 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_runt,
1636 			0, mxge_handle_be32,
1637 			"I", "dropped_runt");
1638 
1639 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1640 			"dropped_unicast_filtered",
1641 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_unicast_filtered,
1642 			0, mxge_handle_be32,
1643 			"I", "dropped_unicast_filtered");
1644 
1645 	/* verbose printing? */
1646 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1647 		       "verbose",
1648 		       CTLFLAG_RW, &mxge_verbose,
1649 		       0, "verbose printing");
1650 
1651 	/* lro */
1652 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1653 			"lro_cnt",
1654 			CTLTYPE_INT|CTLFLAG_RW, sc,
1655 			0, mxge_change_lro,
1656 			"I", "number of lro merge queues");
1657 
1658 
1659 	/* add counters exported for debugging from all slices */
1660 	sysctl_ctx_init(&sc->slice_sysctl_ctx);
1661 	sc->slice_sysctl_tree =
1662 		SYSCTL_ADD_NODE(&sc->slice_sysctl_ctx, children, OID_AUTO,
1663 				"slice", CTLFLAG_RD, 0, "");
1664 
1665 	for (slice = 0; slice < sc->num_slices; slice++) {
1666 		ss = &sc->ss[slice];
1667 		sysctl_ctx_init(&ss->sysctl_ctx);
1668 		ctx = &ss->sysctl_ctx;
1669 		children = SYSCTL_CHILDREN(sc->slice_sysctl_tree);
1670 		sprintf(slice_num, "%d", slice);
1671 		ss->sysctl_tree =
1672 			SYSCTL_ADD_NODE(ctx, children, OID_AUTO, slice_num,
1673 					CTLFLAG_RD, 0, "");
1674 		children = SYSCTL_CHILDREN(ss->sysctl_tree);
1675 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1676 			       "rx_small_cnt",
1677 			       CTLFLAG_RD, &ss->rx_small.cnt,
1678 			       0, "rx_small_cnt");
1679 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1680 			       "rx_big_cnt",
1681 			       CTLFLAG_RD, &ss->rx_big.cnt,
1682 			       0, "rx_small_cnt");
1683 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1684 			       "lro_flushed", CTLFLAG_RD, &ss->lro_flushed,
1685 			       0, "number of lro merge queues flushed");
1686 
1687 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1688 			       "lro_queued", CTLFLAG_RD, &ss->lro_queued,
1689 			       0, "number of frames appended to lro merge"
1690 			       "queues");
1691 
1692 #ifndef IFNET_BUF_RING
1693 		/* only transmit from slice 0 for now */
1694 		if (slice > 0)
1695 			continue;
1696 #endif
1697 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1698 			       "tx_req",
1699 			       CTLFLAG_RD, &ss->tx.req,
1700 			       0, "tx_req");
1701 
1702 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1703 			       "tx_done",
1704 			       CTLFLAG_RD, &ss->tx.done,
1705 			       0, "tx_done");
1706 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1707 			       "tx_pkt_done",
1708 			       CTLFLAG_RD, &ss->tx.pkt_done,
1709 			       0, "tx_done");
1710 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1711 			       "tx_stall",
1712 			       CTLFLAG_RD, &ss->tx.stall,
1713 			       0, "tx_stall");
1714 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1715 			       "tx_wake",
1716 			       CTLFLAG_RD, &ss->tx.wake,
1717 			       0, "tx_wake");
1718 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1719 			       "tx_defrag",
1720 			       CTLFLAG_RD, &ss->tx.defrag,
1721 			       0, "tx_defrag");
1722 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1723 			       "tx_queue_active",
1724 			       CTLFLAG_RD, &ss->tx.queue_active,
1725 			       0, "tx_queue_active");
1726 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1727 			       "tx_activate",
1728 			       CTLFLAG_RD, &ss->tx.activate,
1729 			       0, "tx_activate");
1730 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1731 			       "tx_deactivate",
1732 			       CTLFLAG_RD, &ss->tx.deactivate,
1733 			       0, "tx_deactivate");
1734 	}
1735 }
1736 
1737 /* copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
1738    backwards one at a time and handle ring wraps */
1739 
1740 static inline void
1741 mxge_submit_req_backwards(mxge_tx_ring_t *tx,
1742 			    mcp_kreq_ether_send_t *src, int cnt)
1743 {
1744         int idx, starting_slot;
1745         starting_slot = tx->req;
1746         while (cnt > 1) {
1747                 cnt--;
1748                 idx = (starting_slot + cnt) & tx->mask;
1749                 mxge_pio_copy(&tx->lanai[idx],
1750 			      &src[cnt], sizeof(*src));
1751                 wmb();
1752         }
1753 }
1754 
1755 /*
1756  * copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
1757  * at most 32 bytes at a time, so as to avoid involving the software
1758  * pio handler in the nic.   We re-write the first segment's flags
1759  * to mark them valid only after writing the entire chain
1760  */
1761 
1762 static inline void
1763 mxge_submit_req(mxge_tx_ring_t *tx, mcp_kreq_ether_send_t *src,
1764                   int cnt)
1765 {
1766         int idx, i;
1767         uint32_t *src_ints;
1768 	volatile uint32_t *dst_ints;
1769         mcp_kreq_ether_send_t *srcp;
1770 	volatile mcp_kreq_ether_send_t *dstp, *dst;
1771 	uint8_t last_flags;
1772 
1773         idx = tx->req & tx->mask;
1774 
1775 	last_flags = src->flags;
1776 	src->flags = 0;
1777         wmb();
1778         dst = dstp = &tx->lanai[idx];
1779         srcp = src;
1780 
1781         if ((idx + cnt) < tx->mask) {
1782                 for (i = 0; i < (cnt - 1); i += 2) {
1783                         mxge_pio_copy(dstp, srcp, 2 * sizeof(*src));
1784                         wmb(); /* force write every 32 bytes */
1785                         srcp += 2;
1786                         dstp += 2;
1787                 }
1788         } else {
1789                 /* submit all but the first request, and ensure
1790                    that it is submitted below */
1791                 mxge_submit_req_backwards(tx, src, cnt);
1792                 i = 0;
1793         }
1794         if (i < cnt) {
1795                 /* submit the first request */
1796                 mxge_pio_copy(dstp, srcp, sizeof(*src));
1797                 wmb(); /* barrier before setting valid flag */
1798         }
1799 
1800         /* re-write the last 32-bits with the valid flags */
1801         src->flags = last_flags;
1802         src_ints = (uint32_t *)src;
1803         src_ints+=3;
1804         dst_ints = (volatile uint32_t *)dst;
1805         dst_ints+=3;
1806         *dst_ints =  *src_ints;
1807         tx->req += cnt;
1808         wmb();
1809 }
1810 
1811 #if IFCAP_TSO4
1812 
1813 static void
1814 mxge_encap_tso(struct mxge_slice_state *ss, struct mbuf *m,
1815 	       int busdma_seg_cnt, int ip_off)
1816 {
1817 	mxge_tx_ring_t *tx;
1818 	mcp_kreq_ether_send_t *req;
1819 	bus_dma_segment_t *seg;
1820 	struct ip *ip;
1821 	struct tcphdr *tcp;
1822 	uint32_t low, high_swapped;
1823 	int len, seglen, cum_len, cum_len_next;
1824 	int next_is_first, chop, cnt, rdma_count, small;
1825 	uint16_t pseudo_hdr_offset, cksum_offset, mss;
1826 	uint8_t flags, flags_next;
1827 	static int once;
1828 
1829 	mss = m->m_pkthdr.tso_segsz;
1830 
1831 	/* negative cum_len signifies to the
1832 	 * send loop that we are still in the
1833 	 * header portion of the TSO packet.
1834 	 */
1835 
1836 	/* ensure we have the ethernet, IP and TCP
1837 	   header together in the first mbuf, copy
1838 	   it to a scratch buffer if not */
1839 	if (__predict_false(m->m_len < ip_off + sizeof (*ip))) {
1840 		m_copydata(m, 0, ip_off + sizeof (*ip),
1841 			   ss->scratch);
1842 		ip = (struct ip *)(ss->scratch + ip_off);
1843 	} else {
1844 		ip = (struct ip *)(mtod(m, char *) + ip_off);
1845 	}
1846 	if (__predict_false(m->m_len < ip_off + (ip->ip_hl << 2)
1847 			    + sizeof (*tcp))) {
1848 		m_copydata(m, 0, ip_off + (ip->ip_hl << 2)
1849 			   + sizeof (*tcp),  ss->scratch);
1850 		ip = (struct ip *)(mtod(m, char *) + ip_off);
1851 	}
1852 
1853 	tcp = (struct tcphdr *)((char *)ip + (ip->ip_hl << 2));
1854 	cum_len = -(ip_off + ((ip->ip_hl + tcp->th_off) << 2));
1855 
1856 	/* TSO implies checksum offload on this hardware */
1857 	cksum_offset = ip_off + (ip->ip_hl << 2);
1858 	flags = MXGEFW_FLAGS_TSO_HDR | MXGEFW_FLAGS_FIRST;
1859 
1860 
1861 	/* for TSO, pseudo_hdr_offset holds mss.
1862 	 * The firmware figures out where to put
1863 	 * the checksum by parsing the header. */
1864 	pseudo_hdr_offset = htobe16(mss);
1865 
1866 	tx = &ss->tx;
1867 	req = tx->req_list;
1868 	seg = tx->seg_list;
1869 	cnt = 0;
1870 	rdma_count = 0;
1871 	/* "rdma_count" is the number of RDMAs belonging to the
1872 	 * current packet BEFORE the current send request. For
1873 	 * non-TSO packets, this is equal to "count".
1874 	 * For TSO packets, rdma_count needs to be reset
1875 	 * to 0 after a segment cut.
1876 	 *
1877 	 * The rdma_count field of the send request is
1878 	 * the number of RDMAs of the packet starting at
1879 	 * that request. For TSO send requests with one ore more cuts
1880 	 * in the middle, this is the number of RDMAs starting
1881 	 * after the last cut in the request. All previous
1882 	 * segments before the last cut implicitly have 1 RDMA.
1883 	 *
1884 	 * Since the number of RDMAs is not known beforehand,
1885 	 * it must be filled-in retroactively - after each
1886 	 * segmentation cut or at the end of the entire packet.
1887 	 */
1888 
1889 	while (busdma_seg_cnt) {
1890 		/* Break the busdma segment up into pieces*/
1891 		low = MXGE_LOWPART_TO_U32(seg->ds_addr);
1892 		high_swapped = 	htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
1893 		len = seg->ds_len;
1894 
1895 		while (len) {
1896 			flags_next = flags & ~MXGEFW_FLAGS_FIRST;
1897 			seglen = len;
1898 			cum_len_next = cum_len + seglen;
1899 			(req-rdma_count)->rdma_count = rdma_count + 1;
1900 			if (__predict_true(cum_len >= 0)) {
1901 				/* payload */
1902 				chop = (cum_len_next > mss);
1903 				cum_len_next = cum_len_next % mss;
1904 				next_is_first = (cum_len_next == 0);
1905 				flags |= chop * MXGEFW_FLAGS_TSO_CHOP;
1906 				flags_next |= next_is_first *
1907 					MXGEFW_FLAGS_FIRST;
1908 				rdma_count |= -(chop | next_is_first);
1909 				rdma_count += chop & !next_is_first;
1910 			} else if (cum_len_next >= 0) {
1911 				/* header ends */
1912 				rdma_count = -1;
1913 				cum_len_next = 0;
1914 				seglen = -cum_len;
1915 				small = (mss <= MXGEFW_SEND_SMALL_SIZE);
1916 				flags_next = MXGEFW_FLAGS_TSO_PLD |
1917 					MXGEFW_FLAGS_FIRST |
1918 					(small * MXGEFW_FLAGS_SMALL);
1919 			    }
1920 
1921 			req->addr_high = high_swapped;
1922 			req->addr_low = htobe32(low);
1923 			req->pseudo_hdr_offset = pseudo_hdr_offset;
1924 			req->pad = 0;
1925 			req->rdma_count = 1;
1926 			req->length = htobe16(seglen);
1927 			req->cksum_offset = cksum_offset;
1928 			req->flags = flags | ((cum_len & 1) *
1929 					      MXGEFW_FLAGS_ALIGN_ODD);
1930 			low += seglen;
1931 			len -= seglen;
1932 			cum_len = cum_len_next;
1933 			flags = flags_next;
1934 			req++;
1935 			cnt++;
1936 			rdma_count++;
1937 			if (__predict_false(cksum_offset > seglen))
1938 				cksum_offset -= seglen;
1939 			else
1940 				cksum_offset = 0;
1941 			if (__predict_false(cnt > tx->max_desc))
1942 				goto drop;
1943 		}
1944 		busdma_seg_cnt--;
1945 		seg++;
1946 	}
1947 	(req-rdma_count)->rdma_count = rdma_count;
1948 
1949 	do {
1950 		req--;
1951 		req->flags |= MXGEFW_FLAGS_TSO_LAST;
1952 	} while (!(req->flags & (MXGEFW_FLAGS_TSO_CHOP | MXGEFW_FLAGS_FIRST)));
1953 
1954 	tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
1955 	mxge_submit_req(tx, tx->req_list, cnt);
1956 #ifdef IFNET_BUF_RING
1957 	if ((ss->sc->num_slices > 1) && tx->queue_active == 0) {
1958 		/* tell the NIC to start polling this slice */
1959 		*tx->send_go = 1;
1960 		tx->queue_active = 1;
1961 		tx->activate++;
1962 		wmb();
1963 	}
1964 #endif
1965 	return;
1966 
1967 drop:
1968 	bus_dmamap_unload(tx->dmat, tx->info[tx->req & tx->mask].map);
1969 	m_freem(m);
1970 	ss->oerrors++;
1971 	if (!once) {
1972 		printf("tx->max_desc exceeded via TSO!\n");
1973 		printf("mss = %d, %ld, %d!\n", mss,
1974 		       (long)seg - (long)tx->seg_list, tx->max_desc);
1975 		once = 1;
1976 	}
1977 	return;
1978 
1979 }
1980 
1981 #endif /* IFCAP_TSO4 */
1982 
1983 #ifdef MXGE_NEW_VLAN_API
1984 /*
1985  * We reproduce the software vlan tag insertion from
1986  * net/if_vlan.c:vlan_start() here so that we can advertise "hardware"
1987  * vlan tag insertion. We need to advertise this in order to have the
1988  * vlan interface respect our csum offload flags.
1989  */
1990 static struct mbuf *
1991 mxge_vlan_tag_insert(struct mbuf *m)
1992 {
1993 	struct ether_vlan_header *evl;
1994 
1995 	M_PREPEND(m, ETHER_VLAN_ENCAP_LEN, M_DONTWAIT);
1996 	if (__predict_false(m == NULL))
1997 		return NULL;
1998 	if (m->m_len < sizeof(*evl)) {
1999 		m = m_pullup(m, sizeof(*evl));
2000 		if (__predict_false(m == NULL))
2001 			return NULL;
2002 	}
2003 	/*
2004 	 * Transform the Ethernet header into an Ethernet header
2005 	 * with 802.1Q encapsulation.
2006 	 */
2007 	evl = mtod(m, struct ether_vlan_header *);
2008 	bcopy((char *)evl + ETHER_VLAN_ENCAP_LEN,
2009 	      (char *)evl, ETHER_HDR_LEN - ETHER_TYPE_LEN);
2010 	evl->evl_encap_proto = htons(ETHERTYPE_VLAN);
2011 	evl->evl_tag = htons(m->m_pkthdr.ether_vtag);
2012 	m->m_flags &= ~M_VLANTAG;
2013 	return m;
2014 }
2015 #endif /* MXGE_NEW_VLAN_API */
2016 
2017 static void
2018 mxge_encap(struct mxge_slice_state *ss, struct mbuf *m)
2019 {
2020 	mxge_softc_t *sc;
2021 	mcp_kreq_ether_send_t *req;
2022 	bus_dma_segment_t *seg;
2023 	struct mbuf *m_tmp;
2024 	struct ifnet *ifp;
2025 	mxge_tx_ring_t *tx;
2026 	struct ip *ip;
2027 	int cnt, cum_len, err, i, idx, odd_flag, ip_off;
2028 	uint16_t pseudo_hdr_offset;
2029         uint8_t flags, cksum_offset;
2030 
2031 
2032 	sc = ss->sc;
2033 	ifp = sc->ifp;
2034 	tx = &ss->tx;
2035 
2036 	ip_off = sizeof (struct ether_header);
2037 #ifdef MXGE_NEW_VLAN_API
2038 	if (m->m_flags & M_VLANTAG) {
2039 		m = mxge_vlan_tag_insert(m);
2040 		if (__predict_false(m == NULL))
2041 			goto drop;
2042 		ip_off += ETHER_VLAN_ENCAP_LEN;
2043 	}
2044 #endif
2045 	/* (try to) map the frame for DMA */
2046 	idx = tx->req & tx->mask;
2047 	err = bus_dmamap_load_mbuf_sg(tx->dmat, tx->info[idx].map,
2048 				      m, tx->seg_list, &cnt,
2049 				      BUS_DMA_NOWAIT);
2050 	if (__predict_false(err == EFBIG)) {
2051 		/* Too many segments in the chain.  Try
2052 		   to defrag */
2053 		m_tmp = m_defrag(m, M_NOWAIT);
2054 		if (m_tmp == NULL) {
2055 			goto drop;
2056 		}
2057 		ss->tx.defrag++;
2058 		m = m_tmp;
2059 		err = bus_dmamap_load_mbuf_sg(tx->dmat,
2060 					      tx->info[idx].map,
2061 					      m, tx->seg_list, &cnt,
2062 					      BUS_DMA_NOWAIT);
2063 	}
2064 	if (__predict_false(err != 0)) {
2065 		device_printf(sc->dev, "bus_dmamap_load_mbuf_sg returned %d"
2066 			      " packet len = %d\n", err, m->m_pkthdr.len);
2067 		goto drop;
2068 	}
2069 	bus_dmamap_sync(tx->dmat, tx->info[idx].map,
2070 			BUS_DMASYNC_PREWRITE);
2071 	tx->info[idx].m = m;
2072 
2073 #if IFCAP_TSO4
2074 	/* TSO is different enough, we handle it in another routine */
2075 	if (m->m_pkthdr.csum_flags & (CSUM_TSO)) {
2076 		mxge_encap_tso(ss, m, cnt, ip_off);
2077 		return;
2078 	}
2079 #endif
2080 
2081 	req = tx->req_list;
2082 	cksum_offset = 0;
2083 	pseudo_hdr_offset = 0;
2084 	flags = MXGEFW_FLAGS_NO_TSO;
2085 
2086 	/* checksum offloading? */
2087 	if (m->m_pkthdr.csum_flags & (CSUM_DELAY_DATA)) {
2088 		/* ensure ip header is in first mbuf, copy
2089 		   it to a scratch buffer if not */
2090 		if (__predict_false(m->m_len < ip_off + sizeof (*ip))) {
2091 			m_copydata(m, 0, ip_off + sizeof (*ip),
2092 				   ss->scratch);
2093 			ip = (struct ip *)(ss->scratch + ip_off);
2094 		} else {
2095 			ip = (struct ip *)(mtod(m, char *) + ip_off);
2096 		}
2097 		cksum_offset = ip_off + (ip->ip_hl << 2);
2098 		pseudo_hdr_offset = cksum_offset +  m->m_pkthdr.csum_data;
2099 		pseudo_hdr_offset = htobe16(pseudo_hdr_offset);
2100 		req->cksum_offset = cksum_offset;
2101 		flags |= MXGEFW_FLAGS_CKSUM;
2102 		odd_flag = MXGEFW_FLAGS_ALIGN_ODD;
2103 	} else {
2104 		odd_flag = 0;
2105 	}
2106 	if (m->m_pkthdr.len < MXGEFW_SEND_SMALL_SIZE)
2107 		flags |= MXGEFW_FLAGS_SMALL;
2108 
2109 	/* convert segments into a request list */
2110 	cum_len = 0;
2111 	seg = tx->seg_list;
2112 	req->flags = MXGEFW_FLAGS_FIRST;
2113 	for (i = 0; i < cnt; i++) {
2114 		req->addr_low =
2115 			htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
2116 		req->addr_high =
2117 			htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
2118 		req->length = htobe16(seg->ds_len);
2119 		req->cksum_offset = cksum_offset;
2120 		if (cksum_offset > seg->ds_len)
2121 			cksum_offset -= seg->ds_len;
2122 		else
2123 			cksum_offset = 0;
2124 		req->pseudo_hdr_offset = pseudo_hdr_offset;
2125 		req->pad = 0; /* complete solid 16-byte block */
2126 		req->rdma_count = 1;
2127 		req->flags |= flags | ((cum_len & 1) * odd_flag);
2128 		cum_len += seg->ds_len;
2129 		seg++;
2130 		req++;
2131 		req->flags = 0;
2132 	}
2133 	req--;
2134 	/* pad runts to 60 bytes */
2135 	if (cum_len < 60) {
2136 		req++;
2137 		req->addr_low =
2138 			htobe32(MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr));
2139 		req->addr_high =
2140 			htobe32(MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr));
2141 		req->length = htobe16(60 - cum_len);
2142 		req->cksum_offset = 0;
2143 		req->pseudo_hdr_offset = pseudo_hdr_offset;
2144 		req->pad = 0; /* complete solid 16-byte block */
2145 		req->rdma_count = 1;
2146 		req->flags |= flags | ((cum_len & 1) * odd_flag);
2147 		cnt++;
2148 	}
2149 
2150 	tx->req_list[0].rdma_count = cnt;
2151 #if 0
2152 	/* print what the firmware will see */
2153 	for (i = 0; i < cnt; i++) {
2154 		printf("%d: addr: 0x%x 0x%x len:%d pso%d,"
2155 		    "cso:%d, flags:0x%x, rdma:%d\n",
2156 		    i, (int)ntohl(tx->req_list[i].addr_high),
2157 		    (int)ntohl(tx->req_list[i].addr_low),
2158 		    (int)ntohs(tx->req_list[i].length),
2159 		    (int)ntohs(tx->req_list[i].pseudo_hdr_offset),
2160 		    tx->req_list[i].cksum_offset, tx->req_list[i].flags,
2161 		    tx->req_list[i].rdma_count);
2162 	}
2163 	printf("--------------\n");
2164 #endif
2165 	tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
2166 	mxge_submit_req(tx, tx->req_list, cnt);
2167 #ifdef IFNET_BUF_RING
2168 	if ((ss->sc->num_slices > 1) && tx->queue_active == 0) {
2169 		/* tell the NIC to start polling this slice */
2170 		*tx->send_go = 1;
2171 		tx->queue_active = 1;
2172 		tx->activate++;
2173 		wmb();
2174 	}
2175 #endif
2176 	return;
2177 
2178 drop:
2179 	m_freem(m);
2180 	ss->oerrors++;
2181 	return;
2182 }
2183 
2184 #ifdef IFNET_BUF_RING
2185 static void
2186 mxge_qflush(struct ifnet *ifp)
2187 {
2188 	mxge_softc_t *sc = ifp->if_softc;
2189 	mxge_tx_ring_t *tx;
2190 	struct mbuf *m;
2191 	int slice;
2192 
2193 	for (slice = 0; slice < sc->num_slices; slice++) {
2194 		tx = &sc->ss[slice].tx;
2195 		mtx_lock(&tx->mtx);
2196 		while ((m = buf_ring_dequeue_sc(tx->br)) != NULL)
2197 			m_freem(m);
2198 		mtx_unlock(&tx->mtx);
2199 	}
2200 	if_qflush(ifp);
2201 }
2202 
2203 static inline void
2204 mxge_start_locked(struct mxge_slice_state *ss)
2205 {
2206 	mxge_softc_t *sc;
2207 	struct mbuf *m;
2208 	struct ifnet *ifp;
2209 	mxge_tx_ring_t *tx;
2210 
2211 	sc = ss->sc;
2212 	ifp = sc->ifp;
2213 	tx = &ss->tx;
2214 
2215 	while ((tx->mask - (tx->req - tx->done)) > tx->max_desc) {
2216 		m = drbr_dequeue(ifp, tx->br);
2217 		if (m == NULL) {
2218 			return;
2219 		}
2220 		/* let BPF see it */
2221 		BPF_MTAP(ifp, m);
2222 
2223 		/* give it to the nic */
2224 		mxge_encap(ss, m);
2225 	}
2226 	/* ran out of transmit slots */
2227 	if (((ss->if_drv_flags & IFF_DRV_OACTIVE) == 0)
2228 	    && (!drbr_empty(ifp, tx->br))) {
2229 		ss->if_drv_flags |= IFF_DRV_OACTIVE;
2230 		tx->stall++;
2231 	}
2232 }
2233 
2234 static int
2235 mxge_transmit_locked(struct mxge_slice_state *ss, struct mbuf *m)
2236 {
2237 	mxge_softc_t *sc;
2238 	struct ifnet *ifp;
2239 	mxge_tx_ring_t *tx;
2240 	int err;
2241 
2242 	sc = ss->sc;
2243 	ifp = sc->ifp;
2244 	tx = &ss->tx;
2245 
2246 	if ((ss->if_drv_flags & (IFF_DRV_RUNNING|IFF_DRV_OACTIVE)) !=
2247 	    IFF_DRV_RUNNING) {
2248 		err = drbr_enqueue(ifp, tx->br, m);
2249 		return (err);
2250 	}
2251 
2252 	if (drbr_empty(ifp, tx->br) &&
2253 	    ((tx->mask - (tx->req - tx->done)) > tx->max_desc)) {
2254 		/* let BPF see it */
2255 		BPF_MTAP(ifp, m);
2256 		/* give it to the nic */
2257 		mxge_encap(ss, m);
2258 	} else if ((err = drbr_enqueue(ifp, tx->br, m)) != 0) {
2259 		return (err);
2260 	}
2261 	if (!drbr_empty(ifp, tx->br))
2262 		mxge_start_locked(ss);
2263 	return (0);
2264 }
2265 
2266 static int
2267 mxge_transmit(struct ifnet *ifp, struct mbuf *m)
2268 {
2269 	mxge_softc_t *sc = ifp->if_softc;
2270 	struct mxge_slice_state *ss;
2271 	mxge_tx_ring_t *tx;
2272 	int err = 0;
2273 	int slice;
2274 
2275 	slice = m->m_pkthdr.flowid;
2276 	slice &= (sc->num_slices - 1);  /* num_slices always power of 2 */
2277 
2278 	ss = &sc->ss[slice];
2279 	tx = &ss->tx;
2280 
2281 	if (mtx_trylock(&tx->mtx)) {
2282 		err = mxge_transmit_locked(ss, m);
2283 		mtx_unlock(&tx->mtx);
2284 	} else {
2285 		err = drbr_enqueue(ifp, tx->br, m);
2286 	}
2287 
2288 	return (err);
2289 }
2290 
2291 #else
2292 
2293 static inline void
2294 mxge_start_locked(struct mxge_slice_state *ss)
2295 {
2296 	mxge_softc_t *sc;
2297 	struct mbuf *m;
2298 	struct ifnet *ifp;
2299 	mxge_tx_ring_t *tx;
2300 
2301 	sc = ss->sc;
2302 	ifp = sc->ifp;
2303 	tx = &ss->tx;
2304 	while ((tx->mask - (tx->req - tx->done)) > tx->max_desc) {
2305 		IFQ_DRV_DEQUEUE(&ifp->if_snd, m);
2306 		if (m == NULL) {
2307 			return;
2308 		}
2309 		/* let BPF see it */
2310 		BPF_MTAP(ifp, m);
2311 
2312 		/* give it to the nic */
2313 		mxge_encap(ss, m);
2314 	}
2315 	/* ran out of transmit slots */
2316 	if ((sc->ifp->if_drv_flags & IFF_DRV_OACTIVE) == 0) {
2317 		sc->ifp->if_drv_flags |= IFF_DRV_OACTIVE;
2318 		tx->stall++;
2319 	}
2320 }
2321 #endif
2322 static void
2323 mxge_start(struct ifnet *ifp)
2324 {
2325 	mxge_softc_t *sc = ifp->if_softc;
2326 	struct mxge_slice_state *ss;
2327 
2328 	/* only use the first slice for now */
2329 	ss = &sc->ss[0];
2330 	mtx_lock(&ss->tx.mtx);
2331 	mxge_start_locked(ss);
2332 	mtx_unlock(&ss->tx.mtx);
2333 }
2334 
2335 /*
2336  * copy an array of mcp_kreq_ether_recv_t's to the mcp.  Copy
2337  * at most 32 bytes at a time, so as to avoid involving the software
2338  * pio handler in the nic.   We re-write the first segment's low
2339  * DMA address to mark it valid only after we write the entire chunk
2340  * in a burst
2341  */
2342 static inline void
2343 mxge_submit_8rx(volatile mcp_kreq_ether_recv_t *dst,
2344 		mcp_kreq_ether_recv_t *src)
2345 {
2346 	uint32_t low;
2347 
2348 	low = src->addr_low;
2349 	src->addr_low = 0xffffffff;
2350 	mxge_pio_copy(dst, src, 4 * sizeof (*src));
2351 	wmb();
2352 	mxge_pio_copy(dst + 4, src + 4, 4 * sizeof (*src));
2353 	wmb();
2354 	src->addr_low = low;
2355 	dst->addr_low = low;
2356 	wmb();
2357 }
2358 
2359 static int
2360 mxge_get_buf_small(struct mxge_slice_state *ss, bus_dmamap_t map, int idx)
2361 {
2362 	bus_dma_segment_t seg;
2363 	struct mbuf *m;
2364 	mxge_rx_ring_t *rx = &ss->rx_small;
2365 	int cnt, err;
2366 
2367 	m = m_gethdr(M_DONTWAIT, MT_DATA);
2368 	if (m == NULL) {
2369 		rx->alloc_fail++;
2370 		err = ENOBUFS;
2371 		goto done;
2372 	}
2373 	m->m_len = MHLEN;
2374 	err = bus_dmamap_load_mbuf_sg(rx->dmat, map, m,
2375 				      &seg, &cnt, BUS_DMA_NOWAIT);
2376 	if (err != 0) {
2377 		m_free(m);
2378 		goto done;
2379 	}
2380 	rx->info[idx].m = m;
2381 	rx->shadow[idx].addr_low =
2382 		htobe32(MXGE_LOWPART_TO_U32(seg.ds_addr));
2383 	rx->shadow[idx].addr_high =
2384 		htobe32(MXGE_HIGHPART_TO_U32(seg.ds_addr));
2385 
2386 done:
2387 	if ((idx & 7) == 7)
2388 		mxge_submit_8rx(&rx->lanai[idx - 7], &rx->shadow[idx - 7]);
2389 	return err;
2390 }
2391 
2392 static int
2393 mxge_get_buf_big(struct mxge_slice_state *ss, bus_dmamap_t map, int idx)
2394 {
2395 	bus_dma_segment_t seg[3];
2396 	struct mbuf *m;
2397 	mxge_rx_ring_t *rx = &ss->rx_big;
2398 	int cnt, err, i;
2399 
2400 	if (rx->cl_size == MCLBYTES)
2401 		m = m_getcl(M_DONTWAIT, MT_DATA, M_PKTHDR);
2402 	else
2403 		m = m_getjcl(M_DONTWAIT, MT_DATA, M_PKTHDR, rx->cl_size);
2404 	if (m == NULL) {
2405 		rx->alloc_fail++;
2406 		err = ENOBUFS;
2407 		goto done;
2408 	}
2409 	m->m_len = rx->mlen;
2410 	err = bus_dmamap_load_mbuf_sg(rx->dmat, map, m,
2411 				      seg, &cnt, BUS_DMA_NOWAIT);
2412 	if (err != 0) {
2413 		m_free(m);
2414 		goto done;
2415 	}
2416 	rx->info[idx].m = m;
2417 	rx->shadow[idx].addr_low =
2418 		htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
2419 	rx->shadow[idx].addr_high =
2420 		htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
2421 
2422 #if MXGE_VIRT_JUMBOS
2423 	for (i = 1; i < cnt; i++) {
2424 		rx->shadow[idx + i].addr_low =
2425 			htobe32(MXGE_LOWPART_TO_U32(seg[i].ds_addr));
2426 		rx->shadow[idx + i].addr_high =
2427 			htobe32(MXGE_HIGHPART_TO_U32(seg[i].ds_addr));
2428        }
2429 #endif
2430 
2431 done:
2432        for (i = 0; i < rx->nbufs; i++) {
2433 		if ((idx & 7) == 7) {
2434 			mxge_submit_8rx(&rx->lanai[idx - 7],
2435 					&rx->shadow[idx - 7]);
2436 		}
2437 		idx++;
2438 	}
2439 	return err;
2440 }
2441 
2442 /*
2443  *  Myri10GE hardware checksums are not valid if the sender
2444  *  padded the frame with non-zero padding.  This is because
2445  *  the firmware just does a simple 16-bit 1s complement
2446  *  checksum across the entire frame, excluding the first 14
2447  *  bytes.  It is best to simply to check the checksum and
2448  *  tell the stack about it only if the checksum is good
2449  */
2450 
2451 static inline uint16_t
2452 mxge_rx_csum(struct mbuf *m, int csum)
2453 {
2454 	struct ether_header *eh;
2455 	struct ip *ip;
2456 	uint16_t c;
2457 
2458 	eh = mtod(m, struct ether_header *);
2459 
2460 	/* only deal with IPv4 TCP & UDP for now */
2461 	if (__predict_false(eh->ether_type != htons(ETHERTYPE_IP)))
2462 		return 1;
2463 	ip = (struct ip *)(eh + 1);
2464 	if (__predict_false(ip->ip_p != IPPROTO_TCP &&
2465 			    ip->ip_p != IPPROTO_UDP))
2466 		return 1;
2467 #ifdef INET
2468 	c = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
2469 		      htonl(ntohs(csum) + ntohs(ip->ip_len) +
2470 			    - (ip->ip_hl << 2) + ip->ip_p));
2471 #else
2472 	c = 1;
2473 #endif
2474 	c ^= 0xffff;
2475 	return (c);
2476 }
2477 
2478 static void
2479 mxge_vlan_tag_remove(struct mbuf *m, uint32_t *csum)
2480 {
2481 	struct ether_vlan_header *evl;
2482 	struct ether_header *eh;
2483 	uint32_t partial;
2484 
2485 	evl = mtod(m, struct ether_vlan_header *);
2486 	eh = mtod(m, struct ether_header *);
2487 
2488 	/*
2489 	 * fix checksum by subtracting ETHER_VLAN_ENCAP_LEN bytes
2490 	 * after what the firmware thought was the end of the ethernet
2491 	 * header.
2492 	 */
2493 
2494 	/* put checksum into host byte order */
2495 	*csum = ntohs(*csum);
2496 	partial = ntohl(*(uint32_t *)(mtod(m, char *) + ETHER_HDR_LEN));
2497 	(*csum) += ~partial;
2498 	(*csum) +=  ((*csum) < ~partial);
2499 	(*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2500 	(*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2501 
2502 	/* restore checksum to network byte order;
2503 	   later consumers expect this */
2504 	*csum = htons(*csum);
2505 
2506 	/* save the tag */
2507 #ifdef MXGE_NEW_VLAN_API
2508 	m->m_pkthdr.ether_vtag = ntohs(evl->evl_tag);
2509 #else
2510 	{
2511 		struct m_tag *mtag;
2512 		mtag = m_tag_alloc(MTAG_VLAN, MTAG_VLAN_TAG, sizeof(u_int),
2513 				   M_NOWAIT);
2514 		if (mtag == NULL)
2515 			return;
2516 		VLAN_TAG_VALUE(mtag) = ntohs(evl->evl_tag);
2517 		m_tag_prepend(m, mtag);
2518 	}
2519 
2520 #endif
2521 	m->m_flags |= M_VLANTAG;
2522 
2523 	/*
2524 	 * Remove the 802.1q header by copying the Ethernet
2525 	 * addresses over it and adjusting the beginning of
2526 	 * the data in the mbuf.  The encapsulated Ethernet
2527 	 * type field is already in place.
2528 	 */
2529 	bcopy((char *)evl, (char *)evl + ETHER_VLAN_ENCAP_LEN,
2530 	      ETHER_HDR_LEN - ETHER_TYPE_LEN);
2531 	m_adj(m, ETHER_VLAN_ENCAP_LEN);
2532 }
2533 
2534 
2535 static inline void
2536 mxge_rx_done_big(struct mxge_slice_state *ss, uint32_t len, uint32_t csum)
2537 {
2538 	mxge_softc_t *sc;
2539 	struct ifnet *ifp;
2540 	struct mbuf *m;
2541 	struct ether_header *eh;
2542 	mxge_rx_ring_t *rx;
2543 	bus_dmamap_t old_map;
2544 	int idx;
2545 	uint16_t tcpudp_csum;
2546 
2547 	sc = ss->sc;
2548 	ifp = sc->ifp;
2549 	rx = &ss->rx_big;
2550 	idx = rx->cnt & rx->mask;
2551 	rx->cnt += rx->nbufs;
2552 	/* save a pointer to the received mbuf */
2553 	m = rx->info[idx].m;
2554 	/* try to replace the received mbuf */
2555 	if (mxge_get_buf_big(ss, rx->extra_map, idx)) {
2556 		/* drop the frame -- the old mbuf is re-cycled */
2557 		ifp->if_ierrors++;
2558 		return;
2559 	}
2560 
2561 	/* unmap the received buffer */
2562 	old_map = rx->info[idx].map;
2563 	bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2564 	bus_dmamap_unload(rx->dmat, old_map);
2565 
2566 	/* swap the bus_dmamap_t's */
2567 	rx->info[idx].map = rx->extra_map;
2568 	rx->extra_map = old_map;
2569 
2570 	/* mcp implicitly skips 1st 2 bytes so that packet is properly
2571 	 * aligned */
2572 	m->m_data += MXGEFW_PAD;
2573 
2574 	m->m_pkthdr.rcvif = ifp;
2575 	m->m_len = m->m_pkthdr.len = len;
2576 	ss->ipackets++;
2577 	eh = mtod(m, struct ether_header *);
2578 	if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2579 		mxge_vlan_tag_remove(m, &csum);
2580 	}
2581 	/* if the checksum is valid, mark it in the mbuf header */
2582 	if (sc->csum_flag && (0 == (tcpudp_csum = mxge_rx_csum(m, csum)))) {
2583 		if (sc->lro_cnt && (0 == mxge_lro_rx(ss, m, csum)))
2584 			return;
2585 		/* otherwise, it was a UDP frame, or a TCP frame which
2586 		   we could not do LRO on.  Tell the stack that the
2587 		   checksum is good */
2588 		m->m_pkthdr.csum_data = 0xffff;
2589 		m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR | CSUM_DATA_VALID;
2590 	}
2591 	/* flowid only valid if RSS hashing is enabled */
2592 	if (sc->num_slices > 1) {
2593 		m->m_pkthdr.flowid = (ss - sc->ss);
2594 		m->m_flags |= M_FLOWID;
2595 	}
2596 	/* pass the frame up the stack */
2597 	(*ifp->if_input)(ifp, m);
2598 }
2599 
2600 static inline void
2601 mxge_rx_done_small(struct mxge_slice_state *ss, uint32_t len, uint32_t csum)
2602 {
2603 	mxge_softc_t *sc;
2604 	struct ifnet *ifp;
2605 	struct ether_header *eh;
2606 	struct mbuf *m;
2607 	mxge_rx_ring_t *rx;
2608 	bus_dmamap_t old_map;
2609 	int idx;
2610 	uint16_t tcpudp_csum;
2611 
2612 	sc = ss->sc;
2613 	ifp = sc->ifp;
2614 	rx = &ss->rx_small;
2615 	idx = rx->cnt & rx->mask;
2616 	rx->cnt++;
2617 	/* save a pointer to the received mbuf */
2618 	m = rx->info[idx].m;
2619 	/* try to replace the received mbuf */
2620 	if (mxge_get_buf_small(ss, rx->extra_map, idx)) {
2621 		/* drop the frame -- the old mbuf is re-cycled */
2622 		ifp->if_ierrors++;
2623 		return;
2624 	}
2625 
2626 	/* unmap the received buffer */
2627 	old_map = rx->info[idx].map;
2628 	bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2629 	bus_dmamap_unload(rx->dmat, old_map);
2630 
2631 	/* swap the bus_dmamap_t's */
2632 	rx->info[idx].map = rx->extra_map;
2633 	rx->extra_map = old_map;
2634 
2635 	/* mcp implicitly skips 1st 2 bytes so that packet is properly
2636 	 * aligned */
2637 	m->m_data += MXGEFW_PAD;
2638 
2639 	m->m_pkthdr.rcvif = ifp;
2640 	m->m_len = m->m_pkthdr.len = len;
2641 	ss->ipackets++;
2642 	eh = mtod(m, struct ether_header *);
2643 	if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2644 		mxge_vlan_tag_remove(m, &csum);
2645 	}
2646 	/* if the checksum is valid, mark it in the mbuf header */
2647 	if (sc->csum_flag && (0 == (tcpudp_csum = mxge_rx_csum(m, csum)))) {
2648 		if (sc->lro_cnt && (0 == mxge_lro_rx(ss, m, csum)))
2649 			return;
2650 		/* otherwise, it was a UDP frame, or a TCP frame which
2651 		   we could not do LRO on.  Tell the stack that the
2652 		   checksum is good */
2653 		m->m_pkthdr.csum_data = 0xffff;
2654 		m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR | CSUM_DATA_VALID;
2655 	}
2656 	/* flowid only valid if RSS hashing is enabled */
2657 	if (sc->num_slices > 1) {
2658 		m->m_pkthdr.flowid = (ss - sc->ss);
2659 		m->m_flags |= M_FLOWID;
2660 	}
2661 	/* pass the frame up the stack */
2662 	(*ifp->if_input)(ifp, m);
2663 }
2664 
2665 static inline void
2666 mxge_clean_rx_done(struct mxge_slice_state *ss)
2667 {
2668 	mxge_rx_done_t *rx_done = &ss->rx_done;
2669 	int limit = 0;
2670 	uint16_t length;
2671 	uint16_t checksum;
2672 
2673 
2674 	while (rx_done->entry[rx_done->idx].length != 0) {
2675 		length = ntohs(rx_done->entry[rx_done->idx].length);
2676 		rx_done->entry[rx_done->idx].length = 0;
2677 		checksum = rx_done->entry[rx_done->idx].checksum;
2678 		if (length <= (MHLEN - MXGEFW_PAD))
2679 			mxge_rx_done_small(ss, length, checksum);
2680 		else
2681 			mxge_rx_done_big(ss, length, checksum);
2682 		rx_done->cnt++;
2683 		rx_done->idx = rx_done->cnt & rx_done->mask;
2684 
2685 		/* limit potential for livelock */
2686 		if (__predict_false(++limit > rx_done->mask / 2))
2687 			break;
2688 	}
2689 #ifdef INET
2690 	while (!SLIST_EMPTY(&ss->lro_active)) {
2691 		struct lro_entry *lro = SLIST_FIRST(&ss->lro_active);
2692 		SLIST_REMOVE_HEAD(&ss->lro_active, next);
2693 		mxge_lro_flush(ss, lro);
2694 	}
2695 #endif
2696 }
2697 
2698 
2699 static inline void
2700 mxge_tx_done(struct mxge_slice_state *ss, uint32_t mcp_idx)
2701 {
2702 	struct ifnet *ifp;
2703 	mxge_tx_ring_t *tx;
2704 	struct mbuf *m;
2705 	bus_dmamap_t map;
2706 	int idx;
2707 	int *flags;
2708 
2709 	tx = &ss->tx;
2710 	ifp = ss->sc->ifp;
2711 	while (tx->pkt_done != mcp_idx) {
2712 		idx = tx->done & tx->mask;
2713 		tx->done++;
2714 		m = tx->info[idx].m;
2715 		/* mbuf and DMA map only attached to the first
2716 		   segment per-mbuf */
2717 		if (m != NULL) {
2718 			ss->obytes += m->m_pkthdr.len;
2719 			if (m->m_flags & M_MCAST)
2720 				ss->omcasts++;
2721 			ss->opackets++;
2722 			tx->info[idx].m = NULL;
2723 			map = tx->info[idx].map;
2724 			bus_dmamap_unload(tx->dmat, map);
2725 			m_freem(m);
2726 		}
2727 		if (tx->info[idx].flag) {
2728 			tx->info[idx].flag = 0;
2729 			tx->pkt_done++;
2730 		}
2731 	}
2732 
2733 	/* If we have space, clear IFF_OACTIVE to tell the stack that
2734            its OK to send packets */
2735 #ifdef IFNET_BUF_RING
2736 	flags = &ss->if_drv_flags;
2737 #else
2738 	flags = &ifp->if_drv_flags;
2739 #endif
2740 	mtx_lock(&ss->tx.mtx);
2741 	if ((*flags) & IFF_DRV_OACTIVE &&
2742 	    tx->req - tx->done < (tx->mask + 1)/4) {
2743 		*(flags) &= ~IFF_DRV_OACTIVE;
2744 		ss->tx.wake++;
2745 		mxge_start_locked(ss);
2746 	}
2747 #ifdef IFNET_BUF_RING
2748 	if ((ss->sc->num_slices > 1) && (tx->req == tx->done)) {
2749 		/* let the NIC stop polling this queue, since there
2750 		 * are no more transmits pending */
2751 		if (tx->req == tx->done) {
2752 			*tx->send_stop = 1;
2753 			tx->queue_active = 0;
2754 			tx->deactivate++;
2755 			wmb();
2756 		}
2757 	}
2758 #endif
2759 	mtx_unlock(&ss->tx.mtx);
2760 
2761 }
2762 
2763 static struct mxge_media_type mxge_xfp_media_types[] =
2764 {
2765 	{IFM_10G_CX4,	0x7f, 		"10GBASE-CX4 (module)"},
2766 	{IFM_10G_SR, 	(1 << 7),	"10GBASE-SR"},
2767 	{IFM_10G_LR, 	(1 << 6),	"10GBASE-LR"},
2768 	{0,		(1 << 5),	"10GBASE-ER"},
2769 	{IFM_10G_LRM,	(1 << 4),	"10GBASE-LRM"},
2770 	{0,		(1 << 3),	"10GBASE-SW"},
2771 	{0,		(1 << 2),	"10GBASE-LW"},
2772 	{0,		(1 << 1),	"10GBASE-EW"},
2773 	{0,		(1 << 0),	"Reserved"}
2774 };
2775 static struct mxge_media_type mxge_sfp_media_types[] =
2776 {
2777 	{0,		(1 << 7),	"Reserved"},
2778 	{IFM_10G_LRM,	(1 << 6),	"10GBASE-LRM"},
2779 	{IFM_10G_LR, 	(1 << 5),	"10GBASE-LR"},
2780 	{IFM_10G_SR,	(1 << 4),	"10GBASE-SR"}
2781 };
2782 
2783 static void
2784 mxge_set_media(mxge_softc_t *sc, int type)
2785 {
2786 	sc->media_flags |= type;
2787 	ifmedia_add(&sc->media, sc->media_flags, 0, NULL);
2788 	ifmedia_set(&sc->media, sc->media_flags);
2789 }
2790 
2791 
2792 /*
2793  * Determine the media type for a NIC.  Some XFPs will identify
2794  * themselves only when their link is up, so this is initiated via a
2795  * link up interrupt.  However, this can potentially take up to
2796  * several milliseconds, so it is run via the watchdog routine, rather
2797  * than in the interrupt handler itself.   This need only be done
2798  * once, not each time the link is up.
2799  */
2800 static void
2801 mxge_media_probe(mxge_softc_t *sc)
2802 {
2803 	mxge_cmd_t cmd;
2804 	char *cage_type;
2805 	char *ptr;
2806 	struct mxge_media_type *mxge_media_types = NULL;
2807 	int i, err, ms, mxge_media_type_entries;
2808 	uint32_t byte;
2809 
2810 	sc->need_media_probe = 0;
2811 
2812 	/* if we've already set a media type, we're done */
2813 	if (sc->media_flags  != (IFM_ETHER | IFM_AUTO))
2814 		return;
2815 
2816 	/*
2817 	 * parse the product code to deterimine the interface type
2818 	 * (CX4, XFP, Quad Ribbon Fiber) by looking at the character
2819 	 * after the 3rd dash in the driver's cached copy of the
2820 	 * EEPROM's product code string.
2821 	 */
2822 	ptr = sc->product_code_string;
2823 	if (ptr == NULL) {
2824 		device_printf(sc->dev, "Missing product code\n");
2825 	}
2826 
2827 	for (i = 0; i < 3; i++, ptr++) {
2828 		ptr = index(ptr, '-');
2829 		if (ptr == NULL) {
2830 			device_printf(sc->dev,
2831 				      "only %d dashes in PC?!?\n", i);
2832 			return;
2833 		}
2834 	}
2835 	if (*ptr == 'C') {
2836 		/* -C is CX4 */
2837 		mxge_set_media(sc, IFM_10G_CX4);
2838 		return;
2839 	}
2840 	else if (*ptr == 'Q') {
2841 		/* -Q is Quad Ribbon Fiber */
2842 		device_printf(sc->dev, "Quad Ribbon Fiber Media\n");
2843 		/* FreeBSD has no media type for Quad ribbon fiber */
2844 		return;
2845 	}
2846 
2847 	if (*ptr == 'R') {
2848 		/* -R is XFP */
2849 		mxge_media_types = mxge_xfp_media_types;
2850 		mxge_media_type_entries =
2851 			sizeof (mxge_xfp_media_types) /
2852 			sizeof (mxge_xfp_media_types[0]);
2853 		byte = MXGE_XFP_COMPLIANCE_BYTE;
2854 		cage_type = "XFP";
2855 	}
2856 
2857 	if (*ptr == 'S' || *(ptr +1) == 'S') {
2858 		/* -S or -2S is SFP+ */
2859 		mxge_media_types = mxge_sfp_media_types;
2860 		mxge_media_type_entries =
2861 			sizeof (mxge_sfp_media_types) /
2862 			sizeof (mxge_sfp_media_types[0]);
2863 		cage_type = "SFP+";
2864 		byte = 3;
2865 	}
2866 
2867 	if (mxge_media_types == NULL) {
2868 		device_printf(sc->dev, "Unknown media type: %c\n", *ptr);
2869 		return;
2870 	}
2871 
2872 	/*
2873 	 * At this point we know the NIC has an XFP cage, so now we
2874 	 * try to determine what is in the cage by using the
2875 	 * firmware's XFP I2C commands to read the XFP 10GbE compilance
2876 	 * register.  We read just one byte, which may take over
2877 	 * a millisecond
2878 	 */
2879 
2880 	cmd.data0 = 0;	 /* just fetch 1 byte, not all 256 */
2881 	cmd.data1 = byte;
2882 	err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_READ, &cmd);
2883 	if (err == MXGEFW_CMD_ERROR_I2C_FAILURE) {
2884 		device_printf(sc->dev, "failed to read XFP\n");
2885 	}
2886 	if (err == MXGEFW_CMD_ERROR_I2C_ABSENT) {
2887 		device_printf(sc->dev, "Type R/S with no XFP!?!?\n");
2888 	}
2889 	if (err != MXGEFW_CMD_OK) {
2890 		return;
2891 	}
2892 
2893 	/* now we wait for the data to be cached */
2894 	cmd.data0 = byte;
2895 	err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
2896 	for (ms = 0; (err == EBUSY) && (ms < 50); ms++) {
2897 		DELAY(1000);
2898 		cmd.data0 = byte;
2899 		err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
2900 	}
2901 	if (err != MXGEFW_CMD_OK) {
2902 		device_printf(sc->dev, "failed to read %s (%d, %dms)\n",
2903 			      cage_type, err, ms);
2904 		return;
2905 	}
2906 
2907 	if (cmd.data0 == mxge_media_types[0].bitmask) {
2908 		if (mxge_verbose)
2909 			device_printf(sc->dev, "%s:%s\n", cage_type,
2910 				      mxge_media_types[0].name);
2911 		mxge_set_media(sc, IFM_10G_CX4);
2912 		return;
2913 	}
2914 	for (i = 1; i < mxge_media_type_entries; i++) {
2915 		if (cmd.data0 & mxge_media_types[i].bitmask) {
2916 			if (mxge_verbose)
2917 				device_printf(sc->dev, "%s:%s\n",
2918 					      cage_type,
2919 					      mxge_media_types[i].name);
2920 
2921 			mxge_set_media(sc, mxge_media_types[i].flag);
2922 			return;
2923 		}
2924 	}
2925 	device_printf(sc->dev, "%s media 0x%x unknown\n", cage_type,
2926 		      cmd.data0);
2927 
2928 	return;
2929 }
2930 
2931 static void
2932 mxge_intr(void *arg)
2933 {
2934 	struct mxge_slice_state *ss = arg;
2935 	mxge_softc_t *sc = ss->sc;
2936 	mcp_irq_data_t *stats = ss->fw_stats;
2937 	mxge_tx_ring_t *tx = &ss->tx;
2938 	mxge_rx_done_t *rx_done = &ss->rx_done;
2939 	uint32_t send_done_count;
2940 	uint8_t valid;
2941 
2942 
2943 #ifndef IFNET_BUF_RING
2944 	/* an interrupt on a non-zero slice is implicitly valid
2945 	   since MSI-X irqs are not shared */
2946 	if (ss != sc->ss) {
2947 		mxge_clean_rx_done(ss);
2948 		*ss->irq_claim = be32toh(3);
2949 		return;
2950 	}
2951 #endif
2952 
2953 	/* make sure the DMA has finished */
2954 	if (!stats->valid) {
2955 		return;
2956 	}
2957 	valid = stats->valid;
2958 
2959 	if (sc->legacy_irq) {
2960 		/* lower legacy IRQ  */
2961 		*sc->irq_deassert = 0;
2962 		if (!mxge_deassert_wait)
2963 			/* don't wait for conf. that irq is low */
2964 			stats->valid = 0;
2965 	} else {
2966 		stats->valid = 0;
2967 	}
2968 
2969 	/* loop while waiting for legacy irq deassertion */
2970 	do {
2971 		/* check for transmit completes and receives */
2972 		send_done_count = be32toh(stats->send_done_count);
2973 		while ((send_done_count != tx->pkt_done) ||
2974 		       (rx_done->entry[rx_done->idx].length != 0)) {
2975 			if (send_done_count != tx->pkt_done)
2976 				mxge_tx_done(ss, (int)send_done_count);
2977 			mxge_clean_rx_done(ss);
2978 			send_done_count = be32toh(stats->send_done_count);
2979 		}
2980 		if (sc->legacy_irq && mxge_deassert_wait)
2981 			wmb();
2982 	} while (*((volatile uint8_t *) &stats->valid));
2983 
2984 	/* fw link & error stats meaningful only on the first slice */
2985 	if (__predict_false((ss == sc->ss) && stats->stats_updated)) {
2986 		if (sc->link_state != stats->link_up) {
2987 			sc->link_state = stats->link_up;
2988 			if (sc->link_state) {
2989 				if_link_state_change(sc->ifp, LINK_STATE_UP);
2990 				if (mxge_verbose)
2991 					device_printf(sc->dev, "link up\n");
2992 			} else {
2993 				if_link_state_change(sc->ifp, LINK_STATE_DOWN);
2994 				if (mxge_verbose)
2995 					device_printf(sc->dev, "link down\n");
2996 			}
2997 			sc->need_media_probe = 1;
2998 		}
2999 		if (sc->rdma_tags_available !=
3000 		    be32toh(stats->rdma_tags_available)) {
3001 			sc->rdma_tags_available =
3002 				be32toh(stats->rdma_tags_available);
3003 			device_printf(sc->dev, "RDMA timed out! %d tags "
3004 				      "left\n", sc->rdma_tags_available);
3005 		}
3006 
3007 		if (stats->link_down) {
3008 			sc->down_cnt += stats->link_down;
3009 			sc->link_state = 0;
3010 			if_link_state_change(sc->ifp, LINK_STATE_DOWN);
3011 		}
3012 	}
3013 
3014 	/* check to see if we have rx token to pass back */
3015 	if (valid & 0x1)
3016 	    *ss->irq_claim = be32toh(3);
3017 	*(ss->irq_claim + 1) = be32toh(3);
3018 }
3019 
3020 static void
3021 mxge_init(void *arg)
3022 {
3023 }
3024 
3025 
3026 
3027 static void
3028 mxge_free_slice_mbufs(struct mxge_slice_state *ss)
3029 {
3030 	struct lro_entry *lro_entry;
3031 	int i;
3032 
3033 	while (!SLIST_EMPTY(&ss->lro_free)) {
3034 		lro_entry = SLIST_FIRST(&ss->lro_free);
3035 		SLIST_REMOVE_HEAD(&ss->lro_free, next);
3036 		free(lro_entry, M_DEVBUF);
3037 	}
3038 
3039 	for (i = 0; i <= ss->rx_big.mask; i++) {
3040 		if (ss->rx_big.info[i].m == NULL)
3041 			continue;
3042 		bus_dmamap_unload(ss->rx_big.dmat,
3043 				  ss->rx_big.info[i].map);
3044 		m_freem(ss->rx_big.info[i].m);
3045 		ss->rx_big.info[i].m = NULL;
3046 	}
3047 
3048 	for (i = 0; i <= ss->rx_small.mask; i++) {
3049 		if (ss->rx_small.info[i].m == NULL)
3050 			continue;
3051 		bus_dmamap_unload(ss->rx_small.dmat,
3052 				  ss->rx_small.info[i].map);
3053 		m_freem(ss->rx_small.info[i].m);
3054 		ss->rx_small.info[i].m = NULL;
3055 	}
3056 
3057 	/* transmit ring used only on the first slice */
3058 	if (ss->tx.info == NULL)
3059 		return;
3060 
3061 	for (i = 0; i <= ss->tx.mask; i++) {
3062 		ss->tx.info[i].flag = 0;
3063 		if (ss->tx.info[i].m == NULL)
3064 			continue;
3065 		bus_dmamap_unload(ss->tx.dmat,
3066 				  ss->tx.info[i].map);
3067 		m_freem(ss->tx.info[i].m);
3068 		ss->tx.info[i].m = NULL;
3069 	}
3070 }
3071 
3072 static void
3073 mxge_free_mbufs(mxge_softc_t *sc)
3074 {
3075 	int slice;
3076 
3077 	for (slice = 0; slice < sc->num_slices; slice++)
3078 		mxge_free_slice_mbufs(&sc->ss[slice]);
3079 }
3080 
3081 static void
3082 mxge_free_slice_rings(struct mxge_slice_state *ss)
3083 {
3084 	int i;
3085 
3086 
3087 	if (ss->rx_done.entry != NULL)
3088 		mxge_dma_free(&ss->rx_done.dma);
3089 	ss->rx_done.entry = NULL;
3090 
3091 	if (ss->tx.req_bytes != NULL)
3092 		free(ss->tx.req_bytes, M_DEVBUF);
3093 	ss->tx.req_bytes = NULL;
3094 
3095 	if (ss->tx.seg_list != NULL)
3096 		free(ss->tx.seg_list, M_DEVBUF);
3097 	ss->tx.seg_list = NULL;
3098 
3099 	if (ss->rx_small.shadow != NULL)
3100 		free(ss->rx_small.shadow, M_DEVBUF);
3101 	ss->rx_small.shadow = NULL;
3102 
3103 	if (ss->rx_big.shadow != NULL)
3104 		free(ss->rx_big.shadow, M_DEVBUF);
3105 	ss->rx_big.shadow = NULL;
3106 
3107 	if (ss->tx.info != NULL) {
3108 		if (ss->tx.dmat != NULL) {
3109 			for (i = 0; i <= ss->tx.mask; i++) {
3110 				bus_dmamap_destroy(ss->tx.dmat,
3111 						   ss->tx.info[i].map);
3112 			}
3113 			bus_dma_tag_destroy(ss->tx.dmat);
3114 		}
3115 		free(ss->tx.info, M_DEVBUF);
3116 	}
3117 	ss->tx.info = NULL;
3118 
3119 	if (ss->rx_small.info != NULL) {
3120 		if (ss->rx_small.dmat != NULL) {
3121 			for (i = 0; i <= ss->rx_small.mask; i++) {
3122 				bus_dmamap_destroy(ss->rx_small.dmat,
3123 						   ss->rx_small.info[i].map);
3124 			}
3125 			bus_dmamap_destroy(ss->rx_small.dmat,
3126 					   ss->rx_small.extra_map);
3127 			bus_dma_tag_destroy(ss->rx_small.dmat);
3128 		}
3129 		free(ss->rx_small.info, M_DEVBUF);
3130 	}
3131 	ss->rx_small.info = NULL;
3132 
3133 	if (ss->rx_big.info != NULL) {
3134 		if (ss->rx_big.dmat != NULL) {
3135 			for (i = 0; i <= ss->rx_big.mask; i++) {
3136 				bus_dmamap_destroy(ss->rx_big.dmat,
3137 						   ss->rx_big.info[i].map);
3138 			}
3139 			bus_dmamap_destroy(ss->rx_big.dmat,
3140 					   ss->rx_big.extra_map);
3141 			bus_dma_tag_destroy(ss->rx_big.dmat);
3142 		}
3143 		free(ss->rx_big.info, M_DEVBUF);
3144 	}
3145 	ss->rx_big.info = NULL;
3146 }
3147 
3148 static void
3149 mxge_free_rings(mxge_softc_t *sc)
3150 {
3151 	int slice;
3152 
3153 	for (slice = 0; slice < sc->num_slices; slice++)
3154 		mxge_free_slice_rings(&sc->ss[slice]);
3155 }
3156 
3157 static int
3158 mxge_alloc_slice_rings(struct mxge_slice_state *ss, int rx_ring_entries,
3159 		       int tx_ring_entries)
3160 {
3161 	mxge_softc_t *sc = ss->sc;
3162 	size_t bytes;
3163 	int err, i;
3164 
3165 	err = ENOMEM;
3166 
3167 	/* allocate per-slice receive resources */
3168 
3169 	ss->rx_small.mask = ss->rx_big.mask = rx_ring_entries - 1;
3170 	ss->rx_done.mask = (2 * rx_ring_entries) - 1;
3171 
3172 	/* allocate the rx shadow rings */
3173 	bytes = rx_ring_entries * sizeof (*ss->rx_small.shadow);
3174 	ss->rx_small.shadow = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3175 	if (ss->rx_small.shadow == NULL)
3176 		return err;;
3177 
3178 	bytes = rx_ring_entries * sizeof (*ss->rx_big.shadow);
3179 	ss->rx_big.shadow = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3180 	if (ss->rx_big.shadow == NULL)
3181 		return err;;
3182 
3183 	/* allocate the rx host info rings */
3184 	bytes = rx_ring_entries * sizeof (*ss->rx_small.info);
3185 	ss->rx_small.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3186 	if (ss->rx_small.info == NULL)
3187 		return err;;
3188 
3189 	bytes = rx_ring_entries * sizeof (*ss->rx_big.info);
3190 	ss->rx_big.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3191 	if (ss->rx_big.info == NULL)
3192 		return err;;
3193 
3194 	/* allocate the rx busdma resources */
3195 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
3196 				 1,			/* alignment */
3197 				 4096,			/* boundary */
3198 				 BUS_SPACE_MAXADDR,	/* low */
3199 				 BUS_SPACE_MAXADDR,	/* high */
3200 				 NULL, NULL,		/* filter */
3201 				 MHLEN,			/* maxsize */
3202 				 1,			/* num segs */
3203 				 MHLEN,			/* maxsegsize */
3204 				 BUS_DMA_ALLOCNOW,	/* flags */
3205 				 NULL, NULL,		/* lock */
3206 				 &ss->rx_small.dmat);	/* tag */
3207 	if (err != 0) {
3208 		device_printf(sc->dev, "Err %d allocating rx_small dmat\n",
3209 			      err);
3210 		return err;;
3211 	}
3212 
3213 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
3214 				 1,			/* alignment */
3215 #if MXGE_VIRT_JUMBOS
3216 				 4096,			/* boundary */
3217 #else
3218 				 0,			/* boundary */
3219 #endif
3220 				 BUS_SPACE_MAXADDR,	/* low */
3221 				 BUS_SPACE_MAXADDR,	/* high */
3222 				 NULL, NULL,		/* filter */
3223 				 3*4096,		/* maxsize */
3224 #if MXGE_VIRT_JUMBOS
3225 				 3,			/* num segs */
3226 				 4096,			/* maxsegsize*/
3227 #else
3228 				 1,			/* num segs */
3229 				 MJUM9BYTES,		/* maxsegsize*/
3230 #endif
3231 				 BUS_DMA_ALLOCNOW,	/* flags */
3232 				 NULL, NULL,		/* lock */
3233 				 &ss->rx_big.dmat);	/* tag */
3234 	if (err != 0) {
3235 		device_printf(sc->dev, "Err %d allocating rx_big dmat\n",
3236 			      err);
3237 		return err;;
3238 	}
3239 	for (i = 0; i <= ss->rx_small.mask; i++) {
3240 		err = bus_dmamap_create(ss->rx_small.dmat, 0,
3241 					&ss->rx_small.info[i].map);
3242 		if (err != 0) {
3243 			device_printf(sc->dev, "Err %d  rx_small dmamap\n",
3244 				      err);
3245 			return err;;
3246 		}
3247 	}
3248 	err = bus_dmamap_create(ss->rx_small.dmat, 0,
3249 				&ss->rx_small.extra_map);
3250 	if (err != 0) {
3251 		device_printf(sc->dev, "Err %d extra rx_small dmamap\n",
3252 			      err);
3253 		return err;;
3254 	}
3255 
3256 	for (i = 0; i <= ss->rx_big.mask; i++) {
3257 		err = bus_dmamap_create(ss->rx_big.dmat, 0,
3258 					&ss->rx_big.info[i].map);
3259 		if (err != 0) {
3260 			device_printf(sc->dev, "Err %d  rx_big dmamap\n",
3261 				      err);
3262 			return err;;
3263 		}
3264 	}
3265 	err = bus_dmamap_create(ss->rx_big.dmat, 0,
3266 				&ss->rx_big.extra_map);
3267 	if (err != 0) {
3268 		device_printf(sc->dev, "Err %d extra rx_big dmamap\n",
3269 			      err);
3270 		return err;;
3271 	}
3272 
3273 	/* now allocate TX resouces */
3274 
3275 #ifndef IFNET_BUF_RING
3276 	/* only use a single TX ring for now */
3277 	if (ss != ss->sc->ss)
3278 		return 0;
3279 #endif
3280 
3281 	ss->tx.mask = tx_ring_entries - 1;
3282 	ss->tx.max_desc = MIN(MXGE_MAX_SEND_DESC, tx_ring_entries / 4);
3283 
3284 
3285 	/* allocate the tx request copy block */
3286 	bytes = 8 +
3287 		sizeof (*ss->tx.req_list) * (ss->tx.max_desc + 4);
3288 	ss->tx.req_bytes = malloc(bytes, M_DEVBUF, M_WAITOK);
3289 	if (ss->tx.req_bytes == NULL)
3290 		return err;;
3291 	/* ensure req_list entries are aligned to 8 bytes */
3292 	ss->tx.req_list = (mcp_kreq_ether_send_t *)
3293 		((unsigned long)(ss->tx.req_bytes + 7) & ~7UL);
3294 
3295 	/* allocate the tx busdma segment list */
3296 	bytes = sizeof (*ss->tx.seg_list) * ss->tx.max_desc;
3297 	ss->tx.seg_list = (bus_dma_segment_t *)
3298 		malloc(bytes, M_DEVBUF, M_WAITOK);
3299 	if (ss->tx.seg_list == NULL)
3300 		return err;;
3301 
3302 	/* allocate the tx host info ring */
3303 	bytes = tx_ring_entries * sizeof (*ss->tx.info);
3304 	ss->tx.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3305 	if (ss->tx.info == NULL)
3306 		return err;;
3307 
3308 	/* allocate the tx busdma resources */
3309 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
3310 				 1,			/* alignment */
3311 				 sc->tx_boundary,	/* boundary */
3312 				 BUS_SPACE_MAXADDR,	/* low */
3313 				 BUS_SPACE_MAXADDR,	/* high */
3314 				 NULL, NULL,		/* filter */
3315 				 65536 + 256,		/* maxsize */
3316 				 ss->tx.max_desc - 2,	/* num segs */
3317 				 sc->tx_boundary,	/* maxsegsz */
3318 				 BUS_DMA_ALLOCNOW,	/* flags */
3319 				 NULL, NULL,		/* lock */
3320 				 &ss->tx.dmat);		/* tag */
3321 
3322 	if (err != 0) {
3323 		device_printf(sc->dev, "Err %d allocating tx dmat\n",
3324 			      err);
3325 		return err;;
3326 	}
3327 
3328 	/* now use these tags to setup dmamaps for each slot
3329 	   in the ring */
3330 	for (i = 0; i <= ss->tx.mask; i++) {
3331 		err = bus_dmamap_create(ss->tx.dmat, 0,
3332 					&ss->tx.info[i].map);
3333 		if (err != 0) {
3334 			device_printf(sc->dev, "Err %d  tx dmamap\n",
3335 				      err);
3336 			return err;;
3337 		}
3338 	}
3339 	return 0;
3340 
3341 }
3342 
3343 static int
3344 mxge_alloc_rings(mxge_softc_t *sc)
3345 {
3346 	mxge_cmd_t cmd;
3347 	int tx_ring_size;
3348 	int tx_ring_entries, rx_ring_entries;
3349 	int err, slice;
3350 
3351 	/* get ring sizes */
3352 	err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_RING_SIZE, &cmd);
3353 	tx_ring_size = cmd.data0;
3354 	if (err != 0) {
3355 		device_printf(sc->dev, "Cannot determine tx ring sizes\n");
3356 		goto abort;
3357 	}
3358 
3359 	tx_ring_entries = tx_ring_size / sizeof (mcp_kreq_ether_send_t);
3360 	rx_ring_entries = sc->rx_ring_size / sizeof (mcp_dma_addr_t);
3361 	IFQ_SET_MAXLEN(&sc->ifp->if_snd, tx_ring_entries - 1);
3362 	sc->ifp->if_snd.ifq_drv_maxlen = sc->ifp->if_snd.ifq_maxlen;
3363 	IFQ_SET_READY(&sc->ifp->if_snd);
3364 
3365 	for (slice = 0; slice < sc->num_slices; slice++) {
3366 		err = mxge_alloc_slice_rings(&sc->ss[slice],
3367 					     rx_ring_entries,
3368 					     tx_ring_entries);
3369 		if (err != 0)
3370 			goto abort;
3371 	}
3372 	return 0;
3373 
3374 abort:
3375 	mxge_free_rings(sc);
3376 	return err;
3377 
3378 }
3379 
3380 
3381 static void
3382 mxge_choose_params(int mtu, int *big_buf_size, int *cl_size, int *nbufs)
3383 {
3384 	int bufsize = mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN + MXGEFW_PAD;
3385 
3386 	if (bufsize < MCLBYTES) {
3387 		/* easy, everything fits in a single buffer */
3388 		*big_buf_size = MCLBYTES;
3389 		*cl_size = MCLBYTES;
3390 		*nbufs = 1;
3391 		return;
3392 	}
3393 
3394 	if (bufsize < MJUMPAGESIZE) {
3395 		/* still easy, everything still fits in a single buffer */
3396 		*big_buf_size = MJUMPAGESIZE;
3397 		*cl_size = MJUMPAGESIZE;
3398 		*nbufs = 1;
3399 		return;
3400 	}
3401 #if MXGE_VIRT_JUMBOS
3402 	/* now we need to use virtually contiguous buffers */
3403 	*cl_size = MJUM9BYTES;
3404 	*big_buf_size = 4096;
3405 	*nbufs = mtu / 4096 + 1;
3406 	/* needs to be a power of two, so round up */
3407 	if (*nbufs == 3)
3408 		*nbufs = 4;
3409 #else
3410 	*cl_size = MJUM9BYTES;
3411 	*big_buf_size = MJUM9BYTES;
3412 	*nbufs = 1;
3413 #endif
3414 }
3415 
3416 static int
3417 mxge_slice_open(struct mxge_slice_state *ss, int nbufs, int cl_size)
3418 {
3419 	mxge_softc_t *sc;
3420 	mxge_cmd_t cmd;
3421 	bus_dmamap_t map;
3422 	struct lro_entry *lro_entry;
3423 	int err, i, slice;
3424 
3425 
3426 	sc = ss->sc;
3427 	slice = ss - sc->ss;
3428 
3429 	SLIST_INIT(&ss->lro_free);
3430 	SLIST_INIT(&ss->lro_active);
3431 
3432 	for (i = 0; i < sc->lro_cnt; i++) {
3433 		lro_entry = (struct lro_entry *)
3434 			malloc(sizeof (*lro_entry), M_DEVBUF,
3435 			       M_NOWAIT | M_ZERO);
3436 		if (lro_entry == NULL) {
3437 			sc->lro_cnt = i;
3438 			break;
3439 		}
3440 		SLIST_INSERT_HEAD(&ss->lro_free, lro_entry, next);
3441 	}
3442 	/* get the lanai pointers to the send and receive rings */
3443 
3444 	err = 0;
3445 #ifndef IFNET_BUF_RING
3446 	/* We currently only send from the first slice */
3447 	if (slice == 0) {
3448 #endif
3449 		cmd.data0 = slice;
3450 		err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_OFFSET, &cmd);
3451 		ss->tx.lanai =
3452 			(volatile mcp_kreq_ether_send_t *)(sc->sram + cmd.data0);
3453 		ss->tx.send_go = (volatile uint32_t *)
3454 			(sc->sram + MXGEFW_ETH_SEND_GO + 64 * slice);
3455 		ss->tx.send_stop = (volatile uint32_t *)
3456 		(sc->sram + MXGEFW_ETH_SEND_STOP + 64 * slice);
3457 #ifndef IFNET_BUF_RING
3458 	}
3459 #endif
3460 	cmd.data0 = slice;
3461 	err |= mxge_send_cmd(sc,
3462 			     MXGEFW_CMD_GET_SMALL_RX_OFFSET, &cmd);
3463 	ss->rx_small.lanai =
3464 		(volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
3465 	cmd.data0 = slice;
3466 	err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_BIG_RX_OFFSET, &cmd);
3467 	ss->rx_big.lanai =
3468 		(volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
3469 
3470 	if (err != 0) {
3471 		device_printf(sc->dev,
3472 			      "failed to get ring sizes or locations\n");
3473 		return EIO;
3474 	}
3475 
3476 	/* stock receive rings */
3477 	for (i = 0; i <= ss->rx_small.mask; i++) {
3478 		map = ss->rx_small.info[i].map;
3479 		err = mxge_get_buf_small(ss, map, i);
3480 		if (err) {
3481 			device_printf(sc->dev, "alloced %d/%d smalls\n",
3482 				      i, ss->rx_small.mask + 1);
3483 			return ENOMEM;
3484 		}
3485 	}
3486 	for (i = 0; i <= ss->rx_big.mask; i++) {
3487 		ss->rx_big.shadow[i].addr_low = 0xffffffff;
3488 		ss->rx_big.shadow[i].addr_high = 0xffffffff;
3489 	}
3490 	ss->rx_big.nbufs = nbufs;
3491 	ss->rx_big.cl_size = cl_size;
3492 	ss->rx_big.mlen = ss->sc->ifp->if_mtu + ETHER_HDR_LEN +
3493 		ETHER_VLAN_ENCAP_LEN + MXGEFW_PAD;
3494 	for (i = 0; i <= ss->rx_big.mask; i += ss->rx_big.nbufs) {
3495 		map = ss->rx_big.info[i].map;
3496 		err = mxge_get_buf_big(ss, map, i);
3497 		if (err) {
3498 			device_printf(sc->dev, "alloced %d/%d bigs\n",
3499 				      i, ss->rx_big.mask + 1);
3500 			return ENOMEM;
3501 		}
3502 	}
3503 	return 0;
3504 }
3505 
3506 static int
3507 mxge_open(mxge_softc_t *sc)
3508 {
3509 	mxge_cmd_t cmd;
3510 	int err, big_bytes, nbufs, slice, cl_size, i;
3511 	bus_addr_t bus;
3512 	volatile uint8_t *itable;
3513 	struct mxge_slice_state *ss;
3514 
3515 	/* Copy the MAC address in case it was overridden */
3516 	bcopy(IF_LLADDR(sc->ifp), sc->mac_addr, ETHER_ADDR_LEN);
3517 
3518 	err = mxge_reset(sc, 1);
3519 	if (err != 0) {
3520 		device_printf(sc->dev, "failed to reset\n");
3521 		return EIO;
3522 	}
3523 
3524 	if (sc->num_slices > 1) {
3525 		/* setup the indirection table */
3526 		cmd.data0 = sc->num_slices;
3527 		err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_TABLE_SIZE,
3528 				    &cmd);
3529 
3530 		err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_RSS_TABLE_OFFSET,
3531 				     &cmd);
3532 		if (err != 0) {
3533 			device_printf(sc->dev,
3534 				      "failed to setup rss tables\n");
3535 			return err;
3536 		}
3537 
3538 		/* just enable an identity mapping */
3539 		itable = sc->sram + cmd.data0;
3540 		for (i = 0; i < sc->num_slices; i++)
3541 			itable[i] = (uint8_t)i;
3542 
3543 		cmd.data0 = 1;
3544 		cmd.data1 = mxge_rss_hash_type;
3545 		err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_ENABLE, &cmd);
3546 		if (err != 0) {
3547 			device_printf(sc->dev, "failed to enable slices\n");
3548 			return err;
3549 		}
3550 	}
3551 
3552 
3553 	mxge_choose_params(sc->ifp->if_mtu, &big_bytes, &cl_size, &nbufs);
3554 
3555 	cmd.data0 = nbufs;
3556 	err = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
3557 			    &cmd);
3558 	/* error is only meaningful if we're trying to set
3559 	   MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS > 1 */
3560 	if (err && nbufs > 1) {
3561 		device_printf(sc->dev,
3562 			      "Failed to set alway-use-n to %d\n",
3563 			      nbufs);
3564 		return EIO;
3565 	}
3566 	/* Give the firmware the mtu and the big and small buffer
3567 	   sizes.  The firmware wants the big buf size to be a power
3568 	   of two. Luckily, FreeBSD's clusters are powers of two */
3569 	cmd.data0 = sc->ifp->if_mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
3570 	err = mxge_send_cmd(sc, MXGEFW_CMD_SET_MTU, &cmd);
3571 	cmd.data0 = MHLEN - MXGEFW_PAD;
3572 	err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_SMALL_BUFFER_SIZE,
3573 			     &cmd);
3574 	cmd.data0 = big_bytes;
3575 	err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_BIG_BUFFER_SIZE, &cmd);
3576 
3577 	if (err != 0) {
3578 		device_printf(sc->dev, "failed to setup params\n");
3579 		goto abort;
3580 	}
3581 
3582 	/* Now give him the pointer to the stats block */
3583 	for (slice = 0;
3584 #ifdef IFNET_BUF_RING
3585 	     slice < sc->num_slices;
3586 #else
3587 	     slice < 1;
3588 #endif
3589 	     slice++) {
3590 		ss = &sc->ss[slice];
3591 		cmd.data0 =
3592 			MXGE_LOWPART_TO_U32(ss->fw_stats_dma.bus_addr);
3593 		cmd.data1 =
3594 			MXGE_HIGHPART_TO_U32(ss->fw_stats_dma.bus_addr);
3595 		cmd.data2 = sizeof(struct mcp_irq_data);
3596 		cmd.data2 |= (slice << 16);
3597 		err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_STATS_DMA_V2, &cmd);
3598 	}
3599 
3600 	if (err != 0) {
3601 		bus = sc->ss->fw_stats_dma.bus_addr;
3602 		bus += offsetof(struct mcp_irq_data, send_done_count);
3603 		cmd.data0 = MXGE_LOWPART_TO_U32(bus);
3604 		cmd.data1 = MXGE_HIGHPART_TO_U32(bus);
3605 		err = mxge_send_cmd(sc,
3606 				    MXGEFW_CMD_SET_STATS_DMA_OBSOLETE,
3607 				    &cmd);
3608 		/* Firmware cannot support multicast without STATS_DMA_V2 */
3609 		sc->fw_multicast_support = 0;
3610 	} else {
3611 		sc->fw_multicast_support = 1;
3612 	}
3613 
3614 	if (err != 0) {
3615 		device_printf(sc->dev, "failed to setup params\n");
3616 		goto abort;
3617 	}
3618 
3619 	for (slice = 0; slice < sc->num_slices; slice++) {
3620 		err = mxge_slice_open(&sc->ss[slice], nbufs, cl_size);
3621 		if (err != 0) {
3622 			device_printf(sc->dev, "couldn't open slice %d\n",
3623 				      slice);
3624 			goto abort;
3625 		}
3626 	}
3627 
3628 	/* Finally, start the firmware running */
3629 	err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_UP, &cmd);
3630 	if (err) {
3631 		device_printf(sc->dev, "Couldn't bring up link\n");
3632 		goto abort;
3633 	}
3634 #ifdef IFNET_BUF_RING
3635 	for (slice = 0; slice < sc->num_slices; slice++) {
3636 		ss = &sc->ss[slice];
3637 		ss->if_drv_flags |= IFF_DRV_RUNNING;
3638 		ss->if_drv_flags &= ~IFF_DRV_OACTIVE;
3639 	}
3640 #endif
3641 	sc->ifp->if_drv_flags |= IFF_DRV_RUNNING;
3642 	sc->ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
3643 
3644 	return 0;
3645 
3646 
3647 abort:
3648 	mxge_free_mbufs(sc);
3649 
3650 	return err;
3651 }
3652 
3653 static int
3654 mxge_close(mxge_softc_t *sc, int down)
3655 {
3656 	mxge_cmd_t cmd;
3657 	int err, old_down_cnt;
3658 #ifdef IFNET_BUF_RING
3659 	struct mxge_slice_state *ss;
3660 	int slice;
3661 #endif
3662 
3663 #ifdef IFNET_BUF_RING
3664 	for (slice = 0; slice < sc->num_slices; slice++) {
3665 		ss = &sc->ss[slice];
3666 		ss->if_drv_flags &= ~IFF_DRV_RUNNING;
3667 	}
3668 #endif
3669 	sc->ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
3670 	if (!down) {
3671 		old_down_cnt = sc->down_cnt;
3672 		wmb();
3673 		err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_DOWN, &cmd);
3674 		if (err) {
3675 			device_printf(sc->dev,
3676 				      "Couldn't bring down link\n");
3677 		}
3678 		if (old_down_cnt == sc->down_cnt) {
3679 			/* wait for down irq */
3680 			DELAY(10 * sc->intr_coal_delay);
3681 		}
3682 		wmb();
3683 		if (old_down_cnt == sc->down_cnt) {
3684 			device_printf(sc->dev, "never got down irq\n");
3685 		}
3686 	}
3687 	mxge_free_mbufs(sc);
3688 
3689 	return 0;
3690 }
3691 
3692 static void
3693 mxge_setup_cfg_space(mxge_softc_t *sc)
3694 {
3695 	device_t dev = sc->dev;
3696 	int reg;
3697 	uint16_t cmd, lnk, pectl;
3698 
3699 	/* find the PCIe link width and set max read request to 4KB*/
3700 	if (pci_find_extcap(dev, PCIY_EXPRESS, &reg) == 0) {
3701 		lnk = pci_read_config(dev, reg + 0x12, 2);
3702 		sc->link_width = (lnk >> 4) & 0x3f;
3703 
3704 		if (sc->pectl == 0) {
3705 			pectl = pci_read_config(dev, reg + 0x8, 2);
3706 			pectl = (pectl & ~0x7000) | (5 << 12);
3707 			pci_write_config(dev, reg + 0x8, pectl, 2);
3708 			sc->pectl = pectl;
3709 		} else {
3710 			/* restore saved pectl after watchdog reset */
3711 			pci_write_config(dev, reg + 0x8, sc->pectl, 2);
3712 		}
3713 	}
3714 
3715 	/* Enable DMA and Memory space access */
3716 	pci_enable_busmaster(dev);
3717 	cmd = pci_read_config(dev, PCIR_COMMAND, 2);
3718 	cmd |= PCIM_CMD_MEMEN;
3719 	pci_write_config(dev, PCIR_COMMAND, cmd, 2);
3720 }
3721 
3722 static uint32_t
3723 mxge_read_reboot(mxge_softc_t *sc)
3724 {
3725 	device_t dev = sc->dev;
3726 	uint32_t vs;
3727 
3728 	/* find the vendor specific offset */
3729 	if (pci_find_extcap(dev, PCIY_VENDOR, &vs) != 0) {
3730 		device_printf(sc->dev,
3731 			      "could not find vendor specific offset\n");
3732 		return (uint32_t)-1;
3733 	}
3734 	/* enable read32 mode */
3735 	pci_write_config(dev, vs + 0x10, 0x3, 1);
3736 	/* tell NIC which register to read */
3737 	pci_write_config(dev, vs + 0x18, 0xfffffff0, 4);
3738 	return (pci_read_config(dev, vs + 0x14, 4));
3739 }
3740 
3741 static void
3742 mxge_watchdog_reset(mxge_softc_t *sc)
3743 {
3744 	struct pci_devinfo *dinfo;
3745 	struct mxge_slice_state *ss;
3746 	int err, running, s, num_tx_slices = 1;
3747 	uint32_t reboot;
3748 	uint16_t cmd;
3749 
3750 	err = ENXIO;
3751 
3752 	device_printf(sc->dev, "Watchdog reset!\n");
3753 
3754 	/*
3755 	 * check to see if the NIC rebooted.  If it did, then all of
3756 	 * PCI config space has been reset, and things like the
3757 	 * busmaster bit will be zero.  If this is the case, then we
3758 	 * must restore PCI config space before the NIC can be used
3759 	 * again
3760 	 */
3761 	cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3762 	if (cmd == 0xffff) {
3763 		/*
3764 		 * maybe the watchdog caught the NIC rebooting; wait
3765 		 * up to 100ms for it to finish.  If it does not come
3766 		 * back, then give up
3767 		 */
3768 		DELAY(1000*100);
3769 		cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3770 		if (cmd == 0xffff) {
3771 			device_printf(sc->dev, "NIC disappeared!\n");
3772 		}
3773 	}
3774 	if ((cmd & PCIM_CMD_BUSMASTEREN) == 0) {
3775 		/* print the reboot status */
3776 		reboot = mxge_read_reboot(sc);
3777 		device_printf(sc->dev, "NIC rebooted, status = 0x%x\n",
3778 			      reboot);
3779 		running = sc->ifp->if_drv_flags & IFF_DRV_RUNNING;
3780 		if (running) {
3781 
3782 			/*
3783 			 * quiesce NIC so that TX routines will not try to
3784 			 * xmit after restoration of BAR
3785 			 */
3786 
3787 			/* Mark the link as down */
3788 			if (sc->link_state) {
3789 				sc->link_state = 0;
3790 				if_link_state_change(sc->ifp,
3791 						     LINK_STATE_DOWN);
3792 			}
3793 #ifdef IFNET_BUF_RING
3794 			num_tx_slices = sc->num_slices;
3795 #endif
3796 			/* grab all TX locks to ensure no tx  */
3797 			for (s = 0; s < num_tx_slices; s++) {
3798 				ss = &sc->ss[s];
3799 				mtx_lock(&ss->tx.mtx);
3800 			}
3801 			mxge_close(sc, 1);
3802 		}
3803 		/* restore PCI configuration space */
3804 		dinfo = device_get_ivars(sc->dev);
3805 		pci_cfg_restore(sc->dev, dinfo);
3806 
3807 		/* and redo any changes we made to our config space */
3808 		mxge_setup_cfg_space(sc);
3809 
3810 		/* reload f/w */
3811 		err = mxge_load_firmware(sc, 0);
3812 		if (err) {
3813 			device_printf(sc->dev,
3814 				      "Unable to re-load f/w\n");
3815 		}
3816 		if (running) {
3817 			if (!err)
3818 				err = mxge_open(sc);
3819 			/* release all TX locks */
3820 			for (s = 0; s < num_tx_slices; s++) {
3821 				ss = &sc->ss[s];
3822 #ifdef IFNET_BUF_RING
3823 				mxge_start_locked(ss);
3824 #endif
3825 				mtx_unlock(&ss->tx.mtx);
3826 			}
3827 		}
3828 		sc->watchdog_resets++;
3829 	} else {
3830 		device_printf(sc->dev,
3831 			      "NIC did not reboot, not resetting\n");
3832 		err = 0;
3833 	}
3834 	if (err) {
3835 		device_printf(sc->dev, "watchdog reset failed\n");
3836 	} else {
3837 		if (sc->dying == 2)
3838 			sc->dying = 0;
3839 		callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
3840 	}
3841 }
3842 
3843 static void
3844 mxge_watchdog_task(void *arg, int pending)
3845 {
3846 	mxge_softc_t *sc = arg;
3847 
3848 
3849 	mtx_lock(&sc->driver_mtx);
3850 	mxge_watchdog_reset(sc);
3851 	mtx_unlock(&sc->driver_mtx);
3852 }
3853 
3854 static void
3855 mxge_warn_stuck(mxge_softc_t *sc, mxge_tx_ring_t *tx, int slice)
3856 {
3857 	tx = &sc->ss[slice].tx;
3858 	device_printf(sc->dev, "slice %d struck? ring state:\n", slice);
3859 	device_printf(sc->dev,
3860 		      "tx.req=%d tx.done=%d, tx.queue_active=%d\n",
3861 		      tx->req, tx->done, tx->queue_active);
3862 	device_printf(sc->dev, "tx.activate=%d tx.deactivate=%d\n",
3863 			      tx->activate, tx->deactivate);
3864 	device_printf(sc->dev, "pkt_done=%d fw=%d\n",
3865 		      tx->pkt_done,
3866 		      be32toh(sc->ss->fw_stats->send_done_count));
3867 }
3868 
3869 static int
3870 mxge_watchdog(mxge_softc_t *sc)
3871 {
3872 	mxge_tx_ring_t *tx;
3873 	uint32_t rx_pause = be32toh(sc->ss->fw_stats->dropped_pause);
3874 	int i, err = 0;
3875 
3876 	/* see if we have outstanding transmits, which
3877 	   have been pending for more than mxge_ticks */
3878 	for (i = 0;
3879 #ifdef IFNET_BUF_RING
3880 	     (i < sc->num_slices) && (err == 0);
3881 #else
3882 	     (i < 1) && (err == 0);
3883 #endif
3884 	     i++) {
3885 		tx = &sc->ss[i].tx;
3886 		if (tx->req != tx->done &&
3887 		    tx->watchdog_req != tx->watchdog_done &&
3888 		    tx->done == tx->watchdog_done) {
3889 			/* check for pause blocking before resetting */
3890 			if (tx->watchdog_rx_pause == rx_pause) {
3891 				mxge_warn_stuck(sc, tx, i);
3892 				taskqueue_enqueue(sc->tq, &sc->watchdog_task);
3893 				return (ENXIO);
3894 			}
3895 			else
3896 				device_printf(sc->dev, "Flow control blocking "
3897 					      "xmits, check link partner\n");
3898 		}
3899 
3900 		tx->watchdog_req = tx->req;
3901 		tx->watchdog_done = tx->done;
3902 		tx->watchdog_rx_pause = rx_pause;
3903 	}
3904 
3905 	if (sc->need_media_probe)
3906 		mxge_media_probe(sc);
3907 	return (err);
3908 }
3909 
3910 static u_long
3911 mxge_update_stats(mxge_softc_t *sc)
3912 {
3913 	struct mxge_slice_state *ss;
3914 	u_long pkts = 0;
3915 	u_long ipackets = 0;
3916 	u_long opackets = 0;
3917 #ifdef IFNET_BUF_RING
3918 	u_long obytes = 0;
3919 	u_long omcasts = 0;
3920 	u_long odrops = 0;
3921 #endif
3922 	u_long oerrors = 0;
3923 	int slice;
3924 
3925 	for (slice = 0; slice < sc->num_slices; slice++) {
3926 		ss = &sc->ss[slice];
3927 		ipackets += ss->ipackets;
3928 		opackets += ss->opackets;
3929 #ifdef IFNET_BUF_RING
3930 		obytes += ss->obytes;
3931 		omcasts += ss->omcasts;
3932 		odrops += ss->tx.br->br_drops;
3933 #endif
3934 		oerrors += ss->oerrors;
3935 	}
3936 	pkts = (ipackets - sc->ifp->if_ipackets);
3937 	pkts += (opackets - sc->ifp->if_opackets);
3938 	sc->ifp->if_ipackets = ipackets;
3939 	sc->ifp->if_opackets = opackets;
3940 #ifdef IFNET_BUF_RING
3941 	sc->ifp->if_obytes = obytes;
3942 	sc->ifp->if_omcasts = omcasts;
3943 	sc->ifp->if_snd.ifq_drops = odrops;
3944 #endif
3945 	sc->ifp->if_oerrors = oerrors;
3946 	return pkts;
3947 }
3948 
3949 static void
3950 mxge_tick(void *arg)
3951 {
3952 	mxge_softc_t *sc = arg;
3953 	u_long pkts = 0;
3954 	int err = 0;
3955 	int running, ticks;
3956 	uint16_t cmd;
3957 
3958 	ticks = mxge_ticks;
3959 	mtx_lock(&sc->driver_mtx);
3960 	running = sc->ifp->if_drv_flags & IFF_DRV_RUNNING;
3961 	mtx_unlock(&sc->driver_mtx);
3962 	if (running) {
3963 		/* aggregate stats from different slices */
3964 		pkts = mxge_update_stats(sc);
3965 		if (!sc->watchdog_countdown) {
3966 			err = mxge_watchdog(sc);
3967 			sc->watchdog_countdown = 4;
3968 		}
3969 		sc->watchdog_countdown--;
3970 	}
3971 	if (pkts == 0) {
3972 		/* ensure NIC did not suffer h/w fault while idle */
3973 		cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3974 		if ((cmd & PCIM_CMD_BUSMASTEREN) == 0) {
3975 			sc->dying = 2;
3976 			taskqueue_enqueue(sc->tq, &sc->watchdog_task);
3977 			err = ENXIO;
3978 		}
3979 		/* look less often if NIC is idle */
3980 		ticks *= 4;
3981 	}
3982 
3983 	if (err == 0)
3984 		callout_reset(&sc->co_hdl, ticks, mxge_tick, sc);
3985 
3986 }
3987 
3988 static int
3989 mxge_media_change(struct ifnet *ifp)
3990 {
3991 	return EINVAL;
3992 }
3993 
3994 static int
3995 mxge_change_mtu(mxge_softc_t *sc, int mtu)
3996 {
3997 	struct ifnet *ifp = sc->ifp;
3998 	int real_mtu, old_mtu;
3999 	int err = 0;
4000 
4001 
4002 	real_mtu = mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
4003 	if ((real_mtu > sc->max_mtu) || real_mtu < 60)
4004 		return EINVAL;
4005 	mtx_lock(&sc->driver_mtx);
4006 	old_mtu = ifp->if_mtu;
4007 	ifp->if_mtu = mtu;
4008 	if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
4009 		mxge_close(sc, 0);
4010 		err = mxge_open(sc);
4011 		if (err != 0) {
4012 			ifp->if_mtu = old_mtu;
4013 			mxge_close(sc, 0);
4014 			(void) mxge_open(sc);
4015 		}
4016 	}
4017 	mtx_unlock(&sc->driver_mtx);
4018 	return err;
4019 }
4020 
4021 static void
4022 mxge_media_status(struct ifnet *ifp, struct ifmediareq *ifmr)
4023 {
4024 	mxge_softc_t *sc = ifp->if_softc;
4025 
4026 
4027 	if (sc == NULL)
4028 		return;
4029 	ifmr->ifm_status = IFM_AVALID;
4030 	ifmr->ifm_status |= sc->link_state ? IFM_ACTIVE : 0;
4031 	ifmr->ifm_active = IFM_AUTO | IFM_ETHER;
4032 	ifmr->ifm_active |= sc->link_state ? IFM_FDX : 0;
4033 }
4034 
4035 static int
4036 mxge_ioctl(struct ifnet *ifp, u_long command, caddr_t data)
4037 {
4038 	mxge_softc_t *sc = ifp->if_softc;
4039 	struct ifreq *ifr = (struct ifreq *)data;
4040 	int err, mask;
4041 
4042 	err = 0;
4043 	switch (command) {
4044 	case SIOCSIFADDR:
4045 	case SIOCGIFADDR:
4046 		err = ether_ioctl(ifp, command, data);
4047 		break;
4048 
4049 	case SIOCSIFMTU:
4050 		err = mxge_change_mtu(sc, ifr->ifr_mtu);
4051 		break;
4052 
4053 	case SIOCSIFFLAGS:
4054 		mtx_lock(&sc->driver_mtx);
4055 		if (sc->dying) {
4056 			mtx_unlock(&sc->driver_mtx);
4057 			return EINVAL;
4058 		}
4059 		if (ifp->if_flags & IFF_UP) {
4060 			if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) {
4061 				err = mxge_open(sc);
4062 			} else {
4063 				/* take care of promis can allmulti
4064 				   flag chages */
4065 				mxge_change_promisc(sc,
4066 						    ifp->if_flags & IFF_PROMISC);
4067 				mxge_set_multicast_list(sc);
4068 			}
4069 		} else {
4070 			if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
4071 				mxge_close(sc, 0);
4072 			}
4073 		}
4074 		mtx_unlock(&sc->driver_mtx);
4075 		break;
4076 
4077 	case SIOCADDMULTI:
4078 	case SIOCDELMULTI:
4079 		mtx_lock(&sc->driver_mtx);
4080 		mxge_set_multicast_list(sc);
4081 		mtx_unlock(&sc->driver_mtx);
4082 		break;
4083 
4084 	case SIOCSIFCAP:
4085 		mtx_lock(&sc->driver_mtx);
4086 		mask = ifr->ifr_reqcap ^ ifp->if_capenable;
4087 		if (mask & IFCAP_TXCSUM) {
4088 			if (IFCAP_TXCSUM & ifp->if_capenable) {
4089 				ifp->if_capenable &= ~(IFCAP_TXCSUM|IFCAP_TSO4);
4090 				ifp->if_hwassist &= ~(CSUM_TCP | CSUM_UDP
4091 						      | CSUM_TSO);
4092 			} else {
4093 				ifp->if_capenable |= IFCAP_TXCSUM;
4094 				ifp->if_hwassist |= (CSUM_TCP | CSUM_UDP);
4095 			}
4096 		} else if (mask & IFCAP_RXCSUM) {
4097 			if (IFCAP_RXCSUM & ifp->if_capenable) {
4098 				ifp->if_capenable &= ~IFCAP_RXCSUM;
4099 				sc->csum_flag = 0;
4100 			} else {
4101 				ifp->if_capenable |= IFCAP_RXCSUM;
4102 				sc->csum_flag = 1;
4103 			}
4104 		}
4105 		if (mask & IFCAP_TSO4) {
4106 			if (IFCAP_TSO4 & ifp->if_capenable) {
4107 				ifp->if_capenable &= ~IFCAP_TSO4;
4108 				ifp->if_hwassist &= ~CSUM_TSO;
4109 			} else if (IFCAP_TXCSUM & ifp->if_capenable) {
4110 				ifp->if_capenable |= IFCAP_TSO4;
4111 				ifp->if_hwassist |= CSUM_TSO;
4112 			} else {
4113 				printf("mxge requires tx checksum offload"
4114 				       " be enabled to use TSO\n");
4115 				err = EINVAL;
4116 			}
4117 		}
4118 		if (mask & IFCAP_LRO) {
4119 			if (IFCAP_LRO & ifp->if_capenable)
4120 				err = mxge_change_lro_locked(sc, 0);
4121 			else
4122 				err = mxge_change_lro_locked(sc, mxge_lro_cnt);
4123 		}
4124 		if (mask & IFCAP_VLAN_HWTAGGING)
4125 			ifp->if_capenable ^= IFCAP_VLAN_HWTAGGING;
4126 		mtx_unlock(&sc->driver_mtx);
4127 		VLAN_CAPABILITIES(ifp);
4128 
4129 		break;
4130 
4131 	case SIOCGIFMEDIA:
4132 		err = ifmedia_ioctl(ifp, (struct ifreq *)data,
4133 				    &sc->media, command);
4134                 break;
4135 
4136 	default:
4137 		err = ENOTTY;
4138         }
4139 	return err;
4140 }
4141 
4142 static void
4143 mxge_fetch_tunables(mxge_softc_t *sc)
4144 {
4145 
4146 	TUNABLE_INT_FETCH("hw.mxge.max_slices", &mxge_max_slices);
4147 	TUNABLE_INT_FETCH("hw.mxge.flow_control_enabled",
4148 			  &mxge_flow_control);
4149 	TUNABLE_INT_FETCH("hw.mxge.intr_coal_delay",
4150 			  &mxge_intr_coal_delay);
4151 	TUNABLE_INT_FETCH("hw.mxge.nvidia_ecrc_enable",
4152 			  &mxge_nvidia_ecrc_enable);
4153 	TUNABLE_INT_FETCH("hw.mxge.force_firmware",
4154 			  &mxge_force_firmware);
4155 	TUNABLE_INT_FETCH("hw.mxge.deassert_wait",
4156 			  &mxge_deassert_wait);
4157 	TUNABLE_INT_FETCH("hw.mxge.verbose",
4158 			  &mxge_verbose);
4159 	TUNABLE_INT_FETCH("hw.mxge.ticks", &mxge_ticks);
4160 	TUNABLE_INT_FETCH("hw.mxge.lro_cnt", &sc->lro_cnt);
4161 	TUNABLE_INT_FETCH("hw.mxge.always_promisc", &mxge_always_promisc);
4162 	TUNABLE_INT_FETCH("hw.mxge.rss_hash_type", &mxge_rss_hash_type);
4163 	TUNABLE_INT_FETCH("hw.mxge.rss_hashtype", &mxge_rss_hash_type);
4164 	TUNABLE_INT_FETCH("hw.mxge.initial_mtu", &mxge_initial_mtu);
4165 	TUNABLE_INT_FETCH("hw.mxge.throttle", &mxge_throttle);
4166 	if (sc->lro_cnt != 0)
4167 		mxge_lro_cnt = sc->lro_cnt;
4168 
4169 	if (bootverbose)
4170 		mxge_verbose = 1;
4171 	if (mxge_intr_coal_delay < 0 || mxge_intr_coal_delay > 10*1000)
4172 		mxge_intr_coal_delay = 30;
4173 	if (mxge_ticks == 0)
4174 		mxge_ticks = hz / 2;
4175 	sc->pause = mxge_flow_control;
4176 	if (mxge_rss_hash_type < MXGEFW_RSS_HASH_TYPE_IPV4
4177 	    || mxge_rss_hash_type > MXGEFW_RSS_HASH_TYPE_MAX) {
4178 		mxge_rss_hash_type = MXGEFW_RSS_HASH_TYPE_SRC_PORT;
4179 	}
4180 	if (mxge_initial_mtu > ETHERMTU_JUMBO ||
4181 	    mxge_initial_mtu < ETHER_MIN_LEN)
4182 		mxge_initial_mtu = ETHERMTU_JUMBO;
4183 
4184 	if (mxge_throttle && mxge_throttle > MXGE_MAX_THROTTLE)
4185 		mxge_throttle = MXGE_MAX_THROTTLE;
4186 	if (mxge_throttle && mxge_throttle < MXGE_MIN_THROTTLE)
4187 		mxge_throttle = MXGE_MIN_THROTTLE;
4188 	sc->throttle = mxge_throttle;
4189 }
4190 
4191 
4192 static void
4193 mxge_free_slices(mxge_softc_t *sc)
4194 {
4195 	struct mxge_slice_state *ss;
4196 	int i;
4197 
4198 
4199 	if (sc->ss == NULL)
4200 		return;
4201 
4202 	for (i = 0; i < sc->num_slices; i++) {
4203 		ss = &sc->ss[i];
4204 		if (ss->fw_stats != NULL) {
4205 			mxge_dma_free(&ss->fw_stats_dma);
4206 			ss->fw_stats = NULL;
4207 #ifdef IFNET_BUF_RING
4208 			if (ss->tx.br != NULL) {
4209 				drbr_free(ss->tx.br, M_DEVBUF);
4210 				ss->tx.br = NULL;
4211 			}
4212 #endif
4213 			mtx_destroy(&ss->tx.mtx);
4214 		}
4215 		if (ss->rx_done.entry != NULL) {
4216 			mxge_dma_free(&ss->rx_done.dma);
4217 			ss->rx_done.entry = NULL;
4218 		}
4219 	}
4220 	free(sc->ss, M_DEVBUF);
4221 	sc->ss = NULL;
4222 }
4223 
4224 static int
4225 mxge_alloc_slices(mxge_softc_t *sc)
4226 {
4227 	mxge_cmd_t cmd;
4228 	struct mxge_slice_state *ss;
4229 	size_t bytes;
4230 	int err, i, max_intr_slots;
4231 
4232 	err = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
4233 	if (err != 0) {
4234 		device_printf(sc->dev, "Cannot determine rx ring size\n");
4235 		return err;
4236 	}
4237 	sc->rx_ring_size = cmd.data0;
4238 	max_intr_slots = 2 * (sc->rx_ring_size / sizeof (mcp_dma_addr_t));
4239 
4240 	bytes = sizeof (*sc->ss) * sc->num_slices;
4241 	sc->ss = malloc(bytes, M_DEVBUF, M_NOWAIT | M_ZERO);
4242 	if (sc->ss == NULL)
4243 		return (ENOMEM);
4244 	for (i = 0; i < sc->num_slices; i++) {
4245 		ss = &sc->ss[i];
4246 
4247 		ss->sc = sc;
4248 
4249 		/* allocate per-slice rx interrupt queues */
4250 
4251 		bytes = max_intr_slots * sizeof (*ss->rx_done.entry);
4252 		err = mxge_dma_alloc(sc, &ss->rx_done.dma, bytes, 4096);
4253 		if (err != 0)
4254 			goto abort;
4255 		ss->rx_done.entry = ss->rx_done.dma.addr;
4256 		bzero(ss->rx_done.entry, bytes);
4257 
4258 		/*
4259 		 * allocate the per-slice firmware stats; stats
4260 		 * (including tx) are used used only on the first
4261 		 * slice for now
4262 		 */
4263 #ifndef IFNET_BUF_RING
4264 		if (i > 0)
4265 			continue;
4266 #endif
4267 
4268 		bytes = sizeof (*ss->fw_stats);
4269 		err = mxge_dma_alloc(sc, &ss->fw_stats_dma,
4270 				     sizeof (*ss->fw_stats), 64);
4271 		if (err != 0)
4272 			goto abort;
4273 		ss->fw_stats = (mcp_irq_data_t *)ss->fw_stats_dma.addr;
4274 		snprintf(ss->tx.mtx_name, sizeof(ss->tx.mtx_name),
4275 			 "%s:tx(%d)", device_get_nameunit(sc->dev), i);
4276 		mtx_init(&ss->tx.mtx, ss->tx.mtx_name, NULL, MTX_DEF);
4277 #ifdef IFNET_BUF_RING
4278 		ss->tx.br = buf_ring_alloc(2048, M_DEVBUF, M_WAITOK,
4279 					   &ss->tx.mtx);
4280 #endif
4281 	}
4282 
4283 	return (0);
4284 
4285 abort:
4286 	mxge_free_slices(sc);
4287 	return (ENOMEM);
4288 }
4289 
4290 static void
4291 mxge_slice_probe(mxge_softc_t *sc)
4292 {
4293 	mxge_cmd_t cmd;
4294 	char *old_fw;
4295 	int msix_cnt, status, max_intr_slots;
4296 
4297 	sc->num_slices = 1;
4298 	/*
4299 	 *  don't enable multiple slices if they are not enabled,
4300 	 *  or if this is not an SMP system
4301 	 */
4302 
4303 	if (mxge_max_slices == 0 || mxge_max_slices == 1 || mp_ncpus < 2)
4304 		return;
4305 
4306 	/* see how many MSI-X interrupts are available */
4307 	msix_cnt = pci_msix_count(sc->dev);
4308 	if (msix_cnt < 2)
4309 		return;
4310 
4311 	/* now load the slice aware firmware see what it supports */
4312 	old_fw = sc->fw_name;
4313 	if (old_fw == mxge_fw_aligned)
4314 		sc->fw_name = mxge_fw_rss_aligned;
4315 	else
4316 		sc->fw_name = mxge_fw_rss_unaligned;
4317 	status = mxge_load_firmware(sc, 0);
4318 	if (status != 0) {
4319 		device_printf(sc->dev, "Falling back to a single slice\n");
4320 		return;
4321 	}
4322 
4323 	/* try to send a reset command to the card to see if it
4324 	   is alive */
4325 	memset(&cmd, 0, sizeof (cmd));
4326 	status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
4327 	if (status != 0) {
4328 		device_printf(sc->dev, "failed reset\n");
4329 		goto abort_with_fw;
4330 	}
4331 
4332 	/* get rx ring size */
4333 	status = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
4334 	if (status != 0) {
4335 		device_printf(sc->dev, "Cannot determine rx ring size\n");
4336 		goto abort_with_fw;
4337 	}
4338 	max_intr_slots = 2 * (cmd.data0 / sizeof (mcp_dma_addr_t));
4339 
4340 	/* tell it the size of the interrupt queues */
4341 	cmd.data0 = max_intr_slots * sizeof (struct mcp_slot);
4342 	status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
4343 	if (status != 0) {
4344 		device_printf(sc->dev, "failed MXGEFW_CMD_SET_INTRQ_SIZE\n");
4345 		goto abort_with_fw;
4346 	}
4347 
4348 	/* ask the maximum number of slices it supports */
4349 	status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES, &cmd);
4350 	if (status != 0) {
4351 		device_printf(sc->dev,
4352 			      "failed MXGEFW_CMD_GET_MAX_RSS_QUEUES\n");
4353 		goto abort_with_fw;
4354 	}
4355 	sc->num_slices = cmd.data0;
4356 	if (sc->num_slices > msix_cnt)
4357 		sc->num_slices = msix_cnt;
4358 
4359 	if (mxge_max_slices == -1) {
4360 		/* cap to number of CPUs in system */
4361 		if (sc->num_slices > mp_ncpus)
4362 			sc->num_slices = mp_ncpus;
4363 	} else {
4364 		if (sc->num_slices > mxge_max_slices)
4365 			sc->num_slices = mxge_max_slices;
4366 	}
4367 	/* make sure it is a power of two */
4368 	while (sc->num_slices & (sc->num_slices - 1))
4369 		sc->num_slices--;
4370 
4371 	if (mxge_verbose)
4372 		device_printf(sc->dev, "using %d slices\n",
4373 			      sc->num_slices);
4374 
4375 	return;
4376 
4377 abort_with_fw:
4378 	sc->fw_name = old_fw;
4379 	(void) mxge_load_firmware(sc, 0);
4380 }
4381 
4382 static int
4383 mxge_add_msix_irqs(mxge_softc_t *sc)
4384 {
4385 	size_t bytes;
4386 	int count, err, i, rid;
4387 
4388 	rid = PCIR_BAR(2);
4389 	sc->msix_table_res = bus_alloc_resource_any(sc->dev, SYS_RES_MEMORY,
4390 						    &rid, RF_ACTIVE);
4391 
4392 	if (sc->msix_table_res == NULL) {
4393 		device_printf(sc->dev, "couldn't alloc MSIX table res\n");
4394 		return ENXIO;
4395 	}
4396 
4397 	count = sc->num_slices;
4398 	err = pci_alloc_msix(sc->dev, &count);
4399 	if (err != 0) {
4400 		device_printf(sc->dev, "pci_alloc_msix: failed, wanted %d"
4401 			      "err = %d \n", sc->num_slices, err);
4402 		goto abort_with_msix_table;
4403 	}
4404 	if (count < sc->num_slices) {
4405 		device_printf(sc->dev, "pci_alloc_msix: need %d, got %d\n",
4406 			      count, sc->num_slices);
4407 		device_printf(sc->dev,
4408 			      "Try setting hw.mxge.max_slices to %d\n",
4409 			      count);
4410 		err = ENOSPC;
4411 		goto abort_with_msix;
4412 	}
4413 	bytes = sizeof (*sc->msix_irq_res) * sc->num_slices;
4414 	sc->msix_irq_res = malloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO);
4415 	if (sc->msix_irq_res == NULL) {
4416 		err = ENOMEM;
4417 		goto abort_with_msix;
4418 	}
4419 
4420 	for (i = 0; i < sc->num_slices; i++) {
4421 		rid = i + 1;
4422 		sc->msix_irq_res[i] = bus_alloc_resource_any(sc->dev,
4423 							  SYS_RES_IRQ,
4424 							  &rid, RF_ACTIVE);
4425 		if (sc->msix_irq_res[i] == NULL) {
4426 			device_printf(sc->dev, "couldn't allocate IRQ res"
4427 				      " for message %d\n", i);
4428 			err = ENXIO;
4429 			goto abort_with_res;
4430 		}
4431 	}
4432 
4433 	bytes = sizeof (*sc->msix_ih) * sc->num_slices;
4434 	sc->msix_ih =  malloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO);
4435 
4436 	for (i = 0; i < sc->num_slices; i++) {
4437 		err = bus_setup_intr(sc->dev, sc->msix_irq_res[i],
4438 				     INTR_TYPE_NET | INTR_MPSAFE,
4439 #if __FreeBSD_version > 700030
4440 				     NULL,
4441 #endif
4442 				     mxge_intr, &sc->ss[i], &sc->msix_ih[i]);
4443 		if (err != 0) {
4444 			device_printf(sc->dev, "couldn't setup intr for "
4445 				      "message %d\n", i);
4446 			goto abort_with_intr;
4447 		}
4448 	}
4449 
4450 	if (mxge_verbose) {
4451 		device_printf(sc->dev, "using %d msix IRQs:",
4452 			      sc->num_slices);
4453 		for (i = 0; i < sc->num_slices; i++)
4454 			printf(" %ld",  rman_get_start(sc->msix_irq_res[i]));
4455 		printf("\n");
4456 	}
4457 	return (0);
4458 
4459 abort_with_intr:
4460 	for (i = 0; i < sc->num_slices; i++) {
4461 		if (sc->msix_ih[i] != NULL) {
4462 			bus_teardown_intr(sc->dev, sc->msix_irq_res[i],
4463 					  sc->msix_ih[i]);
4464 			sc->msix_ih[i] = NULL;
4465 		}
4466 	}
4467 	free(sc->msix_ih, M_DEVBUF);
4468 
4469 
4470 abort_with_res:
4471 	for (i = 0; i < sc->num_slices; i++) {
4472 		rid = i + 1;
4473 		if (sc->msix_irq_res[i] != NULL)
4474 			bus_release_resource(sc->dev, SYS_RES_IRQ, rid,
4475 					     sc->msix_irq_res[i]);
4476 		sc->msix_irq_res[i] = NULL;
4477 	}
4478 	free(sc->msix_irq_res, M_DEVBUF);
4479 
4480 
4481 abort_with_msix:
4482 	pci_release_msi(sc->dev);
4483 
4484 abort_with_msix_table:
4485 	bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2),
4486 			     sc->msix_table_res);
4487 
4488 	return err;
4489 }
4490 
4491 static int
4492 mxge_add_single_irq(mxge_softc_t *sc)
4493 {
4494 	int count, err, rid;
4495 
4496 	count = pci_msi_count(sc->dev);
4497 	if (count == 1 && pci_alloc_msi(sc->dev, &count) == 0) {
4498 		rid = 1;
4499 	} else {
4500 		rid = 0;
4501 		sc->legacy_irq = 1;
4502 	}
4503 	sc->irq_res = bus_alloc_resource(sc->dev, SYS_RES_IRQ, &rid, 0, ~0,
4504 					 1, RF_SHAREABLE | RF_ACTIVE);
4505 	if (sc->irq_res == NULL) {
4506 		device_printf(sc->dev, "could not alloc interrupt\n");
4507 		return ENXIO;
4508 	}
4509 	if (mxge_verbose)
4510 		device_printf(sc->dev, "using %s irq %ld\n",
4511 			      sc->legacy_irq ? "INTx" : "MSI",
4512 			      rman_get_start(sc->irq_res));
4513 	err = bus_setup_intr(sc->dev, sc->irq_res,
4514 			     INTR_TYPE_NET | INTR_MPSAFE,
4515 #if __FreeBSD_version > 700030
4516 			     NULL,
4517 #endif
4518 			     mxge_intr, &sc->ss[0], &sc->ih);
4519 	if (err != 0) {
4520 		bus_release_resource(sc->dev, SYS_RES_IRQ,
4521 				     sc->legacy_irq ? 0 : 1, sc->irq_res);
4522 		if (!sc->legacy_irq)
4523 			pci_release_msi(sc->dev);
4524 	}
4525 	return err;
4526 }
4527 
4528 static void
4529 mxge_rem_msix_irqs(mxge_softc_t *sc)
4530 {
4531 	int i, rid;
4532 
4533 	for (i = 0; i < sc->num_slices; i++) {
4534 		if (sc->msix_ih[i] != NULL) {
4535 			bus_teardown_intr(sc->dev, sc->msix_irq_res[i],
4536 					  sc->msix_ih[i]);
4537 			sc->msix_ih[i] = NULL;
4538 		}
4539 	}
4540 	free(sc->msix_ih, M_DEVBUF);
4541 
4542 	for (i = 0; i < sc->num_slices; i++) {
4543 		rid = i + 1;
4544 		if (sc->msix_irq_res[i] != NULL)
4545 			bus_release_resource(sc->dev, SYS_RES_IRQ, rid,
4546 					     sc->msix_irq_res[i]);
4547 		sc->msix_irq_res[i] = NULL;
4548 	}
4549 	free(sc->msix_irq_res, M_DEVBUF);
4550 
4551 	bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2),
4552 			     sc->msix_table_res);
4553 
4554 	pci_release_msi(sc->dev);
4555 	return;
4556 }
4557 
4558 static void
4559 mxge_rem_single_irq(mxge_softc_t *sc)
4560 {
4561 	bus_teardown_intr(sc->dev, sc->irq_res, sc->ih);
4562 	bus_release_resource(sc->dev, SYS_RES_IRQ,
4563 			     sc->legacy_irq ? 0 : 1, sc->irq_res);
4564 	if (!sc->legacy_irq)
4565 		pci_release_msi(sc->dev);
4566 }
4567 
4568 static void
4569 mxge_rem_irq(mxge_softc_t *sc)
4570 {
4571 	if (sc->num_slices > 1)
4572 		mxge_rem_msix_irqs(sc);
4573 	else
4574 		mxge_rem_single_irq(sc);
4575 }
4576 
4577 static int
4578 mxge_add_irq(mxge_softc_t *sc)
4579 {
4580 	int err;
4581 
4582 	if (sc->num_slices > 1)
4583 		err = mxge_add_msix_irqs(sc);
4584 	else
4585 		err = mxge_add_single_irq(sc);
4586 
4587 	if (0 && err == 0 && sc->num_slices > 1) {
4588 		mxge_rem_msix_irqs(sc);
4589 		err = mxge_add_msix_irqs(sc);
4590 	}
4591 	return err;
4592 }
4593 
4594 
4595 static int
4596 mxge_attach(device_t dev)
4597 {
4598 	mxge_softc_t *sc = device_get_softc(dev);
4599 	struct ifnet *ifp;
4600 	int err, rid;
4601 
4602 	sc->dev = dev;
4603 	mxge_fetch_tunables(sc);
4604 
4605 	TASK_INIT(&sc->watchdog_task, 1, mxge_watchdog_task, sc);
4606 	sc->tq = taskqueue_create_fast("mxge_taskq", M_WAITOK,
4607 				       taskqueue_thread_enqueue,
4608 				       &sc->tq);
4609 	if (sc->tq == NULL) {
4610 		err = ENOMEM;
4611 		goto abort_with_nothing;
4612 	}
4613 	taskqueue_start_threads(&sc->tq, 1, PI_NET, "%s taskq",
4614 				device_get_nameunit(sc->dev));
4615 
4616 	err = bus_dma_tag_create(NULL,			/* parent */
4617 				 1,			/* alignment */
4618 				 0,			/* boundary */
4619 				 BUS_SPACE_MAXADDR,	/* low */
4620 				 BUS_SPACE_MAXADDR,	/* high */
4621 				 NULL, NULL,		/* filter */
4622 				 65536 + 256,		/* maxsize */
4623 				 MXGE_MAX_SEND_DESC, 	/* num segs */
4624 				 65536,			/* maxsegsize */
4625 				 0,			/* flags */
4626 				 NULL, NULL,		/* lock */
4627 				 &sc->parent_dmat);	/* tag */
4628 
4629 	if (err != 0) {
4630 		device_printf(sc->dev, "Err %d allocating parent dmat\n",
4631 			      err);
4632 		goto abort_with_tq;
4633 	}
4634 
4635 	ifp = sc->ifp = if_alloc(IFT_ETHER);
4636 	if (ifp == NULL) {
4637 		device_printf(dev, "can not if_alloc()\n");
4638 		err = ENOSPC;
4639 		goto abort_with_parent_dmat;
4640 	}
4641 	if_initname(ifp, device_get_name(dev), device_get_unit(dev));
4642 
4643 	snprintf(sc->cmd_mtx_name, sizeof(sc->cmd_mtx_name), "%s:cmd",
4644 		 device_get_nameunit(dev));
4645 	mtx_init(&sc->cmd_mtx, sc->cmd_mtx_name, NULL, MTX_DEF);
4646 	snprintf(sc->driver_mtx_name, sizeof(sc->driver_mtx_name),
4647 		 "%s:drv", device_get_nameunit(dev));
4648 	mtx_init(&sc->driver_mtx, sc->driver_mtx_name,
4649 		 MTX_NETWORK_LOCK, MTX_DEF);
4650 
4651 	callout_init_mtx(&sc->co_hdl, &sc->driver_mtx, 0);
4652 
4653 	mxge_setup_cfg_space(sc);
4654 
4655 	/* Map the board into the kernel */
4656 	rid = PCIR_BARS;
4657 	sc->mem_res = bus_alloc_resource(dev, SYS_RES_MEMORY, &rid, 0,
4658 					 ~0, 1, RF_ACTIVE);
4659 	if (sc->mem_res == NULL) {
4660 		device_printf(dev, "could not map memory\n");
4661 		err = ENXIO;
4662 		goto abort_with_lock;
4663 	}
4664 	sc->sram = rman_get_virtual(sc->mem_res);
4665 	sc->sram_size = 2*1024*1024 - (2*(48*1024)+(32*1024)) - 0x100;
4666 	if (sc->sram_size > rman_get_size(sc->mem_res)) {
4667 		device_printf(dev, "impossible memory region size %ld\n",
4668 			      rman_get_size(sc->mem_res));
4669 		err = ENXIO;
4670 		goto abort_with_mem_res;
4671 	}
4672 
4673 	/* make NULL terminated copy of the EEPROM strings section of
4674 	   lanai SRAM */
4675 	bzero(sc->eeprom_strings, MXGE_EEPROM_STRINGS_SIZE);
4676 	bus_space_read_region_1(rman_get_bustag(sc->mem_res),
4677 				rman_get_bushandle(sc->mem_res),
4678 				sc->sram_size - MXGE_EEPROM_STRINGS_SIZE,
4679 				sc->eeprom_strings,
4680 				MXGE_EEPROM_STRINGS_SIZE - 2);
4681 	err = mxge_parse_strings(sc);
4682 	if (err != 0)
4683 		goto abort_with_mem_res;
4684 
4685 	/* Enable write combining for efficient use of PCIe bus */
4686 	mxge_enable_wc(sc);
4687 
4688 	/* Allocate the out of band dma memory */
4689 	err = mxge_dma_alloc(sc, &sc->cmd_dma,
4690 			     sizeof (mxge_cmd_t), 64);
4691 	if (err != 0)
4692 		goto abort_with_mem_res;
4693 	sc->cmd = (mcp_cmd_response_t *) sc->cmd_dma.addr;
4694 	err = mxge_dma_alloc(sc, &sc->zeropad_dma, 64, 64);
4695 	if (err != 0)
4696 		goto abort_with_cmd_dma;
4697 
4698 	err = mxge_dma_alloc(sc, &sc->dmabench_dma, 4096, 4096);
4699 	if (err != 0)
4700 		goto abort_with_zeropad_dma;
4701 
4702 	/* select & load the firmware */
4703 	err = mxge_select_firmware(sc);
4704 	if (err != 0)
4705 		goto abort_with_dmabench;
4706 	sc->intr_coal_delay = mxge_intr_coal_delay;
4707 
4708 	mxge_slice_probe(sc);
4709 	err = mxge_alloc_slices(sc);
4710 	if (err != 0)
4711 		goto abort_with_dmabench;
4712 
4713 	err = mxge_reset(sc, 0);
4714 	if (err != 0)
4715 		goto abort_with_slices;
4716 
4717 	err = mxge_alloc_rings(sc);
4718 	if (err != 0) {
4719 		device_printf(sc->dev, "failed to allocate rings\n");
4720 		goto abort_with_dmabench;
4721 	}
4722 
4723 	err = mxge_add_irq(sc);
4724 	if (err != 0) {
4725 		device_printf(sc->dev, "failed to add irq\n");
4726 		goto abort_with_rings;
4727 	}
4728 
4729 	ifp->if_baudrate = IF_Gbps(10UL);
4730 	ifp->if_capabilities = IFCAP_RXCSUM | IFCAP_TXCSUM | IFCAP_TSO4 |
4731 		IFCAP_VLAN_MTU;
4732 #ifdef INET
4733 	ifp->if_capabilities |= IFCAP_LRO;
4734 #endif
4735 
4736 #ifdef MXGE_NEW_VLAN_API
4737 	ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_HWCSUM;
4738 #endif
4739 
4740 	sc->max_mtu = mxge_max_mtu(sc);
4741 	if (sc->max_mtu >= 9000)
4742 		ifp->if_capabilities |= IFCAP_JUMBO_MTU;
4743 	else
4744 		device_printf(dev, "MTU limited to %d.  Install "
4745 			      "latest firmware for 9000 byte jumbo support\n",
4746 			      sc->max_mtu - ETHER_HDR_LEN);
4747 	ifp->if_hwassist = CSUM_TCP | CSUM_UDP | CSUM_TSO;
4748 	ifp->if_capenable = ifp->if_capabilities;
4749 	if (sc->lro_cnt == 0)
4750 		ifp->if_capenable &= ~IFCAP_LRO;
4751 	sc->csum_flag = 1;
4752         ifp->if_init = mxge_init;
4753         ifp->if_softc = sc;
4754         ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
4755         ifp->if_ioctl = mxge_ioctl;
4756         ifp->if_start = mxge_start;
4757 	/* Initialise the ifmedia structure */
4758 	ifmedia_init(&sc->media, 0, mxge_media_change,
4759 		     mxge_media_status);
4760 	mxge_set_media(sc, IFM_ETHER | IFM_AUTO);
4761 	mxge_media_probe(sc);
4762 	sc->dying = 0;
4763 	ether_ifattach(ifp, sc->mac_addr);
4764 	/* ether_ifattach sets mtu to ETHERMTU */
4765 	if (mxge_initial_mtu != ETHERMTU)
4766 		mxge_change_mtu(sc, mxge_initial_mtu);
4767 
4768 	mxge_add_sysctls(sc);
4769 #ifdef IFNET_BUF_RING
4770 	ifp->if_transmit = mxge_transmit;
4771 	ifp->if_qflush = mxge_qflush;
4772 #endif
4773 	callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
4774 	return 0;
4775 
4776 abort_with_rings:
4777 	mxge_free_rings(sc);
4778 abort_with_slices:
4779 	mxge_free_slices(sc);
4780 abort_with_dmabench:
4781 	mxge_dma_free(&sc->dmabench_dma);
4782 abort_with_zeropad_dma:
4783 	mxge_dma_free(&sc->zeropad_dma);
4784 abort_with_cmd_dma:
4785 	mxge_dma_free(&sc->cmd_dma);
4786 abort_with_mem_res:
4787 	bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
4788 abort_with_lock:
4789 	pci_disable_busmaster(dev);
4790 	mtx_destroy(&sc->cmd_mtx);
4791 	mtx_destroy(&sc->driver_mtx);
4792 	if_free(ifp);
4793 abort_with_parent_dmat:
4794 	bus_dma_tag_destroy(sc->parent_dmat);
4795 abort_with_tq:
4796 	if (sc->tq != NULL) {
4797 		taskqueue_drain(sc->tq, &sc->watchdog_task);
4798 		taskqueue_free(sc->tq);
4799 		sc->tq = NULL;
4800 	}
4801 abort_with_nothing:
4802 	return err;
4803 }
4804 
4805 static int
4806 mxge_detach(device_t dev)
4807 {
4808 	mxge_softc_t *sc = device_get_softc(dev);
4809 
4810 	if (mxge_vlans_active(sc)) {
4811 		device_printf(sc->dev,
4812 			      "Detach vlans before removing module\n");
4813 		return EBUSY;
4814 	}
4815 	mtx_lock(&sc->driver_mtx);
4816 	sc->dying = 1;
4817 	if (sc->ifp->if_drv_flags & IFF_DRV_RUNNING)
4818 		mxge_close(sc, 0);
4819 	mtx_unlock(&sc->driver_mtx);
4820 	ether_ifdetach(sc->ifp);
4821 	if (sc->tq != NULL) {
4822 		taskqueue_drain(sc->tq, &sc->watchdog_task);
4823 		taskqueue_free(sc->tq);
4824 		sc->tq = NULL;
4825 	}
4826 	callout_drain(&sc->co_hdl);
4827 	ifmedia_removeall(&sc->media);
4828 	mxge_dummy_rdma(sc, 0);
4829 	mxge_rem_sysctls(sc);
4830 	mxge_rem_irq(sc);
4831 	mxge_free_rings(sc);
4832 	mxge_free_slices(sc);
4833 	mxge_dma_free(&sc->dmabench_dma);
4834 	mxge_dma_free(&sc->zeropad_dma);
4835 	mxge_dma_free(&sc->cmd_dma);
4836 	bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
4837 	pci_disable_busmaster(dev);
4838 	mtx_destroy(&sc->cmd_mtx);
4839 	mtx_destroy(&sc->driver_mtx);
4840 	if_free(sc->ifp);
4841 	bus_dma_tag_destroy(sc->parent_dmat);
4842 	return 0;
4843 }
4844 
4845 static int
4846 mxge_shutdown(device_t dev)
4847 {
4848 	return 0;
4849 }
4850 
4851 /*
4852   This file uses Myri10GE driver indentation.
4853 
4854   Local Variables:
4855   c-file-style:"linux"
4856   tab-width:8
4857   End:
4858 */
4859