xref: /freebsd/sys/dev/mxge/if_mxge.c (revision 409a390c3341fb4f162cd7de1fd595a323ebbfd8)
1 /******************************************************************************
2 
3 Copyright (c) 2006-2009, Myricom Inc.
4 All rights reserved.
5 
6 Redistribution and use in source and binary forms, with or without
7 modification, are permitted provided that the following conditions are met:
8 
9  1. Redistributions of source code must retain the above copyright notice,
10     this list of conditions and the following disclaimer.
11 
12  2. Neither the name of the Myricom Inc, nor the names of its
13     contributors may be used to endorse or promote products derived from
14     this software without specific prior written permission.
15 
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 POSSIBILITY OF SUCH DAMAGE.
27 
28 ***************************************************************************/
29 
30 #include <sys/cdefs.h>
31 __FBSDID("$FreeBSD$");
32 
33 #include <sys/param.h>
34 #include <sys/systm.h>
35 #include <sys/linker.h>
36 #include <sys/firmware.h>
37 #include <sys/endian.h>
38 #include <sys/sockio.h>
39 #include <sys/mbuf.h>
40 #include <sys/malloc.h>
41 #include <sys/kdb.h>
42 #include <sys/kernel.h>
43 #include <sys/lock.h>
44 #include <sys/module.h>
45 #include <sys/socket.h>
46 #include <sys/sysctl.h>
47 #include <sys/sx.h>
48 #include <sys/taskqueue.h>
49 
50 /* count xmits ourselves, rather than via drbr */
51 #define NO_SLOW_STATS
52 #include <net/if.h>
53 #include <net/if_arp.h>
54 #include <net/ethernet.h>
55 #include <net/if_dl.h>
56 #include <net/if_media.h>
57 
58 #include <net/bpf.h>
59 
60 #include <net/if_types.h>
61 #include <net/if_vlan_var.h>
62 #include <net/zlib.h>
63 
64 #include <netinet/in_systm.h>
65 #include <netinet/in.h>
66 #include <netinet/ip.h>
67 #include <netinet/tcp.h>
68 
69 #include <machine/bus.h>
70 #include <machine/in_cksum.h>
71 #include <machine/resource.h>
72 #include <sys/bus.h>
73 #include <sys/rman.h>
74 #include <sys/smp.h>
75 
76 #include <dev/pci/pcireg.h>
77 #include <dev/pci/pcivar.h>
78 #include <dev/pci/pci_private.h> /* XXX for pci_cfg_restore */
79 
80 #include <vm/vm.h>		/* for pmap_mapdev() */
81 #include <vm/pmap.h>
82 
83 #if defined(__i386) || defined(__amd64)
84 #include <machine/specialreg.h>
85 #endif
86 
87 #include <dev/mxge/mxge_mcp.h>
88 #include <dev/mxge/mcp_gen_header.h>
89 /*#define MXGE_FAKE_IFP*/
90 #include <dev/mxge/if_mxge_var.h>
91 #ifdef IFNET_BUF_RING
92 #include <sys/buf_ring.h>
93 #endif
94 
95 #include "opt_inet.h"
96 
97 /* tunable params */
98 static int mxge_nvidia_ecrc_enable = 1;
99 static int mxge_force_firmware = 0;
100 static int mxge_intr_coal_delay = 30;
101 static int mxge_deassert_wait = 1;
102 static int mxge_flow_control = 1;
103 static int mxge_verbose = 0;
104 static int mxge_lro_cnt = 8;
105 static int mxge_ticks;
106 static int mxge_max_slices = 1;
107 static int mxge_rss_hash_type = MXGEFW_RSS_HASH_TYPE_SRC_PORT;
108 static int mxge_always_promisc = 0;
109 static int mxge_initial_mtu = ETHERMTU_JUMBO;
110 static int mxge_throttle = 0;
111 static char *mxge_fw_unaligned = "mxge_ethp_z8e";
112 static char *mxge_fw_aligned = "mxge_eth_z8e";
113 static char *mxge_fw_rss_aligned = "mxge_rss_eth_z8e";
114 static char *mxge_fw_rss_unaligned = "mxge_rss_ethp_z8e";
115 
116 static int mxge_probe(device_t dev);
117 static int mxge_attach(device_t dev);
118 static int mxge_detach(device_t dev);
119 static int mxge_shutdown(device_t dev);
120 static void mxge_intr(void *arg);
121 
122 static device_method_t mxge_methods[] =
123 {
124   /* Device interface */
125   DEVMETHOD(device_probe, mxge_probe),
126   DEVMETHOD(device_attach, mxge_attach),
127   DEVMETHOD(device_detach, mxge_detach),
128   DEVMETHOD(device_shutdown, mxge_shutdown),
129   {0, 0}
130 };
131 
132 static driver_t mxge_driver =
133 {
134   "mxge",
135   mxge_methods,
136   sizeof(mxge_softc_t),
137 };
138 
139 static devclass_t mxge_devclass;
140 
141 /* Declare ourselves to be a child of the PCI bus.*/
142 DRIVER_MODULE(mxge, pci, mxge_driver, mxge_devclass, 0, 0);
143 MODULE_DEPEND(mxge, firmware, 1, 1, 1);
144 MODULE_DEPEND(mxge, zlib, 1, 1, 1);
145 
146 static int mxge_load_firmware(mxge_softc_t *sc, int adopt);
147 static int mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data);
148 static int mxge_close(mxge_softc_t *sc, int down);
149 static int mxge_open(mxge_softc_t *sc);
150 static void mxge_tick(void *arg);
151 
152 static int
153 mxge_probe(device_t dev)
154 {
155 	int rev;
156 
157 
158 	if ((pci_get_vendor(dev) == MXGE_PCI_VENDOR_MYRICOM) &&
159 	    ((pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E) ||
160 	     (pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E_9))) {
161 		rev = pci_get_revid(dev);
162 		switch (rev) {
163 		case MXGE_PCI_REV_Z8E:
164 			device_set_desc(dev, "Myri10G-PCIE-8A");
165 			break;
166 		case MXGE_PCI_REV_Z8ES:
167 			device_set_desc(dev, "Myri10G-PCIE-8B");
168 			break;
169 		default:
170 			device_set_desc(dev, "Myri10G-PCIE-8??");
171 			device_printf(dev, "Unrecognized rev %d NIC\n",
172 				      rev);
173 			break;
174 		}
175 		return 0;
176 	}
177 	return ENXIO;
178 }
179 
180 static void
181 mxge_enable_wc(mxge_softc_t *sc)
182 {
183 #if defined(__i386) || defined(__amd64)
184 	vm_offset_t len;
185 	int err;
186 
187 	sc->wc = 1;
188 	len = rman_get_size(sc->mem_res);
189 	err = pmap_change_attr((vm_offset_t) sc->sram,
190 			       len, PAT_WRITE_COMBINING);
191 	if (err != 0) {
192 		device_printf(sc->dev, "pmap_change_attr failed, %d\n",
193 			      err);
194 		sc->wc = 0;
195 	}
196 #endif
197 }
198 
199 
200 /* callback to get our DMA address */
201 static void
202 mxge_dmamap_callback(void *arg, bus_dma_segment_t *segs, int nsegs,
203 			 int error)
204 {
205 	if (error == 0) {
206 		*(bus_addr_t *) arg = segs->ds_addr;
207 	}
208 }
209 
210 static int
211 mxge_dma_alloc(mxge_softc_t *sc, mxge_dma_t *dma, size_t bytes,
212 		   bus_size_t alignment)
213 {
214 	int err;
215 	device_t dev = sc->dev;
216 	bus_size_t boundary, maxsegsize;
217 
218 	if (bytes > 4096 && alignment == 4096) {
219 		boundary = 0;
220 		maxsegsize = bytes;
221 	} else {
222 		boundary = 4096;
223 		maxsegsize = 4096;
224 	}
225 
226 	/* allocate DMAable memory tags */
227 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
228 				 alignment,		/* alignment */
229 				 boundary,		/* boundary */
230 				 BUS_SPACE_MAXADDR,	/* low */
231 				 BUS_SPACE_MAXADDR,	/* high */
232 				 NULL, NULL,		/* filter */
233 				 bytes,			/* maxsize */
234 				 1,			/* num segs */
235 				 maxsegsize,		/* maxsegsize */
236 				 BUS_DMA_COHERENT,	/* flags */
237 				 NULL, NULL,		/* lock */
238 				 &dma->dmat);		/* tag */
239 	if (err != 0) {
240 		device_printf(dev, "couldn't alloc tag (err = %d)\n", err);
241 		return err;
242 	}
243 
244 	/* allocate DMAable memory & map */
245 	err = bus_dmamem_alloc(dma->dmat, &dma->addr,
246 			       (BUS_DMA_WAITOK | BUS_DMA_COHERENT
247 				| BUS_DMA_ZERO),  &dma->map);
248 	if (err != 0) {
249 		device_printf(dev, "couldn't alloc mem (err = %d)\n", err);
250 		goto abort_with_dmat;
251 	}
252 
253 	/* load the memory */
254 	err = bus_dmamap_load(dma->dmat, dma->map, dma->addr, bytes,
255 			      mxge_dmamap_callback,
256 			      (void *)&dma->bus_addr, 0);
257 	if (err != 0) {
258 		device_printf(dev, "couldn't load map (err = %d)\n", err);
259 		goto abort_with_mem;
260 	}
261 	return 0;
262 
263 abort_with_mem:
264 	bus_dmamem_free(dma->dmat, dma->addr, dma->map);
265 abort_with_dmat:
266 	(void)bus_dma_tag_destroy(dma->dmat);
267 	return err;
268 }
269 
270 
271 static void
272 mxge_dma_free(mxge_dma_t *dma)
273 {
274 	bus_dmamap_unload(dma->dmat, dma->map);
275 	bus_dmamem_free(dma->dmat, dma->addr, dma->map);
276 	(void)bus_dma_tag_destroy(dma->dmat);
277 }
278 
279 /*
280  * The eeprom strings on the lanaiX have the format
281  * SN=x\0
282  * MAC=x:x:x:x:x:x\0
283  * PC=text\0
284  */
285 
286 static int
287 mxge_parse_strings(mxge_softc_t *sc)
288 {
289 #define MXGE_NEXT_STRING(p) while(ptr < limit && *ptr++)
290 
291 	char *ptr, *limit;
292 	int i, found_mac;
293 
294 	ptr = sc->eeprom_strings;
295 	limit = sc->eeprom_strings + MXGE_EEPROM_STRINGS_SIZE;
296 	found_mac = 0;
297 	while (ptr < limit && *ptr != '\0') {
298 		if (memcmp(ptr, "MAC=", 4) == 0) {
299 			ptr += 1;
300 			sc->mac_addr_string = ptr;
301 			for (i = 0; i < 6; i++) {
302 				ptr += 3;
303 				if ((ptr + 2) > limit)
304 					goto abort;
305 				sc->mac_addr[i] = strtoul(ptr, NULL, 16);
306 				found_mac = 1;
307 			}
308 		} else if (memcmp(ptr, "PC=", 3) == 0) {
309 			ptr += 3;
310 			strncpy(sc->product_code_string, ptr,
311 				sizeof (sc->product_code_string) - 1);
312 		} else if (memcmp(ptr, "SN=", 3) == 0) {
313 			ptr += 3;
314 			strncpy(sc->serial_number_string, ptr,
315 				sizeof (sc->serial_number_string) - 1);
316 		}
317 		MXGE_NEXT_STRING(ptr);
318 	}
319 
320 	if (found_mac)
321 		return 0;
322 
323  abort:
324 	device_printf(sc->dev, "failed to parse eeprom_strings\n");
325 
326 	return ENXIO;
327 }
328 
329 #if defined __i386 || defined i386 || defined __i386__ || defined __x86_64__
330 static void
331 mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
332 {
333 	uint32_t val;
334 	unsigned long base, off;
335 	char *va, *cfgptr;
336 	device_t pdev, mcp55;
337 	uint16_t vendor_id, device_id, word;
338 	uintptr_t bus, slot, func, ivend, idev;
339 	uint32_t *ptr32;
340 
341 
342 	if (!mxge_nvidia_ecrc_enable)
343 		return;
344 
345 	pdev = device_get_parent(device_get_parent(sc->dev));
346 	if (pdev == NULL) {
347 		device_printf(sc->dev, "could not find parent?\n");
348 		return;
349 	}
350 	vendor_id = pci_read_config(pdev, PCIR_VENDOR, 2);
351 	device_id = pci_read_config(pdev, PCIR_DEVICE, 2);
352 
353 	if (vendor_id != 0x10de)
354 		return;
355 
356 	base = 0;
357 
358 	if (device_id == 0x005d) {
359 		/* ck804, base address is magic */
360 		base = 0xe0000000UL;
361 	} else if (device_id >= 0x0374 && device_id <= 0x378) {
362 		/* mcp55, base address stored in chipset */
363 		mcp55 = pci_find_bsf(0, 0, 0);
364 		if (mcp55 &&
365 		    0x10de == pci_read_config(mcp55, PCIR_VENDOR, 2) &&
366 		    0x0369 == pci_read_config(mcp55, PCIR_DEVICE, 2)) {
367 			word = pci_read_config(mcp55, 0x90, 2);
368 			base = ((unsigned long)word & 0x7ffeU) << 25;
369 		}
370 	}
371 	if (!base)
372 		return;
373 
374 	/* XXXX
375 	   Test below is commented because it is believed that doing
376 	   config read/write beyond 0xff will access the config space
377 	   for the next larger function.  Uncomment this and remove
378 	   the hacky pmap_mapdev() way of accessing config space when
379 	   FreeBSD grows support for extended pcie config space access
380 	*/
381 #if 0
382 	/* See if we can, by some miracle, access the extended
383 	   config space */
384 	val = pci_read_config(pdev, 0x178, 4);
385 	if (val != 0xffffffff) {
386 		val |= 0x40;
387 		pci_write_config(pdev, 0x178, val, 4);
388 		return;
389 	}
390 #endif
391 	/* Rather than using normal pci config space writes, we must
392 	 * map the Nvidia config space ourselves.  This is because on
393 	 * opteron/nvidia class machine the 0xe000000 mapping is
394 	 * handled by the nvidia chipset, that means the internal PCI
395 	 * device (the on-chip northbridge), or the amd-8131 bridge
396 	 * and things behind them are not visible by this method.
397 	 */
398 
399 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
400 		      PCI_IVAR_BUS, &bus);
401 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
402 		      PCI_IVAR_SLOT, &slot);
403 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
404 		      PCI_IVAR_FUNCTION, &func);
405 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
406 		      PCI_IVAR_VENDOR, &ivend);
407 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
408 		      PCI_IVAR_DEVICE, &idev);
409 
410 	off =  base
411 		+ 0x00100000UL * (unsigned long)bus
412 		+ 0x00001000UL * (unsigned long)(func
413 						 + 8 * slot);
414 
415 	/* map it into the kernel */
416 	va = pmap_mapdev(trunc_page((vm_paddr_t)off), PAGE_SIZE);
417 
418 
419 	if (va == NULL) {
420 		device_printf(sc->dev, "pmap_kenter_temporary didn't\n");
421 		return;
422 	}
423 	/* get a pointer to the config space mapped into the kernel */
424 	cfgptr = va + (off & PAGE_MASK);
425 
426 	/* make sure that we can really access it */
427 	vendor_id = *(uint16_t *)(cfgptr + PCIR_VENDOR);
428 	device_id = *(uint16_t *)(cfgptr + PCIR_DEVICE);
429 	if (! (vendor_id == ivend && device_id == idev)) {
430 		device_printf(sc->dev, "mapping failed: 0x%x:0x%x\n",
431 			      vendor_id, device_id);
432 		pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
433 		return;
434 	}
435 
436 	ptr32 = (uint32_t*)(cfgptr + 0x178);
437 	val = *ptr32;
438 
439 	if (val == 0xffffffff) {
440 		device_printf(sc->dev, "extended mapping failed\n");
441 		pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
442 		return;
443 	}
444 	*ptr32 = val | 0x40;
445 	pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
446 	if (mxge_verbose)
447 		device_printf(sc->dev,
448 			      "Enabled ECRC on upstream Nvidia bridge "
449 			      "at %d:%d:%d\n",
450 			      (int)bus, (int)slot, (int)func);
451 	return;
452 }
453 #else
454 static void
455 mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
456 {
457 	device_printf(sc->dev,
458 		      "Nforce 4 chipset on non-x86/amd64!?!?!\n");
459 	return;
460 }
461 #endif
462 
463 
464 static int
465 mxge_dma_test(mxge_softc_t *sc, int test_type)
466 {
467 	mxge_cmd_t cmd;
468 	bus_addr_t dmatest_bus = sc->dmabench_dma.bus_addr;
469 	int status;
470 	uint32_t len;
471 	char *test = " ";
472 
473 
474 	/* Run a small DMA test.
475 	 * The magic multipliers to the length tell the firmware
476 	 * to do DMA read, write, or read+write tests.  The
477 	 * results are returned in cmd.data0.  The upper 16
478 	 * bits of the return is the number of transfers completed.
479 	 * The lower 16 bits is the time in 0.5us ticks that the
480 	 * transfers took to complete.
481 	 */
482 
483 	len = sc->tx_boundary;
484 
485 	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
486 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
487 	cmd.data2 = len * 0x10000;
488 	status = mxge_send_cmd(sc, test_type, &cmd);
489 	if (status != 0) {
490 		test = "read";
491 		goto abort;
492 	}
493 	sc->read_dma = ((cmd.data0>>16) * len * 2) /
494 		(cmd.data0 & 0xffff);
495 	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
496 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
497 	cmd.data2 = len * 0x1;
498 	status = mxge_send_cmd(sc, test_type, &cmd);
499 	if (status != 0) {
500 		test = "write";
501 		goto abort;
502 	}
503 	sc->write_dma = ((cmd.data0>>16) * len * 2) /
504 		(cmd.data0 & 0xffff);
505 
506 	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
507 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
508 	cmd.data2 = len * 0x10001;
509 	status = mxge_send_cmd(sc, test_type, &cmd);
510 	if (status != 0) {
511 		test = "read/write";
512 		goto abort;
513 	}
514 	sc->read_write_dma = ((cmd.data0>>16) * len * 2 * 2) /
515 		(cmd.data0 & 0xffff);
516 
517 abort:
518 	if (status != 0 && test_type != MXGEFW_CMD_UNALIGNED_TEST)
519 		device_printf(sc->dev, "DMA %s benchmark failed: %d\n",
520 			      test, status);
521 
522 	return status;
523 }
524 
525 /*
526  * The Lanai Z8E PCI-E interface achieves higher Read-DMA throughput
527  * when the PCI-E Completion packets are aligned on an 8-byte
528  * boundary.  Some PCI-E chip sets always align Completion packets; on
529  * the ones that do not, the alignment can be enforced by enabling
530  * ECRC generation (if supported).
531  *
532  * When PCI-E Completion packets are not aligned, it is actually more
533  * efficient to limit Read-DMA transactions to 2KB, rather than 4KB.
534  *
535  * If the driver can neither enable ECRC nor verify that it has
536  * already been enabled, then it must use a firmware image which works
537  * around unaligned completion packets (ethp_z8e.dat), and it should
538  * also ensure that it never gives the device a Read-DMA which is
539  * larger than 2KB by setting the tx_boundary to 2KB.  If ECRC is
540  * enabled, then the driver should use the aligned (eth_z8e.dat)
541  * firmware image, and set tx_boundary to 4KB.
542  */
543 
544 static int
545 mxge_firmware_probe(mxge_softc_t *sc)
546 {
547 	device_t dev = sc->dev;
548 	int reg, status;
549 	uint16_t pectl;
550 
551 	sc->tx_boundary = 4096;
552 	/*
553 	 * Verify the max read request size was set to 4KB
554 	 * before trying the test with 4KB.
555 	 */
556 	if (pci_find_extcap(dev, PCIY_EXPRESS, &reg) == 0) {
557 		pectl = pci_read_config(dev, reg + 0x8, 2);
558 		if ((pectl & (5 << 12)) != (5 << 12)) {
559 			device_printf(dev, "Max Read Req. size != 4k (0x%x\n",
560 				      pectl);
561 			sc->tx_boundary = 2048;
562 		}
563 	}
564 
565 	/*
566 	 * load the optimized firmware (which assumes aligned PCIe
567 	 * completions) in order to see if it works on this host.
568 	 */
569 	sc->fw_name = mxge_fw_aligned;
570 	status = mxge_load_firmware(sc, 1);
571 	if (status != 0) {
572 		return status;
573 	}
574 
575 	/*
576 	 * Enable ECRC if possible
577 	 */
578 	mxge_enable_nvidia_ecrc(sc);
579 
580 	/*
581 	 * Run a DMA test which watches for unaligned completions and
582 	 * aborts on the first one seen.
583 	 */
584 
585 	status = mxge_dma_test(sc, MXGEFW_CMD_UNALIGNED_TEST);
586 	if (status == 0)
587 		return 0; /* keep the aligned firmware */
588 
589 	if (status != E2BIG)
590 		device_printf(dev, "DMA test failed: %d\n", status);
591 	if (status == ENOSYS)
592 		device_printf(dev, "Falling back to ethp! "
593 			      "Please install up to date fw\n");
594 	return status;
595 }
596 
597 static int
598 mxge_select_firmware(mxge_softc_t *sc)
599 {
600 	int aligned = 0;
601 	int force_firmware = mxge_force_firmware;
602 
603 	if (sc->throttle)
604 		force_firmware = sc->throttle;
605 
606 	if (force_firmware != 0) {
607 		if (force_firmware == 1)
608 			aligned = 1;
609 		else
610 			aligned = 0;
611 		if (mxge_verbose)
612 			device_printf(sc->dev,
613 				      "Assuming %s completions (forced)\n",
614 				      aligned ? "aligned" : "unaligned");
615 		goto abort;
616 	}
617 
618 	/* if the PCIe link width is 4 or less, we can use the aligned
619 	   firmware and skip any checks */
620 	if (sc->link_width != 0 && sc->link_width <= 4) {
621 		device_printf(sc->dev,
622 			      "PCIe x%d Link, expect reduced performance\n",
623 			      sc->link_width);
624 		aligned = 1;
625 		goto abort;
626 	}
627 
628 	if (0 == mxge_firmware_probe(sc))
629 		return 0;
630 
631 abort:
632 	if (aligned) {
633 		sc->fw_name = mxge_fw_aligned;
634 		sc->tx_boundary = 4096;
635 	} else {
636 		sc->fw_name = mxge_fw_unaligned;
637 		sc->tx_boundary = 2048;
638 	}
639 	return (mxge_load_firmware(sc, 0));
640 }
641 
642 union qualhack
643 {
644         const char *ro_char;
645         char *rw_char;
646 };
647 
648 static int
649 mxge_validate_firmware(mxge_softc_t *sc, const mcp_gen_header_t *hdr)
650 {
651 
652 
653 	if (be32toh(hdr->mcp_type) != MCP_TYPE_ETH) {
654 		device_printf(sc->dev, "Bad firmware type: 0x%x\n",
655 			      be32toh(hdr->mcp_type));
656 		return EIO;
657 	}
658 
659 	/* save firmware version for sysctl */
660 	strncpy(sc->fw_version, hdr->version, sizeof (sc->fw_version));
661 	if (mxge_verbose)
662 		device_printf(sc->dev, "firmware id: %s\n", hdr->version);
663 
664 	sscanf(sc->fw_version, "%d.%d.%d", &sc->fw_ver_major,
665 	       &sc->fw_ver_minor, &sc->fw_ver_tiny);
666 
667 	if (!(sc->fw_ver_major == MXGEFW_VERSION_MAJOR
668 	      && sc->fw_ver_minor == MXGEFW_VERSION_MINOR)) {
669 		device_printf(sc->dev, "Found firmware version %s\n",
670 			      sc->fw_version);
671 		device_printf(sc->dev, "Driver needs %d.%d\n",
672 			      MXGEFW_VERSION_MAJOR, MXGEFW_VERSION_MINOR);
673 		return EINVAL;
674 	}
675 	return 0;
676 
677 }
678 
679 static void *
680 z_alloc(void *nil, u_int items, u_int size)
681 {
682         void *ptr;
683 
684         ptr = malloc(items * size, M_TEMP, M_NOWAIT);
685         return ptr;
686 }
687 
688 static void
689 z_free(void *nil, void *ptr)
690 {
691         free(ptr, M_TEMP);
692 }
693 
694 
695 static int
696 mxge_load_firmware_helper(mxge_softc_t *sc, uint32_t *limit)
697 {
698 	z_stream zs;
699 	char *inflate_buffer;
700 	const struct firmware *fw;
701 	const mcp_gen_header_t *hdr;
702 	unsigned hdr_offset;
703 	int status;
704 	unsigned int i;
705 	char dummy;
706 	size_t fw_len;
707 
708 	fw = firmware_get(sc->fw_name);
709 	if (fw == NULL) {
710 		device_printf(sc->dev, "Could not find firmware image %s\n",
711 			      sc->fw_name);
712 		return ENOENT;
713 	}
714 
715 
716 
717 	/* setup zlib and decompress f/w */
718 	bzero(&zs, sizeof (zs));
719 	zs.zalloc = z_alloc;
720 	zs.zfree = z_free;
721 	status = inflateInit(&zs);
722 	if (status != Z_OK) {
723 		status = EIO;
724 		goto abort_with_fw;
725 	}
726 
727 	/* the uncompressed size is stored as the firmware version,
728 	   which would otherwise go unused */
729 	fw_len = (size_t) fw->version;
730 	inflate_buffer = malloc(fw_len, M_TEMP, M_NOWAIT);
731 	if (inflate_buffer == NULL)
732 		goto abort_with_zs;
733 	zs.avail_in = fw->datasize;
734 	zs.next_in = __DECONST(char *, fw->data);
735 	zs.avail_out = fw_len;
736 	zs.next_out = inflate_buffer;
737 	status = inflate(&zs, Z_FINISH);
738 	if (status != Z_STREAM_END) {
739 		device_printf(sc->dev, "zlib %d\n", status);
740 		status = EIO;
741 		goto abort_with_buffer;
742 	}
743 
744 	/* check id */
745 	hdr_offset = htobe32(*(const uint32_t *)
746 			     (inflate_buffer + MCP_HEADER_PTR_OFFSET));
747 	if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > fw_len) {
748 		device_printf(sc->dev, "Bad firmware file");
749 		status = EIO;
750 		goto abort_with_buffer;
751 	}
752 	hdr = (const void*)(inflate_buffer + hdr_offset);
753 
754 	status = mxge_validate_firmware(sc, hdr);
755 	if (status != 0)
756 		goto abort_with_buffer;
757 
758 	/* Copy the inflated firmware to NIC SRAM. */
759 	for (i = 0; i < fw_len; i += 256) {
760 		mxge_pio_copy(sc->sram + MXGE_FW_OFFSET + i,
761 			      inflate_buffer + i,
762 			      min(256U, (unsigned)(fw_len - i)));
763 		wmb();
764 		dummy = *sc->sram;
765 		wmb();
766 	}
767 
768 	*limit = fw_len;
769 	status = 0;
770 abort_with_buffer:
771 	free(inflate_buffer, M_TEMP);
772 abort_with_zs:
773 	inflateEnd(&zs);
774 abort_with_fw:
775 	firmware_put(fw, FIRMWARE_UNLOAD);
776 	return status;
777 }
778 
779 /*
780  * Enable or disable periodic RDMAs from the host to make certain
781  * chipsets resend dropped PCIe messages
782  */
783 
784 static void
785 mxge_dummy_rdma(mxge_softc_t *sc, int enable)
786 {
787 	char buf_bytes[72];
788 	volatile uint32_t *confirm;
789 	volatile char *submit;
790 	uint32_t *buf, dma_low, dma_high;
791 	int i;
792 
793 	buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
794 
795 	/* clear confirmation addr */
796 	confirm = (volatile uint32_t *)sc->cmd;
797 	*confirm = 0;
798 	wmb();
799 
800 	/* send an rdma command to the PCIe engine, and wait for the
801 	   response in the confirmation address.  The firmware should
802 	   write a -1 there to indicate it is alive and well
803 	*/
804 
805 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
806 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
807 	buf[0] = htobe32(dma_high);		/* confirm addr MSW */
808 	buf[1] = htobe32(dma_low);		/* confirm addr LSW */
809 	buf[2] = htobe32(0xffffffff);		/* confirm data */
810 	dma_low = MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr);
811 	dma_high = MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr);
812 	buf[3] = htobe32(dma_high); 		/* dummy addr MSW */
813 	buf[4] = htobe32(dma_low); 		/* dummy addr LSW */
814 	buf[5] = htobe32(enable);			/* enable? */
815 
816 
817 	submit = (volatile char *)(sc->sram + MXGEFW_BOOT_DUMMY_RDMA);
818 
819 	mxge_pio_copy(submit, buf, 64);
820 	wmb();
821 	DELAY(1000);
822 	wmb();
823 	i = 0;
824 	while (*confirm != 0xffffffff && i < 20) {
825 		DELAY(1000);
826 		i++;
827 	}
828 	if (*confirm != 0xffffffff) {
829 		device_printf(sc->dev, "dummy rdma %s failed (%p = 0x%x)",
830 			      (enable ? "enable" : "disable"), confirm,
831 			      *confirm);
832 	}
833 	return;
834 }
835 
836 static int
837 mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data)
838 {
839 	mcp_cmd_t *buf;
840 	char buf_bytes[sizeof(*buf) + 8];
841 	volatile mcp_cmd_response_t *response = sc->cmd;
842 	volatile char *cmd_addr = sc->sram + MXGEFW_ETH_CMD;
843 	uint32_t dma_low, dma_high;
844 	int err, sleep_total = 0;
845 
846 	/* ensure buf is aligned to 8 bytes */
847 	buf = (mcp_cmd_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
848 
849 	buf->data0 = htobe32(data->data0);
850 	buf->data1 = htobe32(data->data1);
851 	buf->data2 = htobe32(data->data2);
852 	buf->cmd = htobe32(cmd);
853 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
854 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
855 
856 	buf->response_addr.low = htobe32(dma_low);
857 	buf->response_addr.high = htobe32(dma_high);
858 	mtx_lock(&sc->cmd_mtx);
859 	response->result = 0xffffffff;
860 	wmb();
861 	mxge_pio_copy((volatile void *)cmd_addr, buf, sizeof (*buf));
862 
863 	/* wait up to 20ms */
864 	err = EAGAIN;
865 	for (sleep_total = 0; sleep_total <  20; sleep_total++) {
866 		bus_dmamap_sync(sc->cmd_dma.dmat,
867 				sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
868 		wmb();
869 		switch (be32toh(response->result)) {
870 		case 0:
871 			data->data0 = be32toh(response->data);
872 			err = 0;
873 			break;
874 		case 0xffffffff:
875 			DELAY(1000);
876 			break;
877 		case MXGEFW_CMD_UNKNOWN:
878 			err = ENOSYS;
879 			break;
880 		case MXGEFW_CMD_ERROR_UNALIGNED:
881 			err = E2BIG;
882 			break;
883 		case MXGEFW_CMD_ERROR_BUSY:
884 			err = EBUSY;
885 			break;
886 		default:
887 			device_printf(sc->dev,
888 				      "mxge: command %d "
889 				      "failed, result = %d\n",
890 				      cmd, be32toh(response->result));
891 			err = ENXIO;
892 			break;
893 		}
894 		if (err != EAGAIN)
895 			break;
896 	}
897 	if (err == EAGAIN)
898 		device_printf(sc->dev, "mxge: command %d timed out"
899 			      "result = %d\n",
900 			      cmd, be32toh(response->result));
901 	mtx_unlock(&sc->cmd_mtx);
902 	return err;
903 }
904 
905 static int
906 mxge_adopt_running_firmware(mxge_softc_t *sc)
907 {
908 	struct mcp_gen_header *hdr;
909 	const size_t bytes = sizeof (struct mcp_gen_header);
910 	size_t hdr_offset;
911 	int status;
912 
913 	/* find running firmware header */
914 	hdr_offset = htobe32(*(volatile uint32_t *)
915 			     (sc->sram + MCP_HEADER_PTR_OFFSET));
916 
917 	if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > sc->sram_size) {
918 		device_printf(sc->dev,
919 			      "Running firmware has bad header offset (%d)\n",
920 			      (int)hdr_offset);
921 		return EIO;
922 	}
923 
924 	/* copy header of running firmware from SRAM to host memory to
925 	 * validate firmware */
926 	hdr = malloc(bytes, M_DEVBUF, M_NOWAIT);
927 	if (hdr == NULL) {
928 		device_printf(sc->dev, "could not malloc firmware hdr\n");
929 		return ENOMEM;
930 	}
931 	bus_space_read_region_1(rman_get_bustag(sc->mem_res),
932 				rman_get_bushandle(sc->mem_res),
933 				hdr_offset, (char *)hdr, bytes);
934 	status = mxge_validate_firmware(sc, hdr);
935 	free(hdr, M_DEVBUF);
936 
937 	/*
938 	 * check to see if adopted firmware has bug where adopting
939 	 * it will cause broadcasts to be filtered unless the NIC
940 	 * is kept in ALLMULTI mode
941 	 */
942 	if (sc->fw_ver_major == 1 && sc->fw_ver_minor == 4 &&
943 	    sc->fw_ver_tiny >= 4 && sc->fw_ver_tiny <= 11) {
944 		sc->adopted_rx_filter_bug = 1;
945 		device_printf(sc->dev, "Adopting fw %d.%d.%d: "
946 			      "working around rx filter bug\n",
947 			      sc->fw_ver_major, sc->fw_ver_minor,
948 			      sc->fw_ver_tiny);
949 	}
950 
951 	return status;
952 }
953 
954 
955 static int
956 mxge_load_firmware(mxge_softc_t *sc, int adopt)
957 {
958 	volatile uint32_t *confirm;
959 	volatile char *submit;
960 	char buf_bytes[72];
961 	uint32_t *buf, size, dma_low, dma_high;
962 	int status, i;
963 
964 	buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
965 
966 	size = sc->sram_size;
967 	status = mxge_load_firmware_helper(sc, &size);
968 	if (status) {
969 		if (!adopt)
970 			return status;
971 		/* Try to use the currently running firmware, if
972 		   it is new enough */
973 		status = mxge_adopt_running_firmware(sc);
974 		if (status) {
975 			device_printf(sc->dev,
976 				      "failed to adopt running firmware\n");
977 			return status;
978 		}
979 		device_printf(sc->dev,
980 			      "Successfully adopted running firmware\n");
981 		if (sc->tx_boundary == 4096) {
982 			device_printf(sc->dev,
983 				"Using firmware currently running on NIC"
984 				 ".  For optimal\n");
985 			device_printf(sc->dev,
986 				 "performance consider loading optimized "
987 				 "firmware\n");
988 		}
989 		sc->fw_name = mxge_fw_unaligned;
990 		sc->tx_boundary = 2048;
991 		return 0;
992 	}
993 	/* clear confirmation addr */
994 	confirm = (volatile uint32_t *)sc->cmd;
995 	*confirm = 0;
996 	wmb();
997 	/* send a reload command to the bootstrap MCP, and wait for the
998 	   response in the confirmation address.  The firmware should
999 	   write a -1 there to indicate it is alive and well
1000 	*/
1001 
1002 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
1003 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
1004 
1005 	buf[0] = htobe32(dma_high);	/* confirm addr MSW */
1006 	buf[1] = htobe32(dma_low);	/* confirm addr LSW */
1007 	buf[2] = htobe32(0xffffffff);	/* confirm data */
1008 
1009 	/* FIX: All newest firmware should un-protect the bottom of
1010 	   the sram before handoff. However, the very first interfaces
1011 	   do not. Therefore the handoff copy must skip the first 8 bytes
1012 	*/
1013 					/* where the code starts*/
1014 	buf[3] = htobe32(MXGE_FW_OFFSET + 8);
1015 	buf[4] = htobe32(size - 8); 	/* length of code */
1016 	buf[5] = htobe32(8);		/* where to copy to */
1017 	buf[6] = htobe32(0);		/* where to jump to */
1018 
1019 	submit = (volatile char *)(sc->sram + MXGEFW_BOOT_HANDOFF);
1020 	mxge_pio_copy(submit, buf, 64);
1021 	wmb();
1022 	DELAY(1000);
1023 	wmb();
1024 	i = 0;
1025 	while (*confirm != 0xffffffff && i < 20) {
1026 		DELAY(1000*10);
1027 		i++;
1028 		bus_dmamap_sync(sc->cmd_dma.dmat,
1029 				sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
1030 	}
1031 	if (*confirm != 0xffffffff) {
1032 		device_printf(sc->dev,"handoff failed (%p = 0x%x)",
1033 			confirm, *confirm);
1034 
1035 		return ENXIO;
1036 	}
1037 	return 0;
1038 }
1039 
1040 static int
1041 mxge_update_mac_address(mxge_softc_t *sc)
1042 {
1043 	mxge_cmd_t cmd;
1044 	uint8_t *addr = sc->mac_addr;
1045 	int status;
1046 
1047 
1048 	cmd.data0 = ((addr[0] << 24) | (addr[1] << 16)
1049 		     | (addr[2] << 8) | addr[3]);
1050 
1051 	cmd.data1 = ((addr[4] << 8) | (addr[5]));
1052 
1053 	status = mxge_send_cmd(sc, MXGEFW_SET_MAC_ADDRESS, &cmd);
1054 	return status;
1055 }
1056 
1057 static int
1058 mxge_change_pause(mxge_softc_t *sc, int pause)
1059 {
1060 	mxge_cmd_t cmd;
1061 	int status;
1062 
1063 	if (pause)
1064 		status = mxge_send_cmd(sc, MXGEFW_ENABLE_FLOW_CONTROL,
1065 				       &cmd);
1066 	else
1067 		status = mxge_send_cmd(sc, MXGEFW_DISABLE_FLOW_CONTROL,
1068 				       &cmd);
1069 
1070 	if (status) {
1071 		device_printf(sc->dev, "Failed to set flow control mode\n");
1072 		return ENXIO;
1073 	}
1074 	sc->pause = pause;
1075 	return 0;
1076 }
1077 
1078 static void
1079 mxge_change_promisc(mxge_softc_t *sc, int promisc)
1080 {
1081 	mxge_cmd_t cmd;
1082 	int status;
1083 
1084 	if (mxge_always_promisc)
1085 		promisc = 1;
1086 
1087 	if (promisc)
1088 		status = mxge_send_cmd(sc, MXGEFW_ENABLE_PROMISC,
1089 				       &cmd);
1090 	else
1091 		status = mxge_send_cmd(sc, MXGEFW_DISABLE_PROMISC,
1092 				       &cmd);
1093 
1094 	if (status) {
1095 		device_printf(sc->dev, "Failed to set promisc mode\n");
1096 	}
1097 }
1098 
1099 static void
1100 mxge_set_multicast_list(mxge_softc_t *sc)
1101 {
1102 	mxge_cmd_t cmd;
1103 	struct ifmultiaddr *ifma;
1104 	struct ifnet *ifp = sc->ifp;
1105 	int err;
1106 
1107 	/* This firmware is known to not support multicast */
1108 	if (!sc->fw_multicast_support)
1109 		return;
1110 
1111 	/* Disable multicast filtering while we play with the lists*/
1112 	err = mxge_send_cmd(sc, MXGEFW_ENABLE_ALLMULTI, &cmd);
1113 	if (err != 0) {
1114 		device_printf(sc->dev, "Failed MXGEFW_ENABLE_ALLMULTI,"
1115 		       " error status: %d\n", err);
1116 		return;
1117 	}
1118 
1119 	if (sc->adopted_rx_filter_bug)
1120 		return;
1121 
1122 	if (ifp->if_flags & IFF_ALLMULTI)
1123 		/* request to disable multicast filtering, so quit here */
1124 		return;
1125 
1126 	/* Flush all the filters */
1127 
1128 	err = mxge_send_cmd(sc, MXGEFW_LEAVE_ALL_MULTICAST_GROUPS, &cmd);
1129 	if (err != 0) {
1130 		device_printf(sc->dev,
1131 			      "Failed MXGEFW_LEAVE_ALL_MULTICAST_GROUPS"
1132 			      ", error status: %d\n", err);
1133 		return;
1134 	}
1135 
1136 	/* Walk the multicast list, and add each address */
1137 
1138 	if_maddr_rlock(ifp);
1139 	TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
1140 		if (ifma->ifma_addr->sa_family != AF_LINK)
1141 			continue;
1142 		bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr),
1143 		      &cmd.data0, 4);
1144 		bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr) + 4,
1145 		      &cmd.data1, 2);
1146 		cmd.data0 = htonl(cmd.data0);
1147 		cmd.data1 = htonl(cmd.data1);
1148 		err = mxge_send_cmd(sc, MXGEFW_JOIN_MULTICAST_GROUP, &cmd);
1149 		if (err != 0) {
1150 			device_printf(sc->dev, "Failed "
1151 			       "MXGEFW_JOIN_MULTICAST_GROUP, error status:"
1152 			       "%d\t", err);
1153 			/* abort, leaving multicast filtering off */
1154 			if_maddr_runlock(ifp);
1155 			return;
1156 		}
1157 	}
1158 	if_maddr_runlock(ifp);
1159 	/* Enable multicast filtering */
1160 	err = mxge_send_cmd(sc, MXGEFW_DISABLE_ALLMULTI, &cmd);
1161 	if (err != 0) {
1162 		device_printf(sc->dev, "Failed MXGEFW_DISABLE_ALLMULTI"
1163 		       ", error status: %d\n", err);
1164 	}
1165 }
1166 
1167 static int
1168 mxge_max_mtu(mxge_softc_t *sc)
1169 {
1170 	mxge_cmd_t cmd;
1171 	int status;
1172 
1173 	if (MJUMPAGESIZE - MXGEFW_PAD >  MXGEFW_MAX_MTU)
1174 		return  MXGEFW_MAX_MTU - MXGEFW_PAD;
1175 
1176 	/* try to set nbufs to see if it we can
1177 	   use virtually contiguous jumbos */
1178 	cmd.data0 = 0;
1179 	status = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
1180 			       &cmd);
1181 	if (status == 0)
1182 		return  MXGEFW_MAX_MTU - MXGEFW_PAD;
1183 
1184 	/* otherwise, we're limited to MJUMPAGESIZE */
1185 	return MJUMPAGESIZE - MXGEFW_PAD;
1186 }
1187 
1188 static int
1189 mxge_reset(mxge_softc_t *sc, int interrupts_setup)
1190 {
1191 	struct mxge_slice_state *ss;
1192 	mxge_rx_done_t *rx_done;
1193 	volatile uint32_t *irq_claim;
1194 	mxge_cmd_t cmd;
1195 	int slice, status;
1196 
1197 	/* try to send a reset command to the card to see if it
1198 	   is alive */
1199 	memset(&cmd, 0, sizeof (cmd));
1200 	status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
1201 	if (status != 0) {
1202 		device_printf(sc->dev, "failed reset\n");
1203 		return ENXIO;
1204 	}
1205 
1206 	mxge_dummy_rdma(sc, 1);
1207 
1208 
1209 	/* set the intrq size */
1210 	cmd.data0 = sc->rx_ring_size;
1211 	status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
1212 
1213 	/*
1214 	 * Even though we already know how many slices are supported
1215 	 * via mxge_slice_probe(), MXGEFW_CMD_GET_MAX_RSS_QUEUES
1216 	 * has magic side effects, and must be called after a reset.
1217 	 * It must be called prior to calling any RSS related cmds,
1218 	 * including assigning an interrupt queue for anything but
1219 	 * slice 0.  It must also be called *after*
1220 	 * MXGEFW_CMD_SET_INTRQ_SIZE, since the intrq size is used by
1221 	 * the firmware to compute offsets.
1222 	 */
1223 
1224 	if (sc->num_slices > 1) {
1225 		/* ask the maximum number of slices it supports */
1226 		status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES,
1227 					   &cmd);
1228 		if (status != 0) {
1229 			device_printf(sc->dev,
1230 				      "failed to get number of slices\n");
1231 			return status;
1232 		}
1233 		/*
1234 		 * MXGEFW_CMD_ENABLE_RSS_QUEUES must be called prior
1235 		 * to setting up the interrupt queue DMA
1236 		 */
1237 		cmd.data0 = sc->num_slices;
1238 		cmd.data1 = MXGEFW_SLICE_INTR_MODE_ONE_PER_SLICE;
1239 #ifdef IFNET_BUF_RING
1240 		cmd.data1 |= MXGEFW_SLICE_ENABLE_MULTIPLE_TX_QUEUES;
1241 #endif
1242 		status = mxge_send_cmd(sc, MXGEFW_CMD_ENABLE_RSS_QUEUES,
1243 					   &cmd);
1244 		if (status != 0) {
1245 			device_printf(sc->dev,
1246 				      "failed to set number of slices\n");
1247 			return status;
1248 		}
1249 	}
1250 
1251 
1252 	if (interrupts_setup) {
1253 		/* Now exchange information about interrupts  */
1254 		for (slice = 0; slice < sc->num_slices; slice++) {
1255 			rx_done = &sc->ss[slice].rx_done;
1256 			memset(rx_done->entry, 0, sc->rx_ring_size);
1257 			cmd.data0 = MXGE_LOWPART_TO_U32(rx_done->dma.bus_addr);
1258 			cmd.data1 = MXGE_HIGHPART_TO_U32(rx_done->dma.bus_addr);
1259 			cmd.data2 = slice;
1260 			status |= mxge_send_cmd(sc,
1261 						MXGEFW_CMD_SET_INTRQ_DMA,
1262 						&cmd);
1263 		}
1264 	}
1265 
1266 	status |= mxge_send_cmd(sc,
1267 				MXGEFW_CMD_GET_INTR_COAL_DELAY_OFFSET, &cmd);
1268 
1269 
1270 	sc->intr_coal_delay_ptr = (volatile uint32_t *)(sc->sram + cmd.data0);
1271 
1272 	status |= mxge_send_cmd(sc, MXGEFW_CMD_GET_IRQ_ACK_OFFSET, &cmd);
1273 	irq_claim = (volatile uint32_t *)(sc->sram + cmd.data0);
1274 
1275 
1276 	status |= mxge_send_cmd(sc,  MXGEFW_CMD_GET_IRQ_DEASSERT_OFFSET,
1277 				&cmd);
1278 	sc->irq_deassert = (volatile uint32_t *)(sc->sram + cmd.data0);
1279 	if (status != 0) {
1280 		device_printf(sc->dev, "failed set interrupt parameters\n");
1281 		return status;
1282 	}
1283 
1284 
1285 	*sc->intr_coal_delay_ptr = htobe32(sc->intr_coal_delay);
1286 
1287 
1288 	/* run a DMA benchmark */
1289 	(void) mxge_dma_test(sc, MXGEFW_DMA_TEST);
1290 
1291 	for (slice = 0; slice < sc->num_slices; slice++) {
1292 		ss = &sc->ss[slice];
1293 
1294 		ss->irq_claim = irq_claim + (2 * slice);
1295 		/* reset mcp/driver shared state back to 0 */
1296 		ss->rx_done.idx = 0;
1297 		ss->rx_done.cnt = 0;
1298 		ss->tx.req = 0;
1299 		ss->tx.done = 0;
1300 		ss->tx.pkt_done = 0;
1301 		ss->tx.queue_active = 0;
1302 		ss->tx.activate = 0;
1303 		ss->tx.deactivate = 0;
1304 		ss->tx.wake = 0;
1305 		ss->tx.defrag = 0;
1306 		ss->tx.stall = 0;
1307 		ss->rx_big.cnt = 0;
1308 		ss->rx_small.cnt = 0;
1309 		ss->lro_bad_csum = 0;
1310 		ss->lro_queued = 0;
1311 		ss->lro_flushed = 0;
1312 		if (ss->fw_stats != NULL) {
1313 			bzero(ss->fw_stats, sizeof *ss->fw_stats);
1314 		}
1315 	}
1316 	sc->rdma_tags_available = 15;
1317 	status = mxge_update_mac_address(sc);
1318 	mxge_change_promisc(sc, sc->ifp->if_flags & IFF_PROMISC);
1319 	mxge_change_pause(sc, sc->pause);
1320 	mxge_set_multicast_list(sc);
1321 	if (sc->throttle) {
1322 		cmd.data0 = sc->throttle;
1323 		if (mxge_send_cmd(sc, MXGEFW_CMD_SET_THROTTLE_FACTOR,
1324 				  &cmd)) {
1325 			device_printf(sc->dev,
1326 				      "can't enable throttle\n");
1327 		}
1328 	}
1329 	return status;
1330 }
1331 
1332 static int
1333 mxge_change_throttle(SYSCTL_HANDLER_ARGS)
1334 {
1335 	mxge_cmd_t cmd;
1336 	mxge_softc_t *sc;
1337 	int err;
1338 	unsigned int throttle;
1339 
1340 	sc = arg1;
1341 	throttle = sc->throttle;
1342 	err = sysctl_handle_int(oidp, &throttle, arg2, req);
1343         if (err != 0) {
1344                 return err;
1345         }
1346 
1347 	if (throttle == sc->throttle)
1348 		return 0;
1349 
1350         if (throttle < MXGE_MIN_THROTTLE || throttle > MXGE_MAX_THROTTLE)
1351                 return EINVAL;
1352 
1353 	mtx_lock(&sc->driver_mtx);
1354 	cmd.data0 = throttle;
1355 	err = mxge_send_cmd(sc, MXGEFW_CMD_SET_THROTTLE_FACTOR, &cmd);
1356 	if (err == 0)
1357 		sc->throttle = throttle;
1358 	mtx_unlock(&sc->driver_mtx);
1359 	return err;
1360 }
1361 
1362 static int
1363 mxge_change_intr_coal(SYSCTL_HANDLER_ARGS)
1364 {
1365         mxge_softc_t *sc;
1366         unsigned int intr_coal_delay;
1367         int err;
1368 
1369         sc = arg1;
1370         intr_coal_delay = sc->intr_coal_delay;
1371         err = sysctl_handle_int(oidp, &intr_coal_delay, arg2, req);
1372         if (err != 0) {
1373                 return err;
1374         }
1375         if (intr_coal_delay == sc->intr_coal_delay)
1376                 return 0;
1377 
1378         if (intr_coal_delay == 0 || intr_coal_delay > 1000*1000)
1379                 return EINVAL;
1380 
1381 	mtx_lock(&sc->driver_mtx);
1382 	*sc->intr_coal_delay_ptr = htobe32(intr_coal_delay);
1383 	sc->intr_coal_delay = intr_coal_delay;
1384 
1385 	mtx_unlock(&sc->driver_mtx);
1386         return err;
1387 }
1388 
1389 static int
1390 mxge_change_flow_control(SYSCTL_HANDLER_ARGS)
1391 {
1392         mxge_softc_t *sc;
1393         unsigned int enabled;
1394         int err;
1395 
1396         sc = arg1;
1397         enabled = sc->pause;
1398         err = sysctl_handle_int(oidp, &enabled, arg2, req);
1399         if (err != 0) {
1400                 return err;
1401         }
1402         if (enabled == sc->pause)
1403                 return 0;
1404 
1405 	mtx_lock(&sc->driver_mtx);
1406 	err = mxge_change_pause(sc, enabled);
1407 	mtx_unlock(&sc->driver_mtx);
1408         return err;
1409 }
1410 
1411 static int
1412 mxge_change_lro_locked(mxge_softc_t *sc, int lro_cnt)
1413 {
1414 	struct ifnet *ifp;
1415 	int err = 0;
1416 
1417 	ifp = sc->ifp;
1418 	if (lro_cnt == 0)
1419 		ifp->if_capenable &= ~IFCAP_LRO;
1420 	else
1421 		ifp->if_capenable |= IFCAP_LRO;
1422 	sc->lro_cnt = lro_cnt;
1423 	if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
1424 		mxge_close(sc, 0);
1425 		err = mxge_open(sc);
1426 	}
1427 	return err;
1428 }
1429 
1430 static int
1431 mxge_change_lro(SYSCTL_HANDLER_ARGS)
1432 {
1433 	mxge_softc_t *sc;
1434 	unsigned int lro_cnt;
1435 	int err;
1436 
1437 	sc = arg1;
1438 	lro_cnt = sc->lro_cnt;
1439 	err = sysctl_handle_int(oidp, &lro_cnt, arg2, req);
1440 	if (err != 0)
1441 		return err;
1442 
1443 	if (lro_cnt == sc->lro_cnt)
1444 		return 0;
1445 
1446 	if (lro_cnt > 128)
1447 		return EINVAL;
1448 
1449 	mtx_lock(&sc->driver_mtx);
1450 	err = mxge_change_lro_locked(sc, lro_cnt);
1451 	mtx_unlock(&sc->driver_mtx);
1452 	return err;
1453 }
1454 
1455 static int
1456 mxge_handle_be32(SYSCTL_HANDLER_ARGS)
1457 {
1458         int err;
1459 
1460         if (arg1 == NULL)
1461                 return EFAULT;
1462         arg2 = be32toh(*(int *)arg1);
1463         arg1 = NULL;
1464         err = sysctl_handle_int(oidp, arg1, arg2, req);
1465 
1466         return err;
1467 }
1468 
1469 static void
1470 mxge_rem_sysctls(mxge_softc_t *sc)
1471 {
1472 	struct mxge_slice_state *ss;
1473 	int slice;
1474 
1475 	if (sc->slice_sysctl_tree == NULL)
1476 		return;
1477 
1478 	for (slice = 0; slice < sc->num_slices; slice++) {
1479 		ss = &sc->ss[slice];
1480 		if (ss == NULL || ss->sysctl_tree == NULL)
1481 			continue;
1482 		sysctl_ctx_free(&ss->sysctl_ctx);
1483 		ss->sysctl_tree = NULL;
1484 	}
1485 	sysctl_ctx_free(&sc->slice_sysctl_ctx);
1486 	sc->slice_sysctl_tree = NULL;
1487 }
1488 
1489 static void
1490 mxge_add_sysctls(mxge_softc_t *sc)
1491 {
1492 	struct sysctl_ctx_list *ctx;
1493 	struct sysctl_oid_list *children;
1494 	mcp_irq_data_t *fw;
1495 	struct mxge_slice_state *ss;
1496 	int slice;
1497 	char slice_num[8];
1498 
1499 	ctx = device_get_sysctl_ctx(sc->dev);
1500 	children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev));
1501 	fw = sc->ss[0].fw_stats;
1502 
1503 	/* random information */
1504 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1505 		       "firmware_version",
1506 		       CTLFLAG_RD, &sc->fw_version,
1507 		       0, "firmware version");
1508 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1509 		       "serial_number",
1510 		       CTLFLAG_RD, &sc->serial_number_string,
1511 		       0, "serial number");
1512 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1513 		       "product_code",
1514 		       CTLFLAG_RD, &sc->product_code_string,
1515 		       0, "product_code");
1516 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1517 		       "pcie_link_width",
1518 		       CTLFLAG_RD, &sc->link_width,
1519 		       0, "tx_boundary");
1520 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1521 		       "tx_boundary",
1522 		       CTLFLAG_RD, &sc->tx_boundary,
1523 		       0, "tx_boundary");
1524 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1525 		       "write_combine",
1526 		       CTLFLAG_RD, &sc->wc,
1527 		       0, "write combining PIO?");
1528 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1529 		       "read_dma_MBs",
1530 		       CTLFLAG_RD, &sc->read_dma,
1531 		       0, "DMA Read speed in MB/s");
1532 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1533 		       "write_dma_MBs",
1534 		       CTLFLAG_RD, &sc->write_dma,
1535 		       0, "DMA Write speed in MB/s");
1536 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1537 		       "read_write_dma_MBs",
1538 		       CTLFLAG_RD, &sc->read_write_dma,
1539 		       0, "DMA concurrent Read/Write speed in MB/s");
1540 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1541 		       "watchdog_resets",
1542 		       CTLFLAG_RD, &sc->watchdog_resets,
1543 		       0, "Number of times NIC was reset");
1544 
1545 
1546 	/* performance related tunables */
1547 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1548 			"intr_coal_delay",
1549 			CTLTYPE_INT|CTLFLAG_RW, sc,
1550 			0, mxge_change_intr_coal,
1551 			"I", "interrupt coalescing delay in usecs");
1552 
1553 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1554 			"throttle",
1555 			CTLTYPE_INT|CTLFLAG_RW, sc,
1556 			0, mxge_change_throttle,
1557 			"I", "transmit throttling");
1558 
1559 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1560 			"flow_control_enabled",
1561 			CTLTYPE_INT|CTLFLAG_RW, sc,
1562 			0, mxge_change_flow_control,
1563 			"I", "interrupt coalescing delay in usecs");
1564 
1565 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1566 		       "deassert_wait",
1567 		       CTLFLAG_RW, &mxge_deassert_wait,
1568 		       0, "Wait for IRQ line to go low in ihandler");
1569 
1570 	/* stats block from firmware is in network byte order.
1571 	   Need to swap it */
1572 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1573 			"link_up",
1574 			CTLTYPE_INT|CTLFLAG_RD, &fw->link_up,
1575 			0, mxge_handle_be32,
1576 			"I", "link up");
1577 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1578 			"rdma_tags_available",
1579 			CTLTYPE_INT|CTLFLAG_RD, &fw->rdma_tags_available,
1580 			0, mxge_handle_be32,
1581 			"I", "rdma_tags_available");
1582 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1583 			"dropped_bad_crc32",
1584 			CTLTYPE_INT|CTLFLAG_RD,
1585 			&fw->dropped_bad_crc32,
1586 			0, mxge_handle_be32,
1587 			"I", "dropped_bad_crc32");
1588 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1589 			"dropped_bad_phy",
1590 			CTLTYPE_INT|CTLFLAG_RD,
1591 			&fw->dropped_bad_phy,
1592 			0, mxge_handle_be32,
1593 			"I", "dropped_bad_phy");
1594 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1595 			"dropped_link_error_or_filtered",
1596 			CTLTYPE_INT|CTLFLAG_RD,
1597 			&fw->dropped_link_error_or_filtered,
1598 			0, mxge_handle_be32,
1599 			"I", "dropped_link_error_or_filtered");
1600 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1601 			"dropped_link_overflow",
1602 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_link_overflow,
1603 			0, mxge_handle_be32,
1604 			"I", "dropped_link_overflow");
1605 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1606 			"dropped_multicast_filtered",
1607 			CTLTYPE_INT|CTLFLAG_RD,
1608 			&fw->dropped_multicast_filtered,
1609 			0, mxge_handle_be32,
1610 			"I", "dropped_multicast_filtered");
1611 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1612 			"dropped_no_big_buffer",
1613 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_no_big_buffer,
1614 			0, mxge_handle_be32,
1615 			"I", "dropped_no_big_buffer");
1616 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1617 			"dropped_no_small_buffer",
1618 			CTLTYPE_INT|CTLFLAG_RD,
1619 			&fw->dropped_no_small_buffer,
1620 			0, mxge_handle_be32,
1621 			"I", "dropped_no_small_buffer");
1622 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1623 			"dropped_overrun",
1624 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_overrun,
1625 			0, mxge_handle_be32,
1626 			"I", "dropped_overrun");
1627 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1628 			"dropped_pause",
1629 			CTLTYPE_INT|CTLFLAG_RD,
1630 			&fw->dropped_pause,
1631 			0, mxge_handle_be32,
1632 			"I", "dropped_pause");
1633 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1634 			"dropped_runt",
1635 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_runt,
1636 			0, mxge_handle_be32,
1637 			"I", "dropped_runt");
1638 
1639 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1640 			"dropped_unicast_filtered",
1641 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_unicast_filtered,
1642 			0, mxge_handle_be32,
1643 			"I", "dropped_unicast_filtered");
1644 
1645 	/* verbose printing? */
1646 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1647 		       "verbose",
1648 		       CTLFLAG_RW, &mxge_verbose,
1649 		       0, "verbose printing");
1650 
1651 	/* lro */
1652 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1653 			"lro_cnt",
1654 			CTLTYPE_INT|CTLFLAG_RW, sc,
1655 			0, mxge_change_lro,
1656 			"I", "number of lro merge queues");
1657 
1658 
1659 	/* add counters exported for debugging from all slices */
1660 	sysctl_ctx_init(&sc->slice_sysctl_ctx);
1661 	sc->slice_sysctl_tree =
1662 		SYSCTL_ADD_NODE(&sc->slice_sysctl_ctx, children, OID_AUTO,
1663 				"slice", CTLFLAG_RD, 0, "");
1664 
1665 	for (slice = 0; slice < sc->num_slices; slice++) {
1666 		ss = &sc->ss[slice];
1667 		sysctl_ctx_init(&ss->sysctl_ctx);
1668 		ctx = &ss->sysctl_ctx;
1669 		children = SYSCTL_CHILDREN(sc->slice_sysctl_tree);
1670 		sprintf(slice_num, "%d", slice);
1671 		ss->sysctl_tree =
1672 			SYSCTL_ADD_NODE(ctx, children, OID_AUTO, slice_num,
1673 					CTLFLAG_RD, 0, "");
1674 		children = SYSCTL_CHILDREN(ss->sysctl_tree);
1675 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1676 			       "rx_small_cnt",
1677 			       CTLFLAG_RD, &ss->rx_small.cnt,
1678 			       0, "rx_small_cnt");
1679 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1680 			       "rx_big_cnt",
1681 			       CTLFLAG_RD, &ss->rx_big.cnt,
1682 			       0, "rx_small_cnt");
1683 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1684 			       "lro_flushed", CTLFLAG_RD, &ss->lro_flushed,
1685 			       0, "number of lro merge queues flushed");
1686 
1687 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1688 			       "lro_queued", CTLFLAG_RD, &ss->lro_queued,
1689 			       0, "number of frames appended to lro merge"
1690 			       "queues");
1691 
1692 #ifndef IFNET_BUF_RING
1693 		/* only transmit from slice 0 for now */
1694 		if (slice > 0)
1695 			continue;
1696 #endif
1697 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1698 			       "tx_req",
1699 			       CTLFLAG_RD, &ss->tx.req,
1700 			       0, "tx_req");
1701 
1702 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1703 			       "tx_done",
1704 			       CTLFLAG_RD, &ss->tx.done,
1705 			       0, "tx_done");
1706 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1707 			       "tx_pkt_done",
1708 			       CTLFLAG_RD, &ss->tx.pkt_done,
1709 			       0, "tx_done");
1710 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1711 			       "tx_stall",
1712 			       CTLFLAG_RD, &ss->tx.stall,
1713 			       0, "tx_stall");
1714 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1715 			       "tx_wake",
1716 			       CTLFLAG_RD, &ss->tx.wake,
1717 			       0, "tx_wake");
1718 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1719 			       "tx_defrag",
1720 			       CTLFLAG_RD, &ss->tx.defrag,
1721 			       0, "tx_defrag");
1722 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1723 			       "tx_queue_active",
1724 			       CTLFLAG_RD, &ss->tx.queue_active,
1725 			       0, "tx_queue_active");
1726 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1727 			       "tx_activate",
1728 			       CTLFLAG_RD, &ss->tx.activate,
1729 			       0, "tx_activate");
1730 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1731 			       "tx_deactivate",
1732 			       CTLFLAG_RD, &ss->tx.deactivate,
1733 			       0, "tx_deactivate");
1734 	}
1735 }
1736 
1737 /* copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
1738    backwards one at a time and handle ring wraps */
1739 
1740 static inline void
1741 mxge_submit_req_backwards(mxge_tx_ring_t *tx,
1742 			    mcp_kreq_ether_send_t *src, int cnt)
1743 {
1744         int idx, starting_slot;
1745         starting_slot = tx->req;
1746         while (cnt > 1) {
1747                 cnt--;
1748                 idx = (starting_slot + cnt) & tx->mask;
1749                 mxge_pio_copy(&tx->lanai[idx],
1750 			      &src[cnt], sizeof(*src));
1751                 wmb();
1752         }
1753 }
1754 
1755 /*
1756  * copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
1757  * at most 32 bytes at a time, so as to avoid involving the software
1758  * pio handler in the nic.   We re-write the first segment's flags
1759  * to mark them valid only after writing the entire chain
1760  */
1761 
1762 static inline void
1763 mxge_submit_req(mxge_tx_ring_t *tx, mcp_kreq_ether_send_t *src,
1764                   int cnt)
1765 {
1766         int idx, i;
1767         uint32_t *src_ints;
1768 	volatile uint32_t *dst_ints;
1769         mcp_kreq_ether_send_t *srcp;
1770 	volatile mcp_kreq_ether_send_t *dstp, *dst;
1771 	uint8_t last_flags;
1772 
1773         idx = tx->req & tx->mask;
1774 
1775 	last_flags = src->flags;
1776 	src->flags = 0;
1777         wmb();
1778         dst = dstp = &tx->lanai[idx];
1779         srcp = src;
1780 
1781         if ((idx + cnt) < tx->mask) {
1782                 for (i = 0; i < (cnt - 1); i += 2) {
1783                         mxge_pio_copy(dstp, srcp, 2 * sizeof(*src));
1784                         wmb(); /* force write every 32 bytes */
1785                         srcp += 2;
1786                         dstp += 2;
1787                 }
1788         } else {
1789                 /* submit all but the first request, and ensure
1790                    that it is submitted below */
1791                 mxge_submit_req_backwards(tx, src, cnt);
1792                 i = 0;
1793         }
1794         if (i < cnt) {
1795                 /* submit the first request */
1796                 mxge_pio_copy(dstp, srcp, sizeof(*src));
1797                 wmb(); /* barrier before setting valid flag */
1798         }
1799 
1800         /* re-write the last 32-bits with the valid flags */
1801         src->flags = last_flags;
1802         src_ints = (uint32_t *)src;
1803         src_ints+=3;
1804         dst_ints = (volatile uint32_t *)dst;
1805         dst_ints+=3;
1806         *dst_ints =  *src_ints;
1807         tx->req += cnt;
1808         wmb();
1809 }
1810 
1811 #if IFCAP_TSO4
1812 
1813 static void
1814 mxge_encap_tso(struct mxge_slice_state *ss, struct mbuf *m,
1815 	       int busdma_seg_cnt, int ip_off)
1816 {
1817 	mxge_tx_ring_t *tx;
1818 	mcp_kreq_ether_send_t *req;
1819 	bus_dma_segment_t *seg;
1820 	struct ip *ip;
1821 	struct tcphdr *tcp;
1822 	uint32_t low, high_swapped;
1823 	int len, seglen, cum_len, cum_len_next;
1824 	int next_is_first, chop, cnt, rdma_count, small;
1825 	uint16_t pseudo_hdr_offset, cksum_offset, mss;
1826 	uint8_t flags, flags_next;
1827 	static int once;
1828 
1829 	mss = m->m_pkthdr.tso_segsz;
1830 
1831 	/* negative cum_len signifies to the
1832 	 * send loop that we are still in the
1833 	 * header portion of the TSO packet.
1834 	 */
1835 
1836 	/* ensure we have the ethernet, IP and TCP
1837 	   header together in the first mbuf, copy
1838 	   it to a scratch buffer if not */
1839 	if (__predict_false(m->m_len < ip_off + sizeof (*ip))) {
1840 		m_copydata(m, 0, ip_off + sizeof (*ip),
1841 			   ss->scratch);
1842 		ip = (struct ip *)(ss->scratch + ip_off);
1843 	} else {
1844 		ip = (struct ip *)(mtod(m, char *) + ip_off);
1845 	}
1846 	if (__predict_false(m->m_len < ip_off + (ip->ip_hl << 2)
1847 			    + sizeof (*tcp))) {
1848 		m_copydata(m, 0, ip_off + (ip->ip_hl << 2)
1849 			   + sizeof (*tcp),  ss->scratch);
1850 		ip = (struct ip *)(mtod(m, char *) + ip_off);
1851 	}
1852 
1853 	tcp = (struct tcphdr *)((char *)ip + (ip->ip_hl << 2));
1854 	cum_len = -(ip_off + ((ip->ip_hl + tcp->th_off) << 2));
1855 
1856 	/* TSO implies checksum offload on this hardware */
1857 	cksum_offset = ip_off + (ip->ip_hl << 2);
1858 	flags = MXGEFW_FLAGS_TSO_HDR | MXGEFW_FLAGS_FIRST;
1859 
1860 
1861 	/* for TSO, pseudo_hdr_offset holds mss.
1862 	 * The firmware figures out where to put
1863 	 * the checksum by parsing the header. */
1864 	pseudo_hdr_offset = htobe16(mss);
1865 
1866 	tx = &ss->tx;
1867 	req = tx->req_list;
1868 	seg = tx->seg_list;
1869 	cnt = 0;
1870 	rdma_count = 0;
1871 	/* "rdma_count" is the number of RDMAs belonging to the
1872 	 * current packet BEFORE the current send request. For
1873 	 * non-TSO packets, this is equal to "count".
1874 	 * For TSO packets, rdma_count needs to be reset
1875 	 * to 0 after a segment cut.
1876 	 *
1877 	 * The rdma_count field of the send request is
1878 	 * the number of RDMAs of the packet starting at
1879 	 * that request. For TSO send requests with one ore more cuts
1880 	 * in the middle, this is the number of RDMAs starting
1881 	 * after the last cut in the request. All previous
1882 	 * segments before the last cut implicitly have 1 RDMA.
1883 	 *
1884 	 * Since the number of RDMAs is not known beforehand,
1885 	 * it must be filled-in retroactively - after each
1886 	 * segmentation cut or at the end of the entire packet.
1887 	 */
1888 
1889 	while (busdma_seg_cnt) {
1890 		/* Break the busdma segment up into pieces*/
1891 		low = MXGE_LOWPART_TO_U32(seg->ds_addr);
1892 		high_swapped = 	htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
1893 		len = seg->ds_len;
1894 
1895 		while (len) {
1896 			flags_next = flags & ~MXGEFW_FLAGS_FIRST;
1897 			seglen = len;
1898 			cum_len_next = cum_len + seglen;
1899 			(req-rdma_count)->rdma_count = rdma_count + 1;
1900 			if (__predict_true(cum_len >= 0)) {
1901 				/* payload */
1902 				chop = (cum_len_next > mss);
1903 				cum_len_next = cum_len_next % mss;
1904 				next_is_first = (cum_len_next == 0);
1905 				flags |= chop * MXGEFW_FLAGS_TSO_CHOP;
1906 				flags_next |= next_is_first *
1907 					MXGEFW_FLAGS_FIRST;
1908 				rdma_count |= -(chop | next_is_first);
1909 				rdma_count += chop & !next_is_first;
1910 			} else if (cum_len_next >= 0) {
1911 				/* header ends */
1912 				rdma_count = -1;
1913 				cum_len_next = 0;
1914 				seglen = -cum_len;
1915 				small = (mss <= MXGEFW_SEND_SMALL_SIZE);
1916 				flags_next = MXGEFW_FLAGS_TSO_PLD |
1917 					MXGEFW_FLAGS_FIRST |
1918 					(small * MXGEFW_FLAGS_SMALL);
1919 			    }
1920 
1921 			req->addr_high = high_swapped;
1922 			req->addr_low = htobe32(low);
1923 			req->pseudo_hdr_offset = pseudo_hdr_offset;
1924 			req->pad = 0;
1925 			req->rdma_count = 1;
1926 			req->length = htobe16(seglen);
1927 			req->cksum_offset = cksum_offset;
1928 			req->flags = flags | ((cum_len & 1) *
1929 					      MXGEFW_FLAGS_ALIGN_ODD);
1930 			low += seglen;
1931 			len -= seglen;
1932 			cum_len = cum_len_next;
1933 			flags = flags_next;
1934 			req++;
1935 			cnt++;
1936 			rdma_count++;
1937 			if (__predict_false(cksum_offset > seglen))
1938 				cksum_offset -= seglen;
1939 			else
1940 				cksum_offset = 0;
1941 			if (__predict_false(cnt > tx->max_desc))
1942 				goto drop;
1943 		}
1944 		busdma_seg_cnt--;
1945 		seg++;
1946 	}
1947 	(req-rdma_count)->rdma_count = rdma_count;
1948 
1949 	do {
1950 		req--;
1951 		req->flags |= MXGEFW_FLAGS_TSO_LAST;
1952 	} while (!(req->flags & (MXGEFW_FLAGS_TSO_CHOP | MXGEFW_FLAGS_FIRST)));
1953 
1954 	tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
1955 	mxge_submit_req(tx, tx->req_list, cnt);
1956 #ifdef IFNET_BUF_RING
1957 	if ((ss->sc->num_slices > 1) && tx->queue_active == 0) {
1958 		/* tell the NIC to start polling this slice */
1959 		*tx->send_go = 1;
1960 		tx->queue_active = 1;
1961 		tx->activate++;
1962 		wmb();
1963 	}
1964 #endif
1965 	return;
1966 
1967 drop:
1968 	bus_dmamap_unload(tx->dmat, tx->info[tx->req & tx->mask].map);
1969 	m_freem(m);
1970 	ss->oerrors++;
1971 	if (!once) {
1972 		printf("tx->max_desc exceeded via TSO!\n");
1973 		printf("mss = %d, %ld, %d!\n", mss,
1974 		       (long)seg - (long)tx->seg_list, tx->max_desc);
1975 		once = 1;
1976 	}
1977 	return;
1978 
1979 }
1980 
1981 #endif /* IFCAP_TSO4 */
1982 
1983 #ifdef MXGE_NEW_VLAN_API
1984 /*
1985  * We reproduce the software vlan tag insertion from
1986  * net/if_vlan.c:vlan_start() here so that we can advertise "hardware"
1987  * vlan tag insertion. We need to advertise this in order to have the
1988  * vlan interface respect our csum offload flags.
1989  */
1990 static struct mbuf *
1991 mxge_vlan_tag_insert(struct mbuf *m)
1992 {
1993 	struct ether_vlan_header *evl;
1994 
1995 	M_PREPEND(m, ETHER_VLAN_ENCAP_LEN, M_DONTWAIT);
1996 	if (__predict_false(m == NULL))
1997 		return NULL;
1998 	if (m->m_len < sizeof(*evl)) {
1999 		m = m_pullup(m, sizeof(*evl));
2000 		if (__predict_false(m == NULL))
2001 			return NULL;
2002 	}
2003 	/*
2004 	 * Transform the Ethernet header into an Ethernet header
2005 	 * with 802.1Q encapsulation.
2006 	 */
2007 	evl = mtod(m, struct ether_vlan_header *);
2008 	bcopy((char *)evl + ETHER_VLAN_ENCAP_LEN,
2009 	      (char *)evl, ETHER_HDR_LEN - ETHER_TYPE_LEN);
2010 	evl->evl_encap_proto = htons(ETHERTYPE_VLAN);
2011 	evl->evl_tag = htons(m->m_pkthdr.ether_vtag);
2012 	m->m_flags &= ~M_VLANTAG;
2013 	return m;
2014 }
2015 #endif /* MXGE_NEW_VLAN_API */
2016 
2017 static void
2018 mxge_encap(struct mxge_slice_state *ss, struct mbuf *m)
2019 {
2020 	mxge_softc_t *sc;
2021 	mcp_kreq_ether_send_t *req;
2022 	bus_dma_segment_t *seg;
2023 	struct mbuf *m_tmp;
2024 	struct ifnet *ifp;
2025 	mxge_tx_ring_t *tx;
2026 	struct ip *ip;
2027 	int cnt, cum_len, err, i, idx, odd_flag, ip_off;
2028 	uint16_t pseudo_hdr_offset;
2029         uint8_t flags, cksum_offset;
2030 
2031 
2032 	sc = ss->sc;
2033 	ifp = sc->ifp;
2034 	tx = &ss->tx;
2035 
2036 	ip_off = sizeof (struct ether_header);
2037 #ifdef MXGE_NEW_VLAN_API
2038 	if (m->m_flags & M_VLANTAG) {
2039 		m = mxge_vlan_tag_insert(m);
2040 		if (__predict_false(m == NULL))
2041 			goto drop;
2042 		ip_off += ETHER_VLAN_ENCAP_LEN;
2043 	}
2044 #endif
2045 	/* (try to) map the frame for DMA */
2046 	idx = tx->req & tx->mask;
2047 	err = bus_dmamap_load_mbuf_sg(tx->dmat, tx->info[idx].map,
2048 				      m, tx->seg_list, &cnt,
2049 				      BUS_DMA_NOWAIT);
2050 	if (__predict_false(err == EFBIG)) {
2051 		/* Too many segments in the chain.  Try
2052 		   to defrag */
2053 		m_tmp = m_defrag(m, M_NOWAIT);
2054 		if (m_tmp == NULL) {
2055 			goto drop;
2056 		}
2057 		ss->tx.defrag++;
2058 		m = m_tmp;
2059 		err = bus_dmamap_load_mbuf_sg(tx->dmat,
2060 					      tx->info[idx].map,
2061 					      m, tx->seg_list, &cnt,
2062 					      BUS_DMA_NOWAIT);
2063 	}
2064 	if (__predict_false(err != 0)) {
2065 		device_printf(sc->dev, "bus_dmamap_load_mbuf_sg returned %d"
2066 			      " packet len = %d\n", err, m->m_pkthdr.len);
2067 		goto drop;
2068 	}
2069 	bus_dmamap_sync(tx->dmat, tx->info[idx].map,
2070 			BUS_DMASYNC_PREWRITE);
2071 	tx->info[idx].m = m;
2072 
2073 #if IFCAP_TSO4
2074 	/* TSO is different enough, we handle it in another routine */
2075 	if (m->m_pkthdr.csum_flags & (CSUM_TSO)) {
2076 		mxge_encap_tso(ss, m, cnt, ip_off);
2077 		return;
2078 	}
2079 #endif
2080 
2081 	req = tx->req_list;
2082 	cksum_offset = 0;
2083 	pseudo_hdr_offset = 0;
2084 	flags = MXGEFW_FLAGS_NO_TSO;
2085 
2086 	/* checksum offloading? */
2087 	if (m->m_pkthdr.csum_flags & (CSUM_DELAY_DATA)) {
2088 		/* ensure ip header is in first mbuf, copy
2089 		   it to a scratch buffer if not */
2090 		if (__predict_false(m->m_len < ip_off + sizeof (*ip))) {
2091 			m_copydata(m, 0, ip_off + sizeof (*ip),
2092 				   ss->scratch);
2093 			ip = (struct ip *)(ss->scratch + ip_off);
2094 		} else {
2095 			ip = (struct ip *)(mtod(m, char *) + ip_off);
2096 		}
2097 		cksum_offset = ip_off + (ip->ip_hl << 2);
2098 		pseudo_hdr_offset = cksum_offset +  m->m_pkthdr.csum_data;
2099 		pseudo_hdr_offset = htobe16(pseudo_hdr_offset);
2100 		req->cksum_offset = cksum_offset;
2101 		flags |= MXGEFW_FLAGS_CKSUM;
2102 		odd_flag = MXGEFW_FLAGS_ALIGN_ODD;
2103 	} else {
2104 		odd_flag = 0;
2105 	}
2106 	if (m->m_pkthdr.len < MXGEFW_SEND_SMALL_SIZE)
2107 		flags |= MXGEFW_FLAGS_SMALL;
2108 
2109 	/* convert segments into a request list */
2110 	cum_len = 0;
2111 	seg = tx->seg_list;
2112 	req->flags = MXGEFW_FLAGS_FIRST;
2113 	for (i = 0; i < cnt; i++) {
2114 		req->addr_low =
2115 			htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
2116 		req->addr_high =
2117 			htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
2118 		req->length = htobe16(seg->ds_len);
2119 		req->cksum_offset = cksum_offset;
2120 		if (cksum_offset > seg->ds_len)
2121 			cksum_offset -= seg->ds_len;
2122 		else
2123 			cksum_offset = 0;
2124 		req->pseudo_hdr_offset = pseudo_hdr_offset;
2125 		req->pad = 0; /* complete solid 16-byte block */
2126 		req->rdma_count = 1;
2127 		req->flags |= flags | ((cum_len & 1) * odd_flag);
2128 		cum_len += seg->ds_len;
2129 		seg++;
2130 		req++;
2131 		req->flags = 0;
2132 	}
2133 	req--;
2134 	/* pad runts to 60 bytes */
2135 	if (cum_len < 60) {
2136 		req++;
2137 		req->addr_low =
2138 			htobe32(MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr));
2139 		req->addr_high =
2140 			htobe32(MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr));
2141 		req->length = htobe16(60 - cum_len);
2142 		req->cksum_offset = 0;
2143 		req->pseudo_hdr_offset = pseudo_hdr_offset;
2144 		req->pad = 0; /* complete solid 16-byte block */
2145 		req->rdma_count = 1;
2146 		req->flags |= flags | ((cum_len & 1) * odd_flag);
2147 		cnt++;
2148 	}
2149 
2150 	tx->req_list[0].rdma_count = cnt;
2151 #if 0
2152 	/* print what the firmware will see */
2153 	for (i = 0; i < cnt; i++) {
2154 		printf("%d: addr: 0x%x 0x%x len:%d pso%d,"
2155 		    "cso:%d, flags:0x%x, rdma:%d\n",
2156 		    i, (int)ntohl(tx->req_list[i].addr_high),
2157 		    (int)ntohl(tx->req_list[i].addr_low),
2158 		    (int)ntohs(tx->req_list[i].length),
2159 		    (int)ntohs(tx->req_list[i].pseudo_hdr_offset),
2160 		    tx->req_list[i].cksum_offset, tx->req_list[i].flags,
2161 		    tx->req_list[i].rdma_count);
2162 	}
2163 	printf("--------------\n");
2164 #endif
2165 	tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
2166 	mxge_submit_req(tx, tx->req_list, cnt);
2167 #ifdef IFNET_BUF_RING
2168 	if ((ss->sc->num_slices > 1) && tx->queue_active == 0) {
2169 		/* tell the NIC to start polling this slice */
2170 		*tx->send_go = 1;
2171 		tx->queue_active = 1;
2172 		tx->activate++;
2173 		wmb();
2174 	}
2175 #endif
2176 	return;
2177 
2178 drop:
2179 	m_freem(m);
2180 	ss->oerrors++;
2181 	return;
2182 }
2183 
2184 #ifdef IFNET_BUF_RING
2185 static void
2186 mxge_qflush(struct ifnet *ifp)
2187 {
2188 	mxge_softc_t *sc = ifp->if_softc;
2189 	mxge_tx_ring_t *tx;
2190 	struct mbuf *m;
2191 	int slice;
2192 
2193 	for (slice = 0; slice < sc->num_slices; slice++) {
2194 		tx = &sc->ss[slice].tx;
2195 		mtx_lock(&tx->mtx);
2196 		while ((m = buf_ring_dequeue_sc(tx->br)) != NULL)
2197 			m_freem(m);
2198 		mtx_unlock(&tx->mtx);
2199 	}
2200 	if_qflush(ifp);
2201 }
2202 
2203 static inline void
2204 mxge_start_locked(struct mxge_slice_state *ss)
2205 {
2206 	mxge_softc_t *sc;
2207 	struct mbuf *m;
2208 	struct ifnet *ifp;
2209 	mxge_tx_ring_t *tx;
2210 
2211 	sc = ss->sc;
2212 	ifp = sc->ifp;
2213 	tx = &ss->tx;
2214 
2215 	while ((tx->mask - (tx->req - tx->done)) > tx->max_desc) {
2216 		m = drbr_dequeue(ifp, tx->br);
2217 		if (m == NULL) {
2218 			return;
2219 		}
2220 		/* let BPF see it */
2221 		BPF_MTAP(ifp, m);
2222 
2223 		/* give it to the nic */
2224 		mxge_encap(ss, m);
2225 	}
2226 	/* ran out of transmit slots */
2227 	if (((ss->if_drv_flags & IFF_DRV_OACTIVE) == 0)
2228 	    && (!drbr_empty(ifp, tx->br))) {
2229 		ss->if_drv_flags |= IFF_DRV_OACTIVE;
2230 		tx->stall++;
2231 	}
2232 }
2233 
2234 static int
2235 mxge_transmit_locked(struct mxge_slice_state *ss, struct mbuf *m)
2236 {
2237 	mxge_softc_t *sc;
2238 	struct ifnet *ifp;
2239 	mxge_tx_ring_t *tx;
2240 	int err;
2241 
2242 	sc = ss->sc;
2243 	ifp = sc->ifp;
2244 	tx = &ss->tx;
2245 
2246 	if ((ss->if_drv_flags & (IFF_DRV_RUNNING|IFF_DRV_OACTIVE)) !=
2247 	    IFF_DRV_RUNNING) {
2248 		err = drbr_enqueue(ifp, tx->br, m);
2249 		return (err);
2250 	}
2251 
2252 	if (drbr_empty(ifp, tx->br) &&
2253 	    ((tx->mask - (tx->req - tx->done)) > tx->max_desc)) {
2254 		/* let BPF see it */
2255 		BPF_MTAP(ifp, m);
2256 		/* give it to the nic */
2257 		mxge_encap(ss, m);
2258 	} else if ((err = drbr_enqueue(ifp, tx->br, m)) != 0) {
2259 		return (err);
2260 	}
2261 	if (!drbr_empty(ifp, tx->br))
2262 		mxge_start_locked(ss);
2263 	return (0);
2264 }
2265 
2266 static int
2267 mxge_transmit(struct ifnet *ifp, struct mbuf *m)
2268 {
2269 	mxge_softc_t *sc = ifp->if_softc;
2270 	struct mxge_slice_state *ss;
2271 	mxge_tx_ring_t *tx;
2272 	int err = 0;
2273 	int slice;
2274 
2275 	slice = m->m_pkthdr.flowid;
2276 	slice &= (sc->num_slices - 1);  /* num_slices always power of 2 */
2277 
2278 	ss = &sc->ss[slice];
2279 	tx = &ss->tx;
2280 
2281 	if (mtx_trylock(&tx->mtx)) {
2282 		err = mxge_transmit_locked(ss, m);
2283 		mtx_unlock(&tx->mtx);
2284 	} else {
2285 		err = drbr_enqueue(ifp, tx->br, m);
2286 	}
2287 
2288 	return (err);
2289 }
2290 
2291 #else
2292 
2293 static inline void
2294 mxge_start_locked(struct mxge_slice_state *ss)
2295 {
2296 	mxge_softc_t *sc;
2297 	struct mbuf *m;
2298 	struct ifnet *ifp;
2299 	mxge_tx_ring_t *tx;
2300 
2301 	sc = ss->sc;
2302 	ifp = sc->ifp;
2303 	tx = &ss->tx;
2304 	while ((tx->mask - (tx->req - tx->done)) > tx->max_desc) {
2305 		IFQ_DRV_DEQUEUE(&ifp->if_snd, m);
2306 		if (m == NULL) {
2307 			return;
2308 		}
2309 		/* let BPF see it */
2310 		BPF_MTAP(ifp, m);
2311 
2312 		/* give it to the nic */
2313 		mxge_encap(ss, m);
2314 	}
2315 	/* ran out of transmit slots */
2316 	if ((sc->ifp->if_drv_flags & IFF_DRV_OACTIVE) == 0) {
2317 		sc->ifp->if_drv_flags |= IFF_DRV_OACTIVE;
2318 		tx->stall++;
2319 	}
2320 }
2321 #endif
2322 static void
2323 mxge_start(struct ifnet *ifp)
2324 {
2325 	mxge_softc_t *sc = ifp->if_softc;
2326 	struct mxge_slice_state *ss;
2327 
2328 	/* only use the first slice for now */
2329 	ss = &sc->ss[0];
2330 	mtx_lock(&ss->tx.mtx);
2331 	mxge_start_locked(ss);
2332 	mtx_unlock(&ss->tx.mtx);
2333 }
2334 
2335 /*
2336  * copy an array of mcp_kreq_ether_recv_t's to the mcp.  Copy
2337  * at most 32 bytes at a time, so as to avoid involving the software
2338  * pio handler in the nic.   We re-write the first segment's low
2339  * DMA address to mark it valid only after we write the entire chunk
2340  * in a burst
2341  */
2342 static inline void
2343 mxge_submit_8rx(volatile mcp_kreq_ether_recv_t *dst,
2344 		mcp_kreq_ether_recv_t *src)
2345 {
2346 	uint32_t low;
2347 
2348 	low = src->addr_low;
2349 	src->addr_low = 0xffffffff;
2350 	mxge_pio_copy(dst, src, 4 * sizeof (*src));
2351 	wmb();
2352 	mxge_pio_copy(dst + 4, src + 4, 4 * sizeof (*src));
2353 	wmb();
2354 	src->addr_low = low;
2355 	dst->addr_low = low;
2356 	wmb();
2357 }
2358 
2359 static int
2360 mxge_get_buf_small(struct mxge_slice_state *ss, bus_dmamap_t map, int idx)
2361 {
2362 	bus_dma_segment_t seg;
2363 	struct mbuf *m;
2364 	mxge_rx_ring_t *rx = &ss->rx_small;
2365 	int cnt, err;
2366 
2367 	m = m_gethdr(M_DONTWAIT, MT_DATA);
2368 	if (m == NULL) {
2369 		rx->alloc_fail++;
2370 		err = ENOBUFS;
2371 		goto done;
2372 	}
2373 	m->m_len = MHLEN;
2374 	err = bus_dmamap_load_mbuf_sg(rx->dmat, map, m,
2375 				      &seg, &cnt, BUS_DMA_NOWAIT);
2376 	if (err != 0) {
2377 		m_free(m);
2378 		goto done;
2379 	}
2380 	rx->info[idx].m = m;
2381 	rx->shadow[idx].addr_low =
2382 		htobe32(MXGE_LOWPART_TO_U32(seg.ds_addr));
2383 	rx->shadow[idx].addr_high =
2384 		htobe32(MXGE_HIGHPART_TO_U32(seg.ds_addr));
2385 
2386 done:
2387 	if ((idx & 7) == 7)
2388 		mxge_submit_8rx(&rx->lanai[idx - 7], &rx->shadow[idx - 7]);
2389 	return err;
2390 }
2391 
2392 static int
2393 mxge_get_buf_big(struct mxge_slice_state *ss, bus_dmamap_t map, int idx)
2394 {
2395 	bus_dma_segment_t seg[3];
2396 	struct mbuf *m;
2397 	mxge_rx_ring_t *rx = &ss->rx_big;
2398 	int cnt, err, i;
2399 
2400 	if (rx->cl_size == MCLBYTES)
2401 		m = m_getcl(M_DONTWAIT, MT_DATA, M_PKTHDR);
2402 	else
2403 		m = m_getjcl(M_DONTWAIT, MT_DATA, M_PKTHDR, rx->cl_size);
2404 	if (m == NULL) {
2405 		rx->alloc_fail++;
2406 		err = ENOBUFS;
2407 		goto done;
2408 	}
2409 	m->m_len = rx->mlen;
2410 	err = bus_dmamap_load_mbuf_sg(rx->dmat, map, m,
2411 				      seg, &cnt, BUS_DMA_NOWAIT);
2412 	if (err != 0) {
2413 		m_free(m);
2414 		goto done;
2415 	}
2416 	rx->info[idx].m = m;
2417 	rx->shadow[idx].addr_low =
2418 		htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
2419 	rx->shadow[idx].addr_high =
2420 		htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
2421 
2422 #if MXGE_VIRT_JUMBOS
2423 	for (i = 1; i < cnt; i++) {
2424 		rx->shadow[idx + i].addr_low =
2425 			htobe32(MXGE_LOWPART_TO_U32(seg[i].ds_addr));
2426 		rx->shadow[idx + i].addr_high =
2427 			htobe32(MXGE_HIGHPART_TO_U32(seg[i].ds_addr));
2428        }
2429 #endif
2430 
2431 done:
2432        for (i = 0; i < rx->nbufs; i++) {
2433 		if ((idx & 7) == 7) {
2434 			mxge_submit_8rx(&rx->lanai[idx - 7],
2435 					&rx->shadow[idx - 7]);
2436 		}
2437 		idx++;
2438 	}
2439 	return err;
2440 }
2441 
2442 /*
2443  *  Myri10GE hardware checksums are not valid if the sender
2444  *  padded the frame with non-zero padding.  This is because
2445  *  the firmware just does a simple 16-bit 1s complement
2446  *  checksum across the entire frame, excluding the first 14
2447  *  bytes.  It is best to simply to check the checksum and
2448  *  tell the stack about it only if the checksum is good
2449  */
2450 
2451 static inline uint16_t
2452 mxge_rx_csum(struct mbuf *m, int csum)
2453 {
2454 	struct ether_header *eh;
2455 	struct ip *ip;
2456 	uint16_t c;
2457 
2458 	eh = mtod(m, struct ether_header *);
2459 
2460 	/* only deal with IPv4 TCP & UDP for now */
2461 	if (__predict_false(eh->ether_type != htons(ETHERTYPE_IP)))
2462 		return 1;
2463 	ip = (struct ip *)(eh + 1);
2464 	if (__predict_false(ip->ip_p != IPPROTO_TCP &&
2465 			    ip->ip_p != IPPROTO_UDP))
2466 		return 1;
2467 #ifdef INET
2468 	c = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
2469 		      htonl(ntohs(csum) + ntohs(ip->ip_len) +
2470 			    - (ip->ip_hl << 2) + ip->ip_p));
2471 #else
2472 	c = 1;
2473 #endif
2474 	c ^= 0xffff;
2475 	return (c);
2476 }
2477 
2478 static void
2479 mxge_vlan_tag_remove(struct mbuf *m, uint32_t *csum)
2480 {
2481 	struct ether_vlan_header *evl;
2482 	struct ether_header *eh;
2483 	uint32_t partial;
2484 
2485 	evl = mtod(m, struct ether_vlan_header *);
2486 	eh = mtod(m, struct ether_header *);
2487 
2488 	/*
2489 	 * fix checksum by subtracting ETHER_VLAN_ENCAP_LEN bytes
2490 	 * after what the firmware thought was the end of the ethernet
2491 	 * header.
2492 	 */
2493 
2494 	/* put checksum into host byte order */
2495 	*csum = ntohs(*csum);
2496 	partial = ntohl(*(uint32_t *)(mtod(m, char *) + ETHER_HDR_LEN));
2497 	(*csum) += ~partial;
2498 	(*csum) +=  ((*csum) < ~partial);
2499 	(*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2500 	(*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2501 
2502 	/* restore checksum to network byte order;
2503 	   later consumers expect this */
2504 	*csum = htons(*csum);
2505 
2506 	/* save the tag */
2507 #ifdef MXGE_NEW_VLAN_API
2508 	m->m_pkthdr.ether_vtag = ntohs(evl->evl_tag);
2509 #else
2510 	{
2511 		struct m_tag *mtag;
2512 		mtag = m_tag_alloc(MTAG_VLAN, MTAG_VLAN_TAG, sizeof(u_int),
2513 				   M_NOWAIT);
2514 		if (mtag == NULL)
2515 			return;
2516 		VLAN_TAG_VALUE(mtag) = ntohs(evl->evl_tag);
2517 		m_tag_prepend(m, mtag);
2518 	}
2519 
2520 #endif
2521 	m->m_flags |= M_VLANTAG;
2522 
2523 	/*
2524 	 * Remove the 802.1q header by copying the Ethernet
2525 	 * addresses over it and adjusting the beginning of
2526 	 * the data in the mbuf.  The encapsulated Ethernet
2527 	 * type field is already in place.
2528 	 */
2529 	bcopy((char *)evl, (char *)evl + ETHER_VLAN_ENCAP_LEN,
2530 	      ETHER_HDR_LEN - ETHER_TYPE_LEN);
2531 	m_adj(m, ETHER_VLAN_ENCAP_LEN);
2532 }
2533 
2534 
2535 static inline void
2536 mxge_rx_done_big(struct mxge_slice_state *ss, uint32_t len, uint32_t csum)
2537 {
2538 	mxge_softc_t *sc;
2539 	struct ifnet *ifp;
2540 	struct mbuf *m;
2541 	struct ether_header *eh;
2542 	mxge_rx_ring_t *rx;
2543 	bus_dmamap_t old_map;
2544 	int idx;
2545 	uint16_t tcpudp_csum;
2546 
2547 	sc = ss->sc;
2548 	ifp = sc->ifp;
2549 	rx = &ss->rx_big;
2550 	idx = rx->cnt & rx->mask;
2551 	rx->cnt += rx->nbufs;
2552 	/* save a pointer to the received mbuf */
2553 	m = rx->info[idx].m;
2554 	/* try to replace the received mbuf */
2555 	if (mxge_get_buf_big(ss, rx->extra_map, idx)) {
2556 		/* drop the frame -- the old mbuf is re-cycled */
2557 		ifp->if_ierrors++;
2558 		return;
2559 	}
2560 
2561 	/* unmap the received buffer */
2562 	old_map = rx->info[idx].map;
2563 	bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2564 	bus_dmamap_unload(rx->dmat, old_map);
2565 
2566 	/* swap the bus_dmamap_t's */
2567 	rx->info[idx].map = rx->extra_map;
2568 	rx->extra_map = old_map;
2569 
2570 	/* mcp implicitly skips 1st 2 bytes so that packet is properly
2571 	 * aligned */
2572 	m->m_data += MXGEFW_PAD;
2573 
2574 	m->m_pkthdr.rcvif = ifp;
2575 	m->m_len = m->m_pkthdr.len = len;
2576 	ss->ipackets++;
2577 	eh = mtod(m, struct ether_header *);
2578 	if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2579 		mxge_vlan_tag_remove(m, &csum);
2580 	}
2581 	/* if the checksum is valid, mark it in the mbuf header */
2582 	if (sc->csum_flag && (0 == (tcpudp_csum = mxge_rx_csum(m, csum)))) {
2583 		if (sc->lro_cnt && (0 == mxge_lro_rx(ss, m, csum)))
2584 			return;
2585 		/* otherwise, it was a UDP frame, or a TCP frame which
2586 		   we could not do LRO on.  Tell the stack that the
2587 		   checksum is good */
2588 		m->m_pkthdr.csum_data = 0xffff;
2589 		m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR | CSUM_DATA_VALID;
2590 	}
2591 	/* flowid only valid if RSS hashing is enabled */
2592 	if (sc->num_slices > 1) {
2593 		m->m_pkthdr.flowid = (ss - sc->ss);
2594 		m->m_flags |= M_FLOWID;
2595 	}
2596 	/* pass the frame up the stack */
2597 	(*ifp->if_input)(ifp, m);
2598 }
2599 
2600 static inline void
2601 mxge_rx_done_small(struct mxge_slice_state *ss, uint32_t len, uint32_t csum)
2602 {
2603 	mxge_softc_t *sc;
2604 	struct ifnet *ifp;
2605 	struct ether_header *eh;
2606 	struct mbuf *m;
2607 	mxge_rx_ring_t *rx;
2608 	bus_dmamap_t old_map;
2609 	int idx;
2610 	uint16_t tcpudp_csum;
2611 
2612 	sc = ss->sc;
2613 	ifp = sc->ifp;
2614 	rx = &ss->rx_small;
2615 	idx = rx->cnt & rx->mask;
2616 	rx->cnt++;
2617 	/* save a pointer to the received mbuf */
2618 	m = rx->info[idx].m;
2619 	/* try to replace the received mbuf */
2620 	if (mxge_get_buf_small(ss, rx->extra_map, idx)) {
2621 		/* drop the frame -- the old mbuf is re-cycled */
2622 		ifp->if_ierrors++;
2623 		return;
2624 	}
2625 
2626 	/* unmap the received buffer */
2627 	old_map = rx->info[idx].map;
2628 	bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2629 	bus_dmamap_unload(rx->dmat, old_map);
2630 
2631 	/* swap the bus_dmamap_t's */
2632 	rx->info[idx].map = rx->extra_map;
2633 	rx->extra_map = old_map;
2634 
2635 	/* mcp implicitly skips 1st 2 bytes so that packet is properly
2636 	 * aligned */
2637 	m->m_data += MXGEFW_PAD;
2638 
2639 	m->m_pkthdr.rcvif = ifp;
2640 	m->m_len = m->m_pkthdr.len = len;
2641 	ss->ipackets++;
2642 	eh = mtod(m, struct ether_header *);
2643 	if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2644 		mxge_vlan_tag_remove(m, &csum);
2645 	}
2646 	/* if the checksum is valid, mark it in the mbuf header */
2647 	if (sc->csum_flag && (0 == (tcpudp_csum = mxge_rx_csum(m, csum)))) {
2648 		if (sc->lro_cnt && (0 == mxge_lro_rx(ss, m, csum)))
2649 			return;
2650 		/* otherwise, it was a UDP frame, or a TCP frame which
2651 		   we could not do LRO on.  Tell the stack that the
2652 		   checksum is good */
2653 		m->m_pkthdr.csum_data = 0xffff;
2654 		m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR | CSUM_DATA_VALID;
2655 	}
2656 	/* flowid only valid if RSS hashing is enabled */
2657 	if (sc->num_slices > 1) {
2658 		m->m_pkthdr.flowid = (ss - sc->ss);
2659 		m->m_flags |= M_FLOWID;
2660 	}
2661 	/* pass the frame up the stack */
2662 	(*ifp->if_input)(ifp, m);
2663 }
2664 
2665 static inline void
2666 mxge_clean_rx_done(struct mxge_slice_state *ss)
2667 {
2668 	mxge_rx_done_t *rx_done = &ss->rx_done;
2669 	int limit = 0;
2670 	uint16_t length;
2671 	uint16_t checksum;
2672 
2673 
2674 	while (rx_done->entry[rx_done->idx].length != 0) {
2675 		length = ntohs(rx_done->entry[rx_done->idx].length);
2676 		rx_done->entry[rx_done->idx].length = 0;
2677 		checksum = rx_done->entry[rx_done->idx].checksum;
2678 		if (length <= (MHLEN - MXGEFW_PAD))
2679 			mxge_rx_done_small(ss, length, checksum);
2680 		else
2681 			mxge_rx_done_big(ss, length, checksum);
2682 		rx_done->cnt++;
2683 		rx_done->idx = rx_done->cnt & rx_done->mask;
2684 
2685 		/* limit potential for livelock */
2686 		if (__predict_false(++limit > rx_done->mask / 2))
2687 			break;
2688 	}
2689 #ifdef INET
2690 	while (!SLIST_EMPTY(&ss->lro_active)) {
2691 		struct lro_entry *lro = SLIST_FIRST(&ss->lro_active);
2692 		SLIST_REMOVE_HEAD(&ss->lro_active, next);
2693 		mxge_lro_flush(ss, lro);
2694 	}
2695 #endif
2696 }
2697 
2698 
2699 static inline void
2700 mxge_tx_done(struct mxge_slice_state *ss, uint32_t mcp_idx)
2701 {
2702 	struct ifnet *ifp;
2703 	mxge_tx_ring_t *tx;
2704 	struct mbuf *m;
2705 	bus_dmamap_t map;
2706 	int idx;
2707 	int *flags;
2708 
2709 	tx = &ss->tx;
2710 	ifp = ss->sc->ifp;
2711 	while (tx->pkt_done != mcp_idx) {
2712 		idx = tx->done & tx->mask;
2713 		tx->done++;
2714 		m = tx->info[idx].m;
2715 		/* mbuf and DMA map only attached to the first
2716 		   segment per-mbuf */
2717 		if (m != NULL) {
2718 			ss->obytes += m->m_pkthdr.len;
2719 			if (m->m_flags & M_MCAST)
2720 				ss->omcasts++;
2721 			ss->opackets++;
2722 			tx->info[idx].m = NULL;
2723 			map = tx->info[idx].map;
2724 			bus_dmamap_unload(tx->dmat, map);
2725 			m_freem(m);
2726 		}
2727 		if (tx->info[idx].flag) {
2728 			tx->info[idx].flag = 0;
2729 			tx->pkt_done++;
2730 		}
2731 	}
2732 
2733 	/* If we have space, clear IFF_OACTIVE to tell the stack that
2734            its OK to send packets */
2735 #ifdef IFNET_BUF_RING
2736 	flags = &ss->if_drv_flags;
2737 #else
2738 	flags = &ifp->if_drv_flags;
2739 #endif
2740 	mtx_lock(&ss->tx.mtx);
2741 	if ((*flags) & IFF_DRV_OACTIVE &&
2742 	    tx->req - tx->done < (tx->mask + 1)/4) {
2743 		*(flags) &= ~IFF_DRV_OACTIVE;
2744 		ss->tx.wake++;
2745 		mxge_start_locked(ss);
2746 	}
2747 #ifdef IFNET_BUF_RING
2748 	if ((ss->sc->num_slices > 1) && (tx->req == tx->done)) {
2749 		/* let the NIC stop polling this queue, since there
2750 		 * are no more transmits pending */
2751 		if (tx->req == tx->done) {
2752 			*tx->send_stop = 1;
2753 			tx->queue_active = 0;
2754 			tx->deactivate++;
2755 			wmb();
2756 		}
2757 	}
2758 #endif
2759 	mtx_unlock(&ss->tx.mtx);
2760 
2761 }
2762 
2763 static struct mxge_media_type mxge_xfp_media_types[] =
2764 {
2765 	{IFM_10G_CX4,	0x7f, 		"10GBASE-CX4 (module)"},
2766 	{IFM_10G_SR, 	(1 << 7),	"10GBASE-SR"},
2767 	{IFM_10G_LR, 	(1 << 6),	"10GBASE-LR"},
2768 	{0,		(1 << 5),	"10GBASE-ER"},
2769 	{IFM_10G_LRM,	(1 << 4),	"10GBASE-LRM"},
2770 	{0,		(1 << 3),	"10GBASE-SW"},
2771 	{0,		(1 << 2),	"10GBASE-LW"},
2772 	{0,		(1 << 1),	"10GBASE-EW"},
2773 	{0,		(1 << 0),	"Reserved"}
2774 };
2775 static struct mxge_media_type mxge_sfp_media_types[] =
2776 {
2777 	{0,		(1 << 7),	"Reserved"},
2778 	{IFM_10G_LRM,	(1 << 6),	"10GBASE-LRM"},
2779 	{IFM_10G_LR, 	(1 << 5),	"10GBASE-LR"},
2780 	{IFM_10G_SR,	(1 << 4),	"10GBASE-SR"}
2781 };
2782 
2783 static void
2784 mxge_set_media(mxge_softc_t *sc, int type)
2785 {
2786 	sc->media_flags |= type;
2787 	ifmedia_add(&sc->media, sc->media_flags, 0, NULL);
2788 	ifmedia_set(&sc->media, sc->media_flags);
2789 }
2790 
2791 
2792 /*
2793  * Determine the media type for a NIC.  Some XFPs will identify
2794  * themselves only when their link is up, so this is initiated via a
2795  * link up interrupt.  However, this can potentially take up to
2796  * several milliseconds, so it is run via the watchdog routine, rather
2797  * than in the interrupt handler itself.   This need only be done
2798  * once, not each time the link is up.
2799  */
2800 static void
2801 mxge_media_probe(mxge_softc_t *sc)
2802 {
2803 	mxge_cmd_t cmd;
2804 	char *cage_type;
2805 	char *ptr;
2806 	struct mxge_media_type *mxge_media_types = NULL;
2807 	int i, err, ms, mxge_media_type_entries;
2808 	uint32_t byte;
2809 
2810 	sc->need_media_probe = 0;
2811 
2812 	/* if we've already set a media type, we're done */
2813 	if (sc->media_flags  != (IFM_ETHER | IFM_AUTO))
2814 		return;
2815 
2816 	/*
2817 	 * parse the product code to deterimine the interface type
2818 	 * (CX4, XFP, Quad Ribbon Fiber) by looking at the character
2819 	 * after the 3rd dash in the driver's cached copy of the
2820 	 * EEPROM's product code string.
2821 	 */
2822 	ptr = sc->product_code_string;
2823 	if (ptr == NULL) {
2824 		device_printf(sc->dev, "Missing product code\n");
2825 	}
2826 
2827 	for (i = 0; i < 3; i++, ptr++) {
2828 		ptr = index(ptr, '-');
2829 		if (ptr == NULL) {
2830 			device_printf(sc->dev,
2831 				      "only %d dashes in PC?!?\n", i);
2832 			return;
2833 		}
2834 	}
2835 	if (*ptr == 'C') {
2836 		/* -C is CX4 */
2837 		mxge_set_media(sc, IFM_10G_CX4);
2838 		return;
2839 	}
2840 	else if (*ptr == 'Q') {
2841 		/* -Q is Quad Ribbon Fiber */
2842 		device_printf(sc->dev, "Quad Ribbon Fiber Media\n");
2843 		/* FreeBSD has no media type for Quad ribbon fiber */
2844 		return;
2845 	}
2846 
2847 	if (*ptr == 'R') {
2848 		/* -R is XFP */
2849 		mxge_media_types = mxge_xfp_media_types;
2850 		mxge_media_type_entries =
2851 			sizeof (mxge_xfp_media_types) /
2852 			sizeof (mxge_xfp_media_types[0]);
2853 		byte = MXGE_XFP_COMPLIANCE_BYTE;
2854 		cage_type = "XFP";
2855 	}
2856 
2857 	if (*ptr == 'S' || *(ptr +1) == 'S') {
2858 		/* -S or -2S is SFP+ */
2859 		mxge_media_types = mxge_sfp_media_types;
2860 		mxge_media_type_entries =
2861 			sizeof (mxge_sfp_media_types) /
2862 			sizeof (mxge_sfp_media_types[0]);
2863 		cage_type = "SFP+";
2864 		byte = 3;
2865 	}
2866 
2867 	if (mxge_media_types == NULL) {
2868 		device_printf(sc->dev, "Unknown media type: %c\n", *ptr);
2869 		return;
2870 	}
2871 
2872 	/*
2873 	 * At this point we know the NIC has an XFP cage, so now we
2874 	 * try to determine what is in the cage by using the
2875 	 * firmware's XFP I2C commands to read the XFP 10GbE compilance
2876 	 * register.  We read just one byte, which may take over
2877 	 * a millisecond
2878 	 */
2879 
2880 	cmd.data0 = 0;	 /* just fetch 1 byte, not all 256 */
2881 	cmd.data1 = byte;
2882 	err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_READ, &cmd);
2883 	if (err == MXGEFW_CMD_ERROR_I2C_FAILURE) {
2884 		device_printf(sc->dev, "failed to read XFP\n");
2885 	}
2886 	if (err == MXGEFW_CMD_ERROR_I2C_ABSENT) {
2887 		device_printf(sc->dev, "Type R/S with no XFP!?!?\n");
2888 	}
2889 	if (err != MXGEFW_CMD_OK) {
2890 		return;
2891 	}
2892 
2893 	/* now we wait for the data to be cached */
2894 	cmd.data0 = byte;
2895 	err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
2896 	for (ms = 0; (err == EBUSY) && (ms < 50); ms++) {
2897 		DELAY(1000);
2898 		cmd.data0 = byte;
2899 		err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
2900 	}
2901 	if (err != MXGEFW_CMD_OK) {
2902 		device_printf(sc->dev, "failed to read %s (%d, %dms)\n",
2903 			      cage_type, err, ms);
2904 		return;
2905 	}
2906 
2907 	if (cmd.data0 == mxge_media_types[0].bitmask) {
2908 		if (mxge_verbose)
2909 			device_printf(sc->dev, "%s:%s\n", cage_type,
2910 				      mxge_media_types[0].name);
2911 		mxge_set_media(sc, IFM_10G_CX4);
2912 		return;
2913 	}
2914 	for (i = 1; i < mxge_media_type_entries; i++) {
2915 		if (cmd.data0 & mxge_media_types[i].bitmask) {
2916 			if (mxge_verbose)
2917 				device_printf(sc->dev, "%s:%s\n",
2918 					      cage_type,
2919 					      mxge_media_types[i].name);
2920 
2921 			mxge_set_media(sc, mxge_media_types[i].flag);
2922 			return;
2923 		}
2924 	}
2925 	device_printf(sc->dev, "%s media 0x%x unknown\n", cage_type,
2926 		      cmd.data0);
2927 
2928 	return;
2929 }
2930 
2931 static void
2932 mxge_intr(void *arg)
2933 {
2934 	struct mxge_slice_state *ss = arg;
2935 	mxge_softc_t *sc = ss->sc;
2936 	mcp_irq_data_t *stats = ss->fw_stats;
2937 	mxge_tx_ring_t *tx = &ss->tx;
2938 	mxge_rx_done_t *rx_done = &ss->rx_done;
2939 	uint32_t send_done_count;
2940 	uint8_t valid;
2941 
2942 
2943 #ifndef IFNET_BUF_RING
2944 	/* an interrupt on a non-zero slice is implicitly valid
2945 	   since MSI-X irqs are not shared */
2946 	if (ss != sc->ss) {
2947 		mxge_clean_rx_done(ss);
2948 		*ss->irq_claim = be32toh(3);
2949 		return;
2950 	}
2951 #endif
2952 
2953 	/* make sure the DMA has finished */
2954 	if (!stats->valid) {
2955 		return;
2956 	}
2957 	valid = stats->valid;
2958 
2959 	if (sc->legacy_irq) {
2960 		/* lower legacy IRQ  */
2961 		*sc->irq_deassert = 0;
2962 		if (!mxge_deassert_wait)
2963 			/* don't wait for conf. that irq is low */
2964 			stats->valid = 0;
2965 	} else {
2966 		stats->valid = 0;
2967 	}
2968 
2969 	/* loop while waiting for legacy irq deassertion */
2970 	do {
2971 		/* check for transmit completes and receives */
2972 		send_done_count = be32toh(stats->send_done_count);
2973 		while ((send_done_count != tx->pkt_done) ||
2974 		       (rx_done->entry[rx_done->idx].length != 0)) {
2975 			if (send_done_count != tx->pkt_done)
2976 				mxge_tx_done(ss, (int)send_done_count);
2977 			mxge_clean_rx_done(ss);
2978 			send_done_count = be32toh(stats->send_done_count);
2979 		}
2980 		if (sc->legacy_irq && mxge_deassert_wait)
2981 			wmb();
2982 	} while (*((volatile uint8_t *) &stats->valid));
2983 
2984 	/* fw link & error stats meaningful only on the first slice */
2985 	if (__predict_false((ss == sc->ss) && stats->stats_updated)) {
2986 		if (sc->link_state != stats->link_up) {
2987 			sc->link_state = stats->link_up;
2988 			if (sc->link_state) {
2989 				if_link_state_change(sc->ifp, LINK_STATE_UP);
2990 				if (mxge_verbose)
2991 					device_printf(sc->dev, "link up\n");
2992 			} else {
2993 				if_link_state_change(sc->ifp, LINK_STATE_DOWN);
2994 				if (mxge_verbose)
2995 					device_printf(sc->dev, "link down\n");
2996 			}
2997 			sc->need_media_probe = 1;
2998 		}
2999 		if (sc->rdma_tags_available !=
3000 		    be32toh(stats->rdma_tags_available)) {
3001 			sc->rdma_tags_available =
3002 				be32toh(stats->rdma_tags_available);
3003 			device_printf(sc->dev, "RDMA timed out! %d tags "
3004 				      "left\n", sc->rdma_tags_available);
3005 		}
3006 
3007 		if (stats->link_down) {
3008 			sc->down_cnt += stats->link_down;
3009 			sc->link_state = 0;
3010 			if_link_state_change(sc->ifp, LINK_STATE_DOWN);
3011 		}
3012 	}
3013 
3014 	/* check to see if we have rx token to pass back */
3015 	if (valid & 0x1)
3016 	    *ss->irq_claim = be32toh(3);
3017 	*(ss->irq_claim + 1) = be32toh(3);
3018 }
3019 
3020 static void
3021 mxge_init(void *arg)
3022 {
3023 }
3024 
3025 
3026 
3027 static void
3028 mxge_free_slice_mbufs(struct mxge_slice_state *ss)
3029 {
3030 	struct lro_entry *lro_entry;
3031 	int i;
3032 
3033 	while (!SLIST_EMPTY(&ss->lro_free)) {
3034 		lro_entry = SLIST_FIRST(&ss->lro_free);
3035 		SLIST_REMOVE_HEAD(&ss->lro_free, next);
3036 		free(lro_entry, M_DEVBUF);
3037 	}
3038 
3039 	for (i = 0; i <= ss->rx_big.mask; i++) {
3040 		if (ss->rx_big.info[i].m == NULL)
3041 			continue;
3042 		bus_dmamap_unload(ss->rx_big.dmat,
3043 				  ss->rx_big.info[i].map);
3044 		m_freem(ss->rx_big.info[i].m);
3045 		ss->rx_big.info[i].m = NULL;
3046 	}
3047 
3048 	for (i = 0; i <= ss->rx_small.mask; i++) {
3049 		if (ss->rx_small.info[i].m == NULL)
3050 			continue;
3051 		bus_dmamap_unload(ss->rx_small.dmat,
3052 				  ss->rx_small.info[i].map);
3053 		m_freem(ss->rx_small.info[i].m);
3054 		ss->rx_small.info[i].m = NULL;
3055 	}
3056 
3057 	/* transmit ring used only on the first slice */
3058 	if (ss->tx.info == NULL)
3059 		return;
3060 
3061 	for (i = 0; i <= ss->tx.mask; i++) {
3062 		ss->tx.info[i].flag = 0;
3063 		if (ss->tx.info[i].m == NULL)
3064 			continue;
3065 		bus_dmamap_unload(ss->tx.dmat,
3066 				  ss->tx.info[i].map);
3067 		m_freem(ss->tx.info[i].m);
3068 		ss->tx.info[i].m = NULL;
3069 	}
3070 }
3071 
3072 static void
3073 mxge_free_mbufs(mxge_softc_t *sc)
3074 {
3075 	int slice;
3076 
3077 	for (slice = 0; slice < sc->num_slices; slice++)
3078 		mxge_free_slice_mbufs(&sc->ss[slice]);
3079 }
3080 
3081 static void
3082 mxge_free_slice_rings(struct mxge_slice_state *ss)
3083 {
3084 	int i;
3085 
3086 
3087 	if (ss->rx_done.entry != NULL)
3088 		mxge_dma_free(&ss->rx_done.dma);
3089 	ss->rx_done.entry = NULL;
3090 
3091 	if (ss->tx.req_bytes != NULL)
3092 		free(ss->tx.req_bytes, M_DEVBUF);
3093 	ss->tx.req_bytes = NULL;
3094 
3095 	if (ss->tx.seg_list != NULL)
3096 		free(ss->tx.seg_list, M_DEVBUF);
3097 	ss->tx.seg_list = NULL;
3098 
3099 	if (ss->rx_small.shadow != NULL)
3100 		free(ss->rx_small.shadow, M_DEVBUF);
3101 	ss->rx_small.shadow = NULL;
3102 
3103 	if (ss->rx_big.shadow != NULL)
3104 		free(ss->rx_big.shadow, M_DEVBUF);
3105 	ss->rx_big.shadow = NULL;
3106 
3107 	if (ss->tx.info != NULL) {
3108 		if (ss->tx.dmat != NULL) {
3109 			for (i = 0; i <= ss->tx.mask; i++) {
3110 				bus_dmamap_destroy(ss->tx.dmat,
3111 						   ss->tx.info[i].map);
3112 			}
3113 			bus_dma_tag_destroy(ss->tx.dmat);
3114 		}
3115 		free(ss->tx.info, M_DEVBUF);
3116 	}
3117 	ss->tx.info = NULL;
3118 
3119 	if (ss->rx_small.info != NULL) {
3120 		if (ss->rx_small.dmat != NULL) {
3121 			for (i = 0; i <= ss->rx_small.mask; i++) {
3122 				bus_dmamap_destroy(ss->rx_small.dmat,
3123 						   ss->rx_small.info[i].map);
3124 			}
3125 			bus_dmamap_destroy(ss->rx_small.dmat,
3126 					   ss->rx_small.extra_map);
3127 			bus_dma_tag_destroy(ss->rx_small.dmat);
3128 		}
3129 		free(ss->rx_small.info, M_DEVBUF);
3130 	}
3131 	ss->rx_small.info = NULL;
3132 
3133 	if (ss->rx_big.info != NULL) {
3134 		if (ss->rx_big.dmat != NULL) {
3135 			for (i = 0; i <= ss->rx_big.mask; i++) {
3136 				bus_dmamap_destroy(ss->rx_big.dmat,
3137 						   ss->rx_big.info[i].map);
3138 			}
3139 			bus_dmamap_destroy(ss->rx_big.dmat,
3140 					   ss->rx_big.extra_map);
3141 			bus_dma_tag_destroy(ss->rx_big.dmat);
3142 		}
3143 		free(ss->rx_big.info, M_DEVBUF);
3144 	}
3145 	ss->rx_big.info = NULL;
3146 }
3147 
3148 static void
3149 mxge_free_rings(mxge_softc_t *sc)
3150 {
3151 	int slice;
3152 
3153 	for (slice = 0; slice < sc->num_slices; slice++)
3154 		mxge_free_slice_rings(&sc->ss[slice]);
3155 }
3156 
3157 static int
3158 mxge_alloc_slice_rings(struct mxge_slice_state *ss, int rx_ring_entries,
3159 		       int tx_ring_entries)
3160 {
3161 	mxge_softc_t *sc = ss->sc;
3162 	size_t bytes;
3163 	int err, i;
3164 
3165 	err = ENOMEM;
3166 
3167 	/* allocate per-slice receive resources */
3168 
3169 	ss->rx_small.mask = ss->rx_big.mask = rx_ring_entries - 1;
3170 	ss->rx_done.mask = (2 * rx_ring_entries) - 1;
3171 
3172 	/* allocate the rx shadow rings */
3173 	bytes = rx_ring_entries * sizeof (*ss->rx_small.shadow);
3174 	ss->rx_small.shadow = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3175 	if (ss->rx_small.shadow == NULL)
3176 		return err;
3177 
3178 	bytes = rx_ring_entries * sizeof (*ss->rx_big.shadow);
3179 	ss->rx_big.shadow = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3180 	if (ss->rx_big.shadow == NULL)
3181 		return err;
3182 
3183 	/* allocate the rx host info rings */
3184 	bytes = rx_ring_entries * sizeof (*ss->rx_small.info);
3185 	ss->rx_small.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3186 	if (ss->rx_small.info == NULL)
3187 		return err;
3188 
3189 	bytes = rx_ring_entries * sizeof (*ss->rx_big.info);
3190 	ss->rx_big.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3191 	if (ss->rx_big.info == NULL)
3192 		return err;
3193 
3194 	/* allocate the rx busdma resources */
3195 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
3196 				 1,			/* alignment */
3197 				 4096,			/* boundary */
3198 				 BUS_SPACE_MAXADDR,	/* low */
3199 				 BUS_SPACE_MAXADDR,	/* high */
3200 				 NULL, NULL,		/* filter */
3201 				 MHLEN,			/* maxsize */
3202 				 1,			/* num segs */
3203 				 MHLEN,			/* maxsegsize */
3204 				 BUS_DMA_ALLOCNOW,	/* flags */
3205 				 NULL, NULL,		/* lock */
3206 				 &ss->rx_small.dmat);	/* tag */
3207 	if (err != 0) {
3208 		device_printf(sc->dev, "Err %d allocating rx_small dmat\n",
3209 			      err);
3210 		return err;
3211 	}
3212 
3213 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
3214 				 1,			/* alignment */
3215 #if MXGE_VIRT_JUMBOS
3216 				 4096,			/* boundary */
3217 #else
3218 				 0,			/* boundary */
3219 #endif
3220 				 BUS_SPACE_MAXADDR,	/* low */
3221 				 BUS_SPACE_MAXADDR,	/* high */
3222 				 NULL, NULL,		/* filter */
3223 				 3*4096,		/* maxsize */
3224 #if MXGE_VIRT_JUMBOS
3225 				 3,			/* num segs */
3226 				 4096,			/* maxsegsize*/
3227 #else
3228 				 1,			/* num segs */
3229 				 MJUM9BYTES,		/* maxsegsize*/
3230 #endif
3231 				 BUS_DMA_ALLOCNOW,	/* flags */
3232 				 NULL, NULL,		/* lock */
3233 				 &ss->rx_big.dmat);	/* tag */
3234 	if (err != 0) {
3235 		device_printf(sc->dev, "Err %d allocating rx_big dmat\n",
3236 			      err);
3237 		return err;
3238 	}
3239 	for (i = 0; i <= ss->rx_small.mask; i++) {
3240 		err = bus_dmamap_create(ss->rx_small.dmat, 0,
3241 					&ss->rx_small.info[i].map);
3242 		if (err != 0) {
3243 			device_printf(sc->dev, "Err %d  rx_small dmamap\n",
3244 				      err);
3245 			return err;
3246 		}
3247 	}
3248 	err = bus_dmamap_create(ss->rx_small.dmat, 0,
3249 				&ss->rx_small.extra_map);
3250 	if (err != 0) {
3251 		device_printf(sc->dev, "Err %d extra rx_small dmamap\n",
3252 			      err);
3253 		return err;
3254 	}
3255 
3256 	for (i = 0; i <= ss->rx_big.mask; i++) {
3257 		err = bus_dmamap_create(ss->rx_big.dmat, 0,
3258 					&ss->rx_big.info[i].map);
3259 		if (err != 0) {
3260 			device_printf(sc->dev, "Err %d  rx_big dmamap\n",
3261 				      err);
3262 			return err;
3263 		}
3264 	}
3265 	err = bus_dmamap_create(ss->rx_big.dmat, 0,
3266 				&ss->rx_big.extra_map);
3267 	if (err != 0) {
3268 		device_printf(sc->dev, "Err %d extra rx_big dmamap\n",
3269 			      err);
3270 		return err;
3271 	}
3272 
3273 	/* now allocate TX resouces */
3274 
3275 #ifndef IFNET_BUF_RING
3276 	/* only use a single TX ring for now */
3277 	if (ss != ss->sc->ss)
3278 		return 0;
3279 #endif
3280 
3281 	ss->tx.mask = tx_ring_entries - 1;
3282 	ss->tx.max_desc = MIN(MXGE_MAX_SEND_DESC, tx_ring_entries / 4);
3283 
3284 
3285 	/* allocate the tx request copy block */
3286 	bytes = 8 +
3287 		sizeof (*ss->tx.req_list) * (ss->tx.max_desc + 4);
3288 	ss->tx.req_bytes = malloc(bytes, M_DEVBUF, M_WAITOK);
3289 	if (ss->tx.req_bytes == NULL)
3290 		return err;
3291 	/* ensure req_list entries are aligned to 8 bytes */
3292 	ss->tx.req_list = (mcp_kreq_ether_send_t *)
3293 		((unsigned long)(ss->tx.req_bytes + 7) & ~7UL);
3294 
3295 	/* allocate the tx busdma segment list */
3296 	bytes = sizeof (*ss->tx.seg_list) * ss->tx.max_desc;
3297 	ss->tx.seg_list = (bus_dma_segment_t *)
3298 		malloc(bytes, M_DEVBUF, M_WAITOK);
3299 	if (ss->tx.seg_list == NULL)
3300 		return err;
3301 
3302 	/* allocate the tx host info ring */
3303 	bytes = tx_ring_entries * sizeof (*ss->tx.info);
3304 	ss->tx.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3305 	if (ss->tx.info == NULL)
3306 		return err;
3307 
3308 	/* allocate the tx busdma resources */
3309 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
3310 				 1,			/* alignment */
3311 				 sc->tx_boundary,	/* boundary */
3312 				 BUS_SPACE_MAXADDR,	/* low */
3313 				 BUS_SPACE_MAXADDR,	/* high */
3314 				 NULL, NULL,		/* filter */
3315 				 65536 + 256,		/* maxsize */
3316 				 ss->tx.max_desc - 2,	/* num segs */
3317 				 sc->tx_boundary,	/* maxsegsz */
3318 				 BUS_DMA_ALLOCNOW,	/* flags */
3319 				 NULL, NULL,		/* lock */
3320 				 &ss->tx.dmat);		/* tag */
3321 
3322 	if (err != 0) {
3323 		device_printf(sc->dev, "Err %d allocating tx dmat\n",
3324 			      err);
3325 		return err;
3326 	}
3327 
3328 	/* now use these tags to setup dmamaps for each slot
3329 	   in the ring */
3330 	for (i = 0; i <= ss->tx.mask; i++) {
3331 		err = bus_dmamap_create(ss->tx.dmat, 0,
3332 					&ss->tx.info[i].map);
3333 		if (err != 0) {
3334 			device_printf(sc->dev, "Err %d  tx dmamap\n",
3335 				      err);
3336 			return err;
3337 		}
3338 	}
3339 	return 0;
3340 
3341 }
3342 
3343 static int
3344 mxge_alloc_rings(mxge_softc_t *sc)
3345 {
3346 	mxge_cmd_t cmd;
3347 	int tx_ring_size;
3348 	int tx_ring_entries, rx_ring_entries;
3349 	int err, slice;
3350 
3351 	/* get ring sizes */
3352 	err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_RING_SIZE, &cmd);
3353 	tx_ring_size = cmd.data0;
3354 	if (err != 0) {
3355 		device_printf(sc->dev, "Cannot determine tx ring sizes\n");
3356 		goto abort;
3357 	}
3358 
3359 	tx_ring_entries = tx_ring_size / sizeof (mcp_kreq_ether_send_t);
3360 	rx_ring_entries = sc->rx_ring_size / sizeof (mcp_dma_addr_t);
3361 	IFQ_SET_MAXLEN(&sc->ifp->if_snd, tx_ring_entries - 1);
3362 	sc->ifp->if_snd.ifq_drv_maxlen = sc->ifp->if_snd.ifq_maxlen;
3363 	IFQ_SET_READY(&sc->ifp->if_snd);
3364 
3365 	for (slice = 0; slice < sc->num_slices; slice++) {
3366 		err = mxge_alloc_slice_rings(&sc->ss[slice],
3367 					     rx_ring_entries,
3368 					     tx_ring_entries);
3369 		if (err != 0)
3370 			goto abort;
3371 	}
3372 	return 0;
3373 
3374 abort:
3375 	mxge_free_rings(sc);
3376 	return err;
3377 
3378 }
3379 
3380 
3381 static void
3382 mxge_choose_params(int mtu, int *big_buf_size, int *cl_size, int *nbufs)
3383 {
3384 	int bufsize = mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN + MXGEFW_PAD;
3385 
3386 	if (bufsize < MCLBYTES) {
3387 		/* easy, everything fits in a single buffer */
3388 		*big_buf_size = MCLBYTES;
3389 		*cl_size = MCLBYTES;
3390 		*nbufs = 1;
3391 		return;
3392 	}
3393 
3394 	if (bufsize < MJUMPAGESIZE) {
3395 		/* still easy, everything still fits in a single buffer */
3396 		*big_buf_size = MJUMPAGESIZE;
3397 		*cl_size = MJUMPAGESIZE;
3398 		*nbufs = 1;
3399 		return;
3400 	}
3401 #if MXGE_VIRT_JUMBOS
3402 	/* now we need to use virtually contiguous buffers */
3403 	*cl_size = MJUM9BYTES;
3404 	*big_buf_size = 4096;
3405 	*nbufs = mtu / 4096 + 1;
3406 	/* needs to be a power of two, so round up */
3407 	if (*nbufs == 3)
3408 		*nbufs = 4;
3409 #else
3410 	*cl_size = MJUM9BYTES;
3411 	*big_buf_size = MJUM9BYTES;
3412 	*nbufs = 1;
3413 #endif
3414 }
3415 
3416 static int
3417 mxge_slice_open(struct mxge_slice_state *ss, int nbufs, int cl_size)
3418 {
3419 	mxge_softc_t *sc;
3420 	mxge_cmd_t cmd;
3421 	bus_dmamap_t map;
3422 	struct lro_entry *lro_entry;
3423 	int err, i, slice;
3424 
3425 
3426 	sc = ss->sc;
3427 	slice = ss - sc->ss;
3428 
3429 	SLIST_INIT(&ss->lro_free);
3430 	SLIST_INIT(&ss->lro_active);
3431 
3432 	for (i = 0; i < sc->lro_cnt; i++) {
3433 		lro_entry = (struct lro_entry *)
3434 			malloc(sizeof (*lro_entry), M_DEVBUF,
3435 			       M_NOWAIT | M_ZERO);
3436 		if (lro_entry == NULL) {
3437 			sc->lro_cnt = i;
3438 			break;
3439 		}
3440 		SLIST_INSERT_HEAD(&ss->lro_free, lro_entry, next);
3441 	}
3442 	/* get the lanai pointers to the send and receive rings */
3443 
3444 	err = 0;
3445 #ifndef IFNET_BUF_RING
3446 	/* We currently only send from the first slice */
3447 	if (slice == 0) {
3448 #endif
3449 		cmd.data0 = slice;
3450 		err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_OFFSET, &cmd);
3451 		ss->tx.lanai =
3452 			(volatile mcp_kreq_ether_send_t *)(sc->sram + cmd.data0);
3453 		ss->tx.send_go = (volatile uint32_t *)
3454 			(sc->sram + MXGEFW_ETH_SEND_GO + 64 * slice);
3455 		ss->tx.send_stop = (volatile uint32_t *)
3456 		(sc->sram + MXGEFW_ETH_SEND_STOP + 64 * slice);
3457 #ifndef IFNET_BUF_RING
3458 	}
3459 #endif
3460 	cmd.data0 = slice;
3461 	err |= mxge_send_cmd(sc,
3462 			     MXGEFW_CMD_GET_SMALL_RX_OFFSET, &cmd);
3463 	ss->rx_small.lanai =
3464 		(volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
3465 	cmd.data0 = slice;
3466 	err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_BIG_RX_OFFSET, &cmd);
3467 	ss->rx_big.lanai =
3468 		(volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
3469 
3470 	if (err != 0) {
3471 		device_printf(sc->dev,
3472 			      "failed to get ring sizes or locations\n");
3473 		return EIO;
3474 	}
3475 
3476 	/* stock receive rings */
3477 	for (i = 0; i <= ss->rx_small.mask; i++) {
3478 		map = ss->rx_small.info[i].map;
3479 		err = mxge_get_buf_small(ss, map, i);
3480 		if (err) {
3481 			device_printf(sc->dev, "alloced %d/%d smalls\n",
3482 				      i, ss->rx_small.mask + 1);
3483 			return ENOMEM;
3484 		}
3485 	}
3486 	for (i = 0; i <= ss->rx_big.mask; i++) {
3487 		ss->rx_big.shadow[i].addr_low = 0xffffffff;
3488 		ss->rx_big.shadow[i].addr_high = 0xffffffff;
3489 	}
3490 	ss->rx_big.nbufs = nbufs;
3491 	ss->rx_big.cl_size = cl_size;
3492 	ss->rx_big.mlen = ss->sc->ifp->if_mtu + ETHER_HDR_LEN +
3493 		ETHER_VLAN_ENCAP_LEN + MXGEFW_PAD;
3494 	for (i = 0; i <= ss->rx_big.mask; i += ss->rx_big.nbufs) {
3495 		map = ss->rx_big.info[i].map;
3496 		err = mxge_get_buf_big(ss, map, i);
3497 		if (err) {
3498 			device_printf(sc->dev, "alloced %d/%d bigs\n",
3499 				      i, ss->rx_big.mask + 1);
3500 			return ENOMEM;
3501 		}
3502 	}
3503 	return 0;
3504 }
3505 
3506 static int
3507 mxge_open(mxge_softc_t *sc)
3508 {
3509 	mxge_cmd_t cmd;
3510 	int err, big_bytes, nbufs, slice, cl_size, i;
3511 	bus_addr_t bus;
3512 	volatile uint8_t *itable;
3513 	struct mxge_slice_state *ss;
3514 
3515 	/* Copy the MAC address in case it was overridden */
3516 	bcopy(IF_LLADDR(sc->ifp), sc->mac_addr, ETHER_ADDR_LEN);
3517 
3518 	err = mxge_reset(sc, 1);
3519 	if (err != 0) {
3520 		device_printf(sc->dev, "failed to reset\n");
3521 		return EIO;
3522 	}
3523 
3524 	if (sc->num_slices > 1) {
3525 		/* setup the indirection table */
3526 		cmd.data0 = sc->num_slices;
3527 		err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_TABLE_SIZE,
3528 				    &cmd);
3529 
3530 		err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_RSS_TABLE_OFFSET,
3531 				     &cmd);
3532 		if (err != 0) {
3533 			device_printf(sc->dev,
3534 				      "failed to setup rss tables\n");
3535 			return err;
3536 		}
3537 
3538 		/* just enable an identity mapping */
3539 		itable = sc->sram + cmd.data0;
3540 		for (i = 0; i < sc->num_slices; i++)
3541 			itable[i] = (uint8_t)i;
3542 
3543 		cmd.data0 = 1;
3544 		cmd.data1 = mxge_rss_hash_type;
3545 		err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_ENABLE, &cmd);
3546 		if (err != 0) {
3547 			device_printf(sc->dev, "failed to enable slices\n");
3548 			return err;
3549 		}
3550 	}
3551 
3552 
3553 	mxge_choose_params(sc->ifp->if_mtu, &big_bytes, &cl_size, &nbufs);
3554 
3555 	cmd.data0 = nbufs;
3556 	err = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
3557 			    &cmd);
3558 	/* error is only meaningful if we're trying to set
3559 	   MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS > 1 */
3560 	if (err && nbufs > 1) {
3561 		device_printf(sc->dev,
3562 			      "Failed to set alway-use-n to %d\n",
3563 			      nbufs);
3564 		return EIO;
3565 	}
3566 	/* Give the firmware the mtu and the big and small buffer
3567 	   sizes.  The firmware wants the big buf size to be a power
3568 	   of two. Luckily, FreeBSD's clusters are powers of two */
3569 	cmd.data0 = sc->ifp->if_mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
3570 	err = mxge_send_cmd(sc, MXGEFW_CMD_SET_MTU, &cmd);
3571 	cmd.data0 = MHLEN - MXGEFW_PAD;
3572 	err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_SMALL_BUFFER_SIZE,
3573 			     &cmd);
3574 	cmd.data0 = big_bytes;
3575 	err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_BIG_BUFFER_SIZE, &cmd);
3576 
3577 	if (err != 0) {
3578 		device_printf(sc->dev, "failed to setup params\n");
3579 		goto abort;
3580 	}
3581 
3582 	/* Now give him the pointer to the stats block */
3583 	for (slice = 0;
3584 #ifdef IFNET_BUF_RING
3585 	     slice < sc->num_slices;
3586 #else
3587 	     slice < 1;
3588 #endif
3589 	     slice++) {
3590 		ss = &sc->ss[slice];
3591 		cmd.data0 =
3592 			MXGE_LOWPART_TO_U32(ss->fw_stats_dma.bus_addr);
3593 		cmd.data1 =
3594 			MXGE_HIGHPART_TO_U32(ss->fw_stats_dma.bus_addr);
3595 		cmd.data2 = sizeof(struct mcp_irq_data);
3596 		cmd.data2 |= (slice << 16);
3597 		err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_STATS_DMA_V2, &cmd);
3598 	}
3599 
3600 	if (err != 0) {
3601 		bus = sc->ss->fw_stats_dma.bus_addr;
3602 		bus += offsetof(struct mcp_irq_data, send_done_count);
3603 		cmd.data0 = MXGE_LOWPART_TO_U32(bus);
3604 		cmd.data1 = MXGE_HIGHPART_TO_U32(bus);
3605 		err = mxge_send_cmd(sc,
3606 				    MXGEFW_CMD_SET_STATS_DMA_OBSOLETE,
3607 				    &cmd);
3608 		/* Firmware cannot support multicast without STATS_DMA_V2 */
3609 		sc->fw_multicast_support = 0;
3610 	} else {
3611 		sc->fw_multicast_support = 1;
3612 	}
3613 
3614 	if (err != 0) {
3615 		device_printf(sc->dev, "failed to setup params\n");
3616 		goto abort;
3617 	}
3618 
3619 	for (slice = 0; slice < sc->num_slices; slice++) {
3620 		err = mxge_slice_open(&sc->ss[slice], nbufs, cl_size);
3621 		if (err != 0) {
3622 			device_printf(sc->dev, "couldn't open slice %d\n",
3623 				      slice);
3624 			goto abort;
3625 		}
3626 	}
3627 
3628 	/* Finally, start the firmware running */
3629 	err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_UP, &cmd);
3630 	if (err) {
3631 		device_printf(sc->dev, "Couldn't bring up link\n");
3632 		goto abort;
3633 	}
3634 #ifdef IFNET_BUF_RING
3635 	for (slice = 0; slice < sc->num_slices; slice++) {
3636 		ss = &sc->ss[slice];
3637 		ss->if_drv_flags |= IFF_DRV_RUNNING;
3638 		ss->if_drv_flags &= ~IFF_DRV_OACTIVE;
3639 	}
3640 #endif
3641 	sc->ifp->if_drv_flags |= IFF_DRV_RUNNING;
3642 	sc->ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
3643 
3644 	return 0;
3645 
3646 
3647 abort:
3648 	mxge_free_mbufs(sc);
3649 
3650 	return err;
3651 }
3652 
3653 static int
3654 mxge_close(mxge_softc_t *sc, int down)
3655 {
3656 	mxge_cmd_t cmd;
3657 	int err, old_down_cnt;
3658 #ifdef IFNET_BUF_RING
3659 	struct mxge_slice_state *ss;
3660 	int slice;
3661 #endif
3662 
3663 #ifdef IFNET_BUF_RING
3664 	for (slice = 0; slice < sc->num_slices; slice++) {
3665 		ss = &sc->ss[slice];
3666 		ss->if_drv_flags &= ~IFF_DRV_RUNNING;
3667 	}
3668 #endif
3669 	sc->ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
3670 	if (!down) {
3671 		old_down_cnt = sc->down_cnt;
3672 		wmb();
3673 		err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_DOWN, &cmd);
3674 		if (err) {
3675 			device_printf(sc->dev,
3676 				      "Couldn't bring down link\n");
3677 		}
3678 		if (old_down_cnt == sc->down_cnt) {
3679 			/* wait for down irq */
3680 			DELAY(10 * sc->intr_coal_delay);
3681 		}
3682 		wmb();
3683 		if (old_down_cnt == sc->down_cnt) {
3684 			device_printf(sc->dev, "never got down irq\n");
3685 		}
3686 	}
3687 	mxge_free_mbufs(sc);
3688 
3689 	return 0;
3690 }
3691 
3692 static void
3693 mxge_setup_cfg_space(mxge_softc_t *sc)
3694 {
3695 	device_t dev = sc->dev;
3696 	int reg;
3697 	uint16_t cmd, lnk, pectl;
3698 
3699 	/* find the PCIe link width and set max read request to 4KB*/
3700 	if (pci_find_extcap(dev, PCIY_EXPRESS, &reg) == 0) {
3701 		lnk = pci_read_config(dev, reg + 0x12, 2);
3702 		sc->link_width = (lnk >> 4) & 0x3f;
3703 
3704 		if (sc->pectl == 0) {
3705 			pectl = pci_read_config(dev, reg + 0x8, 2);
3706 			pectl = (pectl & ~0x7000) | (5 << 12);
3707 			pci_write_config(dev, reg + 0x8, pectl, 2);
3708 			sc->pectl = pectl;
3709 		} else {
3710 			/* restore saved pectl after watchdog reset */
3711 			pci_write_config(dev, reg + 0x8, sc->pectl, 2);
3712 		}
3713 	}
3714 
3715 	/* Enable DMA and Memory space access */
3716 	pci_enable_busmaster(dev);
3717 	cmd = pci_read_config(dev, PCIR_COMMAND, 2);
3718 	cmd |= PCIM_CMD_MEMEN;
3719 	pci_write_config(dev, PCIR_COMMAND, cmd, 2);
3720 }
3721 
3722 static uint32_t
3723 mxge_read_reboot(mxge_softc_t *sc)
3724 {
3725 	device_t dev = sc->dev;
3726 	uint32_t vs;
3727 
3728 	/* find the vendor specific offset */
3729 	if (pci_find_extcap(dev, PCIY_VENDOR, &vs) != 0) {
3730 		device_printf(sc->dev,
3731 			      "could not find vendor specific offset\n");
3732 		return (uint32_t)-1;
3733 	}
3734 	/* enable read32 mode */
3735 	pci_write_config(dev, vs + 0x10, 0x3, 1);
3736 	/* tell NIC which register to read */
3737 	pci_write_config(dev, vs + 0x18, 0xfffffff0, 4);
3738 	return (pci_read_config(dev, vs + 0x14, 4));
3739 }
3740 
3741 static void
3742 mxge_watchdog_reset(mxge_softc_t *sc)
3743 {
3744 	struct pci_devinfo *dinfo;
3745 	struct mxge_slice_state *ss;
3746 	int err, running, s, num_tx_slices = 1;
3747 	uint32_t reboot;
3748 	uint16_t cmd;
3749 
3750 	err = ENXIO;
3751 
3752 	device_printf(sc->dev, "Watchdog reset!\n");
3753 
3754 	/*
3755 	 * check to see if the NIC rebooted.  If it did, then all of
3756 	 * PCI config space has been reset, and things like the
3757 	 * busmaster bit will be zero.  If this is the case, then we
3758 	 * must restore PCI config space before the NIC can be used
3759 	 * again
3760 	 */
3761 	cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3762 	if (cmd == 0xffff) {
3763 		/*
3764 		 * maybe the watchdog caught the NIC rebooting; wait
3765 		 * up to 100ms for it to finish.  If it does not come
3766 		 * back, then give up
3767 		 */
3768 		DELAY(1000*100);
3769 		cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3770 		if (cmd == 0xffff) {
3771 			device_printf(sc->dev, "NIC disappeared!\n");
3772 		}
3773 	}
3774 	if ((cmd & PCIM_CMD_BUSMASTEREN) == 0) {
3775 		/* print the reboot status */
3776 		reboot = mxge_read_reboot(sc);
3777 		device_printf(sc->dev, "NIC rebooted, status = 0x%x\n",
3778 			      reboot);
3779 		running = sc->ifp->if_drv_flags & IFF_DRV_RUNNING;
3780 		if (running) {
3781 
3782 			/*
3783 			 * quiesce NIC so that TX routines will not try to
3784 			 * xmit after restoration of BAR
3785 			 */
3786 
3787 			/* Mark the link as down */
3788 			if (sc->link_state) {
3789 				sc->link_state = 0;
3790 				if_link_state_change(sc->ifp,
3791 						     LINK_STATE_DOWN);
3792 			}
3793 #ifdef IFNET_BUF_RING
3794 			num_tx_slices = sc->num_slices;
3795 #endif
3796 			/* grab all TX locks to ensure no tx  */
3797 			for (s = 0; s < num_tx_slices; s++) {
3798 				ss = &sc->ss[s];
3799 				mtx_lock(&ss->tx.mtx);
3800 			}
3801 			mxge_close(sc, 1);
3802 		}
3803 		/* restore PCI configuration space */
3804 		dinfo = device_get_ivars(sc->dev);
3805 		pci_cfg_restore(sc->dev, dinfo);
3806 
3807 		/* and redo any changes we made to our config space */
3808 		mxge_setup_cfg_space(sc);
3809 
3810 		/* reload f/w */
3811 		err = mxge_load_firmware(sc, 0);
3812 		if (err) {
3813 			device_printf(sc->dev,
3814 				      "Unable to re-load f/w\n");
3815 		}
3816 		if (running) {
3817 			if (!err)
3818 				err = mxge_open(sc);
3819 			/* release all TX locks */
3820 			for (s = 0; s < num_tx_slices; s++) {
3821 				ss = &sc->ss[s];
3822 #ifdef IFNET_BUF_RING
3823 				mxge_start_locked(ss);
3824 #endif
3825 				mtx_unlock(&ss->tx.mtx);
3826 			}
3827 		}
3828 		sc->watchdog_resets++;
3829 	} else {
3830 		device_printf(sc->dev,
3831 			      "NIC did not reboot, not resetting\n");
3832 		err = 0;
3833 	}
3834 	if (err) {
3835 		device_printf(sc->dev, "watchdog reset failed\n");
3836 	} else {
3837 		if (sc->dying == 2)
3838 			sc->dying = 0;
3839 		callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
3840 	}
3841 }
3842 
3843 static void
3844 mxge_watchdog_task(void *arg, int pending)
3845 {
3846 	mxge_softc_t *sc = arg;
3847 
3848 
3849 	mtx_lock(&sc->driver_mtx);
3850 	mxge_watchdog_reset(sc);
3851 	mtx_unlock(&sc->driver_mtx);
3852 }
3853 
3854 static void
3855 mxge_warn_stuck(mxge_softc_t *sc, mxge_tx_ring_t *tx, int slice)
3856 {
3857 	tx = &sc->ss[slice].tx;
3858 	device_printf(sc->dev, "slice %d struck? ring state:\n", slice);
3859 	device_printf(sc->dev,
3860 		      "tx.req=%d tx.done=%d, tx.queue_active=%d\n",
3861 		      tx->req, tx->done, tx->queue_active);
3862 	device_printf(sc->dev, "tx.activate=%d tx.deactivate=%d\n",
3863 			      tx->activate, tx->deactivate);
3864 	device_printf(sc->dev, "pkt_done=%d fw=%d\n",
3865 		      tx->pkt_done,
3866 		      be32toh(sc->ss->fw_stats->send_done_count));
3867 }
3868 
3869 static int
3870 mxge_watchdog(mxge_softc_t *sc)
3871 {
3872 	mxge_tx_ring_t *tx;
3873 	uint32_t rx_pause = be32toh(sc->ss->fw_stats->dropped_pause);
3874 	int i, err = 0;
3875 
3876 	/* see if we have outstanding transmits, which
3877 	   have been pending for more than mxge_ticks */
3878 	for (i = 0;
3879 #ifdef IFNET_BUF_RING
3880 	     (i < sc->num_slices) && (err == 0);
3881 #else
3882 	     (i < 1) && (err == 0);
3883 #endif
3884 	     i++) {
3885 		tx = &sc->ss[i].tx;
3886 		if (tx->req != tx->done &&
3887 		    tx->watchdog_req != tx->watchdog_done &&
3888 		    tx->done == tx->watchdog_done) {
3889 			/* check for pause blocking before resetting */
3890 			if (tx->watchdog_rx_pause == rx_pause) {
3891 				mxge_warn_stuck(sc, tx, i);
3892 				taskqueue_enqueue(sc->tq, &sc->watchdog_task);
3893 				return (ENXIO);
3894 			}
3895 			else
3896 				device_printf(sc->dev, "Flow control blocking "
3897 					      "xmits, check link partner\n");
3898 		}
3899 
3900 		tx->watchdog_req = tx->req;
3901 		tx->watchdog_done = tx->done;
3902 		tx->watchdog_rx_pause = rx_pause;
3903 	}
3904 
3905 	if (sc->need_media_probe)
3906 		mxge_media_probe(sc);
3907 	return (err);
3908 }
3909 
3910 static u_long
3911 mxge_update_stats(mxge_softc_t *sc)
3912 {
3913 	struct mxge_slice_state *ss;
3914 	u_long pkts = 0;
3915 	u_long ipackets = 0;
3916 	u_long opackets = 0;
3917 #ifdef IFNET_BUF_RING
3918 	u_long obytes = 0;
3919 	u_long omcasts = 0;
3920 	u_long odrops = 0;
3921 #endif
3922 	u_long oerrors = 0;
3923 	int slice;
3924 
3925 	for (slice = 0; slice < sc->num_slices; slice++) {
3926 		ss = &sc->ss[slice];
3927 		ipackets += ss->ipackets;
3928 		opackets += ss->opackets;
3929 #ifdef IFNET_BUF_RING
3930 		obytes += ss->obytes;
3931 		omcasts += ss->omcasts;
3932 		odrops += ss->tx.br->br_drops;
3933 #endif
3934 		oerrors += ss->oerrors;
3935 	}
3936 	pkts = (ipackets - sc->ifp->if_ipackets);
3937 	pkts += (opackets - sc->ifp->if_opackets);
3938 	sc->ifp->if_ipackets = ipackets;
3939 	sc->ifp->if_opackets = opackets;
3940 #ifdef IFNET_BUF_RING
3941 	sc->ifp->if_obytes = obytes;
3942 	sc->ifp->if_omcasts = omcasts;
3943 	sc->ifp->if_snd.ifq_drops = odrops;
3944 #endif
3945 	sc->ifp->if_oerrors = oerrors;
3946 	return pkts;
3947 }
3948 
3949 static void
3950 mxge_tick(void *arg)
3951 {
3952 	mxge_softc_t *sc = arg;
3953 	u_long pkts = 0;
3954 	int err = 0;
3955 	int running, ticks;
3956 	uint16_t cmd;
3957 
3958 	ticks = mxge_ticks;
3959 	running = sc->ifp->if_drv_flags & IFF_DRV_RUNNING;
3960 	if (running) {
3961 		/* aggregate stats from different slices */
3962 		pkts = mxge_update_stats(sc);
3963 		if (!sc->watchdog_countdown) {
3964 			err = mxge_watchdog(sc);
3965 			sc->watchdog_countdown = 4;
3966 		}
3967 		sc->watchdog_countdown--;
3968 	}
3969 	if (pkts == 0) {
3970 		/* ensure NIC did not suffer h/w fault while idle */
3971 		cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3972 		if ((cmd & PCIM_CMD_BUSMASTEREN) == 0) {
3973 			sc->dying = 2;
3974 			taskqueue_enqueue(sc->tq, &sc->watchdog_task);
3975 			err = ENXIO;
3976 		}
3977 		/* look less often if NIC is idle */
3978 		ticks *= 4;
3979 	}
3980 
3981 	if (err == 0)
3982 		callout_reset(&sc->co_hdl, ticks, mxge_tick, sc);
3983 
3984 }
3985 
3986 static int
3987 mxge_media_change(struct ifnet *ifp)
3988 {
3989 	return EINVAL;
3990 }
3991 
3992 static int
3993 mxge_change_mtu(mxge_softc_t *sc, int mtu)
3994 {
3995 	struct ifnet *ifp = sc->ifp;
3996 	int real_mtu, old_mtu;
3997 	int err = 0;
3998 
3999 
4000 	real_mtu = mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
4001 	if ((real_mtu > sc->max_mtu) || real_mtu < 60)
4002 		return EINVAL;
4003 	mtx_lock(&sc->driver_mtx);
4004 	old_mtu = ifp->if_mtu;
4005 	ifp->if_mtu = mtu;
4006 	if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
4007 		mxge_close(sc, 0);
4008 		err = mxge_open(sc);
4009 		if (err != 0) {
4010 			ifp->if_mtu = old_mtu;
4011 			mxge_close(sc, 0);
4012 			(void) mxge_open(sc);
4013 		}
4014 	}
4015 	mtx_unlock(&sc->driver_mtx);
4016 	return err;
4017 }
4018 
4019 static void
4020 mxge_media_status(struct ifnet *ifp, struct ifmediareq *ifmr)
4021 {
4022 	mxge_softc_t *sc = ifp->if_softc;
4023 
4024 
4025 	if (sc == NULL)
4026 		return;
4027 	ifmr->ifm_status = IFM_AVALID;
4028 	ifmr->ifm_status |= sc->link_state ? IFM_ACTIVE : 0;
4029 	ifmr->ifm_active = IFM_AUTO | IFM_ETHER;
4030 	ifmr->ifm_active |= sc->link_state ? IFM_FDX : 0;
4031 }
4032 
4033 static int
4034 mxge_ioctl(struct ifnet *ifp, u_long command, caddr_t data)
4035 {
4036 	mxge_softc_t *sc = ifp->if_softc;
4037 	struct ifreq *ifr = (struct ifreq *)data;
4038 	int err, mask;
4039 
4040 	err = 0;
4041 	switch (command) {
4042 	case SIOCSIFADDR:
4043 	case SIOCGIFADDR:
4044 		err = ether_ioctl(ifp, command, data);
4045 		break;
4046 
4047 	case SIOCSIFMTU:
4048 		err = mxge_change_mtu(sc, ifr->ifr_mtu);
4049 		break;
4050 
4051 	case SIOCSIFFLAGS:
4052 		mtx_lock(&sc->driver_mtx);
4053 		if (sc->dying) {
4054 			mtx_unlock(&sc->driver_mtx);
4055 			return EINVAL;
4056 		}
4057 		if (ifp->if_flags & IFF_UP) {
4058 			if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) {
4059 				err = mxge_open(sc);
4060 			} else {
4061 				/* take care of promis can allmulti
4062 				   flag chages */
4063 				mxge_change_promisc(sc,
4064 						    ifp->if_flags & IFF_PROMISC);
4065 				mxge_set_multicast_list(sc);
4066 			}
4067 		} else {
4068 			if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
4069 				mxge_close(sc, 0);
4070 			}
4071 		}
4072 		mtx_unlock(&sc->driver_mtx);
4073 		break;
4074 
4075 	case SIOCADDMULTI:
4076 	case SIOCDELMULTI:
4077 		mtx_lock(&sc->driver_mtx);
4078 		mxge_set_multicast_list(sc);
4079 		mtx_unlock(&sc->driver_mtx);
4080 		break;
4081 
4082 	case SIOCSIFCAP:
4083 		mtx_lock(&sc->driver_mtx);
4084 		mask = ifr->ifr_reqcap ^ ifp->if_capenable;
4085 		if (mask & IFCAP_TXCSUM) {
4086 			if (IFCAP_TXCSUM & ifp->if_capenable) {
4087 				ifp->if_capenable &= ~(IFCAP_TXCSUM|IFCAP_TSO4);
4088 				ifp->if_hwassist &= ~(CSUM_TCP | CSUM_UDP
4089 						      | CSUM_TSO);
4090 			} else {
4091 				ifp->if_capenable |= IFCAP_TXCSUM;
4092 				ifp->if_hwassist |= (CSUM_TCP | CSUM_UDP);
4093 			}
4094 		} else if (mask & IFCAP_RXCSUM) {
4095 			if (IFCAP_RXCSUM & ifp->if_capenable) {
4096 				ifp->if_capenable &= ~IFCAP_RXCSUM;
4097 				sc->csum_flag = 0;
4098 			} else {
4099 				ifp->if_capenable |= IFCAP_RXCSUM;
4100 				sc->csum_flag = 1;
4101 			}
4102 		}
4103 		if (mask & IFCAP_TSO4) {
4104 			if (IFCAP_TSO4 & ifp->if_capenable) {
4105 				ifp->if_capenable &= ~IFCAP_TSO4;
4106 				ifp->if_hwassist &= ~CSUM_TSO;
4107 			} else if (IFCAP_TXCSUM & ifp->if_capenable) {
4108 				ifp->if_capenable |= IFCAP_TSO4;
4109 				ifp->if_hwassist |= CSUM_TSO;
4110 			} else {
4111 				printf("mxge requires tx checksum offload"
4112 				       " be enabled to use TSO\n");
4113 				err = EINVAL;
4114 			}
4115 		}
4116 		if (mask & IFCAP_LRO) {
4117 			if (IFCAP_LRO & ifp->if_capenable)
4118 				err = mxge_change_lro_locked(sc, 0);
4119 			else
4120 				err = mxge_change_lro_locked(sc, mxge_lro_cnt);
4121 		}
4122 		if (mask & IFCAP_VLAN_HWTAGGING)
4123 			ifp->if_capenable ^= IFCAP_VLAN_HWTAGGING;
4124 		mtx_unlock(&sc->driver_mtx);
4125 		VLAN_CAPABILITIES(ifp);
4126 
4127 		break;
4128 
4129 	case SIOCGIFMEDIA:
4130 		err = ifmedia_ioctl(ifp, (struct ifreq *)data,
4131 				    &sc->media, command);
4132                 break;
4133 
4134 	default:
4135 		err = ENOTTY;
4136         }
4137 	return err;
4138 }
4139 
4140 static void
4141 mxge_fetch_tunables(mxge_softc_t *sc)
4142 {
4143 
4144 	TUNABLE_INT_FETCH("hw.mxge.max_slices", &mxge_max_slices);
4145 	TUNABLE_INT_FETCH("hw.mxge.flow_control_enabled",
4146 			  &mxge_flow_control);
4147 	TUNABLE_INT_FETCH("hw.mxge.intr_coal_delay",
4148 			  &mxge_intr_coal_delay);
4149 	TUNABLE_INT_FETCH("hw.mxge.nvidia_ecrc_enable",
4150 			  &mxge_nvidia_ecrc_enable);
4151 	TUNABLE_INT_FETCH("hw.mxge.force_firmware",
4152 			  &mxge_force_firmware);
4153 	TUNABLE_INT_FETCH("hw.mxge.deassert_wait",
4154 			  &mxge_deassert_wait);
4155 	TUNABLE_INT_FETCH("hw.mxge.verbose",
4156 			  &mxge_verbose);
4157 	TUNABLE_INT_FETCH("hw.mxge.ticks", &mxge_ticks);
4158 	TUNABLE_INT_FETCH("hw.mxge.lro_cnt", &sc->lro_cnt);
4159 	TUNABLE_INT_FETCH("hw.mxge.always_promisc", &mxge_always_promisc);
4160 	TUNABLE_INT_FETCH("hw.mxge.rss_hash_type", &mxge_rss_hash_type);
4161 	TUNABLE_INT_FETCH("hw.mxge.rss_hashtype", &mxge_rss_hash_type);
4162 	TUNABLE_INT_FETCH("hw.mxge.initial_mtu", &mxge_initial_mtu);
4163 	TUNABLE_INT_FETCH("hw.mxge.throttle", &mxge_throttle);
4164 	if (sc->lro_cnt != 0)
4165 		mxge_lro_cnt = sc->lro_cnt;
4166 
4167 	if (bootverbose)
4168 		mxge_verbose = 1;
4169 	if (mxge_intr_coal_delay < 0 || mxge_intr_coal_delay > 10*1000)
4170 		mxge_intr_coal_delay = 30;
4171 	if (mxge_ticks == 0)
4172 		mxge_ticks = hz / 2;
4173 	sc->pause = mxge_flow_control;
4174 	if (mxge_rss_hash_type < MXGEFW_RSS_HASH_TYPE_IPV4
4175 	    || mxge_rss_hash_type > MXGEFW_RSS_HASH_TYPE_MAX) {
4176 		mxge_rss_hash_type = MXGEFW_RSS_HASH_TYPE_SRC_PORT;
4177 	}
4178 	if (mxge_initial_mtu > ETHERMTU_JUMBO ||
4179 	    mxge_initial_mtu < ETHER_MIN_LEN)
4180 		mxge_initial_mtu = ETHERMTU_JUMBO;
4181 
4182 	if (mxge_throttle && mxge_throttle > MXGE_MAX_THROTTLE)
4183 		mxge_throttle = MXGE_MAX_THROTTLE;
4184 	if (mxge_throttle && mxge_throttle < MXGE_MIN_THROTTLE)
4185 		mxge_throttle = MXGE_MIN_THROTTLE;
4186 	sc->throttle = mxge_throttle;
4187 }
4188 
4189 
4190 static void
4191 mxge_free_slices(mxge_softc_t *sc)
4192 {
4193 	struct mxge_slice_state *ss;
4194 	int i;
4195 
4196 
4197 	if (sc->ss == NULL)
4198 		return;
4199 
4200 	for (i = 0; i < sc->num_slices; i++) {
4201 		ss = &sc->ss[i];
4202 		if (ss->fw_stats != NULL) {
4203 			mxge_dma_free(&ss->fw_stats_dma);
4204 			ss->fw_stats = NULL;
4205 #ifdef IFNET_BUF_RING
4206 			if (ss->tx.br != NULL) {
4207 				drbr_free(ss->tx.br, M_DEVBUF);
4208 				ss->tx.br = NULL;
4209 			}
4210 #endif
4211 			mtx_destroy(&ss->tx.mtx);
4212 		}
4213 		if (ss->rx_done.entry != NULL) {
4214 			mxge_dma_free(&ss->rx_done.dma);
4215 			ss->rx_done.entry = NULL;
4216 		}
4217 	}
4218 	free(sc->ss, M_DEVBUF);
4219 	sc->ss = NULL;
4220 }
4221 
4222 static int
4223 mxge_alloc_slices(mxge_softc_t *sc)
4224 {
4225 	mxge_cmd_t cmd;
4226 	struct mxge_slice_state *ss;
4227 	size_t bytes;
4228 	int err, i, max_intr_slots;
4229 
4230 	err = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
4231 	if (err != 0) {
4232 		device_printf(sc->dev, "Cannot determine rx ring size\n");
4233 		return err;
4234 	}
4235 	sc->rx_ring_size = cmd.data0;
4236 	max_intr_slots = 2 * (sc->rx_ring_size / sizeof (mcp_dma_addr_t));
4237 
4238 	bytes = sizeof (*sc->ss) * sc->num_slices;
4239 	sc->ss = malloc(bytes, M_DEVBUF, M_NOWAIT | M_ZERO);
4240 	if (sc->ss == NULL)
4241 		return (ENOMEM);
4242 	for (i = 0; i < sc->num_slices; i++) {
4243 		ss = &sc->ss[i];
4244 
4245 		ss->sc = sc;
4246 
4247 		/* allocate per-slice rx interrupt queues */
4248 
4249 		bytes = max_intr_slots * sizeof (*ss->rx_done.entry);
4250 		err = mxge_dma_alloc(sc, &ss->rx_done.dma, bytes, 4096);
4251 		if (err != 0)
4252 			goto abort;
4253 		ss->rx_done.entry = ss->rx_done.dma.addr;
4254 		bzero(ss->rx_done.entry, bytes);
4255 
4256 		/*
4257 		 * allocate the per-slice firmware stats; stats
4258 		 * (including tx) are used used only on the first
4259 		 * slice for now
4260 		 */
4261 #ifndef IFNET_BUF_RING
4262 		if (i > 0)
4263 			continue;
4264 #endif
4265 
4266 		bytes = sizeof (*ss->fw_stats);
4267 		err = mxge_dma_alloc(sc, &ss->fw_stats_dma,
4268 				     sizeof (*ss->fw_stats), 64);
4269 		if (err != 0)
4270 			goto abort;
4271 		ss->fw_stats = (mcp_irq_data_t *)ss->fw_stats_dma.addr;
4272 		snprintf(ss->tx.mtx_name, sizeof(ss->tx.mtx_name),
4273 			 "%s:tx(%d)", device_get_nameunit(sc->dev), i);
4274 		mtx_init(&ss->tx.mtx, ss->tx.mtx_name, NULL, MTX_DEF);
4275 #ifdef IFNET_BUF_RING
4276 		ss->tx.br = buf_ring_alloc(2048, M_DEVBUF, M_WAITOK,
4277 					   &ss->tx.mtx);
4278 #endif
4279 	}
4280 
4281 	return (0);
4282 
4283 abort:
4284 	mxge_free_slices(sc);
4285 	return (ENOMEM);
4286 }
4287 
4288 static void
4289 mxge_slice_probe(mxge_softc_t *sc)
4290 {
4291 	mxge_cmd_t cmd;
4292 	char *old_fw;
4293 	int msix_cnt, status, max_intr_slots;
4294 
4295 	sc->num_slices = 1;
4296 	/*
4297 	 *  don't enable multiple slices if they are not enabled,
4298 	 *  or if this is not an SMP system
4299 	 */
4300 
4301 	if (mxge_max_slices == 0 || mxge_max_slices == 1 || mp_ncpus < 2)
4302 		return;
4303 
4304 	/* see how many MSI-X interrupts are available */
4305 	msix_cnt = pci_msix_count(sc->dev);
4306 	if (msix_cnt < 2)
4307 		return;
4308 
4309 	/* now load the slice aware firmware see what it supports */
4310 	old_fw = sc->fw_name;
4311 	if (old_fw == mxge_fw_aligned)
4312 		sc->fw_name = mxge_fw_rss_aligned;
4313 	else
4314 		sc->fw_name = mxge_fw_rss_unaligned;
4315 	status = mxge_load_firmware(sc, 0);
4316 	if (status != 0) {
4317 		device_printf(sc->dev, "Falling back to a single slice\n");
4318 		return;
4319 	}
4320 
4321 	/* try to send a reset command to the card to see if it
4322 	   is alive */
4323 	memset(&cmd, 0, sizeof (cmd));
4324 	status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
4325 	if (status != 0) {
4326 		device_printf(sc->dev, "failed reset\n");
4327 		goto abort_with_fw;
4328 	}
4329 
4330 	/* get rx ring size */
4331 	status = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
4332 	if (status != 0) {
4333 		device_printf(sc->dev, "Cannot determine rx ring size\n");
4334 		goto abort_with_fw;
4335 	}
4336 	max_intr_slots = 2 * (cmd.data0 / sizeof (mcp_dma_addr_t));
4337 
4338 	/* tell it the size of the interrupt queues */
4339 	cmd.data0 = max_intr_slots * sizeof (struct mcp_slot);
4340 	status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
4341 	if (status != 0) {
4342 		device_printf(sc->dev, "failed MXGEFW_CMD_SET_INTRQ_SIZE\n");
4343 		goto abort_with_fw;
4344 	}
4345 
4346 	/* ask the maximum number of slices it supports */
4347 	status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES, &cmd);
4348 	if (status != 0) {
4349 		device_printf(sc->dev,
4350 			      "failed MXGEFW_CMD_GET_MAX_RSS_QUEUES\n");
4351 		goto abort_with_fw;
4352 	}
4353 	sc->num_slices = cmd.data0;
4354 	if (sc->num_slices > msix_cnt)
4355 		sc->num_slices = msix_cnt;
4356 
4357 	if (mxge_max_slices == -1) {
4358 		/* cap to number of CPUs in system */
4359 		if (sc->num_slices > mp_ncpus)
4360 			sc->num_slices = mp_ncpus;
4361 	} else {
4362 		if (sc->num_slices > mxge_max_slices)
4363 			sc->num_slices = mxge_max_slices;
4364 	}
4365 	/* make sure it is a power of two */
4366 	while (sc->num_slices & (sc->num_slices - 1))
4367 		sc->num_slices--;
4368 
4369 	if (mxge_verbose)
4370 		device_printf(sc->dev, "using %d slices\n",
4371 			      sc->num_slices);
4372 
4373 	return;
4374 
4375 abort_with_fw:
4376 	sc->fw_name = old_fw;
4377 	(void) mxge_load_firmware(sc, 0);
4378 }
4379 
4380 static int
4381 mxge_add_msix_irqs(mxge_softc_t *sc)
4382 {
4383 	size_t bytes;
4384 	int count, err, i, rid;
4385 
4386 	rid = PCIR_BAR(2);
4387 	sc->msix_table_res = bus_alloc_resource_any(sc->dev, SYS_RES_MEMORY,
4388 						    &rid, RF_ACTIVE);
4389 
4390 	if (sc->msix_table_res == NULL) {
4391 		device_printf(sc->dev, "couldn't alloc MSIX table res\n");
4392 		return ENXIO;
4393 	}
4394 
4395 	count = sc->num_slices;
4396 	err = pci_alloc_msix(sc->dev, &count);
4397 	if (err != 0) {
4398 		device_printf(sc->dev, "pci_alloc_msix: failed, wanted %d"
4399 			      "err = %d \n", sc->num_slices, err);
4400 		goto abort_with_msix_table;
4401 	}
4402 	if (count < sc->num_slices) {
4403 		device_printf(sc->dev, "pci_alloc_msix: need %d, got %d\n",
4404 			      count, sc->num_slices);
4405 		device_printf(sc->dev,
4406 			      "Try setting hw.mxge.max_slices to %d\n",
4407 			      count);
4408 		err = ENOSPC;
4409 		goto abort_with_msix;
4410 	}
4411 	bytes = sizeof (*sc->msix_irq_res) * sc->num_slices;
4412 	sc->msix_irq_res = malloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO);
4413 	if (sc->msix_irq_res == NULL) {
4414 		err = ENOMEM;
4415 		goto abort_with_msix;
4416 	}
4417 
4418 	for (i = 0; i < sc->num_slices; i++) {
4419 		rid = i + 1;
4420 		sc->msix_irq_res[i] = bus_alloc_resource_any(sc->dev,
4421 							  SYS_RES_IRQ,
4422 							  &rid, RF_ACTIVE);
4423 		if (sc->msix_irq_res[i] == NULL) {
4424 			device_printf(sc->dev, "couldn't allocate IRQ res"
4425 				      " for message %d\n", i);
4426 			err = ENXIO;
4427 			goto abort_with_res;
4428 		}
4429 	}
4430 
4431 	bytes = sizeof (*sc->msix_ih) * sc->num_slices;
4432 	sc->msix_ih =  malloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO);
4433 
4434 	for (i = 0; i < sc->num_slices; i++) {
4435 		err = bus_setup_intr(sc->dev, sc->msix_irq_res[i],
4436 				     INTR_TYPE_NET | INTR_MPSAFE,
4437 #if __FreeBSD_version > 700030
4438 				     NULL,
4439 #endif
4440 				     mxge_intr, &sc->ss[i], &sc->msix_ih[i]);
4441 		if (err != 0) {
4442 			device_printf(sc->dev, "couldn't setup intr for "
4443 				      "message %d\n", i);
4444 			goto abort_with_intr;
4445 		}
4446 	}
4447 
4448 	if (mxge_verbose) {
4449 		device_printf(sc->dev, "using %d msix IRQs:",
4450 			      sc->num_slices);
4451 		for (i = 0; i < sc->num_slices; i++)
4452 			printf(" %ld",  rman_get_start(sc->msix_irq_res[i]));
4453 		printf("\n");
4454 	}
4455 	return (0);
4456 
4457 abort_with_intr:
4458 	for (i = 0; i < sc->num_slices; i++) {
4459 		if (sc->msix_ih[i] != NULL) {
4460 			bus_teardown_intr(sc->dev, sc->msix_irq_res[i],
4461 					  sc->msix_ih[i]);
4462 			sc->msix_ih[i] = NULL;
4463 		}
4464 	}
4465 	free(sc->msix_ih, M_DEVBUF);
4466 
4467 
4468 abort_with_res:
4469 	for (i = 0; i < sc->num_slices; i++) {
4470 		rid = i + 1;
4471 		if (sc->msix_irq_res[i] != NULL)
4472 			bus_release_resource(sc->dev, SYS_RES_IRQ, rid,
4473 					     sc->msix_irq_res[i]);
4474 		sc->msix_irq_res[i] = NULL;
4475 	}
4476 	free(sc->msix_irq_res, M_DEVBUF);
4477 
4478 
4479 abort_with_msix:
4480 	pci_release_msi(sc->dev);
4481 
4482 abort_with_msix_table:
4483 	bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2),
4484 			     sc->msix_table_res);
4485 
4486 	return err;
4487 }
4488 
4489 static int
4490 mxge_add_single_irq(mxge_softc_t *sc)
4491 {
4492 	int count, err, rid;
4493 
4494 	count = pci_msi_count(sc->dev);
4495 	if (count == 1 && pci_alloc_msi(sc->dev, &count) == 0) {
4496 		rid = 1;
4497 	} else {
4498 		rid = 0;
4499 		sc->legacy_irq = 1;
4500 	}
4501 	sc->irq_res = bus_alloc_resource(sc->dev, SYS_RES_IRQ, &rid, 0, ~0,
4502 					 1, RF_SHAREABLE | RF_ACTIVE);
4503 	if (sc->irq_res == NULL) {
4504 		device_printf(sc->dev, "could not alloc interrupt\n");
4505 		return ENXIO;
4506 	}
4507 	if (mxge_verbose)
4508 		device_printf(sc->dev, "using %s irq %ld\n",
4509 			      sc->legacy_irq ? "INTx" : "MSI",
4510 			      rman_get_start(sc->irq_res));
4511 	err = bus_setup_intr(sc->dev, sc->irq_res,
4512 			     INTR_TYPE_NET | INTR_MPSAFE,
4513 #if __FreeBSD_version > 700030
4514 			     NULL,
4515 #endif
4516 			     mxge_intr, &sc->ss[0], &sc->ih);
4517 	if (err != 0) {
4518 		bus_release_resource(sc->dev, SYS_RES_IRQ,
4519 				     sc->legacy_irq ? 0 : 1, sc->irq_res);
4520 		if (!sc->legacy_irq)
4521 			pci_release_msi(sc->dev);
4522 	}
4523 	return err;
4524 }
4525 
4526 static void
4527 mxge_rem_msix_irqs(mxge_softc_t *sc)
4528 {
4529 	int i, rid;
4530 
4531 	for (i = 0; i < sc->num_slices; i++) {
4532 		if (sc->msix_ih[i] != NULL) {
4533 			bus_teardown_intr(sc->dev, sc->msix_irq_res[i],
4534 					  sc->msix_ih[i]);
4535 			sc->msix_ih[i] = NULL;
4536 		}
4537 	}
4538 	free(sc->msix_ih, M_DEVBUF);
4539 
4540 	for (i = 0; i < sc->num_slices; i++) {
4541 		rid = i + 1;
4542 		if (sc->msix_irq_res[i] != NULL)
4543 			bus_release_resource(sc->dev, SYS_RES_IRQ, rid,
4544 					     sc->msix_irq_res[i]);
4545 		sc->msix_irq_res[i] = NULL;
4546 	}
4547 	free(sc->msix_irq_res, M_DEVBUF);
4548 
4549 	bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2),
4550 			     sc->msix_table_res);
4551 
4552 	pci_release_msi(sc->dev);
4553 	return;
4554 }
4555 
4556 static void
4557 mxge_rem_single_irq(mxge_softc_t *sc)
4558 {
4559 	bus_teardown_intr(sc->dev, sc->irq_res, sc->ih);
4560 	bus_release_resource(sc->dev, SYS_RES_IRQ,
4561 			     sc->legacy_irq ? 0 : 1, sc->irq_res);
4562 	if (!sc->legacy_irq)
4563 		pci_release_msi(sc->dev);
4564 }
4565 
4566 static void
4567 mxge_rem_irq(mxge_softc_t *sc)
4568 {
4569 	if (sc->num_slices > 1)
4570 		mxge_rem_msix_irqs(sc);
4571 	else
4572 		mxge_rem_single_irq(sc);
4573 }
4574 
4575 static int
4576 mxge_add_irq(mxge_softc_t *sc)
4577 {
4578 	int err;
4579 
4580 	if (sc->num_slices > 1)
4581 		err = mxge_add_msix_irqs(sc);
4582 	else
4583 		err = mxge_add_single_irq(sc);
4584 
4585 	if (0 && err == 0 && sc->num_slices > 1) {
4586 		mxge_rem_msix_irqs(sc);
4587 		err = mxge_add_msix_irqs(sc);
4588 	}
4589 	return err;
4590 }
4591 
4592 
4593 static int
4594 mxge_attach(device_t dev)
4595 {
4596 	mxge_softc_t *sc = device_get_softc(dev);
4597 	struct ifnet *ifp;
4598 	int err, rid;
4599 
4600 	sc->dev = dev;
4601 	mxge_fetch_tunables(sc);
4602 
4603 	TASK_INIT(&sc->watchdog_task, 1, mxge_watchdog_task, sc);
4604 	sc->tq = taskqueue_create_fast("mxge_taskq", M_WAITOK,
4605 				       taskqueue_thread_enqueue,
4606 				       &sc->tq);
4607 	if (sc->tq == NULL) {
4608 		err = ENOMEM;
4609 		goto abort_with_nothing;
4610 	}
4611 	taskqueue_start_threads(&sc->tq, 1, PI_NET, "%s taskq",
4612 				device_get_nameunit(sc->dev));
4613 
4614 	err = bus_dma_tag_create(NULL,			/* parent */
4615 				 1,			/* alignment */
4616 				 0,			/* boundary */
4617 				 BUS_SPACE_MAXADDR,	/* low */
4618 				 BUS_SPACE_MAXADDR,	/* high */
4619 				 NULL, NULL,		/* filter */
4620 				 65536 + 256,		/* maxsize */
4621 				 MXGE_MAX_SEND_DESC, 	/* num segs */
4622 				 65536,			/* maxsegsize */
4623 				 0,			/* flags */
4624 				 NULL, NULL,		/* lock */
4625 				 &sc->parent_dmat);	/* tag */
4626 
4627 	if (err != 0) {
4628 		device_printf(sc->dev, "Err %d allocating parent dmat\n",
4629 			      err);
4630 		goto abort_with_tq;
4631 	}
4632 
4633 	ifp = sc->ifp = if_alloc(IFT_ETHER);
4634 	if (ifp == NULL) {
4635 		device_printf(dev, "can not if_alloc()\n");
4636 		err = ENOSPC;
4637 		goto abort_with_parent_dmat;
4638 	}
4639 	if_initname(ifp, device_get_name(dev), device_get_unit(dev));
4640 
4641 	snprintf(sc->cmd_mtx_name, sizeof(sc->cmd_mtx_name), "%s:cmd",
4642 		 device_get_nameunit(dev));
4643 	mtx_init(&sc->cmd_mtx, sc->cmd_mtx_name, NULL, MTX_DEF);
4644 	snprintf(sc->driver_mtx_name, sizeof(sc->driver_mtx_name),
4645 		 "%s:drv", device_get_nameunit(dev));
4646 	mtx_init(&sc->driver_mtx, sc->driver_mtx_name,
4647 		 MTX_NETWORK_LOCK, MTX_DEF);
4648 
4649 	callout_init_mtx(&sc->co_hdl, &sc->driver_mtx, 0);
4650 
4651 	mxge_setup_cfg_space(sc);
4652 
4653 	/* Map the board into the kernel */
4654 	rid = PCIR_BARS;
4655 	sc->mem_res = bus_alloc_resource(dev, SYS_RES_MEMORY, &rid, 0,
4656 					 ~0, 1, RF_ACTIVE);
4657 	if (sc->mem_res == NULL) {
4658 		device_printf(dev, "could not map memory\n");
4659 		err = ENXIO;
4660 		goto abort_with_lock;
4661 	}
4662 	sc->sram = rman_get_virtual(sc->mem_res);
4663 	sc->sram_size = 2*1024*1024 - (2*(48*1024)+(32*1024)) - 0x100;
4664 	if (sc->sram_size > rman_get_size(sc->mem_res)) {
4665 		device_printf(dev, "impossible memory region size %ld\n",
4666 			      rman_get_size(sc->mem_res));
4667 		err = ENXIO;
4668 		goto abort_with_mem_res;
4669 	}
4670 
4671 	/* make NULL terminated copy of the EEPROM strings section of
4672 	   lanai SRAM */
4673 	bzero(sc->eeprom_strings, MXGE_EEPROM_STRINGS_SIZE);
4674 	bus_space_read_region_1(rman_get_bustag(sc->mem_res),
4675 				rman_get_bushandle(sc->mem_res),
4676 				sc->sram_size - MXGE_EEPROM_STRINGS_SIZE,
4677 				sc->eeprom_strings,
4678 				MXGE_EEPROM_STRINGS_SIZE - 2);
4679 	err = mxge_parse_strings(sc);
4680 	if (err != 0)
4681 		goto abort_with_mem_res;
4682 
4683 	/* Enable write combining for efficient use of PCIe bus */
4684 	mxge_enable_wc(sc);
4685 
4686 	/* Allocate the out of band dma memory */
4687 	err = mxge_dma_alloc(sc, &sc->cmd_dma,
4688 			     sizeof (mxge_cmd_t), 64);
4689 	if (err != 0)
4690 		goto abort_with_mem_res;
4691 	sc->cmd = (mcp_cmd_response_t *) sc->cmd_dma.addr;
4692 	err = mxge_dma_alloc(sc, &sc->zeropad_dma, 64, 64);
4693 	if (err != 0)
4694 		goto abort_with_cmd_dma;
4695 
4696 	err = mxge_dma_alloc(sc, &sc->dmabench_dma, 4096, 4096);
4697 	if (err != 0)
4698 		goto abort_with_zeropad_dma;
4699 
4700 	/* select & load the firmware */
4701 	err = mxge_select_firmware(sc);
4702 	if (err != 0)
4703 		goto abort_with_dmabench;
4704 	sc->intr_coal_delay = mxge_intr_coal_delay;
4705 
4706 	mxge_slice_probe(sc);
4707 	err = mxge_alloc_slices(sc);
4708 	if (err != 0)
4709 		goto abort_with_dmabench;
4710 
4711 	err = mxge_reset(sc, 0);
4712 	if (err != 0)
4713 		goto abort_with_slices;
4714 
4715 	err = mxge_alloc_rings(sc);
4716 	if (err != 0) {
4717 		device_printf(sc->dev, "failed to allocate rings\n");
4718 		goto abort_with_dmabench;
4719 	}
4720 
4721 	err = mxge_add_irq(sc);
4722 	if (err != 0) {
4723 		device_printf(sc->dev, "failed to add irq\n");
4724 		goto abort_with_rings;
4725 	}
4726 
4727 	ifp->if_baudrate = IF_Gbps(10UL);
4728 	ifp->if_capabilities = IFCAP_RXCSUM | IFCAP_TXCSUM | IFCAP_TSO4 |
4729 		IFCAP_VLAN_MTU;
4730 #ifdef INET
4731 	ifp->if_capabilities |= IFCAP_LRO;
4732 #endif
4733 
4734 #ifdef MXGE_NEW_VLAN_API
4735 	ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_HWCSUM;
4736 #endif
4737 
4738 	sc->max_mtu = mxge_max_mtu(sc);
4739 	if (sc->max_mtu >= 9000)
4740 		ifp->if_capabilities |= IFCAP_JUMBO_MTU;
4741 	else
4742 		device_printf(dev, "MTU limited to %d.  Install "
4743 			      "latest firmware for 9000 byte jumbo support\n",
4744 			      sc->max_mtu - ETHER_HDR_LEN);
4745 	ifp->if_hwassist = CSUM_TCP | CSUM_UDP | CSUM_TSO;
4746 	ifp->if_capenable = ifp->if_capabilities;
4747 	if (sc->lro_cnt == 0)
4748 		ifp->if_capenable &= ~IFCAP_LRO;
4749 	sc->csum_flag = 1;
4750         ifp->if_init = mxge_init;
4751         ifp->if_softc = sc;
4752         ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
4753         ifp->if_ioctl = mxge_ioctl;
4754         ifp->if_start = mxge_start;
4755 	/* Initialise the ifmedia structure */
4756 	ifmedia_init(&sc->media, 0, mxge_media_change,
4757 		     mxge_media_status);
4758 	mxge_set_media(sc, IFM_ETHER | IFM_AUTO);
4759 	mxge_media_probe(sc);
4760 	sc->dying = 0;
4761 	ether_ifattach(ifp, sc->mac_addr);
4762 	/* ether_ifattach sets mtu to ETHERMTU */
4763 	if (mxge_initial_mtu != ETHERMTU)
4764 		mxge_change_mtu(sc, mxge_initial_mtu);
4765 
4766 	mxge_add_sysctls(sc);
4767 #ifdef IFNET_BUF_RING
4768 	ifp->if_transmit = mxge_transmit;
4769 	ifp->if_qflush = mxge_qflush;
4770 #endif
4771 	callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
4772 	return 0;
4773 
4774 abort_with_rings:
4775 	mxge_free_rings(sc);
4776 abort_with_slices:
4777 	mxge_free_slices(sc);
4778 abort_with_dmabench:
4779 	mxge_dma_free(&sc->dmabench_dma);
4780 abort_with_zeropad_dma:
4781 	mxge_dma_free(&sc->zeropad_dma);
4782 abort_with_cmd_dma:
4783 	mxge_dma_free(&sc->cmd_dma);
4784 abort_with_mem_res:
4785 	bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
4786 abort_with_lock:
4787 	pci_disable_busmaster(dev);
4788 	mtx_destroy(&sc->cmd_mtx);
4789 	mtx_destroy(&sc->driver_mtx);
4790 	if_free(ifp);
4791 abort_with_parent_dmat:
4792 	bus_dma_tag_destroy(sc->parent_dmat);
4793 abort_with_tq:
4794 	if (sc->tq != NULL) {
4795 		taskqueue_drain(sc->tq, &sc->watchdog_task);
4796 		taskqueue_free(sc->tq);
4797 		sc->tq = NULL;
4798 	}
4799 abort_with_nothing:
4800 	return err;
4801 }
4802 
4803 static int
4804 mxge_detach(device_t dev)
4805 {
4806 	mxge_softc_t *sc = device_get_softc(dev);
4807 
4808 	if (mxge_vlans_active(sc)) {
4809 		device_printf(sc->dev,
4810 			      "Detach vlans before removing module\n");
4811 		return EBUSY;
4812 	}
4813 	mtx_lock(&sc->driver_mtx);
4814 	sc->dying = 1;
4815 	if (sc->ifp->if_drv_flags & IFF_DRV_RUNNING)
4816 		mxge_close(sc, 0);
4817 	mtx_unlock(&sc->driver_mtx);
4818 	ether_ifdetach(sc->ifp);
4819 	if (sc->tq != NULL) {
4820 		taskqueue_drain(sc->tq, &sc->watchdog_task);
4821 		taskqueue_free(sc->tq);
4822 		sc->tq = NULL;
4823 	}
4824 	callout_drain(&sc->co_hdl);
4825 	ifmedia_removeall(&sc->media);
4826 	mxge_dummy_rdma(sc, 0);
4827 	mxge_rem_sysctls(sc);
4828 	mxge_rem_irq(sc);
4829 	mxge_free_rings(sc);
4830 	mxge_free_slices(sc);
4831 	mxge_dma_free(&sc->dmabench_dma);
4832 	mxge_dma_free(&sc->zeropad_dma);
4833 	mxge_dma_free(&sc->cmd_dma);
4834 	bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
4835 	pci_disable_busmaster(dev);
4836 	mtx_destroy(&sc->cmd_mtx);
4837 	mtx_destroy(&sc->driver_mtx);
4838 	if_free(sc->ifp);
4839 	bus_dma_tag_destroy(sc->parent_dmat);
4840 	return 0;
4841 }
4842 
4843 static int
4844 mxge_shutdown(device_t dev)
4845 {
4846 	return 0;
4847 }
4848 
4849 /*
4850   This file uses Myri10GE driver indentation.
4851 
4852   Local Variables:
4853   c-file-style:"linux"
4854   tab-width:8
4855   End:
4856 */
4857