xref: /freebsd/sys/dev/mxge/if_mxge.c (revision 7aa383846770374466b1dcb2cefd71bde9acf463)
1 /******************************************************************************
2 
3 Copyright (c) 2006-2009, Myricom Inc.
4 All rights reserved.
5 
6 Redistribution and use in source and binary forms, with or without
7 modification, are permitted provided that the following conditions are met:
8 
9  1. Redistributions of source code must retain the above copyright notice,
10     this list of conditions and the following disclaimer.
11 
12  2. Neither the name of the Myricom Inc, nor the names of its
13     contributors may be used to endorse or promote products derived from
14     this software without specific prior written permission.
15 
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 POSSIBILITY OF SUCH DAMAGE.
27 
28 ***************************************************************************/
29 
30 #include <sys/cdefs.h>
31 __FBSDID("$FreeBSD$");
32 
33 #include <sys/param.h>
34 #include <sys/systm.h>
35 #include <sys/linker.h>
36 #include <sys/firmware.h>
37 #include <sys/endian.h>
38 #include <sys/sockio.h>
39 #include <sys/mbuf.h>
40 #include <sys/malloc.h>
41 #include <sys/kdb.h>
42 #include <sys/kernel.h>
43 #include <sys/lock.h>
44 #include <sys/module.h>
45 #include <sys/socket.h>
46 #include <sys/sysctl.h>
47 #include <sys/sx.h>
48 #include <sys/taskqueue.h>
49 
50 /* count xmits ourselves, rather than via drbr */
51 #define NO_SLOW_STATS
52 #include <net/if.h>
53 #include <net/if_arp.h>
54 #include <net/ethernet.h>
55 #include <net/if_dl.h>
56 #include <net/if_media.h>
57 
58 #include <net/bpf.h>
59 
60 #include <net/if_types.h>
61 #include <net/if_vlan_var.h>
62 #include <net/zlib.h>
63 
64 #include <netinet/in_systm.h>
65 #include <netinet/in.h>
66 #include <netinet/ip.h>
67 #include <netinet/tcp.h>
68 
69 #include <machine/bus.h>
70 #include <machine/in_cksum.h>
71 #include <machine/resource.h>
72 #include <sys/bus.h>
73 #include <sys/rman.h>
74 #include <sys/smp.h>
75 
76 #include <dev/pci/pcireg.h>
77 #include <dev/pci/pcivar.h>
78 #include <dev/pci/pci_private.h> /* XXX for pci_cfg_restore */
79 
80 #include <vm/vm.h>		/* for pmap_mapdev() */
81 #include <vm/pmap.h>
82 
83 #if defined(__i386) || defined(__amd64)
84 #include <machine/specialreg.h>
85 #endif
86 
87 #include <dev/mxge/mxge_mcp.h>
88 #include <dev/mxge/mcp_gen_header.h>
89 /*#define MXGE_FAKE_IFP*/
90 #include <dev/mxge/if_mxge_var.h>
91 #ifdef IFNET_BUF_RING
92 #include <sys/buf_ring.h>
93 #endif
94 
95 #include "opt_inet.h"
96 
97 /* tunable params */
98 static int mxge_nvidia_ecrc_enable = 1;
99 static int mxge_force_firmware = 0;
100 static int mxge_intr_coal_delay = 30;
101 static int mxge_deassert_wait = 1;
102 static int mxge_flow_control = 1;
103 static int mxge_verbose = 0;
104 static int mxge_lro_cnt = 8;
105 static int mxge_ticks;
106 static int mxge_max_slices = 1;
107 static int mxge_rss_hash_type = MXGEFW_RSS_HASH_TYPE_SRC_DST_PORT;
108 static int mxge_always_promisc = 0;
109 static int mxge_initial_mtu = ETHERMTU_JUMBO;
110 static int mxge_throttle = 0;
111 static char *mxge_fw_unaligned = "mxge_ethp_z8e";
112 static char *mxge_fw_aligned = "mxge_eth_z8e";
113 static char *mxge_fw_rss_aligned = "mxge_rss_eth_z8e";
114 static char *mxge_fw_rss_unaligned = "mxge_rss_ethp_z8e";
115 
116 static int mxge_probe(device_t dev);
117 static int mxge_attach(device_t dev);
118 static int mxge_detach(device_t dev);
119 static int mxge_shutdown(device_t dev);
120 static void mxge_intr(void *arg);
121 
122 static device_method_t mxge_methods[] =
123 {
124   /* Device interface */
125   DEVMETHOD(device_probe, mxge_probe),
126   DEVMETHOD(device_attach, mxge_attach),
127   DEVMETHOD(device_detach, mxge_detach),
128   DEVMETHOD(device_shutdown, mxge_shutdown),
129   {0, 0}
130 };
131 
132 static driver_t mxge_driver =
133 {
134   "mxge",
135   mxge_methods,
136   sizeof(mxge_softc_t),
137 };
138 
139 static devclass_t mxge_devclass;
140 
141 /* Declare ourselves to be a child of the PCI bus.*/
142 DRIVER_MODULE(mxge, pci, mxge_driver, mxge_devclass, 0, 0);
143 MODULE_DEPEND(mxge, firmware, 1, 1, 1);
144 MODULE_DEPEND(mxge, zlib, 1, 1, 1);
145 
146 static int mxge_load_firmware(mxge_softc_t *sc, int adopt);
147 static int mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data);
148 static int mxge_close(mxge_softc_t *sc, int down);
149 static int mxge_open(mxge_softc_t *sc);
150 static void mxge_tick(void *arg);
151 
152 static int
153 mxge_probe(device_t dev)
154 {
155 	int rev;
156 
157 
158 	if ((pci_get_vendor(dev) == MXGE_PCI_VENDOR_MYRICOM) &&
159 	    ((pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E) ||
160 	     (pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E_9))) {
161 		rev = pci_get_revid(dev);
162 		switch (rev) {
163 		case MXGE_PCI_REV_Z8E:
164 			device_set_desc(dev, "Myri10G-PCIE-8A");
165 			break;
166 		case MXGE_PCI_REV_Z8ES:
167 			device_set_desc(dev, "Myri10G-PCIE-8B");
168 			break;
169 		default:
170 			device_set_desc(dev, "Myri10G-PCIE-8??");
171 			device_printf(dev, "Unrecognized rev %d NIC\n",
172 				      rev);
173 			break;
174 		}
175 		return 0;
176 	}
177 	return ENXIO;
178 }
179 
180 static void
181 mxge_enable_wc(mxge_softc_t *sc)
182 {
183 #if defined(__i386) || defined(__amd64)
184 	vm_offset_t len;
185 	int err;
186 
187 	sc->wc = 1;
188 	len = rman_get_size(sc->mem_res);
189 	err = pmap_change_attr((vm_offset_t) sc->sram,
190 			       len, PAT_WRITE_COMBINING);
191 	if (err != 0) {
192 		device_printf(sc->dev, "pmap_change_attr failed, %d\n",
193 			      err);
194 		sc->wc = 0;
195 	}
196 #endif
197 }
198 
199 
200 /* callback to get our DMA address */
201 static void
202 mxge_dmamap_callback(void *arg, bus_dma_segment_t *segs, int nsegs,
203 			 int error)
204 {
205 	if (error == 0) {
206 		*(bus_addr_t *) arg = segs->ds_addr;
207 	}
208 }
209 
210 static int
211 mxge_dma_alloc(mxge_softc_t *sc, mxge_dma_t *dma, size_t bytes,
212 		   bus_size_t alignment)
213 {
214 	int err;
215 	device_t dev = sc->dev;
216 	bus_size_t boundary, maxsegsize;
217 
218 	if (bytes > 4096 && alignment == 4096) {
219 		boundary = 0;
220 		maxsegsize = bytes;
221 	} else {
222 		boundary = 4096;
223 		maxsegsize = 4096;
224 	}
225 
226 	/* allocate DMAable memory tags */
227 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
228 				 alignment,		/* alignment */
229 				 boundary,		/* boundary */
230 				 BUS_SPACE_MAXADDR,	/* low */
231 				 BUS_SPACE_MAXADDR,	/* high */
232 				 NULL, NULL,		/* filter */
233 				 bytes,			/* maxsize */
234 				 1,			/* num segs */
235 				 maxsegsize,		/* maxsegsize */
236 				 BUS_DMA_COHERENT,	/* flags */
237 				 NULL, NULL,		/* lock */
238 				 &dma->dmat);		/* tag */
239 	if (err != 0) {
240 		device_printf(dev, "couldn't alloc tag (err = %d)\n", err);
241 		return err;
242 	}
243 
244 	/* allocate DMAable memory & map */
245 	err = bus_dmamem_alloc(dma->dmat, &dma->addr,
246 			       (BUS_DMA_WAITOK | BUS_DMA_COHERENT
247 				| BUS_DMA_ZERO),  &dma->map);
248 	if (err != 0) {
249 		device_printf(dev, "couldn't alloc mem (err = %d)\n", err);
250 		goto abort_with_dmat;
251 	}
252 
253 	/* load the memory */
254 	err = bus_dmamap_load(dma->dmat, dma->map, dma->addr, bytes,
255 			      mxge_dmamap_callback,
256 			      (void *)&dma->bus_addr, 0);
257 	if (err != 0) {
258 		device_printf(dev, "couldn't load map (err = %d)\n", err);
259 		goto abort_with_mem;
260 	}
261 	return 0;
262 
263 abort_with_mem:
264 	bus_dmamem_free(dma->dmat, dma->addr, dma->map);
265 abort_with_dmat:
266 	(void)bus_dma_tag_destroy(dma->dmat);
267 	return err;
268 }
269 
270 
271 static void
272 mxge_dma_free(mxge_dma_t *dma)
273 {
274 	bus_dmamap_unload(dma->dmat, dma->map);
275 	bus_dmamem_free(dma->dmat, dma->addr, dma->map);
276 	(void)bus_dma_tag_destroy(dma->dmat);
277 }
278 
279 /*
280  * The eeprom strings on the lanaiX have the format
281  * SN=x\0
282  * MAC=x:x:x:x:x:x\0
283  * PC=text\0
284  */
285 
286 static int
287 mxge_parse_strings(mxge_softc_t *sc)
288 {
289 #define MXGE_NEXT_STRING(p) while(ptr < limit && *ptr++)
290 
291 	char *ptr, *limit;
292 	int i, found_mac;
293 
294 	ptr = sc->eeprom_strings;
295 	limit = sc->eeprom_strings + MXGE_EEPROM_STRINGS_SIZE;
296 	found_mac = 0;
297 	while (ptr < limit && *ptr != '\0') {
298 		if (memcmp(ptr, "MAC=", 4) == 0) {
299 			ptr += 1;
300 			sc->mac_addr_string = ptr;
301 			for (i = 0; i < 6; i++) {
302 				ptr += 3;
303 				if ((ptr + 2) > limit)
304 					goto abort;
305 				sc->mac_addr[i] = strtoul(ptr, NULL, 16);
306 				found_mac = 1;
307 			}
308 		} else if (memcmp(ptr, "PC=", 3) == 0) {
309 			ptr += 3;
310 			strncpy(sc->product_code_string, ptr,
311 				sizeof (sc->product_code_string) - 1);
312 		} else if (memcmp(ptr, "SN=", 3) == 0) {
313 			ptr += 3;
314 			strncpy(sc->serial_number_string, ptr,
315 				sizeof (sc->serial_number_string) - 1);
316 		}
317 		MXGE_NEXT_STRING(ptr);
318 	}
319 
320 	if (found_mac)
321 		return 0;
322 
323  abort:
324 	device_printf(sc->dev, "failed to parse eeprom_strings\n");
325 
326 	return ENXIO;
327 }
328 
329 #if defined __i386 || defined i386 || defined __i386__ || defined __x86_64__
330 static void
331 mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
332 {
333 	uint32_t val;
334 	unsigned long base, off;
335 	char *va, *cfgptr;
336 	device_t pdev, mcp55;
337 	uint16_t vendor_id, device_id, word;
338 	uintptr_t bus, slot, func, ivend, idev;
339 	uint32_t *ptr32;
340 
341 
342 	if (!mxge_nvidia_ecrc_enable)
343 		return;
344 
345 	pdev = device_get_parent(device_get_parent(sc->dev));
346 	if (pdev == NULL) {
347 		device_printf(sc->dev, "could not find parent?\n");
348 		return;
349 	}
350 	vendor_id = pci_read_config(pdev, PCIR_VENDOR, 2);
351 	device_id = pci_read_config(pdev, PCIR_DEVICE, 2);
352 
353 	if (vendor_id != 0x10de)
354 		return;
355 
356 	base = 0;
357 
358 	if (device_id == 0x005d) {
359 		/* ck804, base address is magic */
360 		base = 0xe0000000UL;
361 	} else if (device_id >= 0x0374 && device_id <= 0x378) {
362 		/* mcp55, base address stored in chipset */
363 		mcp55 = pci_find_bsf(0, 0, 0);
364 		if (mcp55 &&
365 		    0x10de == pci_read_config(mcp55, PCIR_VENDOR, 2) &&
366 		    0x0369 == pci_read_config(mcp55, PCIR_DEVICE, 2)) {
367 			word = pci_read_config(mcp55, 0x90, 2);
368 			base = ((unsigned long)word & 0x7ffeU) << 25;
369 		}
370 	}
371 	if (!base)
372 		return;
373 
374 	/* XXXX
375 	   Test below is commented because it is believed that doing
376 	   config read/write beyond 0xff will access the config space
377 	   for the next larger function.  Uncomment this and remove
378 	   the hacky pmap_mapdev() way of accessing config space when
379 	   FreeBSD grows support for extended pcie config space access
380 	*/
381 #if 0
382 	/* See if we can, by some miracle, access the extended
383 	   config space */
384 	val = pci_read_config(pdev, 0x178, 4);
385 	if (val != 0xffffffff) {
386 		val |= 0x40;
387 		pci_write_config(pdev, 0x178, val, 4);
388 		return;
389 	}
390 #endif
391 	/* Rather than using normal pci config space writes, we must
392 	 * map the Nvidia config space ourselves.  This is because on
393 	 * opteron/nvidia class machine the 0xe000000 mapping is
394 	 * handled by the nvidia chipset, that means the internal PCI
395 	 * device (the on-chip northbridge), or the amd-8131 bridge
396 	 * and things behind them are not visible by this method.
397 	 */
398 
399 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
400 		      PCI_IVAR_BUS, &bus);
401 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
402 		      PCI_IVAR_SLOT, &slot);
403 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
404 		      PCI_IVAR_FUNCTION, &func);
405 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
406 		      PCI_IVAR_VENDOR, &ivend);
407 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
408 		      PCI_IVAR_DEVICE, &idev);
409 
410 	off =  base
411 		+ 0x00100000UL * (unsigned long)bus
412 		+ 0x00001000UL * (unsigned long)(func
413 						 + 8 * slot);
414 
415 	/* map it into the kernel */
416 	va = pmap_mapdev(trunc_page((vm_paddr_t)off), PAGE_SIZE);
417 
418 
419 	if (va == NULL) {
420 		device_printf(sc->dev, "pmap_kenter_temporary didn't\n");
421 		return;
422 	}
423 	/* get a pointer to the config space mapped into the kernel */
424 	cfgptr = va + (off & PAGE_MASK);
425 
426 	/* make sure that we can really access it */
427 	vendor_id = *(uint16_t *)(cfgptr + PCIR_VENDOR);
428 	device_id = *(uint16_t *)(cfgptr + PCIR_DEVICE);
429 	if (! (vendor_id == ivend && device_id == idev)) {
430 		device_printf(sc->dev, "mapping failed: 0x%x:0x%x\n",
431 			      vendor_id, device_id);
432 		pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
433 		return;
434 	}
435 
436 	ptr32 = (uint32_t*)(cfgptr + 0x178);
437 	val = *ptr32;
438 
439 	if (val == 0xffffffff) {
440 		device_printf(sc->dev, "extended mapping failed\n");
441 		pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
442 		return;
443 	}
444 	*ptr32 = val | 0x40;
445 	pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
446 	if (mxge_verbose)
447 		device_printf(sc->dev,
448 			      "Enabled ECRC on upstream Nvidia bridge "
449 			      "at %d:%d:%d\n",
450 			      (int)bus, (int)slot, (int)func);
451 	return;
452 }
453 #else
454 static void
455 mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
456 {
457 	device_printf(sc->dev,
458 		      "Nforce 4 chipset on non-x86/amd64!?!?!\n");
459 	return;
460 }
461 #endif
462 
463 
464 static int
465 mxge_dma_test(mxge_softc_t *sc, int test_type)
466 {
467 	mxge_cmd_t cmd;
468 	bus_addr_t dmatest_bus = sc->dmabench_dma.bus_addr;
469 	int status;
470 	uint32_t len;
471 	char *test = " ";
472 
473 
474 	/* Run a small DMA test.
475 	 * The magic multipliers to the length tell the firmware
476 	 * to do DMA read, write, or read+write tests.  The
477 	 * results are returned in cmd.data0.  The upper 16
478 	 * bits of the return is the number of transfers completed.
479 	 * The lower 16 bits is the time in 0.5us ticks that the
480 	 * transfers took to complete.
481 	 */
482 
483 	len = sc->tx_boundary;
484 
485 	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
486 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
487 	cmd.data2 = len * 0x10000;
488 	status = mxge_send_cmd(sc, test_type, &cmd);
489 	if (status != 0) {
490 		test = "read";
491 		goto abort;
492 	}
493 	sc->read_dma = ((cmd.data0>>16) * len * 2) /
494 		(cmd.data0 & 0xffff);
495 	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
496 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
497 	cmd.data2 = len * 0x1;
498 	status = mxge_send_cmd(sc, test_type, &cmd);
499 	if (status != 0) {
500 		test = "write";
501 		goto abort;
502 	}
503 	sc->write_dma = ((cmd.data0>>16) * len * 2) /
504 		(cmd.data0 & 0xffff);
505 
506 	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
507 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
508 	cmd.data2 = len * 0x10001;
509 	status = mxge_send_cmd(sc, test_type, &cmd);
510 	if (status != 0) {
511 		test = "read/write";
512 		goto abort;
513 	}
514 	sc->read_write_dma = ((cmd.data0>>16) * len * 2 * 2) /
515 		(cmd.data0 & 0xffff);
516 
517 abort:
518 	if (status != 0 && test_type != MXGEFW_CMD_UNALIGNED_TEST)
519 		device_printf(sc->dev, "DMA %s benchmark failed: %d\n",
520 			      test, status);
521 
522 	return status;
523 }
524 
525 /*
526  * The Lanai Z8E PCI-E interface achieves higher Read-DMA throughput
527  * when the PCI-E Completion packets are aligned on an 8-byte
528  * boundary.  Some PCI-E chip sets always align Completion packets; on
529  * the ones that do not, the alignment can be enforced by enabling
530  * ECRC generation (if supported).
531  *
532  * When PCI-E Completion packets are not aligned, it is actually more
533  * efficient to limit Read-DMA transactions to 2KB, rather than 4KB.
534  *
535  * If the driver can neither enable ECRC nor verify that it has
536  * already been enabled, then it must use a firmware image which works
537  * around unaligned completion packets (ethp_z8e.dat), and it should
538  * also ensure that it never gives the device a Read-DMA which is
539  * larger than 2KB by setting the tx_boundary to 2KB.  If ECRC is
540  * enabled, then the driver should use the aligned (eth_z8e.dat)
541  * firmware image, and set tx_boundary to 4KB.
542  */
543 
544 static int
545 mxge_firmware_probe(mxge_softc_t *sc)
546 {
547 	device_t dev = sc->dev;
548 	int reg, status;
549 	uint16_t pectl;
550 
551 	sc->tx_boundary = 4096;
552 	/*
553 	 * Verify the max read request size was set to 4KB
554 	 * before trying the test with 4KB.
555 	 */
556 	if (pci_find_extcap(dev, PCIY_EXPRESS, &reg) == 0) {
557 		pectl = pci_read_config(dev, reg + 0x8, 2);
558 		if ((pectl & (5 << 12)) != (5 << 12)) {
559 			device_printf(dev, "Max Read Req. size != 4k (0x%x\n",
560 				      pectl);
561 			sc->tx_boundary = 2048;
562 		}
563 	}
564 
565 	/*
566 	 * load the optimized firmware (which assumes aligned PCIe
567 	 * completions) in order to see if it works on this host.
568 	 */
569 	sc->fw_name = mxge_fw_aligned;
570 	status = mxge_load_firmware(sc, 1);
571 	if (status != 0) {
572 		return status;
573 	}
574 
575 	/*
576 	 * Enable ECRC if possible
577 	 */
578 	mxge_enable_nvidia_ecrc(sc);
579 
580 	/*
581 	 * Run a DMA test which watches for unaligned completions and
582 	 * aborts on the first one seen.
583 	 */
584 
585 	status = mxge_dma_test(sc, MXGEFW_CMD_UNALIGNED_TEST);
586 	if (status == 0)
587 		return 0; /* keep the aligned firmware */
588 
589 	if (status != E2BIG)
590 		device_printf(dev, "DMA test failed: %d\n", status);
591 	if (status == ENOSYS)
592 		device_printf(dev, "Falling back to ethp! "
593 			      "Please install up to date fw\n");
594 	return status;
595 }
596 
597 static int
598 mxge_select_firmware(mxge_softc_t *sc)
599 {
600 	int aligned = 0;
601 	int force_firmware = mxge_force_firmware;
602 
603 	if (sc->throttle)
604 		force_firmware = sc->throttle;
605 
606 	if (force_firmware != 0) {
607 		if (force_firmware == 1)
608 			aligned = 1;
609 		else
610 			aligned = 0;
611 		if (mxge_verbose)
612 			device_printf(sc->dev,
613 				      "Assuming %s completions (forced)\n",
614 				      aligned ? "aligned" : "unaligned");
615 		goto abort;
616 	}
617 
618 	/* if the PCIe link width is 4 or less, we can use the aligned
619 	   firmware and skip any checks */
620 	if (sc->link_width != 0 && sc->link_width <= 4) {
621 		device_printf(sc->dev,
622 			      "PCIe x%d Link, expect reduced performance\n",
623 			      sc->link_width);
624 		aligned = 1;
625 		goto abort;
626 	}
627 
628 	if (0 == mxge_firmware_probe(sc))
629 		return 0;
630 
631 abort:
632 	if (aligned) {
633 		sc->fw_name = mxge_fw_aligned;
634 		sc->tx_boundary = 4096;
635 	} else {
636 		sc->fw_name = mxge_fw_unaligned;
637 		sc->tx_boundary = 2048;
638 	}
639 	return (mxge_load_firmware(sc, 0));
640 }
641 
642 union qualhack
643 {
644         const char *ro_char;
645         char *rw_char;
646 };
647 
648 static int
649 mxge_validate_firmware(mxge_softc_t *sc, const mcp_gen_header_t *hdr)
650 {
651 
652 
653 	if (be32toh(hdr->mcp_type) != MCP_TYPE_ETH) {
654 		device_printf(sc->dev, "Bad firmware type: 0x%x\n",
655 			      be32toh(hdr->mcp_type));
656 		return EIO;
657 	}
658 
659 	/* save firmware version for sysctl */
660 	strncpy(sc->fw_version, hdr->version, sizeof (sc->fw_version));
661 	if (mxge_verbose)
662 		device_printf(sc->dev, "firmware id: %s\n", hdr->version);
663 
664 	sscanf(sc->fw_version, "%d.%d.%d", &sc->fw_ver_major,
665 	       &sc->fw_ver_minor, &sc->fw_ver_tiny);
666 
667 	if (!(sc->fw_ver_major == MXGEFW_VERSION_MAJOR
668 	      && sc->fw_ver_minor == MXGEFW_VERSION_MINOR)) {
669 		device_printf(sc->dev, "Found firmware version %s\n",
670 			      sc->fw_version);
671 		device_printf(sc->dev, "Driver needs %d.%d\n",
672 			      MXGEFW_VERSION_MAJOR, MXGEFW_VERSION_MINOR);
673 		return EINVAL;
674 	}
675 	return 0;
676 
677 }
678 
679 static void *
680 z_alloc(void *nil, u_int items, u_int size)
681 {
682         void *ptr;
683 
684         ptr = malloc(items * size, M_TEMP, M_NOWAIT);
685         return ptr;
686 }
687 
688 static void
689 z_free(void *nil, void *ptr)
690 {
691         free(ptr, M_TEMP);
692 }
693 
694 
695 static int
696 mxge_load_firmware_helper(mxge_softc_t *sc, uint32_t *limit)
697 {
698 	z_stream zs;
699 	char *inflate_buffer;
700 	const struct firmware *fw;
701 	const mcp_gen_header_t *hdr;
702 	unsigned hdr_offset;
703 	int status;
704 	unsigned int i;
705 	char dummy;
706 	size_t fw_len;
707 
708 	fw = firmware_get(sc->fw_name);
709 	if (fw == NULL) {
710 		device_printf(sc->dev, "Could not find firmware image %s\n",
711 			      sc->fw_name);
712 		return ENOENT;
713 	}
714 
715 
716 
717 	/* setup zlib and decompress f/w */
718 	bzero(&zs, sizeof (zs));
719 	zs.zalloc = z_alloc;
720 	zs.zfree = z_free;
721 	status = inflateInit(&zs);
722 	if (status != Z_OK) {
723 		status = EIO;
724 		goto abort_with_fw;
725 	}
726 
727 	/* the uncompressed size is stored as the firmware version,
728 	   which would otherwise go unused */
729 	fw_len = (size_t) fw->version;
730 	inflate_buffer = malloc(fw_len, M_TEMP, M_NOWAIT);
731 	if (inflate_buffer == NULL)
732 		goto abort_with_zs;
733 	zs.avail_in = fw->datasize;
734 	zs.next_in = __DECONST(char *, fw->data);
735 	zs.avail_out = fw_len;
736 	zs.next_out = inflate_buffer;
737 	status = inflate(&zs, Z_FINISH);
738 	if (status != Z_STREAM_END) {
739 		device_printf(sc->dev, "zlib %d\n", status);
740 		status = EIO;
741 		goto abort_with_buffer;
742 	}
743 
744 	/* check id */
745 	hdr_offset = htobe32(*(const uint32_t *)
746 			     (inflate_buffer + MCP_HEADER_PTR_OFFSET));
747 	if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > fw_len) {
748 		device_printf(sc->dev, "Bad firmware file");
749 		status = EIO;
750 		goto abort_with_buffer;
751 	}
752 	hdr = (const void*)(inflate_buffer + hdr_offset);
753 
754 	status = mxge_validate_firmware(sc, hdr);
755 	if (status != 0)
756 		goto abort_with_buffer;
757 
758 	/* Copy the inflated firmware to NIC SRAM. */
759 	for (i = 0; i < fw_len; i += 256) {
760 		mxge_pio_copy(sc->sram + MXGE_FW_OFFSET + i,
761 			      inflate_buffer + i,
762 			      min(256U, (unsigned)(fw_len - i)));
763 		wmb();
764 		dummy = *sc->sram;
765 		wmb();
766 	}
767 
768 	*limit = fw_len;
769 	status = 0;
770 abort_with_buffer:
771 	free(inflate_buffer, M_TEMP);
772 abort_with_zs:
773 	inflateEnd(&zs);
774 abort_with_fw:
775 	firmware_put(fw, FIRMWARE_UNLOAD);
776 	return status;
777 }
778 
779 /*
780  * Enable or disable periodic RDMAs from the host to make certain
781  * chipsets resend dropped PCIe messages
782  */
783 
784 static void
785 mxge_dummy_rdma(mxge_softc_t *sc, int enable)
786 {
787 	char buf_bytes[72];
788 	volatile uint32_t *confirm;
789 	volatile char *submit;
790 	uint32_t *buf, dma_low, dma_high;
791 	int i;
792 
793 	buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
794 
795 	/* clear confirmation addr */
796 	confirm = (volatile uint32_t *)sc->cmd;
797 	*confirm = 0;
798 	wmb();
799 
800 	/* send an rdma command to the PCIe engine, and wait for the
801 	   response in the confirmation address.  The firmware should
802 	   write a -1 there to indicate it is alive and well
803 	*/
804 
805 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
806 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
807 	buf[0] = htobe32(dma_high);		/* confirm addr MSW */
808 	buf[1] = htobe32(dma_low);		/* confirm addr LSW */
809 	buf[2] = htobe32(0xffffffff);		/* confirm data */
810 	dma_low = MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr);
811 	dma_high = MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr);
812 	buf[3] = htobe32(dma_high); 		/* dummy addr MSW */
813 	buf[4] = htobe32(dma_low); 		/* dummy addr LSW */
814 	buf[5] = htobe32(enable);			/* enable? */
815 
816 
817 	submit = (volatile char *)(sc->sram + MXGEFW_BOOT_DUMMY_RDMA);
818 
819 	mxge_pio_copy(submit, buf, 64);
820 	wmb();
821 	DELAY(1000);
822 	wmb();
823 	i = 0;
824 	while (*confirm != 0xffffffff && i < 20) {
825 		DELAY(1000);
826 		i++;
827 	}
828 	if (*confirm != 0xffffffff) {
829 		device_printf(sc->dev, "dummy rdma %s failed (%p = 0x%x)",
830 			      (enable ? "enable" : "disable"), confirm,
831 			      *confirm);
832 	}
833 	return;
834 }
835 
836 static int
837 mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data)
838 {
839 	mcp_cmd_t *buf;
840 	char buf_bytes[sizeof(*buf) + 8];
841 	volatile mcp_cmd_response_t *response = sc->cmd;
842 	volatile char *cmd_addr = sc->sram + MXGEFW_ETH_CMD;
843 	uint32_t dma_low, dma_high;
844 	int err, sleep_total = 0;
845 
846 	/* ensure buf is aligned to 8 bytes */
847 	buf = (mcp_cmd_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
848 
849 	buf->data0 = htobe32(data->data0);
850 	buf->data1 = htobe32(data->data1);
851 	buf->data2 = htobe32(data->data2);
852 	buf->cmd = htobe32(cmd);
853 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
854 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
855 
856 	buf->response_addr.low = htobe32(dma_low);
857 	buf->response_addr.high = htobe32(dma_high);
858 	mtx_lock(&sc->cmd_mtx);
859 	response->result = 0xffffffff;
860 	wmb();
861 	mxge_pio_copy((volatile void *)cmd_addr, buf, sizeof (*buf));
862 
863 	/* wait up to 20ms */
864 	err = EAGAIN;
865 	for (sleep_total = 0; sleep_total <  20; sleep_total++) {
866 		bus_dmamap_sync(sc->cmd_dma.dmat,
867 				sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
868 		wmb();
869 		switch (be32toh(response->result)) {
870 		case 0:
871 			data->data0 = be32toh(response->data);
872 			err = 0;
873 			break;
874 		case 0xffffffff:
875 			DELAY(1000);
876 			break;
877 		case MXGEFW_CMD_UNKNOWN:
878 			err = ENOSYS;
879 			break;
880 		case MXGEFW_CMD_ERROR_UNALIGNED:
881 			err = E2BIG;
882 			break;
883 		case MXGEFW_CMD_ERROR_BUSY:
884 			err = EBUSY;
885 			break;
886 		case MXGEFW_CMD_ERROR_I2C_ABSENT:
887 			err = ENXIO;
888 			break;
889 		default:
890 			device_printf(sc->dev,
891 				      "mxge: command %d "
892 				      "failed, result = %d\n",
893 				      cmd, be32toh(response->result));
894 			err = ENXIO;
895 			break;
896 		}
897 		if (err != EAGAIN)
898 			break;
899 	}
900 	if (err == EAGAIN)
901 		device_printf(sc->dev, "mxge: command %d timed out"
902 			      "result = %d\n",
903 			      cmd, be32toh(response->result));
904 	mtx_unlock(&sc->cmd_mtx);
905 	return err;
906 }
907 
908 static int
909 mxge_adopt_running_firmware(mxge_softc_t *sc)
910 {
911 	struct mcp_gen_header *hdr;
912 	const size_t bytes = sizeof (struct mcp_gen_header);
913 	size_t hdr_offset;
914 	int status;
915 
916 	/* find running firmware header */
917 	hdr_offset = htobe32(*(volatile uint32_t *)
918 			     (sc->sram + MCP_HEADER_PTR_OFFSET));
919 
920 	if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > sc->sram_size) {
921 		device_printf(sc->dev,
922 			      "Running firmware has bad header offset (%d)\n",
923 			      (int)hdr_offset);
924 		return EIO;
925 	}
926 
927 	/* copy header of running firmware from SRAM to host memory to
928 	 * validate firmware */
929 	hdr = malloc(bytes, M_DEVBUF, M_NOWAIT);
930 	if (hdr == NULL) {
931 		device_printf(sc->dev, "could not malloc firmware hdr\n");
932 		return ENOMEM;
933 	}
934 	bus_space_read_region_1(rman_get_bustag(sc->mem_res),
935 				rman_get_bushandle(sc->mem_res),
936 				hdr_offset, (char *)hdr, bytes);
937 	status = mxge_validate_firmware(sc, hdr);
938 	free(hdr, M_DEVBUF);
939 
940 	/*
941 	 * check to see if adopted firmware has bug where adopting
942 	 * it will cause broadcasts to be filtered unless the NIC
943 	 * is kept in ALLMULTI mode
944 	 */
945 	if (sc->fw_ver_major == 1 && sc->fw_ver_minor == 4 &&
946 	    sc->fw_ver_tiny >= 4 && sc->fw_ver_tiny <= 11) {
947 		sc->adopted_rx_filter_bug = 1;
948 		device_printf(sc->dev, "Adopting fw %d.%d.%d: "
949 			      "working around rx filter bug\n",
950 			      sc->fw_ver_major, sc->fw_ver_minor,
951 			      sc->fw_ver_tiny);
952 	}
953 
954 	return status;
955 }
956 
957 
958 static int
959 mxge_load_firmware(mxge_softc_t *sc, int adopt)
960 {
961 	volatile uint32_t *confirm;
962 	volatile char *submit;
963 	char buf_bytes[72];
964 	uint32_t *buf, size, dma_low, dma_high;
965 	int status, i;
966 
967 	buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
968 
969 	size = sc->sram_size;
970 	status = mxge_load_firmware_helper(sc, &size);
971 	if (status) {
972 		if (!adopt)
973 			return status;
974 		/* Try to use the currently running firmware, if
975 		   it is new enough */
976 		status = mxge_adopt_running_firmware(sc);
977 		if (status) {
978 			device_printf(sc->dev,
979 				      "failed to adopt running firmware\n");
980 			return status;
981 		}
982 		device_printf(sc->dev,
983 			      "Successfully adopted running firmware\n");
984 		if (sc->tx_boundary == 4096) {
985 			device_printf(sc->dev,
986 				"Using firmware currently running on NIC"
987 				 ".  For optimal\n");
988 			device_printf(sc->dev,
989 				 "performance consider loading optimized "
990 				 "firmware\n");
991 		}
992 		sc->fw_name = mxge_fw_unaligned;
993 		sc->tx_boundary = 2048;
994 		return 0;
995 	}
996 	/* clear confirmation addr */
997 	confirm = (volatile uint32_t *)sc->cmd;
998 	*confirm = 0;
999 	wmb();
1000 	/* send a reload command to the bootstrap MCP, and wait for the
1001 	   response in the confirmation address.  The firmware should
1002 	   write a -1 there to indicate it is alive and well
1003 	*/
1004 
1005 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
1006 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
1007 
1008 	buf[0] = htobe32(dma_high);	/* confirm addr MSW */
1009 	buf[1] = htobe32(dma_low);	/* confirm addr LSW */
1010 	buf[2] = htobe32(0xffffffff);	/* confirm data */
1011 
1012 	/* FIX: All newest firmware should un-protect the bottom of
1013 	   the sram before handoff. However, the very first interfaces
1014 	   do not. Therefore the handoff copy must skip the first 8 bytes
1015 	*/
1016 					/* where the code starts*/
1017 	buf[3] = htobe32(MXGE_FW_OFFSET + 8);
1018 	buf[4] = htobe32(size - 8); 	/* length of code */
1019 	buf[5] = htobe32(8);		/* where to copy to */
1020 	buf[6] = htobe32(0);		/* where to jump to */
1021 
1022 	submit = (volatile char *)(sc->sram + MXGEFW_BOOT_HANDOFF);
1023 	mxge_pio_copy(submit, buf, 64);
1024 	wmb();
1025 	DELAY(1000);
1026 	wmb();
1027 	i = 0;
1028 	while (*confirm != 0xffffffff && i < 20) {
1029 		DELAY(1000*10);
1030 		i++;
1031 		bus_dmamap_sync(sc->cmd_dma.dmat,
1032 				sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
1033 	}
1034 	if (*confirm != 0xffffffff) {
1035 		device_printf(sc->dev,"handoff failed (%p = 0x%x)",
1036 			confirm, *confirm);
1037 
1038 		return ENXIO;
1039 	}
1040 	return 0;
1041 }
1042 
1043 static int
1044 mxge_update_mac_address(mxge_softc_t *sc)
1045 {
1046 	mxge_cmd_t cmd;
1047 	uint8_t *addr = sc->mac_addr;
1048 	int status;
1049 
1050 
1051 	cmd.data0 = ((addr[0] << 24) | (addr[1] << 16)
1052 		     | (addr[2] << 8) | addr[3]);
1053 
1054 	cmd.data1 = ((addr[4] << 8) | (addr[5]));
1055 
1056 	status = mxge_send_cmd(sc, MXGEFW_SET_MAC_ADDRESS, &cmd);
1057 	return status;
1058 }
1059 
1060 static int
1061 mxge_change_pause(mxge_softc_t *sc, int pause)
1062 {
1063 	mxge_cmd_t cmd;
1064 	int status;
1065 
1066 	if (pause)
1067 		status = mxge_send_cmd(sc, MXGEFW_ENABLE_FLOW_CONTROL,
1068 				       &cmd);
1069 	else
1070 		status = mxge_send_cmd(sc, MXGEFW_DISABLE_FLOW_CONTROL,
1071 				       &cmd);
1072 
1073 	if (status) {
1074 		device_printf(sc->dev, "Failed to set flow control mode\n");
1075 		return ENXIO;
1076 	}
1077 	sc->pause = pause;
1078 	return 0;
1079 }
1080 
1081 static void
1082 mxge_change_promisc(mxge_softc_t *sc, int promisc)
1083 {
1084 	mxge_cmd_t cmd;
1085 	int status;
1086 
1087 	if (mxge_always_promisc)
1088 		promisc = 1;
1089 
1090 	if (promisc)
1091 		status = mxge_send_cmd(sc, MXGEFW_ENABLE_PROMISC,
1092 				       &cmd);
1093 	else
1094 		status = mxge_send_cmd(sc, MXGEFW_DISABLE_PROMISC,
1095 				       &cmd);
1096 
1097 	if (status) {
1098 		device_printf(sc->dev, "Failed to set promisc mode\n");
1099 	}
1100 }
1101 
1102 static void
1103 mxge_set_multicast_list(mxge_softc_t *sc)
1104 {
1105 	mxge_cmd_t cmd;
1106 	struct ifmultiaddr *ifma;
1107 	struct ifnet *ifp = sc->ifp;
1108 	int err;
1109 
1110 	/* This firmware is known to not support multicast */
1111 	if (!sc->fw_multicast_support)
1112 		return;
1113 
1114 	/* Disable multicast filtering while we play with the lists*/
1115 	err = mxge_send_cmd(sc, MXGEFW_ENABLE_ALLMULTI, &cmd);
1116 	if (err != 0) {
1117 		device_printf(sc->dev, "Failed MXGEFW_ENABLE_ALLMULTI,"
1118 		       " error status: %d\n", err);
1119 		return;
1120 	}
1121 
1122 	if (sc->adopted_rx_filter_bug)
1123 		return;
1124 
1125 	if (ifp->if_flags & IFF_ALLMULTI)
1126 		/* request to disable multicast filtering, so quit here */
1127 		return;
1128 
1129 	/* Flush all the filters */
1130 
1131 	err = mxge_send_cmd(sc, MXGEFW_LEAVE_ALL_MULTICAST_GROUPS, &cmd);
1132 	if (err != 0) {
1133 		device_printf(sc->dev,
1134 			      "Failed MXGEFW_LEAVE_ALL_MULTICAST_GROUPS"
1135 			      ", error status: %d\n", err);
1136 		return;
1137 	}
1138 
1139 	/* Walk the multicast list, and add each address */
1140 
1141 	if_maddr_rlock(ifp);
1142 	TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
1143 		if (ifma->ifma_addr->sa_family != AF_LINK)
1144 			continue;
1145 		bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr),
1146 		      &cmd.data0, 4);
1147 		bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr) + 4,
1148 		      &cmd.data1, 2);
1149 		cmd.data0 = htonl(cmd.data0);
1150 		cmd.data1 = htonl(cmd.data1);
1151 		err = mxge_send_cmd(sc, MXGEFW_JOIN_MULTICAST_GROUP, &cmd);
1152 		if (err != 0) {
1153 			device_printf(sc->dev, "Failed "
1154 			       "MXGEFW_JOIN_MULTICAST_GROUP, error status:"
1155 			       "%d\t", err);
1156 			/* abort, leaving multicast filtering off */
1157 			if_maddr_runlock(ifp);
1158 			return;
1159 		}
1160 	}
1161 	if_maddr_runlock(ifp);
1162 	/* Enable multicast filtering */
1163 	err = mxge_send_cmd(sc, MXGEFW_DISABLE_ALLMULTI, &cmd);
1164 	if (err != 0) {
1165 		device_printf(sc->dev, "Failed MXGEFW_DISABLE_ALLMULTI"
1166 		       ", error status: %d\n", err);
1167 	}
1168 }
1169 
1170 static int
1171 mxge_max_mtu(mxge_softc_t *sc)
1172 {
1173 	mxge_cmd_t cmd;
1174 	int status;
1175 
1176 	if (MJUMPAGESIZE - MXGEFW_PAD >  MXGEFW_MAX_MTU)
1177 		return  MXGEFW_MAX_MTU - MXGEFW_PAD;
1178 
1179 	/* try to set nbufs to see if it we can
1180 	   use virtually contiguous jumbos */
1181 	cmd.data0 = 0;
1182 	status = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
1183 			       &cmd);
1184 	if (status == 0)
1185 		return  MXGEFW_MAX_MTU - MXGEFW_PAD;
1186 
1187 	/* otherwise, we're limited to MJUMPAGESIZE */
1188 	return MJUMPAGESIZE - MXGEFW_PAD;
1189 }
1190 
1191 static int
1192 mxge_reset(mxge_softc_t *sc, int interrupts_setup)
1193 {
1194 	struct mxge_slice_state *ss;
1195 	mxge_rx_done_t *rx_done;
1196 	volatile uint32_t *irq_claim;
1197 	mxge_cmd_t cmd;
1198 	int slice, status;
1199 
1200 	/* try to send a reset command to the card to see if it
1201 	   is alive */
1202 	memset(&cmd, 0, sizeof (cmd));
1203 	status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
1204 	if (status != 0) {
1205 		device_printf(sc->dev, "failed reset\n");
1206 		return ENXIO;
1207 	}
1208 
1209 	mxge_dummy_rdma(sc, 1);
1210 
1211 
1212 	/* set the intrq size */
1213 	cmd.data0 = sc->rx_ring_size;
1214 	status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
1215 
1216 	/*
1217 	 * Even though we already know how many slices are supported
1218 	 * via mxge_slice_probe(), MXGEFW_CMD_GET_MAX_RSS_QUEUES
1219 	 * has magic side effects, and must be called after a reset.
1220 	 * It must be called prior to calling any RSS related cmds,
1221 	 * including assigning an interrupt queue for anything but
1222 	 * slice 0.  It must also be called *after*
1223 	 * MXGEFW_CMD_SET_INTRQ_SIZE, since the intrq size is used by
1224 	 * the firmware to compute offsets.
1225 	 */
1226 
1227 	if (sc->num_slices > 1) {
1228 		/* ask the maximum number of slices it supports */
1229 		status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES,
1230 					   &cmd);
1231 		if (status != 0) {
1232 			device_printf(sc->dev,
1233 				      "failed to get number of slices\n");
1234 			return status;
1235 		}
1236 		/*
1237 		 * MXGEFW_CMD_ENABLE_RSS_QUEUES must be called prior
1238 		 * to setting up the interrupt queue DMA
1239 		 */
1240 		cmd.data0 = sc->num_slices;
1241 		cmd.data1 = MXGEFW_SLICE_INTR_MODE_ONE_PER_SLICE;
1242 #ifdef IFNET_BUF_RING
1243 		cmd.data1 |= MXGEFW_SLICE_ENABLE_MULTIPLE_TX_QUEUES;
1244 #endif
1245 		status = mxge_send_cmd(sc, MXGEFW_CMD_ENABLE_RSS_QUEUES,
1246 					   &cmd);
1247 		if (status != 0) {
1248 			device_printf(sc->dev,
1249 				      "failed to set number of slices\n");
1250 			return status;
1251 		}
1252 	}
1253 
1254 
1255 	if (interrupts_setup) {
1256 		/* Now exchange information about interrupts  */
1257 		for (slice = 0; slice < sc->num_slices; slice++) {
1258 			rx_done = &sc->ss[slice].rx_done;
1259 			memset(rx_done->entry, 0, sc->rx_ring_size);
1260 			cmd.data0 = MXGE_LOWPART_TO_U32(rx_done->dma.bus_addr);
1261 			cmd.data1 = MXGE_HIGHPART_TO_U32(rx_done->dma.bus_addr);
1262 			cmd.data2 = slice;
1263 			status |= mxge_send_cmd(sc,
1264 						MXGEFW_CMD_SET_INTRQ_DMA,
1265 						&cmd);
1266 		}
1267 	}
1268 
1269 	status |= mxge_send_cmd(sc,
1270 				MXGEFW_CMD_GET_INTR_COAL_DELAY_OFFSET, &cmd);
1271 
1272 
1273 	sc->intr_coal_delay_ptr = (volatile uint32_t *)(sc->sram + cmd.data0);
1274 
1275 	status |= mxge_send_cmd(sc, MXGEFW_CMD_GET_IRQ_ACK_OFFSET, &cmd);
1276 	irq_claim = (volatile uint32_t *)(sc->sram + cmd.data0);
1277 
1278 
1279 	status |= mxge_send_cmd(sc,  MXGEFW_CMD_GET_IRQ_DEASSERT_OFFSET,
1280 				&cmd);
1281 	sc->irq_deassert = (volatile uint32_t *)(sc->sram + cmd.data0);
1282 	if (status != 0) {
1283 		device_printf(sc->dev, "failed set interrupt parameters\n");
1284 		return status;
1285 	}
1286 
1287 
1288 	*sc->intr_coal_delay_ptr = htobe32(sc->intr_coal_delay);
1289 
1290 
1291 	/* run a DMA benchmark */
1292 	(void) mxge_dma_test(sc, MXGEFW_DMA_TEST);
1293 
1294 	for (slice = 0; slice < sc->num_slices; slice++) {
1295 		ss = &sc->ss[slice];
1296 
1297 		ss->irq_claim = irq_claim + (2 * slice);
1298 		/* reset mcp/driver shared state back to 0 */
1299 		ss->rx_done.idx = 0;
1300 		ss->rx_done.cnt = 0;
1301 		ss->tx.req = 0;
1302 		ss->tx.done = 0;
1303 		ss->tx.pkt_done = 0;
1304 		ss->tx.queue_active = 0;
1305 		ss->tx.activate = 0;
1306 		ss->tx.deactivate = 0;
1307 		ss->tx.wake = 0;
1308 		ss->tx.defrag = 0;
1309 		ss->tx.stall = 0;
1310 		ss->rx_big.cnt = 0;
1311 		ss->rx_small.cnt = 0;
1312 		ss->lro_bad_csum = 0;
1313 		ss->lro_queued = 0;
1314 		ss->lro_flushed = 0;
1315 		if (ss->fw_stats != NULL) {
1316 			bzero(ss->fw_stats, sizeof *ss->fw_stats);
1317 		}
1318 	}
1319 	sc->rdma_tags_available = 15;
1320 	status = mxge_update_mac_address(sc);
1321 	mxge_change_promisc(sc, sc->ifp->if_flags & IFF_PROMISC);
1322 	mxge_change_pause(sc, sc->pause);
1323 	mxge_set_multicast_list(sc);
1324 	if (sc->throttle) {
1325 		cmd.data0 = sc->throttle;
1326 		if (mxge_send_cmd(sc, MXGEFW_CMD_SET_THROTTLE_FACTOR,
1327 				  &cmd)) {
1328 			device_printf(sc->dev,
1329 				      "can't enable throttle\n");
1330 		}
1331 	}
1332 	return status;
1333 }
1334 
1335 static int
1336 mxge_change_throttle(SYSCTL_HANDLER_ARGS)
1337 {
1338 	mxge_cmd_t cmd;
1339 	mxge_softc_t *sc;
1340 	int err;
1341 	unsigned int throttle;
1342 
1343 	sc = arg1;
1344 	throttle = sc->throttle;
1345 	err = sysctl_handle_int(oidp, &throttle, arg2, req);
1346         if (err != 0) {
1347                 return err;
1348         }
1349 
1350 	if (throttle == sc->throttle)
1351 		return 0;
1352 
1353         if (throttle < MXGE_MIN_THROTTLE || throttle > MXGE_MAX_THROTTLE)
1354                 return EINVAL;
1355 
1356 	mtx_lock(&sc->driver_mtx);
1357 	cmd.data0 = throttle;
1358 	err = mxge_send_cmd(sc, MXGEFW_CMD_SET_THROTTLE_FACTOR, &cmd);
1359 	if (err == 0)
1360 		sc->throttle = throttle;
1361 	mtx_unlock(&sc->driver_mtx);
1362 	return err;
1363 }
1364 
1365 static int
1366 mxge_change_intr_coal(SYSCTL_HANDLER_ARGS)
1367 {
1368         mxge_softc_t *sc;
1369         unsigned int intr_coal_delay;
1370         int err;
1371 
1372         sc = arg1;
1373         intr_coal_delay = sc->intr_coal_delay;
1374         err = sysctl_handle_int(oidp, &intr_coal_delay, arg2, req);
1375         if (err != 0) {
1376                 return err;
1377         }
1378         if (intr_coal_delay == sc->intr_coal_delay)
1379                 return 0;
1380 
1381         if (intr_coal_delay == 0 || intr_coal_delay > 1000*1000)
1382                 return EINVAL;
1383 
1384 	mtx_lock(&sc->driver_mtx);
1385 	*sc->intr_coal_delay_ptr = htobe32(intr_coal_delay);
1386 	sc->intr_coal_delay = intr_coal_delay;
1387 
1388 	mtx_unlock(&sc->driver_mtx);
1389         return err;
1390 }
1391 
1392 static int
1393 mxge_change_flow_control(SYSCTL_HANDLER_ARGS)
1394 {
1395         mxge_softc_t *sc;
1396         unsigned int enabled;
1397         int err;
1398 
1399         sc = arg1;
1400         enabled = sc->pause;
1401         err = sysctl_handle_int(oidp, &enabled, arg2, req);
1402         if (err != 0) {
1403                 return err;
1404         }
1405         if (enabled == sc->pause)
1406                 return 0;
1407 
1408 	mtx_lock(&sc->driver_mtx);
1409 	err = mxge_change_pause(sc, enabled);
1410 	mtx_unlock(&sc->driver_mtx);
1411         return err;
1412 }
1413 
1414 static int
1415 mxge_change_lro_locked(mxge_softc_t *sc, int lro_cnt)
1416 {
1417 	struct ifnet *ifp;
1418 	int err = 0;
1419 
1420 	ifp = sc->ifp;
1421 	if (lro_cnt == 0)
1422 		ifp->if_capenable &= ~IFCAP_LRO;
1423 	else
1424 		ifp->if_capenable |= IFCAP_LRO;
1425 	sc->lro_cnt = lro_cnt;
1426 	if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
1427 		mxge_close(sc, 0);
1428 		err = mxge_open(sc);
1429 	}
1430 	return err;
1431 }
1432 
1433 static int
1434 mxge_change_lro(SYSCTL_HANDLER_ARGS)
1435 {
1436 	mxge_softc_t *sc;
1437 	unsigned int lro_cnt;
1438 	int err;
1439 
1440 	sc = arg1;
1441 	lro_cnt = sc->lro_cnt;
1442 	err = sysctl_handle_int(oidp, &lro_cnt, arg2, req);
1443 	if (err != 0)
1444 		return err;
1445 
1446 	if (lro_cnt == sc->lro_cnt)
1447 		return 0;
1448 
1449 	if (lro_cnt > 128)
1450 		return EINVAL;
1451 
1452 	mtx_lock(&sc->driver_mtx);
1453 	err = mxge_change_lro_locked(sc, lro_cnt);
1454 	mtx_unlock(&sc->driver_mtx);
1455 	return err;
1456 }
1457 
1458 static int
1459 mxge_handle_be32(SYSCTL_HANDLER_ARGS)
1460 {
1461         int err;
1462 
1463         if (arg1 == NULL)
1464                 return EFAULT;
1465         arg2 = be32toh(*(int *)arg1);
1466         arg1 = NULL;
1467         err = sysctl_handle_int(oidp, arg1, arg2, req);
1468 
1469         return err;
1470 }
1471 
1472 static void
1473 mxge_rem_sysctls(mxge_softc_t *sc)
1474 {
1475 	struct mxge_slice_state *ss;
1476 	int slice;
1477 
1478 	if (sc->slice_sysctl_tree == NULL)
1479 		return;
1480 
1481 	for (slice = 0; slice < sc->num_slices; slice++) {
1482 		ss = &sc->ss[slice];
1483 		if (ss == NULL || ss->sysctl_tree == NULL)
1484 			continue;
1485 		sysctl_ctx_free(&ss->sysctl_ctx);
1486 		ss->sysctl_tree = NULL;
1487 	}
1488 	sysctl_ctx_free(&sc->slice_sysctl_ctx);
1489 	sc->slice_sysctl_tree = NULL;
1490 }
1491 
1492 static void
1493 mxge_add_sysctls(mxge_softc_t *sc)
1494 {
1495 	struct sysctl_ctx_list *ctx;
1496 	struct sysctl_oid_list *children;
1497 	mcp_irq_data_t *fw;
1498 	struct mxge_slice_state *ss;
1499 	int slice;
1500 	char slice_num[8];
1501 
1502 	ctx = device_get_sysctl_ctx(sc->dev);
1503 	children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev));
1504 	fw = sc->ss[0].fw_stats;
1505 
1506 	/* random information */
1507 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1508 		       "firmware_version",
1509 		       CTLFLAG_RD, &sc->fw_version,
1510 		       0, "firmware version");
1511 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1512 		       "serial_number",
1513 		       CTLFLAG_RD, &sc->serial_number_string,
1514 		       0, "serial number");
1515 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1516 		       "product_code",
1517 		       CTLFLAG_RD, &sc->product_code_string,
1518 		       0, "product_code");
1519 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1520 		       "pcie_link_width",
1521 		       CTLFLAG_RD, &sc->link_width,
1522 		       0, "tx_boundary");
1523 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1524 		       "tx_boundary",
1525 		       CTLFLAG_RD, &sc->tx_boundary,
1526 		       0, "tx_boundary");
1527 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1528 		       "write_combine",
1529 		       CTLFLAG_RD, &sc->wc,
1530 		       0, "write combining PIO?");
1531 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1532 		       "read_dma_MBs",
1533 		       CTLFLAG_RD, &sc->read_dma,
1534 		       0, "DMA Read speed in MB/s");
1535 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1536 		       "write_dma_MBs",
1537 		       CTLFLAG_RD, &sc->write_dma,
1538 		       0, "DMA Write speed in MB/s");
1539 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1540 		       "read_write_dma_MBs",
1541 		       CTLFLAG_RD, &sc->read_write_dma,
1542 		       0, "DMA concurrent Read/Write speed in MB/s");
1543 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1544 		       "watchdog_resets",
1545 		       CTLFLAG_RD, &sc->watchdog_resets,
1546 		       0, "Number of times NIC was reset");
1547 
1548 
1549 	/* performance related tunables */
1550 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1551 			"intr_coal_delay",
1552 			CTLTYPE_INT|CTLFLAG_RW, sc,
1553 			0, mxge_change_intr_coal,
1554 			"I", "interrupt coalescing delay in usecs");
1555 
1556 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1557 			"throttle",
1558 			CTLTYPE_INT|CTLFLAG_RW, sc,
1559 			0, mxge_change_throttle,
1560 			"I", "transmit throttling");
1561 
1562 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1563 			"flow_control_enabled",
1564 			CTLTYPE_INT|CTLFLAG_RW, sc,
1565 			0, mxge_change_flow_control,
1566 			"I", "interrupt coalescing delay in usecs");
1567 
1568 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1569 		       "deassert_wait",
1570 		       CTLFLAG_RW, &mxge_deassert_wait,
1571 		       0, "Wait for IRQ line to go low in ihandler");
1572 
1573 	/* stats block from firmware is in network byte order.
1574 	   Need to swap it */
1575 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1576 			"link_up",
1577 			CTLTYPE_INT|CTLFLAG_RD, &fw->link_up,
1578 			0, mxge_handle_be32,
1579 			"I", "link up");
1580 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1581 			"rdma_tags_available",
1582 			CTLTYPE_INT|CTLFLAG_RD, &fw->rdma_tags_available,
1583 			0, mxge_handle_be32,
1584 			"I", "rdma_tags_available");
1585 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1586 			"dropped_bad_crc32",
1587 			CTLTYPE_INT|CTLFLAG_RD,
1588 			&fw->dropped_bad_crc32,
1589 			0, mxge_handle_be32,
1590 			"I", "dropped_bad_crc32");
1591 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1592 			"dropped_bad_phy",
1593 			CTLTYPE_INT|CTLFLAG_RD,
1594 			&fw->dropped_bad_phy,
1595 			0, mxge_handle_be32,
1596 			"I", "dropped_bad_phy");
1597 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1598 			"dropped_link_error_or_filtered",
1599 			CTLTYPE_INT|CTLFLAG_RD,
1600 			&fw->dropped_link_error_or_filtered,
1601 			0, mxge_handle_be32,
1602 			"I", "dropped_link_error_or_filtered");
1603 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1604 			"dropped_link_overflow",
1605 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_link_overflow,
1606 			0, mxge_handle_be32,
1607 			"I", "dropped_link_overflow");
1608 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1609 			"dropped_multicast_filtered",
1610 			CTLTYPE_INT|CTLFLAG_RD,
1611 			&fw->dropped_multicast_filtered,
1612 			0, mxge_handle_be32,
1613 			"I", "dropped_multicast_filtered");
1614 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1615 			"dropped_no_big_buffer",
1616 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_no_big_buffer,
1617 			0, mxge_handle_be32,
1618 			"I", "dropped_no_big_buffer");
1619 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1620 			"dropped_no_small_buffer",
1621 			CTLTYPE_INT|CTLFLAG_RD,
1622 			&fw->dropped_no_small_buffer,
1623 			0, mxge_handle_be32,
1624 			"I", "dropped_no_small_buffer");
1625 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1626 			"dropped_overrun",
1627 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_overrun,
1628 			0, mxge_handle_be32,
1629 			"I", "dropped_overrun");
1630 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1631 			"dropped_pause",
1632 			CTLTYPE_INT|CTLFLAG_RD,
1633 			&fw->dropped_pause,
1634 			0, mxge_handle_be32,
1635 			"I", "dropped_pause");
1636 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1637 			"dropped_runt",
1638 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_runt,
1639 			0, mxge_handle_be32,
1640 			"I", "dropped_runt");
1641 
1642 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1643 			"dropped_unicast_filtered",
1644 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_unicast_filtered,
1645 			0, mxge_handle_be32,
1646 			"I", "dropped_unicast_filtered");
1647 
1648 	/* verbose printing? */
1649 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1650 		       "verbose",
1651 		       CTLFLAG_RW, &mxge_verbose,
1652 		       0, "verbose printing");
1653 
1654 	/* lro */
1655 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1656 			"lro_cnt",
1657 			CTLTYPE_INT|CTLFLAG_RW, sc,
1658 			0, mxge_change_lro,
1659 			"I", "number of lro merge queues");
1660 
1661 
1662 	/* add counters exported for debugging from all slices */
1663 	sysctl_ctx_init(&sc->slice_sysctl_ctx);
1664 	sc->slice_sysctl_tree =
1665 		SYSCTL_ADD_NODE(&sc->slice_sysctl_ctx, children, OID_AUTO,
1666 				"slice", CTLFLAG_RD, 0, "");
1667 
1668 	for (slice = 0; slice < sc->num_slices; slice++) {
1669 		ss = &sc->ss[slice];
1670 		sysctl_ctx_init(&ss->sysctl_ctx);
1671 		ctx = &ss->sysctl_ctx;
1672 		children = SYSCTL_CHILDREN(sc->slice_sysctl_tree);
1673 		sprintf(slice_num, "%d", slice);
1674 		ss->sysctl_tree =
1675 			SYSCTL_ADD_NODE(ctx, children, OID_AUTO, slice_num,
1676 					CTLFLAG_RD, 0, "");
1677 		children = SYSCTL_CHILDREN(ss->sysctl_tree);
1678 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1679 			       "rx_small_cnt",
1680 			       CTLFLAG_RD, &ss->rx_small.cnt,
1681 			       0, "rx_small_cnt");
1682 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1683 			       "rx_big_cnt",
1684 			       CTLFLAG_RD, &ss->rx_big.cnt,
1685 			       0, "rx_small_cnt");
1686 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1687 			       "lro_flushed", CTLFLAG_RD, &ss->lro_flushed,
1688 			       0, "number of lro merge queues flushed");
1689 
1690 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1691 			       "lro_queued", CTLFLAG_RD, &ss->lro_queued,
1692 			       0, "number of frames appended to lro merge"
1693 			       "queues");
1694 
1695 #ifndef IFNET_BUF_RING
1696 		/* only transmit from slice 0 for now */
1697 		if (slice > 0)
1698 			continue;
1699 #endif
1700 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1701 			       "tx_req",
1702 			       CTLFLAG_RD, &ss->tx.req,
1703 			       0, "tx_req");
1704 
1705 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1706 			       "tx_done",
1707 			       CTLFLAG_RD, &ss->tx.done,
1708 			       0, "tx_done");
1709 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1710 			       "tx_pkt_done",
1711 			       CTLFLAG_RD, &ss->tx.pkt_done,
1712 			       0, "tx_done");
1713 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1714 			       "tx_stall",
1715 			       CTLFLAG_RD, &ss->tx.stall,
1716 			       0, "tx_stall");
1717 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1718 			       "tx_wake",
1719 			       CTLFLAG_RD, &ss->tx.wake,
1720 			       0, "tx_wake");
1721 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1722 			       "tx_defrag",
1723 			       CTLFLAG_RD, &ss->tx.defrag,
1724 			       0, "tx_defrag");
1725 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1726 			       "tx_queue_active",
1727 			       CTLFLAG_RD, &ss->tx.queue_active,
1728 			       0, "tx_queue_active");
1729 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1730 			       "tx_activate",
1731 			       CTLFLAG_RD, &ss->tx.activate,
1732 			       0, "tx_activate");
1733 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1734 			       "tx_deactivate",
1735 			       CTLFLAG_RD, &ss->tx.deactivate,
1736 			       0, "tx_deactivate");
1737 	}
1738 }
1739 
1740 /* copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
1741    backwards one at a time and handle ring wraps */
1742 
1743 static inline void
1744 mxge_submit_req_backwards(mxge_tx_ring_t *tx,
1745 			    mcp_kreq_ether_send_t *src, int cnt)
1746 {
1747         int idx, starting_slot;
1748         starting_slot = tx->req;
1749         while (cnt > 1) {
1750                 cnt--;
1751                 idx = (starting_slot + cnt) & tx->mask;
1752                 mxge_pio_copy(&tx->lanai[idx],
1753 			      &src[cnt], sizeof(*src));
1754                 wmb();
1755         }
1756 }
1757 
1758 /*
1759  * copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
1760  * at most 32 bytes at a time, so as to avoid involving the software
1761  * pio handler in the nic.   We re-write the first segment's flags
1762  * to mark them valid only after writing the entire chain
1763  */
1764 
1765 static inline void
1766 mxge_submit_req(mxge_tx_ring_t *tx, mcp_kreq_ether_send_t *src,
1767                   int cnt)
1768 {
1769         int idx, i;
1770         uint32_t *src_ints;
1771 	volatile uint32_t *dst_ints;
1772         mcp_kreq_ether_send_t *srcp;
1773 	volatile mcp_kreq_ether_send_t *dstp, *dst;
1774 	uint8_t last_flags;
1775 
1776         idx = tx->req & tx->mask;
1777 
1778 	last_flags = src->flags;
1779 	src->flags = 0;
1780         wmb();
1781         dst = dstp = &tx->lanai[idx];
1782         srcp = src;
1783 
1784         if ((idx + cnt) < tx->mask) {
1785                 for (i = 0; i < (cnt - 1); i += 2) {
1786                         mxge_pio_copy(dstp, srcp, 2 * sizeof(*src));
1787                         wmb(); /* force write every 32 bytes */
1788                         srcp += 2;
1789                         dstp += 2;
1790                 }
1791         } else {
1792                 /* submit all but the first request, and ensure
1793                    that it is submitted below */
1794                 mxge_submit_req_backwards(tx, src, cnt);
1795                 i = 0;
1796         }
1797         if (i < cnt) {
1798                 /* submit the first request */
1799                 mxge_pio_copy(dstp, srcp, sizeof(*src));
1800                 wmb(); /* barrier before setting valid flag */
1801         }
1802 
1803         /* re-write the last 32-bits with the valid flags */
1804         src->flags = last_flags;
1805         src_ints = (uint32_t *)src;
1806         src_ints+=3;
1807         dst_ints = (volatile uint32_t *)dst;
1808         dst_ints+=3;
1809         *dst_ints =  *src_ints;
1810         tx->req += cnt;
1811         wmb();
1812 }
1813 
1814 #if IFCAP_TSO4
1815 
1816 static void
1817 mxge_encap_tso(struct mxge_slice_state *ss, struct mbuf *m,
1818 	       int busdma_seg_cnt, int ip_off)
1819 {
1820 	mxge_tx_ring_t *tx;
1821 	mcp_kreq_ether_send_t *req;
1822 	bus_dma_segment_t *seg;
1823 	struct ip *ip;
1824 	struct tcphdr *tcp;
1825 	uint32_t low, high_swapped;
1826 	int len, seglen, cum_len, cum_len_next;
1827 	int next_is_first, chop, cnt, rdma_count, small;
1828 	uint16_t pseudo_hdr_offset, cksum_offset, mss;
1829 	uint8_t flags, flags_next;
1830 	static int once;
1831 
1832 	mss = m->m_pkthdr.tso_segsz;
1833 
1834 	/* negative cum_len signifies to the
1835 	 * send loop that we are still in the
1836 	 * header portion of the TSO packet.
1837 	 */
1838 
1839 	/* ensure we have the ethernet, IP and TCP
1840 	   header together in the first mbuf, copy
1841 	   it to a scratch buffer if not */
1842 	if (__predict_false(m->m_len < ip_off + sizeof (*ip))) {
1843 		m_copydata(m, 0, ip_off + sizeof (*ip),
1844 			   ss->scratch);
1845 		ip = (struct ip *)(ss->scratch + ip_off);
1846 	} else {
1847 		ip = (struct ip *)(mtod(m, char *) + ip_off);
1848 	}
1849 	if (__predict_false(m->m_len < ip_off + (ip->ip_hl << 2)
1850 			    + sizeof (*tcp))) {
1851 		m_copydata(m, 0, ip_off + (ip->ip_hl << 2)
1852 			   + sizeof (*tcp),  ss->scratch);
1853 		ip = (struct ip *)(mtod(m, char *) + ip_off);
1854 	}
1855 
1856 	tcp = (struct tcphdr *)((char *)ip + (ip->ip_hl << 2));
1857 	cum_len = -(ip_off + ((ip->ip_hl + tcp->th_off) << 2));
1858 
1859 	/* TSO implies checksum offload on this hardware */
1860 	cksum_offset = ip_off + (ip->ip_hl << 2);
1861 	flags = MXGEFW_FLAGS_TSO_HDR | MXGEFW_FLAGS_FIRST;
1862 
1863 
1864 	/* for TSO, pseudo_hdr_offset holds mss.
1865 	 * The firmware figures out where to put
1866 	 * the checksum by parsing the header. */
1867 	pseudo_hdr_offset = htobe16(mss);
1868 
1869 	tx = &ss->tx;
1870 	req = tx->req_list;
1871 	seg = tx->seg_list;
1872 	cnt = 0;
1873 	rdma_count = 0;
1874 	/* "rdma_count" is the number of RDMAs belonging to the
1875 	 * current packet BEFORE the current send request. For
1876 	 * non-TSO packets, this is equal to "count".
1877 	 * For TSO packets, rdma_count needs to be reset
1878 	 * to 0 after a segment cut.
1879 	 *
1880 	 * The rdma_count field of the send request is
1881 	 * the number of RDMAs of the packet starting at
1882 	 * that request. For TSO send requests with one ore more cuts
1883 	 * in the middle, this is the number of RDMAs starting
1884 	 * after the last cut in the request. All previous
1885 	 * segments before the last cut implicitly have 1 RDMA.
1886 	 *
1887 	 * Since the number of RDMAs is not known beforehand,
1888 	 * it must be filled-in retroactively - after each
1889 	 * segmentation cut or at the end of the entire packet.
1890 	 */
1891 
1892 	while (busdma_seg_cnt) {
1893 		/* Break the busdma segment up into pieces*/
1894 		low = MXGE_LOWPART_TO_U32(seg->ds_addr);
1895 		high_swapped = 	htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
1896 		len = seg->ds_len;
1897 
1898 		while (len) {
1899 			flags_next = flags & ~MXGEFW_FLAGS_FIRST;
1900 			seglen = len;
1901 			cum_len_next = cum_len + seglen;
1902 			(req-rdma_count)->rdma_count = rdma_count + 1;
1903 			if (__predict_true(cum_len >= 0)) {
1904 				/* payload */
1905 				chop = (cum_len_next > mss);
1906 				cum_len_next = cum_len_next % mss;
1907 				next_is_first = (cum_len_next == 0);
1908 				flags |= chop * MXGEFW_FLAGS_TSO_CHOP;
1909 				flags_next |= next_is_first *
1910 					MXGEFW_FLAGS_FIRST;
1911 				rdma_count |= -(chop | next_is_first);
1912 				rdma_count += chop & !next_is_first;
1913 			} else if (cum_len_next >= 0) {
1914 				/* header ends */
1915 				rdma_count = -1;
1916 				cum_len_next = 0;
1917 				seglen = -cum_len;
1918 				small = (mss <= MXGEFW_SEND_SMALL_SIZE);
1919 				flags_next = MXGEFW_FLAGS_TSO_PLD |
1920 					MXGEFW_FLAGS_FIRST |
1921 					(small * MXGEFW_FLAGS_SMALL);
1922 			    }
1923 
1924 			req->addr_high = high_swapped;
1925 			req->addr_low = htobe32(low);
1926 			req->pseudo_hdr_offset = pseudo_hdr_offset;
1927 			req->pad = 0;
1928 			req->rdma_count = 1;
1929 			req->length = htobe16(seglen);
1930 			req->cksum_offset = cksum_offset;
1931 			req->flags = flags | ((cum_len & 1) *
1932 					      MXGEFW_FLAGS_ALIGN_ODD);
1933 			low += seglen;
1934 			len -= seglen;
1935 			cum_len = cum_len_next;
1936 			flags = flags_next;
1937 			req++;
1938 			cnt++;
1939 			rdma_count++;
1940 			if (__predict_false(cksum_offset > seglen))
1941 				cksum_offset -= seglen;
1942 			else
1943 				cksum_offset = 0;
1944 			if (__predict_false(cnt > tx->max_desc))
1945 				goto drop;
1946 		}
1947 		busdma_seg_cnt--;
1948 		seg++;
1949 	}
1950 	(req-rdma_count)->rdma_count = rdma_count;
1951 
1952 	do {
1953 		req--;
1954 		req->flags |= MXGEFW_FLAGS_TSO_LAST;
1955 	} while (!(req->flags & (MXGEFW_FLAGS_TSO_CHOP | MXGEFW_FLAGS_FIRST)));
1956 
1957 	tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
1958 	mxge_submit_req(tx, tx->req_list, cnt);
1959 #ifdef IFNET_BUF_RING
1960 	if ((ss->sc->num_slices > 1) && tx->queue_active == 0) {
1961 		/* tell the NIC to start polling this slice */
1962 		*tx->send_go = 1;
1963 		tx->queue_active = 1;
1964 		tx->activate++;
1965 		wmb();
1966 	}
1967 #endif
1968 	return;
1969 
1970 drop:
1971 	bus_dmamap_unload(tx->dmat, tx->info[tx->req & tx->mask].map);
1972 	m_freem(m);
1973 	ss->oerrors++;
1974 	if (!once) {
1975 		printf("tx->max_desc exceeded via TSO!\n");
1976 		printf("mss = %d, %ld, %d!\n", mss,
1977 		       (long)seg - (long)tx->seg_list, tx->max_desc);
1978 		once = 1;
1979 	}
1980 	return;
1981 
1982 }
1983 
1984 #endif /* IFCAP_TSO4 */
1985 
1986 #ifdef MXGE_NEW_VLAN_API
1987 /*
1988  * We reproduce the software vlan tag insertion from
1989  * net/if_vlan.c:vlan_start() here so that we can advertise "hardware"
1990  * vlan tag insertion. We need to advertise this in order to have the
1991  * vlan interface respect our csum offload flags.
1992  */
1993 static struct mbuf *
1994 mxge_vlan_tag_insert(struct mbuf *m)
1995 {
1996 	struct ether_vlan_header *evl;
1997 
1998 	M_PREPEND(m, ETHER_VLAN_ENCAP_LEN, M_DONTWAIT);
1999 	if (__predict_false(m == NULL))
2000 		return NULL;
2001 	if (m->m_len < sizeof(*evl)) {
2002 		m = m_pullup(m, sizeof(*evl));
2003 		if (__predict_false(m == NULL))
2004 			return NULL;
2005 	}
2006 	/*
2007 	 * Transform the Ethernet header into an Ethernet header
2008 	 * with 802.1Q encapsulation.
2009 	 */
2010 	evl = mtod(m, struct ether_vlan_header *);
2011 	bcopy((char *)evl + ETHER_VLAN_ENCAP_LEN,
2012 	      (char *)evl, ETHER_HDR_LEN - ETHER_TYPE_LEN);
2013 	evl->evl_encap_proto = htons(ETHERTYPE_VLAN);
2014 	evl->evl_tag = htons(m->m_pkthdr.ether_vtag);
2015 	m->m_flags &= ~M_VLANTAG;
2016 	return m;
2017 }
2018 #endif /* MXGE_NEW_VLAN_API */
2019 
2020 static void
2021 mxge_encap(struct mxge_slice_state *ss, struct mbuf *m)
2022 {
2023 	mxge_softc_t *sc;
2024 	mcp_kreq_ether_send_t *req;
2025 	bus_dma_segment_t *seg;
2026 	struct mbuf *m_tmp;
2027 	struct ifnet *ifp;
2028 	mxge_tx_ring_t *tx;
2029 	struct ip *ip;
2030 	int cnt, cum_len, err, i, idx, odd_flag, ip_off;
2031 	uint16_t pseudo_hdr_offset;
2032         uint8_t flags, cksum_offset;
2033 
2034 
2035 	sc = ss->sc;
2036 	ifp = sc->ifp;
2037 	tx = &ss->tx;
2038 
2039 	ip_off = sizeof (struct ether_header);
2040 #ifdef MXGE_NEW_VLAN_API
2041 	if (m->m_flags & M_VLANTAG) {
2042 		m = mxge_vlan_tag_insert(m);
2043 		if (__predict_false(m == NULL))
2044 			goto drop;
2045 		ip_off += ETHER_VLAN_ENCAP_LEN;
2046 	}
2047 #endif
2048 	/* (try to) map the frame for DMA */
2049 	idx = tx->req & tx->mask;
2050 	err = bus_dmamap_load_mbuf_sg(tx->dmat, tx->info[idx].map,
2051 				      m, tx->seg_list, &cnt,
2052 				      BUS_DMA_NOWAIT);
2053 	if (__predict_false(err == EFBIG)) {
2054 		/* Too many segments in the chain.  Try
2055 		   to defrag */
2056 		m_tmp = m_defrag(m, M_NOWAIT);
2057 		if (m_tmp == NULL) {
2058 			goto drop;
2059 		}
2060 		ss->tx.defrag++;
2061 		m = m_tmp;
2062 		err = bus_dmamap_load_mbuf_sg(tx->dmat,
2063 					      tx->info[idx].map,
2064 					      m, tx->seg_list, &cnt,
2065 					      BUS_DMA_NOWAIT);
2066 	}
2067 	if (__predict_false(err != 0)) {
2068 		device_printf(sc->dev, "bus_dmamap_load_mbuf_sg returned %d"
2069 			      " packet len = %d\n", err, m->m_pkthdr.len);
2070 		goto drop;
2071 	}
2072 	bus_dmamap_sync(tx->dmat, tx->info[idx].map,
2073 			BUS_DMASYNC_PREWRITE);
2074 	tx->info[idx].m = m;
2075 
2076 #if IFCAP_TSO4
2077 	/* TSO is different enough, we handle it in another routine */
2078 	if (m->m_pkthdr.csum_flags & (CSUM_TSO)) {
2079 		mxge_encap_tso(ss, m, cnt, ip_off);
2080 		return;
2081 	}
2082 #endif
2083 
2084 	req = tx->req_list;
2085 	cksum_offset = 0;
2086 	pseudo_hdr_offset = 0;
2087 	flags = MXGEFW_FLAGS_NO_TSO;
2088 
2089 	/* checksum offloading? */
2090 	if (m->m_pkthdr.csum_flags & (CSUM_DELAY_DATA)) {
2091 		/* ensure ip header is in first mbuf, copy
2092 		   it to a scratch buffer if not */
2093 		if (__predict_false(m->m_len < ip_off + sizeof (*ip))) {
2094 			m_copydata(m, 0, ip_off + sizeof (*ip),
2095 				   ss->scratch);
2096 			ip = (struct ip *)(ss->scratch + ip_off);
2097 		} else {
2098 			ip = (struct ip *)(mtod(m, char *) + ip_off);
2099 		}
2100 		cksum_offset = ip_off + (ip->ip_hl << 2);
2101 		pseudo_hdr_offset = cksum_offset +  m->m_pkthdr.csum_data;
2102 		pseudo_hdr_offset = htobe16(pseudo_hdr_offset);
2103 		req->cksum_offset = cksum_offset;
2104 		flags |= MXGEFW_FLAGS_CKSUM;
2105 		odd_flag = MXGEFW_FLAGS_ALIGN_ODD;
2106 	} else {
2107 		odd_flag = 0;
2108 	}
2109 	if (m->m_pkthdr.len < MXGEFW_SEND_SMALL_SIZE)
2110 		flags |= MXGEFW_FLAGS_SMALL;
2111 
2112 	/* convert segments into a request list */
2113 	cum_len = 0;
2114 	seg = tx->seg_list;
2115 	req->flags = MXGEFW_FLAGS_FIRST;
2116 	for (i = 0; i < cnt; i++) {
2117 		req->addr_low =
2118 			htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
2119 		req->addr_high =
2120 			htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
2121 		req->length = htobe16(seg->ds_len);
2122 		req->cksum_offset = cksum_offset;
2123 		if (cksum_offset > seg->ds_len)
2124 			cksum_offset -= seg->ds_len;
2125 		else
2126 			cksum_offset = 0;
2127 		req->pseudo_hdr_offset = pseudo_hdr_offset;
2128 		req->pad = 0; /* complete solid 16-byte block */
2129 		req->rdma_count = 1;
2130 		req->flags |= flags | ((cum_len & 1) * odd_flag);
2131 		cum_len += seg->ds_len;
2132 		seg++;
2133 		req++;
2134 		req->flags = 0;
2135 	}
2136 	req--;
2137 	/* pad runts to 60 bytes */
2138 	if (cum_len < 60) {
2139 		req++;
2140 		req->addr_low =
2141 			htobe32(MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr));
2142 		req->addr_high =
2143 			htobe32(MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr));
2144 		req->length = htobe16(60 - cum_len);
2145 		req->cksum_offset = 0;
2146 		req->pseudo_hdr_offset = pseudo_hdr_offset;
2147 		req->pad = 0; /* complete solid 16-byte block */
2148 		req->rdma_count = 1;
2149 		req->flags |= flags | ((cum_len & 1) * odd_flag);
2150 		cnt++;
2151 	}
2152 
2153 	tx->req_list[0].rdma_count = cnt;
2154 #if 0
2155 	/* print what the firmware will see */
2156 	for (i = 0; i < cnt; i++) {
2157 		printf("%d: addr: 0x%x 0x%x len:%d pso%d,"
2158 		    "cso:%d, flags:0x%x, rdma:%d\n",
2159 		    i, (int)ntohl(tx->req_list[i].addr_high),
2160 		    (int)ntohl(tx->req_list[i].addr_low),
2161 		    (int)ntohs(tx->req_list[i].length),
2162 		    (int)ntohs(tx->req_list[i].pseudo_hdr_offset),
2163 		    tx->req_list[i].cksum_offset, tx->req_list[i].flags,
2164 		    tx->req_list[i].rdma_count);
2165 	}
2166 	printf("--------------\n");
2167 #endif
2168 	tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
2169 	mxge_submit_req(tx, tx->req_list, cnt);
2170 #ifdef IFNET_BUF_RING
2171 	if ((ss->sc->num_slices > 1) && tx->queue_active == 0) {
2172 		/* tell the NIC to start polling this slice */
2173 		*tx->send_go = 1;
2174 		tx->queue_active = 1;
2175 		tx->activate++;
2176 		wmb();
2177 	}
2178 #endif
2179 	return;
2180 
2181 drop:
2182 	m_freem(m);
2183 	ss->oerrors++;
2184 	return;
2185 }
2186 
2187 #ifdef IFNET_BUF_RING
2188 static void
2189 mxge_qflush(struct ifnet *ifp)
2190 {
2191 	mxge_softc_t *sc = ifp->if_softc;
2192 	mxge_tx_ring_t *tx;
2193 	struct mbuf *m;
2194 	int slice;
2195 
2196 	for (slice = 0; slice < sc->num_slices; slice++) {
2197 		tx = &sc->ss[slice].tx;
2198 		mtx_lock(&tx->mtx);
2199 		while ((m = buf_ring_dequeue_sc(tx->br)) != NULL)
2200 			m_freem(m);
2201 		mtx_unlock(&tx->mtx);
2202 	}
2203 	if_qflush(ifp);
2204 }
2205 
2206 static inline void
2207 mxge_start_locked(struct mxge_slice_state *ss)
2208 {
2209 	mxge_softc_t *sc;
2210 	struct mbuf *m;
2211 	struct ifnet *ifp;
2212 	mxge_tx_ring_t *tx;
2213 
2214 	sc = ss->sc;
2215 	ifp = sc->ifp;
2216 	tx = &ss->tx;
2217 
2218 	while ((tx->mask - (tx->req - tx->done)) > tx->max_desc) {
2219 		m = drbr_dequeue(ifp, tx->br);
2220 		if (m == NULL) {
2221 			return;
2222 		}
2223 		/* let BPF see it */
2224 		BPF_MTAP(ifp, m);
2225 
2226 		/* give it to the nic */
2227 		mxge_encap(ss, m);
2228 	}
2229 	/* ran out of transmit slots */
2230 	if (((ss->if_drv_flags & IFF_DRV_OACTIVE) == 0)
2231 	    && (!drbr_empty(ifp, tx->br))) {
2232 		ss->if_drv_flags |= IFF_DRV_OACTIVE;
2233 		tx->stall++;
2234 	}
2235 }
2236 
2237 static int
2238 mxge_transmit_locked(struct mxge_slice_state *ss, struct mbuf *m)
2239 {
2240 	mxge_softc_t *sc;
2241 	struct ifnet *ifp;
2242 	mxge_tx_ring_t *tx;
2243 	int err;
2244 
2245 	sc = ss->sc;
2246 	ifp = sc->ifp;
2247 	tx = &ss->tx;
2248 
2249 	if ((ss->if_drv_flags & (IFF_DRV_RUNNING|IFF_DRV_OACTIVE)) !=
2250 	    IFF_DRV_RUNNING) {
2251 		err = drbr_enqueue(ifp, tx->br, m);
2252 		return (err);
2253 	}
2254 
2255 	if (!drbr_needs_enqueue(ifp, tx->br) &&
2256 	    ((tx->mask - (tx->req - tx->done)) > tx->max_desc)) {
2257 		/* let BPF see it */
2258 		BPF_MTAP(ifp, m);
2259 		/* give it to the nic */
2260 		mxge_encap(ss, m);
2261 	} else if ((err = drbr_enqueue(ifp, tx->br, m)) != 0) {
2262 		return (err);
2263 	}
2264 	if (!drbr_empty(ifp, tx->br))
2265 		mxge_start_locked(ss);
2266 	return (0);
2267 }
2268 
2269 static int
2270 mxge_transmit(struct ifnet *ifp, struct mbuf *m)
2271 {
2272 	mxge_softc_t *sc = ifp->if_softc;
2273 	struct mxge_slice_state *ss;
2274 	mxge_tx_ring_t *tx;
2275 	int err = 0;
2276 	int slice;
2277 
2278 	slice = m->m_pkthdr.flowid;
2279 	slice &= (sc->num_slices - 1);  /* num_slices always power of 2 */
2280 
2281 	ss = &sc->ss[slice];
2282 	tx = &ss->tx;
2283 
2284 	if (mtx_trylock(&tx->mtx)) {
2285 		err = mxge_transmit_locked(ss, m);
2286 		mtx_unlock(&tx->mtx);
2287 	} else {
2288 		err = drbr_enqueue(ifp, tx->br, m);
2289 	}
2290 
2291 	return (err);
2292 }
2293 
2294 #else
2295 
2296 static inline void
2297 mxge_start_locked(struct mxge_slice_state *ss)
2298 {
2299 	mxge_softc_t *sc;
2300 	struct mbuf *m;
2301 	struct ifnet *ifp;
2302 	mxge_tx_ring_t *tx;
2303 
2304 	sc = ss->sc;
2305 	ifp = sc->ifp;
2306 	tx = &ss->tx;
2307 	while ((tx->mask - (tx->req - tx->done)) > tx->max_desc) {
2308 		IFQ_DRV_DEQUEUE(&ifp->if_snd, m);
2309 		if (m == NULL) {
2310 			return;
2311 		}
2312 		/* let BPF see it */
2313 		BPF_MTAP(ifp, m);
2314 
2315 		/* give it to the nic */
2316 		mxge_encap(ss, m);
2317 	}
2318 	/* ran out of transmit slots */
2319 	if ((sc->ifp->if_drv_flags & IFF_DRV_OACTIVE) == 0) {
2320 		sc->ifp->if_drv_flags |= IFF_DRV_OACTIVE;
2321 		tx->stall++;
2322 	}
2323 }
2324 #endif
2325 static void
2326 mxge_start(struct ifnet *ifp)
2327 {
2328 	mxge_softc_t *sc = ifp->if_softc;
2329 	struct mxge_slice_state *ss;
2330 
2331 	/* only use the first slice for now */
2332 	ss = &sc->ss[0];
2333 	mtx_lock(&ss->tx.mtx);
2334 	mxge_start_locked(ss);
2335 	mtx_unlock(&ss->tx.mtx);
2336 }
2337 
2338 /*
2339  * copy an array of mcp_kreq_ether_recv_t's to the mcp.  Copy
2340  * at most 32 bytes at a time, so as to avoid involving the software
2341  * pio handler in the nic.   We re-write the first segment's low
2342  * DMA address to mark it valid only after we write the entire chunk
2343  * in a burst
2344  */
2345 static inline void
2346 mxge_submit_8rx(volatile mcp_kreq_ether_recv_t *dst,
2347 		mcp_kreq_ether_recv_t *src)
2348 {
2349 	uint32_t low;
2350 
2351 	low = src->addr_low;
2352 	src->addr_low = 0xffffffff;
2353 	mxge_pio_copy(dst, src, 4 * sizeof (*src));
2354 	wmb();
2355 	mxge_pio_copy(dst + 4, src + 4, 4 * sizeof (*src));
2356 	wmb();
2357 	src->addr_low = low;
2358 	dst->addr_low = low;
2359 	wmb();
2360 }
2361 
2362 static int
2363 mxge_get_buf_small(struct mxge_slice_state *ss, bus_dmamap_t map, int idx)
2364 {
2365 	bus_dma_segment_t seg;
2366 	struct mbuf *m;
2367 	mxge_rx_ring_t *rx = &ss->rx_small;
2368 	int cnt, err;
2369 
2370 	m = m_gethdr(M_DONTWAIT, MT_DATA);
2371 	if (m == NULL) {
2372 		rx->alloc_fail++;
2373 		err = ENOBUFS;
2374 		goto done;
2375 	}
2376 	m->m_len = MHLEN;
2377 	err = bus_dmamap_load_mbuf_sg(rx->dmat, map, m,
2378 				      &seg, &cnt, BUS_DMA_NOWAIT);
2379 	if (err != 0) {
2380 		m_free(m);
2381 		goto done;
2382 	}
2383 	rx->info[idx].m = m;
2384 	rx->shadow[idx].addr_low =
2385 		htobe32(MXGE_LOWPART_TO_U32(seg.ds_addr));
2386 	rx->shadow[idx].addr_high =
2387 		htobe32(MXGE_HIGHPART_TO_U32(seg.ds_addr));
2388 
2389 done:
2390 	if ((idx & 7) == 7)
2391 		mxge_submit_8rx(&rx->lanai[idx - 7], &rx->shadow[idx - 7]);
2392 	return err;
2393 }
2394 
2395 static int
2396 mxge_get_buf_big(struct mxge_slice_state *ss, bus_dmamap_t map, int idx)
2397 {
2398 	bus_dma_segment_t seg[3];
2399 	struct mbuf *m;
2400 	mxge_rx_ring_t *rx = &ss->rx_big;
2401 	int cnt, err, i;
2402 
2403 	m = m_getjcl(M_DONTWAIT, MT_DATA, M_PKTHDR, rx->cl_size);
2404 	if (m == NULL) {
2405 		rx->alloc_fail++;
2406 		err = ENOBUFS;
2407 		goto done;
2408 	}
2409 	m->m_len = rx->mlen;
2410 	err = bus_dmamap_load_mbuf_sg(rx->dmat, map, m,
2411 				      seg, &cnt, BUS_DMA_NOWAIT);
2412 	if (err != 0) {
2413 		m_free(m);
2414 		goto done;
2415 	}
2416 	rx->info[idx].m = m;
2417 	rx->shadow[idx].addr_low =
2418 		htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
2419 	rx->shadow[idx].addr_high =
2420 		htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
2421 
2422 #if MXGE_VIRT_JUMBOS
2423 	for (i = 1; i < cnt; i++) {
2424 		rx->shadow[idx + i].addr_low =
2425 			htobe32(MXGE_LOWPART_TO_U32(seg[i].ds_addr));
2426 		rx->shadow[idx + i].addr_high =
2427 			htobe32(MXGE_HIGHPART_TO_U32(seg[i].ds_addr));
2428        }
2429 #endif
2430 
2431 done:
2432        for (i = 0; i < rx->nbufs; i++) {
2433 		if ((idx & 7) == 7) {
2434 			mxge_submit_8rx(&rx->lanai[idx - 7],
2435 					&rx->shadow[idx - 7]);
2436 		}
2437 		idx++;
2438 	}
2439 	return err;
2440 }
2441 
2442 /*
2443  *  Myri10GE hardware checksums are not valid if the sender
2444  *  padded the frame with non-zero padding.  This is because
2445  *  the firmware just does a simple 16-bit 1s complement
2446  *  checksum across the entire frame, excluding the first 14
2447  *  bytes.  It is best to simply to check the checksum and
2448  *  tell the stack about it only if the checksum is good
2449  */
2450 
2451 static inline uint16_t
2452 mxge_rx_csum(struct mbuf *m, int csum)
2453 {
2454 	struct ether_header *eh;
2455 	struct ip *ip;
2456 	uint16_t c;
2457 
2458 	eh = mtod(m, struct ether_header *);
2459 
2460 	/* only deal with IPv4 TCP & UDP for now */
2461 	if (__predict_false(eh->ether_type != htons(ETHERTYPE_IP)))
2462 		return 1;
2463 	ip = (struct ip *)(eh + 1);
2464 	if (__predict_false(ip->ip_p != IPPROTO_TCP &&
2465 			    ip->ip_p != IPPROTO_UDP))
2466 		return 1;
2467 #ifdef INET
2468 	c = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
2469 		      htonl(ntohs(csum) + ntohs(ip->ip_len) +
2470 			    - (ip->ip_hl << 2) + ip->ip_p));
2471 #else
2472 	c = 1;
2473 #endif
2474 	c ^= 0xffff;
2475 	return (c);
2476 }
2477 
2478 static void
2479 mxge_vlan_tag_remove(struct mbuf *m, uint32_t *csum)
2480 {
2481 	struct ether_vlan_header *evl;
2482 	struct ether_header *eh;
2483 	uint32_t partial;
2484 
2485 	evl = mtod(m, struct ether_vlan_header *);
2486 	eh = mtod(m, struct ether_header *);
2487 
2488 	/*
2489 	 * fix checksum by subtracting ETHER_VLAN_ENCAP_LEN bytes
2490 	 * after what the firmware thought was the end of the ethernet
2491 	 * header.
2492 	 */
2493 
2494 	/* put checksum into host byte order */
2495 	*csum = ntohs(*csum);
2496 	partial = ntohl(*(uint32_t *)(mtod(m, char *) + ETHER_HDR_LEN));
2497 	(*csum) += ~partial;
2498 	(*csum) +=  ((*csum) < ~partial);
2499 	(*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2500 	(*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2501 
2502 	/* restore checksum to network byte order;
2503 	   later consumers expect this */
2504 	*csum = htons(*csum);
2505 
2506 	/* save the tag */
2507 #ifdef MXGE_NEW_VLAN_API
2508 	m->m_pkthdr.ether_vtag = ntohs(evl->evl_tag);
2509 #else
2510 	{
2511 		struct m_tag *mtag;
2512 		mtag = m_tag_alloc(MTAG_VLAN, MTAG_VLAN_TAG, sizeof(u_int),
2513 				   M_NOWAIT);
2514 		if (mtag == NULL)
2515 			return;
2516 		VLAN_TAG_VALUE(mtag) = ntohs(evl->evl_tag);
2517 		m_tag_prepend(m, mtag);
2518 	}
2519 
2520 #endif
2521 	m->m_flags |= M_VLANTAG;
2522 
2523 	/*
2524 	 * Remove the 802.1q header by copying the Ethernet
2525 	 * addresses over it and adjusting the beginning of
2526 	 * the data in the mbuf.  The encapsulated Ethernet
2527 	 * type field is already in place.
2528 	 */
2529 	bcopy((char *)evl, (char *)evl + ETHER_VLAN_ENCAP_LEN,
2530 	      ETHER_HDR_LEN - ETHER_TYPE_LEN);
2531 	m_adj(m, ETHER_VLAN_ENCAP_LEN);
2532 }
2533 
2534 
2535 static inline void
2536 mxge_rx_done_big(struct mxge_slice_state *ss, uint32_t len, uint32_t csum)
2537 {
2538 	mxge_softc_t *sc;
2539 	struct ifnet *ifp;
2540 	struct mbuf *m;
2541 	struct ether_header *eh;
2542 	mxge_rx_ring_t *rx;
2543 	bus_dmamap_t old_map;
2544 	int idx;
2545 	uint16_t tcpudp_csum;
2546 
2547 	sc = ss->sc;
2548 	ifp = sc->ifp;
2549 	rx = &ss->rx_big;
2550 	idx = rx->cnt & rx->mask;
2551 	rx->cnt += rx->nbufs;
2552 	/* save a pointer to the received mbuf */
2553 	m = rx->info[idx].m;
2554 	/* try to replace the received mbuf */
2555 	if (mxge_get_buf_big(ss, rx->extra_map, idx)) {
2556 		/* drop the frame -- the old mbuf is re-cycled */
2557 		ifp->if_ierrors++;
2558 		return;
2559 	}
2560 
2561 	/* unmap the received buffer */
2562 	old_map = rx->info[idx].map;
2563 	bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2564 	bus_dmamap_unload(rx->dmat, old_map);
2565 
2566 	/* swap the bus_dmamap_t's */
2567 	rx->info[idx].map = rx->extra_map;
2568 	rx->extra_map = old_map;
2569 
2570 	/* mcp implicitly skips 1st 2 bytes so that packet is properly
2571 	 * aligned */
2572 	m->m_data += MXGEFW_PAD;
2573 
2574 	m->m_pkthdr.rcvif = ifp;
2575 	m->m_len = m->m_pkthdr.len = len;
2576 	ss->ipackets++;
2577 	eh = mtod(m, struct ether_header *);
2578 	if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2579 		mxge_vlan_tag_remove(m, &csum);
2580 	}
2581 	/* if the checksum is valid, mark it in the mbuf header */
2582 	if (sc->csum_flag && (0 == (tcpudp_csum = mxge_rx_csum(m, csum)))) {
2583 		if (sc->lro_cnt && (0 == mxge_lro_rx(ss, m, csum)))
2584 			return;
2585 		/* otherwise, it was a UDP frame, or a TCP frame which
2586 		   we could not do LRO on.  Tell the stack that the
2587 		   checksum is good */
2588 		m->m_pkthdr.csum_data = 0xffff;
2589 		m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR | CSUM_DATA_VALID;
2590 	}
2591 	/* flowid only valid if RSS hashing is enabled */
2592 	if (sc->num_slices > 1) {
2593 		m->m_pkthdr.flowid = (ss - sc->ss);
2594 		m->m_flags |= M_FLOWID;
2595 	}
2596 	/* pass the frame up the stack */
2597 	(*ifp->if_input)(ifp, m);
2598 }
2599 
2600 static inline void
2601 mxge_rx_done_small(struct mxge_slice_state *ss, uint32_t len, uint32_t csum)
2602 {
2603 	mxge_softc_t *sc;
2604 	struct ifnet *ifp;
2605 	struct ether_header *eh;
2606 	struct mbuf *m;
2607 	mxge_rx_ring_t *rx;
2608 	bus_dmamap_t old_map;
2609 	int idx;
2610 	uint16_t tcpudp_csum;
2611 
2612 	sc = ss->sc;
2613 	ifp = sc->ifp;
2614 	rx = &ss->rx_small;
2615 	idx = rx->cnt & rx->mask;
2616 	rx->cnt++;
2617 	/* save a pointer to the received mbuf */
2618 	m = rx->info[idx].m;
2619 	/* try to replace the received mbuf */
2620 	if (mxge_get_buf_small(ss, rx->extra_map, idx)) {
2621 		/* drop the frame -- the old mbuf is re-cycled */
2622 		ifp->if_ierrors++;
2623 		return;
2624 	}
2625 
2626 	/* unmap the received buffer */
2627 	old_map = rx->info[idx].map;
2628 	bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2629 	bus_dmamap_unload(rx->dmat, old_map);
2630 
2631 	/* swap the bus_dmamap_t's */
2632 	rx->info[idx].map = rx->extra_map;
2633 	rx->extra_map = old_map;
2634 
2635 	/* mcp implicitly skips 1st 2 bytes so that packet is properly
2636 	 * aligned */
2637 	m->m_data += MXGEFW_PAD;
2638 
2639 	m->m_pkthdr.rcvif = ifp;
2640 	m->m_len = m->m_pkthdr.len = len;
2641 	ss->ipackets++;
2642 	eh = mtod(m, struct ether_header *);
2643 	if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2644 		mxge_vlan_tag_remove(m, &csum);
2645 	}
2646 	/* if the checksum is valid, mark it in the mbuf header */
2647 	if (sc->csum_flag && (0 == (tcpudp_csum = mxge_rx_csum(m, csum)))) {
2648 		if (sc->lro_cnt && (0 == mxge_lro_rx(ss, m, csum)))
2649 			return;
2650 		/* otherwise, it was a UDP frame, or a TCP frame which
2651 		   we could not do LRO on.  Tell the stack that the
2652 		   checksum is good */
2653 		m->m_pkthdr.csum_data = 0xffff;
2654 		m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR | CSUM_DATA_VALID;
2655 	}
2656 	/* flowid only valid if RSS hashing is enabled */
2657 	if (sc->num_slices > 1) {
2658 		m->m_pkthdr.flowid = (ss - sc->ss);
2659 		m->m_flags |= M_FLOWID;
2660 	}
2661 	/* pass the frame up the stack */
2662 	(*ifp->if_input)(ifp, m);
2663 }
2664 
2665 static inline void
2666 mxge_clean_rx_done(struct mxge_slice_state *ss)
2667 {
2668 	mxge_rx_done_t *rx_done = &ss->rx_done;
2669 	int limit = 0;
2670 	uint16_t length;
2671 	uint16_t checksum;
2672 
2673 
2674 	while (rx_done->entry[rx_done->idx].length != 0) {
2675 		length = ntohs(rx_done->entry[rx_done->idx].length);
2676 		rx_done->entry[rx_done->idx].length = 0;
2677 		checksum = rx_done->entry[rx_done->idx].checksum;
2678 		if (length <= (MHLEN - MXGEFW_PAD))
2679 			mxge_rx_done_small(ss, length, checksum);
2680 		else
2681 			mxge_rx_done_big(ss, length, checksum);
2682 		rx_done->cnt++;
2683 		rx_done->idx = rx_done->cnt & rx_done->mask;
2684 
2685 		/* limit potential for livelock */
2686 		if (__predict_false(++limit > rx_done->mask / 2))
2687 			break;
2688 	}
2689 #ifdef INET
2690 	while (!SLIST_EMPTY(&ss->lro_active)) {
2691 		struct lro_entry *lro = SLIST_FIRST(&ss->lro_active);
2692 		SLIST_REMOVE_HEAD(&ss->lro_active, next);
2693 		mxge_lro_flush(ss, lro);
2694 	}
2695 #endif
2696 }
2697 
2698 
2699 static inline void
2700 mxge_tx_done(struct mxge_slice_state *ss, uint32_t mcp_idx)
2701 {
2702 	struct ifnet *ifp;
2703 	mxge_tx_ring_t *tx;
2704 	struct mbuf *m;
2705 	bus_dmamap_t map;
2706 	int idx;
2707 	int *flags;
2708 
2709 	tx = &ss->tx;
2710 	ifp = ss->sc->ifp;
2711 	while (tx->pkt_done != mcp_idx) {
2712 		idx = tx->done & tx->mask;
2713 		tx->done++;
2714 		m = tx->info[idx].m;
2715 		/* mbuf and DMA map only attached to the first
2716 		   segment per-mbuf */
2717 		if (m != NULL) {
2718 			ss->obytes += m->m_pkthdr.len;
2719 			if (m->m_flags & M_MCAST)
2720 				ss->omcasts++;
2721 			ss->opackets++;
2722 			tx->info[idx].m = NULL;
2723 			map = tx->info[idx].map;
2724 			bus_dmamap_unload(tx->dmat, map);
2725 			m_freem(m);
2726 		}
2727 		if (tx->info[idx].flag) {
2728 			tx->info[idx].flag = 0;
2729 			tx->pkt_done++;
2730 		}
2731 	}
2732 
2733 	/* If we have space, clear IFF_OACTIVE to tell the stack that
2734            its OK to send packets */
2735 #ifdef IFNET_BUF_RING
2736 	flags = &ss->if_drv_flags;
2737 #else
2738 	flags = &ifp->if_drv_flags;
2739 #endif
2740 	mtx_lock(&ss->tx.mtx);
2741 	if ((*flags) & IFF_DRV_OACTIVE &&
2742 	    tx->req - tx->done < (tx->mask + 1)/4) {
2743 		*(flags) &= ~IFF_DRV_OACTIVE;
2744 		ss->tx.wake++;
2745 		mxge_start_locked(ss);
2746 	}
2747 #ifdef IFNET_BUF_RING
2748 	if ((ss->sc->num_slices > 1) && (tx->req == tx->done)) {
2749 		/* let the NIC stop polling this queue, since there
2750 		 * are no more transmits pending */
2751 		if (tx->req == tx->done) {
2752 			*tx->send_stop = 1;
2753 			tx->queue_active = 0;
2754 			tx->deactivate++;
2755 			wmb();
2756 		}
2757 	}
2758 #endif
2759 	mtx_unlock(&ss->tx.mtx);
2760 
2761 }
2762 
2763 static struct mxge_media_type mxge_xfp_media_types[] =
2764 {
2765 	{IFM_10G_CX4,	0x7f, 		"10GBASE-CX4 (module)"},
2766 	{IFM_10G_SR, 	(1 << 7),	"10GBASE-SR"},
2767 	{IFM_10G_LR, 	(1 << 6),	"10GBASE-LR"},
2768 	{0,		(1 << 5),	"10GBASE-ER"},
2769 	{IFM_10G_LRM,	(1 << 4),	"10GBASE-LRM"},
2770 	{0,		(1 << 3),	"10GBASE-SW"},
2771 	{0,		(1 << 2),	"10GBASE-LW"},
2772 	{0,		(1 << 1),	"10GBASE-EW"},
2773 	{0,		(1 << 0),	"Reserved"}
2774 };
2775 static struct mxge_media_type mxge_sfp_media_types[] =
2776 {
2777 	{IFM_10G_TWINAX,      0,	"10GBASE-Twinax"},
2778 	{0,		(1 << 7),	"Reserved"},
2779 	{IFM_10G_LRM,	(1 << 6),	"10GBASE-LRM"},
2780 	{IFM_10G_LR, 	(1 << 5),	"10GBASE-LR"},
2781 	{IFM_10G_SR,	(1 << 4),	"10GBASE-SR"},
2782 	{IFM_10G_TWINAX,(1 << 0),	"10GBASE-Twinax"}
2783 };
2784 
2785 static void
2786 mxge_media_set(mxge_softc_t *sc, int media_type)
2787 {
2788 
2789 
2790 	ifmedia_add(&sc->media, IFM_ETHER | IFM_FDX | media_type,
2791 		    0, NULL);
2792 	ifmedia_set(&sc->media, IFM_ETHER | IFM_FDX | media_type);
2793 	sc->current_media = media_type;
2794 	sc->media.ifm_media = sc->media.ifm_cur->ifm_media;
2795 }
2796 
2797 static void
2798 mxge_media_init(mxge_softc_t *sc)
2799 {
2800 	char *ptr;
2801 	int i;
2802 
2803 	ifmedia_removeall(&sc->media);
2804 	mxge_media_set(sc, IFM_AUTO);
2805 
2806 	/*
2807 	 * parse the product code to deterimine the interface type
2808 	 * (CX4, XFP, Quad Ribbon Fiber) by looking at the character
2809 	 * after the 3rd dash in the driver's cached copy of the
2810 	 * EEPROM's product code string.
2811 	 */
2812 	ptr = sc->product_code_string;
2813 	if (ptr == NULL) {
2814 		device_printf(sc->dev, "Missing product code\n");
2815 		return;
2816 	}
2817 
2818 	for (i = 0; i < 3; i++, ptr++) {
2819 		ptr = index(ptr, '-');
2820 		if (ptr == NULL) {
2821 			device_printf(sc->dev,
2822 				      "only %d dashes in PC?!?\n", i);
2823 			return;
2824 		}
2825 	}
2826 	if (*ptr == 'C') {
2827 		/* -C is CX4 */
2828 		sc->connector = MXGE_CX4;
2829 		mxge_media_set(sc, IFM_10G_CX4);
2830 	} else if (*ptr == 'Q') {
2831 		/* -Q is Quad Ribbon Fiber */
2832 		sc->connector = MXGE_QRF;
2833 		device_printf(sc->dev, "Quad Ribbon Fiber Media\n");
2834 		/* FreeBSD has no media type for Quad ribbon fiber */
2835 	} else if (*ptr == 'R') {
2836 		/* -R is XFP */
2837 		sc->connector = MXGE_XFP;
2838 	} else if (*ptr == 'S' || *(ptr +1) == 'S') {
2839 		/* -S or -2S is SFP+ */
2840 		sc->connector = MXGE_SFP;
2841 	} else {
2842 		device_printf(sc->dev, "Unknown media type: %c\n", *ptr);
2843 	}
2844 }
2845 
2846 /*
2847  * Determine the media type for a NIC.  Some XFPs will identify
2848  * themselves only when their link is up, so this is initiated via a
2849  * link up interrupt.  However, this can potentially take up to
2850  * several milliseconds, so it is run via the watchdog routine, rather
2851  * than in the interrupt handler itself.
2852  */
2853 static void
2854 mxge_media_probe(mxge_softc_t *sc)
2855 {
2856 	mxge_cmd_t cmd;
2857 	char *cage_type;
2858 
2859 	struct mxge_media_type *mxge_media_types = NULL;
2860 	int i, err, ms, mxge_media_type_entries;
2861 	uint32_t byte;
2862 
2863 	sc->need_media_probe = 0;
2864 
2865 	if (sc->connector == MXGE_XFP) {
2866 		/* -R is XFP */
2867 		mxge_media_types = mxge_xfp_media_types;
2868 		mxge_media_type_entries =
2869 			sizeof (mxge_xfp_media_types) /
2870 			sizeof (mxge_xfp_media_types[0]);
2871 		byte = MXGE_XFP_COMPLIANCE_BYTE;
2872 		cage_type = "XFP";
2873 	} else 	if (sc->connector == MXGE_SFP) {
2874 		/* -S or -2S is SFP+ */
2875 		mxge_media_types = mxge_sfp_media_types;
2876 		mxge_media_type_entries =
2877 			sizeof (mxge_sfp_media_types) /
2878 			sizeof (mxge_sfp_media_types[0]);
2879 		cage_type = "SFP+";
2880 		byte = 3;
2881 	} else {
2882 		/* nothing to do; media type cannot change */
2883 		return;
2884 	}
2885 
2886 	/*
2887 	 * At this point we know the NIC has an XFP cage, so now we
2888 	 * try to determine what is in the cage by using the
2889 	 * firmware's XFP I2C commands to read the XFP 10GbE compilance
2890 	 * register.  We read just one byte, which may take over
2891 	 * a millisecond
2892 	 */
2893 
2894 	cmd.data0 = 0;	 /* just fetch 1 byte, not all 256 */
2895 	cmd.data1 = byte;
2896 	err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_READ, &cmd);
2897 	if (err == MXGEFW_CMD_ERROR_I2C_FAILURE) {
2898 		device_printf(sc->dev, "failed to read XFP\n");
2899 	}
2900 	if (err == MXGEFW_CMD_ERROR_I2C_ABSENT) {
2901 		device_printf(sc->dev, "Type R/S with no XFP!?!?\n");
2902 	}
2903 	if (err != MXGEFW_CMD_OK) {
2904 		return;
2905 	}
2906 
2907 	/* now we wait for the data to be cached */
2908 	cmd.data0 = byte;
2909 	err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
2910 	for (ms = 0; (err == EBUSY) && (ms < 50); ms++) {
2911 		DELAY(1000);
2912 		cmd.data0 = byte;
2913 		err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
2914 	}
2915 	if (err != MXGEFW_CMD_OK) {
2916 		device_printf(sc->dev, "failed to read %s (%d, %dms)\n",
2917 			      cage_type, err, ms);
2918 		return;
2919 	}
2920 
2921 	if (cmd.data0 == mxge_media_types[0].bitmask) {
2922 		if (mxge_verbose)
2923 			device_printf(sc->dev, "%s:%s\n", cage_type,
2924 				      mxge_media_types[0].name);
2925 		if (sc->current_media != mxge_media_types[0].flag) {
2926 			mxge_media_init(sc);
2927 			mxge_media_set(sc, mxge_media_types[0].flag);
2928 		}
2929 		return;
2930 	}
2931 	for (i = 1; i < mxge_media_type_entries; i++) {
2932 		if (cmd.data0 & mxge_media_types[i].bitmask) {
2933 			if (mxge_verbose)
2934 				device_printf(sc->dev, "%s:%s\n",
2935 					      cage_type,
2936 					      mxge_media_types[i].name);
2937 
2938 			if (sc->current_media != mxge_media_types[i].flag) {
2939 				mxge_media_init(sc);
2940 				mxge_media_set(sc, mxge_media_types[i].flag);
2941 			}
2942 			return;
2943 		}
2944 	}
2945 	if (mxge_verbose)
2946 		device_printf(sc->dev, "%s media 0x%x unknown\n",
2947 			      cage_type, cmd.data0);
2948 
2949 	return;
2950 }
2951 
2952 static void
2953 mxge_intr(void *arg)
2954 {
2955 	struct mxge_slice_state *ss = arg;
2956 	mxge_softc_t *sc = ss->sc;
2957 	mcp_irq_data_t *stats = ss->fw_stats;
2958 	mxge_tx_ring_t *tx = &ss->tx;
2959 	mxge_rx_done_t *rx_done = &ss->rx_done;
2960 	uint32_t send_done_count;
2961 	uint8_t valid;
2962 
2963 
2964 #ifndef IFNET_BUF_RING
2965 	/* an interrupt on a non-zero slice is implicitly valid
2966 	   since MSI-X irqs are not shared */
2967 	if (ss != sc->ss) {
2968 		mxge_clean_rx_done(ss);
2969 		*ss->irq_claim = be32toh(3);
2970 		return;
2971 	}
2972 #endif
2973 
2974 	/* make sure the DMA has finished */
2975 	if (!stats->valid) {
2976 		return;
2977 	}
2978 	valid = stats->valid;
2979 
2980 	if (sc->legacy_irq) {
2981 		/* lower legacy IRQ  */
2982 		*sc->irq_deassert = 0;
2983 		if (!mxge_deassert_wait)
2984 			/* don't wait for conf. that irq is low */
2985 			stats->valid = 0;
2986 	} else {
2987 		stats->valid = 0;
2988 	}
2989 
2990 	/* loop while waiting for legacy irq deassertion */
2991 	do {
2992 		/* check for transmit completes and receives */
2993 		send_done_count = be32toh(stats->send_done_count);
2994 		while ((send_done_count != tx->pkt_done) ||
2995 		       (rx_done->entry[rx_done->idx].length != 0)) {
2996 			if (send_done_count != tx->pkt_done)
2997 				mxge_tx_done(ss, (int)send_done_count);
2998 			mxge_clean_rx_done(ss);
2999 			send_done_count = be32toh(stats->send_done_count);
3000 		}
3001 		if (sc->legacy_irq && mxge_deassert_wait)
3002 			wmb();
3003 	} while (*((volatile uint8_t *) &stats->valid));
3004 
3005 	/* fw link & error stats meaningful only on the first slice */
3006 	if (__predict_false((ss == sc->ss) && stats->stats_updated)) {
3007 		if (sc->link_state != stats->link_up) {
3008 			sc->link_state = stats->link_up;
3009 			if (sc->link_state) {
3010 				if_link_state_change(sc->ifp, LINK_STATE_UP);
3011 				 sc->ifp->if_baudrate = IF_Gbps(10UL);
3012 				if (mxge_verbose)
3013 					device_printf(sc->dev, "link up\n");
3014 			} else {
3015 				if_link_state_change(sc->ifp, LINK_STATE_DOWN);
3016 				sc->ifp->if_baudrate = 0;
3017 				if (mxge_verbose)
3018 					device_printf(sc->dev, "link down\n");
3019 			}
3020 			sc->need_media_probe = 1;
3021 		}
3022 		if (sc->rdma_tags_available !=
3023 		    be32toh(stats->rdma_tags_available)) {
3024 			sc->rdma_tags_available =
3025 				be32toh(stats->rdma_tags_available);
3026 			device_printf(sc->dev, "RDMA timed out! %d tags "
3027 				      "left\n", sc->rdma_tags_available);
3028 		}
3029 
3030 		if (stats->link_down) {
3031 			sc->down_cnt += stats->link_down;
3032 			sc->link_state = 0;
3033 			if_link_state_change(sc->ifp, LINK_STATE_DOWN);
3034 		}
3035 	}
3036 
3037 	/* check to see if we have rx token to pass back */
3038 	if (valid & 0x1)
3039 	    *ss->irq_claim = be32toh(3);
3040 	*(ss->irq_claim + 1) = be32toh(3);
3041 }
3042 
3043 static void
3044 mxge_init(void *arg)
3045 {
3046 }
3047 
3048 
3049 
3050 static void
3051 mxge_free_slice_mbufs(struct mxge_slice_state *ss)
3052 {
3053 	struct lro_entry *lro_entry;
3054 	int i;
3055 
3056 	while (!SLIST_EMPTY(&ss->lro_free)) {
3057 		lro_entry = SLIST_FIRST(&ss->lro_free);
3058 		SLIST_REMOVE_HEAD(&ss->lro_free, next);
3059 		free(lro_entry, M_DEVBUF);
3060 	}
3061 
3062 	for (i = 0; i <= ss->rx_big.mask; i++) {
3063 		if (ss->rx_big.info[i].m == NULL)
3064 			continue;
3065 		bus_dmamap_unload(ss->rx_big.dmat,
3066 				  ss->rx_big.info[i].map);
3067 		m_freem(ss->rx_big.info[i].m);
3068 		ss->rx_big.info[i].m = NULL;
3069 	}
3070 
3071 	for (i = 0; i <= ss->rx_small.mask; i++) {
3072 		if (ss->rx_small.info[i].m == NULL)
3073 			continue;
3074 		bus_dmamap_unload(ss->rx_small.dmat,
3075 				  ss->rx_small.info[i].map);
3076 		m_freem(ss->rx_small.info[i].m);
3077 		ss->rx_small.info[i].m = NULL;
3078 	}
3079 
3080 	/* transmit ring used only on the first slice */
3081 	if (ss->tx.info == NULL)
3082 		return;
3083 
3084 	for (i = 0; i <= ss->tx.mask; i++) {
3085 		ss->tx.info[i].flag = 0;
3086 		if (ss->tx.info[i].m == NULL)
3087 			continue;
3088 		bus_dmamap_unload(ss->tx.dmat,
3089 				  ss->tx.info[i].map);
3090 		m_freem(ss->tx.info[i].m);
3091 		ss->tx.info[i].m = NULL;
3092 	}
3093 }
3094 
3095 static void
3096 mxge_free_mbufs(mxge_softc_t *sc)
3097 {
3098 	int slice;
3099 
3100 	for (slice = 0; slice < sc->num_slices; slice++)
3101 		mxge_free_slice_mbufs(&sc->ss[slice]);
3102 }
3103 
3104 static void
3105 mxge_free_slice_rings(struct mxge_slice_state *ss)
3106 {
3107 	int i;
3108 
3109 
3110 	if (ss->rx_done.entry != NULL)
3111 		mxge_dma_free(&ss->rx_done.dma);
3112 	ss->rx_done.entry = NULL;
3113 
3114 	if (ss->tx.req_bytes != NULL)
3115 		free(ss->tx.req_bytes, M_DEVBUF);
3116 	ss->tx.req_bytes = NULL;
3117 
3118 	if (ss->tx.seg_list != NULL)
3119 		free(ss->tx.seg_list, M_DEVBUF);
3120 	ss->tx.seg_list = NULL;
3121 
3122 	if (ss->rx_small.shadow != NULL)
3123 		free(ss->rx_small.shadow, M_DEVBUF);
3124 	ss->rx_small.shadow = NULL;
3125 
3126 	if (ss->rx_big.shadow != NULL)
3127 		free(ss->rx_big.shadow, M_DEVBUF);
3128 	ss->rx_big.shadow = NULL;
3129 
3130 	if (ss->tx.info != NULL) {
3131 		if (ss->tx.dmat != NULL) {
3132 			for (i = 0; i <= ss->tx.mask; i++) {
3133 				bus_dmamap_destroy(ss->tx.dmat,
3134 						   ss->tx.info[i].map);
3135 			}
3136 			bus_dma_tag_destroy(ss->tx.dmat);
3137 		}
3138 		free(ss->tx.info, M_DEVBUF);
3139 	}
3140 	ss->tx.info = NULL;
3141 
3142 	if (ss->rx_small.info != NULL) {
3143 		if (ss->rx_small.dmat != NULL) {
3144 			for (i = 0; i <= ss->rx_small.mask; i++) {
3145 				bus_dmamap_destroy(ss->rx_small.dmat,
3146 						   ss->rx_small.info[i].map);
3147 			}
3148 			bus_dmamap_destroy(ss->rx_small.dmat,
3149 					   ss->rx_small.extra_map);
3150 			bus_dma_tag_destroy(ss->rx_small.dmat);
3151 		}
3152 		free(ss->rx_small.info, M_DEVBUF);
3153 	}
3154 	ss->rx_small.info = NULL;
3155 
3156 	if (ss->rx_big.info != NULL) {
3157 		if (ss->rx_big.dmat != NULL) {
3158 			for (i = 0; i <= ss->rx_big.mask; i++) {
3159 				bus_dmamap_destroy(ss->rx_big.dmat,
3160 						   ss->rx_big.info[i].map);
3161 			}
3162 			bus_dmamap_destroy(ss->rx_big.dmat,
3163 					   ss->rx_big.extra_map);
3164 			bus_dma_tag_destroy(ss->rx_big.dmat);
3165 		}
3166 		free(ss->rx_big.info, M_DEVBUF);
3167 	}
3168 	ss->rx_big.info = NULL;
3169 }
3170 
3171 static void
3172 mxge_free_rings(mxge_softc_t *sc)
3173 {
3174 	int slice;
3175 
3176 	for (slice = 0; slice < sc->num_slices; slice++)
3177 		mxge_free_slice_rings(&sc->ss[slice]);
3178 }
3179 
3180 static int
3181 mxge_alloc_slice_rings(struct mxge_slice_state *ss, int rx_ring_entries,
3182 		       int tx_ring_entries)
3183 {
3184 	mxge_softc_t *sc = ss->sc;
3185 	size_t bytes;
3186 	int err, i;
3187 
3188 	err = ENOMEM;
3189 
3190 	/* allocate per-slice receive resources */
3191 
3192 	ss->rx_small.mask = ss->rx_big.mask = rx_ring_entries - 1;
3193 	ss->rx_done.mask = (2 * rx_ring_entries) - 1;
3194 
3195 	/* allocate the rx shadow rings */
3196 	bytes = rx_ring_entries * sizeof (*ss->rx_small.shadow);
3197 	ss->rx_small.shadow = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3198 	if (ss->rx_small.shadow == NULL)
3199 		return err;
3200 
3201 	bytes = rx_ring_entries * sizeof (*ss->rx_big.shadow);
3202 	ss->rx_big.shadow = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3203 	if (ss->rx_big.shadow == NULL)
3204 		return err;
3205 
3206 	/* allocate the rx host info rings */
3207 	bytes = rx_ring_entries * sizeof (*ss->rx_small.info);
3208 	ss->rx_small.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3209 	if (ss->rx_small.info == NULL)
3210 		return err;
3211 
3212 	bytes = rx_ring_entries * sizeof (*ss->rx_big.info);
3213 	ss->rx_big.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3214 	if (ss->rx_big.info == NULL)
3215 		return err;
3216 
3217 	/* allocate the rx busdma resources */
3218 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
3219 				 1,			/* alignment */
3220 				 4096,			/* boundary */
3221 				 BUS_SPACE_MAXADDR,	/* low */
3222 				 BUS_SPACE_MAXADDR,	/* high */
3223 				 NULL, NULL,		/* filter */
3224 				 MHLEN,			/* maxsize */
3225 				 1,			/* num segs */
3226 				 MHLEN,			/* maxsegsize */
3227 				 BUS_DMA_ALLOCNOW,	/* flags */
3228 				 NULL, NULL,		/* lock */
3229 				 &ss->rx_small.dmat);	/* tag */
3230 	if (err != 0) {
3231 		device_printf(sc->dev, "Err %d allocating rx_small dmat\n",
3232 			      err);
3233 		return err;
3234 	}
3235 
3236 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
3237 				 1,			/* alignment */
3238 #if MXGE_VIRT_JUMBOS
3239 				 4096,			/* boundary */
3240 #else
3241 				 0,			/* boundary */
3242 #endif
3243 				 BUS_SPACE_MAXADDR,	/* low */
3244 				 BUS_SPACE_MAXADDR,	/* high */
3245 				 NULL, NULL,		/* filter */
3246 				 3*4096,		/* maxsize */
3247 #if MXGE_VIRT_JUMBOS
3248 				 3,			/* num segs */
3249 				 4096,			/* maxsegsize*/
3250 #else
3251 				 1,			/* num segs */
3252 				 MJUM9BYTES,		/* maxsegsize*/
3253 #endif
3254 				 BUS_DMA_ALLOCNOW,	/* flags */
3255 				 NULL, NULL,		/* lock */
3256 				 &ss->rx_big.dmat);	/* tag */
3257 	if (err != 0) {
3258 		device_printf(sc->dev, "Err %d allocating rx_big dmat\n",
3259 			      err);
3260 		return err;
3261 	}
3262 	for (i = 0; i <= ss->rx_small.mask; i++) {
3263 		err = bus_dmamap_create(ss->rx_small.dmat, 0,
3264 					&ss->rx_small.info[i].map);
3265 		if (err != 0) {
3266 			device_printf(sc->dev, "Err %d  rx_small dmamap\n",
3267 				      err);
3268 			return err;
3269 		}
3270 	}
3271 	err = bus_dmamap_create(ss->rx_small.dmat, 0,
3272 				&ss->rx_small.extra_map);
3273 	if (err != 0) {
3274 		device_printf(sc->dev, "Err %d extra rx_small dmamap\n",
3275 			      err);
3276 		return err;
3277 	}
3278 
3279 	for (i = 0; i <= ss->rx_big.mask; i++) {
3280 		err = bus_dmamap_create(ss->rx_big.dmat, 0,
3281 					&ss->rx_big.info[i].map);
3282 		if (err != 0) {
3283 			device_printf(sc->dev, "Err %d  rx_big dmamap\n",
3284 				      err);
3285 			return err;
3286 		}
3287 	}
3288 	err = bus_dmamap_create(ss->rx_big.dmat, 0,
3289 				&ss->rx_big.extra_map);
3290 	if (err != 0) {
3291 		device_printf(sc->dev, "Err %d extra rx_big dmamap\n",
3292 			      err);
3293 		return err;
3294 	}
3295 
3296 	/* now allocate TX resouces */
3297 
3298 #ifndef IFNET_BUF_RING
3299 	/* only use a single TX ring for now */
3300 	if (ss != ss->sc->ss)
3301 		return 0;
3302 #endif
3303 
3304 	ss->tx.mask = tx_ring_entries - 1;
3305 	ss->tx.max_desc = MIN(MXGE_MAX_SEND_DESC, tx_ring_entries / 4);
3306 
3307 
3308 	/* allocate the tx request copy block */
3309 	bytes = 8 +
3310 		sizeof (*ss->tx.req_list) * (ss->tx.max_desc + 4);
3311 	ss->tx.req_bytes = malloc(bytes, M_DEVBUF, M_WAITOK);
3312 	if (ss->tx.req_bytes == NULL)
3313 		return err;
3314 	/* ensure req_list entries are aligned to 8 bytes */
3315 	ss->tx.req_list = (mcp_kreq_ether_send_t *)
3316 		((unsigned long)(ss->tx.req_bytes + 7) & ~7UL);
3317 
3318 	/* allocate the tx busdma segment list */
3319 	bytes = sizeof (*ss->tx.seg_list) * ss->tx.max_desc;
3320 	ss->tx.seg_list = (bus_dma_segment_t *)
3321 		malloc(bytes, M_DEVBUF, M_WAITOK);
3322 	if (ss->tx.seg_list == NULL)
3323 		return err;
3324 
3325 	/* allocate the tx host info ring */
3326 	bytes = tx_ring_entries * sizeof (*ss->tx.info);
3327 	ss->tx.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3328 	if (ss->tx.info == NULL)
3329 		return err;
3330 
3331 	/* allocate the tx busdma resources */
3332 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
3333 				 1,			/* alignment */
3334 				 sc->tx_boundary,	/* boundary */
3335 				 BUS_SPACE_MAXADDR,	/* low */
3336 				 BUS_SPACE_MAXADDR,	/* high */
3337 				 NULL, NULL,		/* filter */
3338 				 65536 + 256,		/* maxsize */
3339 				 ss->tx.max_desc - 2,	/* num segs */
3340 				 sc->tx_boundary,	/* maxsegsz */
3341 				 BUS_DMA_ALLOCNOW,	/* flags */
3342 				 NULL, NULL,		/* lock */
3343 				 &ss->tx.dmat);		/* tag */
3344 
3345 	if (err != 0) {
3346 		device_printf(sc->dev, "Err %d allocating tx dmat\n",
3347 			      err);
3348 		return err;
3349 	}
3350 
3351 	/* now use these tags to setup dmamaps for each slot
3352 	   in the ring */
3353 	for (i = 0; i <= ss->tx.mask; i++) {
3354 		err = bus_dmamap_create(ss->tx.dmat, 0,
3355 					&ss->tx.info[i].map);
3356 		if (err != 0) {
3357 			device_printf(sc->dev, "Err %d  tx dmamap\n",
3358 				      err);
3359 			return err;
3360 		}
3361 	}
3362 	return 0;
3363 
3364 }
3365 
3366 static int
3367 mxge_alloc_rings(mxge_softc_t *sc)
3368 {
3369 	mxge_cmd_t cmd;
3370 	int tx_ring_size;
3371 	int tx_ring_entries, rx_ring_entries;
3372 	int err, slice;
3373 
3374 	/* get ring sizes */
3375 	err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_RING_SIZE, &cmd);
3376 	tx_ring_size = cmd.data0;
3377 	if (err != 0) {
3378 		device_printf(sc->dev, "Cannot determine tx ring sizes\n");
3379 		goto abort;
3380 	}
3381 
3382 	tx_ring_entries = tx_ring_size / sizeof (mcp_kreq_ether_send_t);
3383 	rx_ring_entries = sc->rx_ring_size / sizeof (mcp_dma_addr_t);
3384 	IFQ_SET_MAXLEN(&sc->ifp->if_snd, tx_ring_entries - 1);
3385 	sc->ifp->if_snd.ifq_drv_maxlen = sc->ifp->if_snd.ifq_maxlen;
3386 	IFQ_SET_READY(&sc->ifp->if_snd);
3387 
3388 	for (slice = 0; slice < sc->num_slices; slice++) {
3389 		err = mxge_alloc_slice_rings(&sc->ss[slice],
3390 					     rx_ring_entries,
3391 					     tx_ring_entries);
3392 		if (err != 0)
3393 			goto abort;
3394 	}
3395 	return 0;
3396 
3397 abort:
3398 	mxge_free_rings(sc);
3399 	return err;
3400 
3401 }
3402 
3403 
3404 static void
3405 mxge_choose_params(int mtu, int *big_buf_size, int *cl_size, int *nbufs)
3406 {
3407 	int bufsize = mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN + MXGEFW_PAD;
3408 
3409 	if (bufsize < MCLBYTES) {
3410 		/* easy, everything fits in a single buffer */
3411 		*big_buf_size = MCLBYTES;
3412 		*cl_size = MCLBYTES;
3413 		*nbufs = 1;
3414 		return;
3415 	}
3416 
3417 	if (bufsize < MJUMPAGESIZE) {
3418 		/* still easy, everything still fits in a single buffer */
3419 		*big_buf_size = MJUMPAGESIZE;
3420 		*cl_size = MJUMPAGESIZE;
3421 		*nbufs = 1;
3422 		return;
3423 	}
3424 #if MXGE_VIRT_JUMBOS
3425 	/* now we need to use virtually contiguous buffers */
3426 	*cl_size = MJUM9BYTES;
3427 	*big_buf_size = 4096;
3428 	*nbufs = mtu / 4096 + 1;
3429 	/* needs to be a power of two, so round up */
3430 	if (*nbufs == 3)
3431 		*nbufs = 4;
3432 #else
3433 	*cl_size = MJUM9BYTES;
3434 	*big_buf_size = MJUM9BYTES;
3435 	*nbufs = 1;
3436 #endif
3437 }
3438 
3439 static int
3440 mxge_slice_open(struct mxge_slice_state *ss, int nbufs, int cl_size)
3441 {
3442 	mxge_softc_t *sc;
3443 	mxge_cmd_t cmd;
3444 	bus_dmamap_t map;
3445 	struct lro_entry *lro_entry;
3446 	int err, i, slice;
3447 
3448 
3449 	sc = ss->sc;
3450 	slice = ss - sc->ss;
3451 
3452 	SLIST_INIT(&ss->lro_free);
3453 	SLIST_INIT(&ss->lro_active);
3454 
3455 	for (i = 0; i < sc->lro_cnt; i++) {
3456 		lro_entry = (struct lro_entry *)
3457 			malloc(sizeof (*lro_entry), M_DEVBUF,
3458 			       M_NOWAIT | M_ZERO);
3459 		if (lro_entry == NULL) {
3460 			sc->lro_cnt = i;
3461 			break;
3462 		}
3463 		SLIST_INSERT_HEAD(&ss->lro_free, lro_entry, next);
3464 	}
3465 	/* get the lanai pointers to the send and receive rings */
3466 
3467 	err = 0;
3468 #ifndef IFNET_BUF_RING
3469 	/* We currently only send from the first slice */
3470 	if (slice == 0) {
3471 #endif
3472 		cmd.data0 = slice;
3473 		err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_OFFSET, &cmd);
3474 		ss->tx.lanai =
3475 			(volatile mcp_kreq_ether_send_t *)(sc->sram + cmd.data0);
3476 		ss->tx.send_go = (volatile uint32_t *)
3477 			(sc->sram + MXGEFW_ETH_SEND_GO + 64 * slice);
3478 		ss->tx.send_stop = (volatile uint32_t *)
3479 		(sc->sram + MXGEFW_ETH_SEND_STOP + 64 * slice);
3480 #ifndef IFNET_BUF_RING
3481 	}
3482 #endif
3483 	cmd.data0 = slice;
3484 	err |= mxge_send_cmd(sc,
3485 			     MXGEFW_CMD_GET_SMALL_RX_OFFSET, &cmd);
3486 	ss->rx_small.lanai =
3487 		(volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
3488 	cmd.data0 = slice;
3489 	err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_BIG_RX_OFFSET, &cmd);
3490 	ss->rx_big.lanai =
3491 		(volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
3492 
3493 	if (err != 0) {
3494 		device_printf(sc->dev,
3495 			      "failed to get ring sizes or locations\n");
3496 		return EIO;
3497 	}
3498 
3499 	/* stock receive rings */
3500 	for (i = 0; i <= ss->rx_small.mask; i++) {
3501 		map = ss->rx_small.info[i].map;
3502 		err = mxge_get_buf_small(ss, map, i);
3503 		if (err) {
3504 			device_printf(sc->dev, "alloced %d/%d smalls\n",
3505 				      i, ss->rx_small.mask + 1);
3506 			return ENOMEM;
3507 		}
3508 	}
3509 	for (i = 0; i <= ss->rx_big.mask; i++) {
3510 		ss->rx_big.shadow[i].addr_low = 0xffffffff;
3511 		ss->rx_big.shadow[i].addr_high = 0xffffffff;
3512 	}
3513 	ss->rx_big.nbufs = nbufs;
3514 	ss->rx_big.cl_size = cl_size;
3515 	ss->rx_big.mlen = ss->sc->ifp->if_mtu + ETHER_HDR_LEN +
3516 		ETHER_VLAN_ENCAP_LEN + MXGEFW_PAD;
3517 	for (i = 0; i <= ss->rx_big.mask; i += ss->rx_big.nbufs) {
3518 		map = ss->rx_big.info[i].map;
3519 		err = mxge_get_buf_big(ss, map, i);
3520 		if (err) {
3521 			device_printf(sc->dev, "alloced %d/%d bigs\n",
3522 				      i, ss->rx_big.mask + 1);
3523 			return ENOMEM;
3524 		}
3525 	}
3526 	return 0;
3527 }
3528 
3529 static int
3530 mxge_open(mxge_softc_t *sc)
3531 {
3532 	mxge_cmd_t cmd;
3533 	int err, big_bytes, nbufs, slice, cl_size, i;
3534 	bus_addr_t bus;
3535 	volatile uint8_t *itable;
3536 	struct mxge_slice_state *ss;
3537 
3538 	/* Copy the MAC address in case it was overridden */
3539 	bcopy(IF_LLADDR(sc->ifp), sc->mac_addr, ETHER_ADDR_LEN);
3540 
3541 	err = mxge_reset(sc, 1);
3542 	if (err != 0) {
3543 		device_printf(sc->dev, "failed to reset\n");
3544 		return EIO;
3545 	}
3546 
3547 	if (sc->num_slices > 1) {
3548 		/* setup the indirection table */
3549 		cmd.data0 = sc->num_slices;
3550 		err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_TABLE_SIZE,
3551 				    &cmd);
3552 
3553 		err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_RSS_TABLE_OFFSET,
3554 				     &cmd);
3555 		if (err != 0) {
3556 			device_printf(sc->dev,
3557 				      "failed to setup rss tables\n");
3558 			return err;
3559 		}
3560 
3561 		/* just enable an identity mapping */
3562 		itable = sc->sram + cmd.data0;
3563 		for (i = 0; i < sc->num_slices; i++)
3564 			itable[i] = (uint8_t)i;
3565 
3566 		cmd.data0 = 1;
3567 		cmd.data1 = mxge_rss_hash_type;
3568 		err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_ENABLE, &cmd);
3569 		if (err != 0) {
3570 			device_printf(sc->dev, "failed to enable slices\n");
3571 			return err;
3572 		}
3573 	}
3574 
3575 
3576 	mxge_choose_params(sc->ifp->if_mtu, &big_bytes, &cl_size, &nbufs);
3577 
3578 	cmd.data0 = nbufs;
3579 	err = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
3580 			    &cmd);
3581 	/* error is only meaningful if we're trying to set
3582 	   MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS > 1 */
3583 	if (err && nbufs > 1) {
3584 		device_printf(sc->dev,
3585 			      "Failed to set alway-use-n to %d\n",
3586 			      nbufs);
3587 		return EIO;
3588 	}
3589 	/* Give the firmware the mtu and the big and small buffer
3590 	   sizes.  The firmware wants the big buf size to be a power
3591 	   of two. Luckily, FreeBSD's clusters are powers of two */
3592 	cmd.data0 = sc->ifp->if_mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
3593 	err = mxge_send_cmd(sc, MXGEFW_CMD_SET_MTU, &cmd);
3594 	cmd.data0 = MHLEN - MXGEFW_PAD;
3595 	err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_SMALL_BUFFER_SIZE,
3596 			     &cmd);
3597 	cmd.data0 = big_bytes;
3598 	err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_BIG_BUFFER_SIZE, &cmd);
3599 
3600 	if (err != 0) {
3601 		device_printf(sc->dev, "failed to setup params\n");
3602 		goto abort;
3603 	}
3604 
3605 	/* Now give him the pointer to the stats block */
3606 	for (slice = 0;
3607 #ifdef IFNET_BUF_RING
3608 	     slice < sc->num_slices;
3609 #else
3610 	     slice < 1;
3611 #endif
3612 	     slice++) {
3613 		ss = &sc->ss[slice];
3614 		cmd.data0 =
3615 			MXGE_LOWPART_TO_U32(ss->fw_stats_dma.bus_addr);
3616 		cmd.data1 =
3617 			MXGE_HIGHPART_TO_U32(ss->fw_stats_dma.bus_addr);
3618 		cmd.data2 = sizeof(struct mcp_irq_data);
3619 		cmd.data2 |= (slice << 16);
3620 		err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_STATS_DMA_V2, &cmd);
3621 	}
3622 
3623 	if (err != 0) {
3624 		bus = sc->ss->fw_stats_dma.bus_addr;
3625 		bus += offsetof(struct mcp_irq_data, send_done_count);
3626 		cmd.data0 = MXGE_LOWPART_TO_U32(bus);
3627 		cmd.data1 = MXGE_HIGHPART_TO_U32(bus);
3628 		err = mxge_send_cmd(sc,
3629 				    MXGEFW_CMD_SET_STATS_DMA_OBSOLETE,
3630 				    &cmd);
3631 		/* Firmware cannot support multicast without STATS_DMA_V2 */
3632 		sc->fw_multicast_support = 0;
3633 	} else {
3634 		sc->fw_multicast_support = 1;
3635 	}
3636 
3637 	if (err != 0) {
3638 		device_printf(sc->dev, "failed to setup params\n");
3639 		goto abort;
3640 	}
3641 
3642 	for (slice = 0; slice < sc->num_slices; slice++) {
3643 		err = mxge_slice_open(&sc->ss[slice], nbufs, cl_size);
3644 		if (err != 0) {
3645 			device_printf(sc->dev, "couldn't open slice %d\n",
3646 				      slice);
3647 			goto abort;
3648 		}
3649 	}
3650 
3651 	/* Finally, start the firmware running */
3652 	err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_UP, &cmd);
3653 	if (err) {
3654 		device_printf(sc->dev, "Couldn't bring up link\n");
3655 		goto abort;
3656 	}
3657 #ifdef IFNET_BUF_RING
3658 	for (slice = 0; slice < sc->num_slices; slice++) {
3659 		ss = &sc->ss[slice];
3660 		ss->if_drv_flags |= IFF_DRV_RUNNING;
3661 		ss->if_drv_flags &= ~IFF_DRV_OACTIVE;
3662 	}
3663 #endif
3664 	sc->ifp->if_drv_flags |= IFF_DRV_RUNNING;
3665 	sc->ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
3666 
3667 	return 0;
3668 
3669 
3670 abort:
3671 	mxge_free_mbufs(sc);
3672 
3673 	return err;
3674 }
3675 
3676 static int
3677 mxge_close(mxge_softc_t *sc, int down)
3678 {
3679 	mxge_cmd_t cmd;
3680 	int err, old_down_cnt;
3681 #ifdef IFNET_BUF_RING
3682 	struct mxge_slice_state *ss;
3683 	int slice;
3684 #endif
3685 
3686 #ifdef IFNET_BUF_RING
3687 	for (slice = 0; slice < sc->num_slices; slice++) {
3688 		ss = &sc->ss[slice];
3689 		ss->if_drv_flags &= ~IFF_DRV_RUNNING;
3690 	}
3691 #endif
3692 	sc->ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
3693 	if (!down) {
3694 		old_down_cnt = sc->down_cnt;
3695 		wmb();
3696 		err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_DOWN, &cmd);
3697 		if (err) {
3698 			device_printf(sc->dev,
3699 				      "Couldn't bring down link\n");
3700 		}
3701 		if (old_down_cnt == sc->down_cnt) {
3702 			/* wait for down irq */
3703 			DELAY(10 * sc->intr_coal_delay);
3704 		}
3705 		wmb();
3706 		if (old_down_cnt == sc->down_cnt) {
3707 			device_printf(sc->dev, "never got down irq\n");
3708 		}
3709 	}
3710 	mxge_free_mbufs(sc);
3711 
3712 	return 0;
3713 }
3714 
3715 static void
3716 mxge_setup_cfg_space(mxge_softc_t *sc)
3717 {
3718 	device_t dev = sc->dev;
3719 	int reg;
3720 	uint16_t cmd, lnk, pectl;
3721 
3722 	/* find the PCIe link width and set max read request to 4KB*/
3723 	if (pci_find_extcap(dev, PCIY_EXPRESS, &reg) == 0) {
3724 		lnk = pci_read_config(dev, reg + 0x12, 2);
3725 		sc->link_width = (lnk >> 4) & 0x3f;
3726 
3727 		if (sc->pectl == 0) {
3728 			pectl = pci_read_config(dev, reg + 0x8, 2);
3729 			pectl = (pectl & ~0x7000) | (5 << 12);
3730 			pci_write_config(dev, reg + 0x8, pectl, 2);
3731 			sc->pectl = pectl;
3732 		} else {
3733 			/* restore saved pectl after watchdog reset */
3734 			pci_write_config(dev, reg + 0x8, sc->pectl, 2);
3735 		}
3736 	}
3737 
3738 	/* Enable DMA and Memory space access */
3739 	pci_enable_busmaster(dev);
3740 	cmd = pci_read_config(dev, PCIR_COMMAND, 2);
3741 	cmd |= PCIM_CMD_MEMEN;
3742 	pci_write_config(dev, PCIR_COMMAND, cmd, 2);
3743 }
3744 
3745 static uint32_t
3746 mxge_read_reboot(mxge_softc_t *sc)
3747 {
3748 	device_t dev = sc->dev;
3749 	uint32_t vs;
3750 
3751 	/* find the vendor specific offset */
3752 	if (pci_find_extcap(dev, PCIY_VENDOR, &vs) != 0) {
3753 		device_printf(sc->dev,
3754 			      "could not find vendor specific offset\n");
3755 		return (uint32_t)-1;
3756 	}
3757 	/* enable read32 mode */
3758 	pci_write_config(dev, vs + 0x10, 0x3, 1);
3759 	/* tell NIC which register to read */
3760 	pci_write_config(dev, vs + 0x18, 0xfffffff0, 4);
3761 	return (pci_read_config(dev, vs + 0x14, 4));
3762 }
3763 
3764 static void
3765 mxge_watchdog_reset(mxge_softc_t *sc)
3766 {
3767 	struct pci_devinfo *dinfo;
3768 	struct mxge_slice_state *ss;
3769 	int err, running, s, num_tx_slices = 1;
3770 	uint32_t reboot;
3771 	uint16_t cmd;
3772 
3773 	err = ENXIO;
3774 
3775 	device_printf(sc->dev, "Watchdog reset!\n");
3776 
3777 	/*
3778 	 * check to see if the NIC rebooted.  If it did, then all of
3779 	 * PCI config space has been reset, and things like the
3780 	 * busmaster bit will be zero.  If this is the case, then we
3781 	 * must restore PCI config space before the NIC can be used
3782 	 * again
3783 	 */
3784 	cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3785 	if (cmd == 0xffff) {
3786 		/*
3787 		 * maybe the watchdog caught the NIC rebooting; wait
3788 		 * up to 100ms for it to finish.  If it does not come
3789 		 * back, then give up
3790 		 */
3791 		DELAY(1000*100);
3792 		cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3793 		if (cmd == 0xffff) {
3794 			device_printf(sc->dev, "NIC disappeared!\n");
3795 		}
3796 	}
3797 	if ((cmd & PCIM_CMD_BUSMASTEREN) == 0) {
3798 		/* print the reboot status */
3799 		reboot = mxge_read_reboot(sc);
3800 		device_printf(sc->dev, "NIC rebooted, status = 0x%x\n",
3801 			      reboot);
3802 		running = sc->ifp->if_drv_flags & IFF_DRV_RUNNING;
3803 		if (running) {
3804 
3805 			/*
3806 			 * quiesce NIC so that TX routines will not try to
3807 			 * xmit after restoration of BAR
3808 			 */
3809 
3810 			/* Mark the link as down */
3811 			if (sc->link_state) {
3812 				sc->link_state = 0;
3813 				if_link_state_change(sc->ifp,
3814 						     LINK_STATE_DOWN);
3815 			}
3816 #ifdef IFNET_BUF_RING
3817 			num_tx_slices = sc->num_slices;
3818 #endif
3819 			/* grab all TX locks to ensure no tx  */
3820 			for (s = 0; s < num_tx_slices; s++) {
3821 				ss = &sc->ss[s];
3822 				mtx_lock(&ss->tx.mtx);
3823 			}
3824 			mxge_close(sc, 1);
3825 		}
3826 		/* restore PCI configuration space */
3827 		dinfo = device_get_ivars(sc->dev);
3828 		pci_cfg_restore(sc->dev, dinfo);
3829 
3830 		/* and redo any changes we made to our config space */
3831 		mxge_setup_cfg_space(sc);
3832 
3833 		/* reload f/w */
3834 		err = mxge_load_firmware(sc, 0);
3835 		if (err) {
3836 			device_printf(sc->dev,
3837 				      "Unable to re-load f/w\n");
3838 		}
3839 		if (running) {
3840 			if (!err)
3841 				err = mxge_open(sc);
3842 			/* release all TX locks */
3843 			for (s = 0; s < num_tx_slices; s++) {
3844 				ss = &sc->ss[s];
3845 #ifdef IFNET_BUF_RING
3846 				mxge_start_locked(ss);
3847 #endif
3848 				mtx_unlock(&ss->tx.mtx);
3849 			}
3850 		}
3851 		sc->watchdog_resets++;
3852 	} else {
3853 		device_printf(sc->dev,
3854 			      "NIC did not reboot, not resetting\n");
3855 		err = 0;
3856 	}
3857 	if (err) {
3858 		device_printf(sc->dev, "watchdog reset failed\n");
3859 	} else {
3860 		if (sc->dying == 2)
3861 			sc->dying = 0;
3862 		callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
3863 	}
3864 }
3865 
3866 static void
3867 mxge_watchdog_task(void *arg, int pending)
3868 {
3869 	mxge_softc_t *sc = arg;
3870 
3871 
3872 	mtx_lock(&sc->driver_mtx);
3873 	mxge_watchdog_reset(sc);
3874 	mtx_unlock(&sc->driver_mtx);
3875 }
3876 
3877 static void
3878 mxge_warn_stuck(mxge_softc_t *sc, mxge_tx_ring_t *tx, int slice)
3879 {
3880 	tx = &sc->ss[slice].tx;
3881 	device_printf(sc->dev, "slice %d struck? ring state:\n", slice);
3882 	device_printf(sc->dev,
3883 		      "tx.req=%d tx.done=%d, tx.queue_active=%d\n",
3884 		      tx->req, tx->done, tx->queue_active);
3885 	device_printf(sc->dev, "tx.activate=%d tx.deactivate=%d\n",
3886 			      tx->activate, tx->deactivate);
3887 	device_printf(sc->dev, "pkt_done=%d fw=%d\n",
3888 		      tx->pkt_done,
3889 		      be32toh(sc->ss->fw_stats->send_done_count));
3890 }
3891 
3892 static int
3893 mxge_watchdog(mxge_softc_t *sc)
3894 {
3895 	mxge_tx_ring_t *tx;
3896 	uint32_t rx_pause = be32toh(sc->ss->fw_stats->dropped_pause);
3897 	int i, err = 0;
3898 
3899 	/* see if we have outstanding transmits, which
3900 	   have been pending for more than mxge_ticks */
3901 	for (i = 0;
3902 #ifdef IFNET_BUF_RING
3903 	     (i < sc->num_slices) && (err == 0);
3904 #else
3905 	     (i < 1) && (err == 0);
3906 #endif
3907 	     i++) {
3908 		tx = &sc->ss[i].tx;
3909 		if (tx->req != tx->done &&
3910 		    tx->watchdog_req != tx->watchdog_done &&
3911 		    tx->done == tx->watchdog_done) {
3912 			/* check for pause blocking before resetting */
3913 			if (tx->watchdog_rx_pause == rx_pause) {
3914 				mxge_warn_stuck(sc, tx, i);
3915 				taskqueue_enqueue(sc->tq, &sc->watchdog_task);
3916 				return (ENXIO);
3917 			}
3918 			else
3919 				device_printf(sc->dev, "Flow control blocking "
3920 					      "xmits, check link partner\n");
3921 		}
3922 
3923 		tx->watchdog_req = tx->req;
3924 		tx->watchdog_done = tx->done;
3925 		tx->watchdog_rx_pause = rx_pause;
3926 	}
3927 
3928 	if (sc->need_media_probe)
3929 		mxge_media_probe(sc);
3930 	return (err);
3931 }
3932 
3933 static u_long
3934 mxge_update_stats(mxge_softc_t *sc)
3935 {
3936 	struct mxge_slice_state *ss;
3937 	u_long pkts = 0;
3938 	u_long ipackets = 0;
3939 	u_long opackets = 0;
3940 #ifdef IFNET_BUF_RING
3941 	u_long obytes = 0;
3942 	u_long omcasts = 0;
3943 	u_long odrops = 0;
3944 #endif
3945 	u_long oerrors = 0;
3946 	int slice;
3947 
3948 	for (slice = 0; slice < sc->num_slices; slice++) {
3949 		ss = &sc->ss[slice];
3950 		ipackets += ss->ipackets;
3951 		opackets += ss->opackets;
3952 #ifdef IFNET_BUF_RING
3953 		obytes += ss->obytes;
3954 		omcasts += ss->omcasts;
3955 		odrops += ss->tx.br->br_drops;
3956 #endif
3957 		oerrors += ss->oerrors;
3958 	}
3959 	pkts = (ipackets - sc->ifp->if_ipackets);
3960 	pkts += (opackets - sc->ifp->if_opackets);
3961 	sc->ifp->if_ipackets = ipackets;
3962 	sc->ifp->if_opackets = opackets;
3963 #ifdef IFNET_BUF_RING
3964 	sc->ifp->if_obytes = obytes;
3965 	sc->ifp->if_omcasts = omcasts;
3966 	sc->ifp->if_snd.ifq_drops = odrops;
3967 #endif
3968 	sc->ifp->if_oerrors = oerrors;
3969 	return pkts;
3970 }
3971 
3972 static void
3973 mxge_tick(void *arg)
3974 {
3975 	mxge_softc_t *sc = arg;
3976 	u_long pkts = 0;
3977 	int err = 0;
3978 	int running, ticks;
3979 	uint16_t cmd;
3980 
3981 	ticks = mxge_ticks;
3982 	running = sc->ifp->if_drv_flags & IFF_DRV_RUNNING;
3983 	if (running) {
3984 		/* aggregate stats from different slices */
3985 		pkts = mxge_update_stats(sc);
3986 		if (!sc->watchdog_countdown) {
3987 			err = mxge_watchdog(sc);
3988 			sc->watchdog_countdown = 4;
3989 		}
3990 		sc->watchdog_countdown--;
3991 	}
3992 	if (pkts == 0) {
3993 		/* ensure NIC did not suffer h/w fault while idle */
3994 		cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3995 		if ((cmd & PCIM_CMD_BUSMASTEREN) == 0) {
3996 			sc->dying = 2;
3997 			taskqueue_enqueue(sc->tq, &sc->watchdog_task);
3998 			err = ENXIO;
3999 		}
4000 		/* look less often if NIC is idle */
4001 		ticks *= 4;
4002 	}
4003 
4004 	if (err == 0)
4005 		callout_reset(&sc->co_hdl, ticks, mxge_tick, sc);
4006 
4007 }
4008 
4009 static int
4010 mxge_media_change(struct ifnet *ifp)
4011 {
4012 	return EINVAL;
4013 }
4014 
4015 static int
4016 mxge_change_mtu(mxge_softc_t *sc, int mtu)
4017 {
4018 	struct ifnet *ifp = sc->ifp;
4019 	int real_mtu, old_mtu;
4020 	int err = 0;
4021 
4022 
4023 	real_mtu = mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
4024 	if ((real_mtu > sc->max_mtu) || real_mtu < 60)
4025 		return EINVAL;
4026 	mtx_lock(&sc->driver_mtx);
4027 	old_mtu = ifp->if_mtu;
4028 	ifp->if_mtu = mtu;
4029 	if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
4030 		mxge_close(sc, 0);
4031 		err = mxge_open(sc);
4032 		if (err != 0) {
4033 			ifp->if_mtu = old_mtu;
4034 			mxge_close(sc, 0);
4035 			(void) mxge_open(sc);
4036 		}
4037 	}
4038 	mtx_unlock(&sc->driver_mtx);
4039 	return err;
4040 }
4041 
4042 static void
4043 mxge_media_status(struct ifnet *ifp, struct ifmediareq *ifmr)
4044 {
4045 	mxge_softc_t *sc = ifp->if_softc;
4046 
4047 
4048 	if (sc == NULL)
4049 		return;
4050 	ifmr->ifm_status = IFM_AVALID;
4051 	ifmr->ifm_active = IFM_ETHER | IFM_FDX;
4052 	ifmr->ifm_status |= sc->link_state ? IFM_ACTIVE : 0;
4053 	ifmr->ifm_active |= sc->current_media;
4054 }
4055 
4056 static int
4057 mxge_ioctl(struct ifnet *ifp, u_long command, caddr_t data)
4058 {
4059 	mxge_softc_t *sc = ifp->if_softc;
4060 	struct ifreq *ifr = (struct ifreq *)data;
4061 	int err, mask;
4062 
4063 	err = 0;
4064 	switch (command) {
4065 	case SIOCSIFADDR:
4066 	case SIOCGIFADDR:
4067 		err = ether_ioctl(ifp, command, data);
4068 		break;
4069 
4070 	case SIOCSIFMTU:
4071 		err = mxge_change_mtu(sc, ifr->ifr_mtu);
4072 		break;
4073 
4074 	case SIOCSIFFLAGS:
4075 		mtx_lock(&sc->driver_mtx);
4076 		if (sc->dying) {
4077 			mtx_unlock(&sc->driver_mtx);
4078 			return EINVAL;
4079 		}
4080 		if (ifp->if_flags & IFF_UP) {
4081 			if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) {
4082 				err = mxge_open(sc);
4083 			} else {
4084 				/* take care of promis can allmulti
4085 				   flag chages */
4086 				mxge_change_promisc(sc,
4087 						    ifp->if_flags & IFF_PROMISC);
4088 				mxge_set_multicast_list(sc);
4089 			}
4090 		} else {
4091 			if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
4092 				mxge_close(sc, 0);
4093 			}
4094 		}
4095 		mtx_unlock(&sc->driver_mtx);
4096 		break;
4097 
4098 	case SIOCADDMULTI:
4099 	case SIOCDELMULTI:
4100 		mtx_lock(&sc->driver_mtx);
4101 		mxge_set_multicast_list(sc);
4102 		mtx_unlock(&sc->driver_mtx);
4103 		break;
4104 
4105 	case SIOCSIFCAP:
4106 		mtx_lock(&sc->driver_mtx);
4107 		mask = ifr->ifr_reqcap ^ ifp->if_capenable;
4108 		if (mask & IFCAP_TXCSUM) {
4109 			if (IFCAP_TXCSUM & ifp->if_capenable) {
4110 				ifp->if_capenable &= ~(IFCAP_TXCSUM|IFCAP_TSO4);
4111 				ifp->if_hwassist &= ~(CSUM_TCP | CSUM_UDP
4112 						      | CSUM_TSO);
4113 			} else {
4114 				ifp->if_capenable |= IFCAP_TXCSUM;
4115 				ifp->if_hwassist |= (CSUM_TCP | CSUM_UDP);
4116 			}
4117 		} else if (mask & IFCAP_RXCSUM) {
4118 			if (IFCAP_RXCSUM & ifp->if_capenable) {
4119 				ifp->if_capenable &= ~IFCAP_RXCSUM;
4120 				sc->csum_flag = 0;
4121 			} else {
4122 				ifp->if_capenable |= IFCAP_RXCSUM;
4123 				sc->csum_flag = 1;
4124 			}
4125 		}
4126 		if (mask & IFCAP_TSO4) {
4127 			if (IFCAP_TSO4 & ifp->if_capenable) {
4128 				ifp->if_capenable &= ~IFCAP_TSO4;
4129 				ifp->if_hwassist &= ~CSUM_TSO;
4130 			} else if (IFCAP_TXCSUM & ifp->if_capenable) {
4131 				ifp->if_capenable |= IFCAP_TSO4;
4132 				ifp->if_hwassist |= CSUM_TSO;
4133 			} else {
4134 				printf("mxge requires tx checksum offload"
4135 				       " be enabled to use TSO\n");
4136 				err = EINVAL;
4137 			}
4138 		}
4139 		if (mask & IFCAP_LRO) {
4140 			if (IFCAP_LRO & ifp->if_capenable)
4141 				err = mxge_change_lro_locked(sc, 0);
4142 			else
4143 				err = mxge_change_lro_locked(sc, mxge_lro_cnt);
4144 		}
4145 		if (mask & IFCAP_VLAN_HWTAGGING)
4146 			ifp->if_capenable ^= IFCAP_VLAN_HWTAGGING;
4147 		if (mask & IFCAP_VLAN_HWTSO)
4148 			ifp->if_capenable ^= IFCAP_VLAN_HWTSO;
4149 
4150 		if (!(ifp->if_capabilities & IFCAP_VLAN_HWTSO) ||
4151 		    !(ifp->if_capenable & IFCAP_VLAN_HWTAGGING))
4152 			ifp->if_capenable &= ~IFCAP_VLAN_HWTSO;
4153 
4154 		mtx_unlock(&sc->driver_mtx);
4155 		VLAN_CAPABILITIES(ifp);
4156 
4157 		break;
4158 
4159 	case SIOCGIFMEDIA:
4160 		mtx_lock(&sc->driver_mtx);
4161 		mxge_media_probe(sc);
4162 		mtx_unlock(&sc->driver_mtx);
4163 		err = ifmedia_ioctl(ifp, (struct ifreq *)data,
4164 				    &sc->media, command);
4165                 break;
4166 
4167 	default:
4168 		err = ENOTTY;
4169         }
4170 	return err;
4171 }
4172 
4173 static void
4174 mxge_fetch_tunables(mxge_softc_t *sc)
4175 {
4176 
4177 	TUNABLE_INT_FETCH("hw.mxge.max_slices", &mxge_max_slices);
4178 	TUNABLE_INT_FETCH("hw.mxge.flow_control_enabled",
4179 			  &mxge_flow_control);
4180 	TUNABLE_INT_FETCH("hw.mxge.intr_coal_delay",
4181 			  &mxge_intr_coal_delay);
4182 	TUNABLE_INT_FETCH("hw.mxge.nvidia_ecrc_enable",
4183 			  &mxge_nvidia_ecrc_enable);
4184 	TUNABLE_INT_FETCH("hw.mxge.force_firmware",
4185 			  &mxge_force_firmware);
4186 	TUNABLE_INT_FETCH("hw.mxge.deassert_wait",
4187 			  &mxge_deassert_wait);
4188 	TUNABLE_INT_FETCH("hw.mxge.verbose",
4189 			  &mxge_verbose);
4190 	TUNABLE_INT_FETCH("hw.mxge.ticks", &mxge_ticks);
4191 	TUNABLE_INT_FETCH("hw.mxge.lro_cnt", &sc->lro_cnt);
4192 	TUNABLE_INT_FETCH("hw.mxge.always_promisc", &mxge_always_promisc);
4193 	TUNABLE_INT_FETCH("hw.mxge.rss_hash_type", &mxge_rss_hash_type);
4194 	TUNABLE_INT_FETCH("hw.mxge.rss_hashtype", &mxge_rss_hash_type);
4195 	TUNABLE_INT_FETCH("hw.mxge.initial_mtu", &mxge_initial_mtu);
4196 	TUNABLE_INT_FETCH("hw.mxge.throttle", &mxge_throttle);
4197 	if (sc->lro_cnt != 0)
4198 		mxge_lro_cnt = sc->lro_cnt;
4199 
4200 	if (bootverbose)
4201 		mxge_verbose = 1;
4202 	if (mxge_intr_coal_delay < 0 || mxge_intr_coal_delay > 10*1000)
4203 		mxge_intr_coal_delay = 30;
4204 	if (mxge_ticks == 0)
4205 		mxge_ticks = hz / 2;
4206 	sc->pause = mxge_flow_control;
4207 	if (mxge_rss_hash_type < MXGEFW_RSS_HASH_TYPE_IPV4
4208 	    || mxge_rss_hash_type > MXGEFW_RSS_HASH_TYPE_MAX) {
4209 		mxge_rss_hash_type = MXGEFW_RSS_HASH_TYPE_SRC_DST_PORT;
4210 	}
4211 	if (mxge_initial_mtu > ETHERMTU_JUMBO ||
4212 	    mxge_initial_mtu < ETHER_MIN_LEN)
4213 		mxge_initial_mtu = ETHERMTU_JUMBO;
4214 
4215 	if (mxge_throttle && mxge_throttle > MXGE_MAX_THROTTLE)
4216 		mxge_throttle = MXGE_MAX_THROTTLE;
4217 	if (mxge_throttle && mxge_throttle < MXGE_MIN_THROTTLE)
4218 		mxge_throttle = MXGE_MIN_THROTTLE;
4219 	sc->throttle = mxge_throttle;
4220 }
4221 
4222 
4223 static void
4224 mxge_free_slices(mxge_softc_t *sc)
4225 {
4226 	struct mxge_slice_state *ss;
4227 	int i;
4228 
4229 
4230 	if (sc->ss == NULL)
4231 		return;
4232 
4233 	for (i = 0; i < sc->num_slices; i++) {
4234 		ss = &sc->ss[i];
4235 		if (ss->fw_stats != NULL) {
4236 			mxge_dma_free(&ss->fw_stats_dma);
4237 			ss->fw_stats = NULL;
4238 #ifdef IFNET_BUF_RING
4239 			if (ss->tx.br != NULL) {
4240 				drbr_free(ss->tx.br, M_DEVBUF);
4241 				ss->tx.br = NULL;
4242 			}
4243 #endif
4244 			mtx_destroy(&ss->tx.mtx);
4245 		}
4246 		if (ss->rx_done.entry != NULL) {
4247 			mxge_dma_free(&ss->rx_done.dma);
4248 			ss->rx_done.entry = NULL;
4249 		}
4250 	}
4251 	free(sc->ss, M_DEVBUF);
4252 	sc->ss = NULL;
4253 }
4254 
4255 static int
4256 mxge_alloc_slices(mxge_softc_t *sc)
4257 {
4258 	mxge_cmd_t cmd;
4259 	struct mxge_slice_state *ss;
4260 	size_t bytes;
4261 	int err, i, max_intr_slots;
4262 
4263 	err = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
4264 	if (err != 0) {
4265 		device_printf(sc->dev, "Cannot determine rx ring size\n");
4266 		return err;
4267 	}
4268 	sc->rx_ring_size = cmd.data0;
4269 	max_intr_slots = 2 * (sc->rx_ring_size / sizeof (mcp_dma_addr_t));
4270 
4271 	bytes = sizeof (*sc->ss) * sc->num_slices;
4272 	sc->ss = malloc(bytes, M_DEVBUF, M_NOWAIT | M_ZERO);
4273 	if (sc->ss == NULL)
4274 		return (ENOMEM);
4275 	for (i = 0; i < sc->num_slices; i++) {
4276 		ss = &sc->ss[i];
4277 
4278 		ss->sc = sc;
4279 
4280 		/* allocate per-slice rx interrupt queues */
4281 
4282 		bytes = max_intr_slots * sizeof (*ss->rx_done.entry);
4283 		err = mxge_dma_alloc(sc, &ss->rx_done.dma, bytes, 4096);
4284 		if (err != 0)
4285 			goto abort;
4286 		ss->rx_done.entry = ss->rx_done.dma.addr;
4287 		bzero(ss->rx_done.entry, bytes);
4288 
4289 		/*
4290 		 * allocate the per-slice firmware stats; stats
4291 		 * (including tx) are used used only on the first
4292 		 * slice for now
4293 		 */
4294 #ifndef IFNET_BUF_RING
4295 		if (i > 0)
4296 			continue;
4297 #endif
4298 
4299 		bytes = sizeof (*ss->fw_stats);
4300 		err = mxge_dma_alloc(sc, &ss->fw_stats_dma,
4301 				     sizeof (*ss->fw_stats), 64);
4302 		if (err != 0)
4303 			goto abort;
4304 		ss->fw_stats = (mcp_irq_data_t *)ss->fw_stats_dma.addr;
4305 		snprintf(ss->tx.mtx_name, sizeof(ss->tx.mtx_name),
4306 			 "%s:tx(%d)", device_get_nameunit(sc->dev), i);
4307 		mtx_init(&ss->tx.mtx, ss->tx.mtx_name, NULL, MTX_DEF);
4308 #ifdef IFNET_BUF_RING
4309 		ss->tx.br = buf_ring_alloc(2048, M_DEVBUF, M_WAITOK,
4310 					   &ss->tx.mtx);
4311 #endif
4312 	}
4313 
4314 	return (0);
4315 
4316 abort:
4317 	mxge_free_slices(sc);
4318 	return (ENOMEM);
4319 }
4320 
4321 static void
4322 mxge_slice_probe(mxge_softc_t *sc)
4323 {
4324 	mxge_cmd_t cmd;
4325 	char *old_fw;
4326 	int msix_cnt, status, max_intr_slots;
4327 
4328 	sc->num_slices = 1;
4329 	/*
4330 	 *  don't enable multiple slices if they are not enabled,
4331 	 *  or if this is not an SMP system
4332 	 */
4333 
4334 	if (mxge_max_slices == 0 || mxge_max_slices == 1 || mp_ncpus < 2)
4335 		return;
4336 
4337 	/* see how many MSI-X interrupts are available */
4338 	msix_cnt = pci_msix_count(sc->dev);
4339 	if (msix_cnt < 2)
4340 		return;
4341 
4342 	/* now load the slice aware firmware see what it supports */
4343 	old_fw = sc->fw_name;
4344 	if (old_fw == mxge_fw_aligned)
4345 		sc->fw_name = mxge_fw_rss_aligned;
4346 	else
4347 		sc->fw_name = mxge_fw_rss_unaligned;
4348 	status = mxge_load_firmware(sc, 0);
4349 	if (status != 0) {
4350 		device_printf(sc->dev, "Falling back to a single slice\n");
4351 		return;
4352 	}
4353 
4354 	/* try to send a reset command to the card to see if it
4355 	   is alive */
4356 	memset(&cmd, 0, sizeof (cmd));
4357 	status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
4358 	if (status != 0) {
4359 		device_printf(sc->dev, "failed reset\n");
4360 		goto abort_with_fw;
4361 	}
4362 
4363 	/* get rx ring size */
4364 	status = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
4365 	if (status != 0) {
4366 		device_printf(sc->dev, "Cannot determine rx ring size\n");
4367 		goto abort_with_fw;
4368 	}
4369 	max_intr_slots = 2 * (cmd.data0 / sizeof (mcp_dma_addr_t));
4370 
4371 	/* tell it the size of the interrupt queues */
4372 	cmd.data0 = max_intr_slots * sizeof (struct mcp_slot);
4373 	status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
4374 	if (status != 0) {
4375 		device_printf(sc->dev, "failed MXGEFW_CMD_SET_INTRQ_SIZE\n");
4376 		goto abort_with_fw;
4377 	}
4378 
4379 	/* ask the maximum number of slices it supports */
4380 	status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES, &cmd);
4381 	if (status != 0) {
4382 		device_printf(sc->dev,
4383 			      "failed MXGEFW_CMD_GET_MAX_RSS_QUEUES\n");
4384 		goto abort_with_fw;
4385 	}
4386 	sc->num_slices = cmd.data0;
4387 	if (sc->num_slices > msix_cnt)
4388 		sc->num_slices = msix_cnt;
4389 
4390 	if (mxge_max_slices == -1) {
4391 		/* cap to number of CPUs in system */
4392 		if (sc->num_slices > mp_ncpus)
4393 			sc->num_slices = mp_ncpus;
4394 	} else {
4395 		if (sc->num_slices > mxge_max_slices)
4396 			sc->num_slices = mxge_max_slices;
4397 	}
4398 	/* make sure it is a power of two */
4399 	while (sc->num_slices & (sc->num_slices - 1))
4400 		sc->num_slices--;
4401 
4402 	if (mxge_verbose)
4403 		device_printf(sc->dev, "using %d slices\n",
4404 			      sc->num_slices);
4405 
4406 	return;
4407 
4408 abort_with_fw:
4409 	sc->fw_name = old_fw;
4410 	(void) mxge_load_firmware(sc, 0);
4411 }
4412 
4413 static int
4414 mxge_add_msix_irqs(mxge_softc_t *sc)
4415 {
4416 	size_t bytes;
4417 	int count, err, i, rid;
4418 
4419 	rid = PCIR_BAR(2);
4420 	sc->msix_table_res = bus_alloc_resource_any(sc->dev, SYS_RES_MEMORY,
4421 						    &rid, RF_ACTIVE);
4422 
4423 	if (sc->msix_table_res == NULL) {
4424 		device_printf(sc->dev, "couldn't alloc MSIX table res\n");
4425 		return ENXIO;
4426 	}
4427 
4428 	count = sc->num_slices;
4429 	err = pci_alloc_msix(sc->dev, &count);
4430 	if (err != 0) {
4431 		device_printf(sc->dev, "pci_alloc_msix: failed, wanted %d"
4432 			      "err = %d \n", sc->num_slices, err);
4433 		goto abort_with_msix_table;
4434 	}
4435 	if (count < sc->num_slices) {
4436 		device_printf(sc->dev, "pci_alloc_msix: need %d, got %d\n",
4437 			      count, sc->num_slices);
4438 		device_printf(sc->dev,
4439 			      "Try setting hw.mxge.max_slices to %d\n",
4440 			      count);
4441 		err = ENOSPC;
4442 		goto abort_with_msix;
4443 	}
4444 	bytes = sizeof (*sc->msix_irq_res) * sc->num_slices;
4445 	sc->msix_irq_res = malloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO);
4446 	if (sc->msix_irq_res == NULL) {
4447 		err = ENOMEM;
4448 		goto abort_with_msix;
4449 	}
4450 
4451 	for (i = 0; i < sc->num_slices; i++) {
4452 		rid = i + 1;
4453 		sc->msix_irq_res[i] = bus_alloc_resource_any(sc->dev,
4454 							  SYS_RES_IRQ,
4455 							  &rid, RF_ACTIVE);
4456 		if (sc->msix_irq_res[i] == NULL) {
4457 			device_printf(sc->dev, "couldn't allocate IRQ res"
4458 				      " for message %d\n", i);
4459 			err = ENXIO;
4460 			goto abort_with_res;
4461 		}
4462 	}
4463 
4464 	bytes = sizeof (*sc->msix_ih) * sc->num_slices;
4465 	sc->msix_ih =  malloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO);
4466 
4467 	for (i = 0; i < sc->num_slices; i++) {
4468 		err = bus_setup_intr(sc->dev, sc->msix_irq_res[i],
4469 				     INTR_TYPE_NET | INTR_MPSAFE,
4470 #if __FreeBSD_version > 700030
4471 				     NULL,
4472 #endif
4473 				     mxge_intr, &sc->ss[i], &sc->msix_ih[i]);
4474 		if (err != 0) {
4475 			device_printf(sc->dev, "couldn't setup intr for "
4476 				      "message %d\n", i);
4477 			goto abort_with_intr;
4478 		}
4479 		bus_describe_intr(sc->dev, sc->msix_irq_res[i],
4480 				  sc->msix_ih[i], "s%d", i);
4481 	}
4482 
4483 	if (mxge_verbose) {
4484 		device_printf(sc->dev, "using %d msix IRQs:",
4485 			      sc->num_slices);
4486 		for (i = 0; i < sc->num_slices; i++)
4487 			printf(" %ld",  rman_get_start(sc->msix_irq_res[i]));
4488 		printf("\n");
4489 	}
4490 	return (0);
4491 
4492 abort_with_intr:
4493 	for (i = 0; i < sc->num_slices; i++) {
4494 		if (sc->msix_ih[i] != NULL) {
4495 			bus_teardown_intr(sc->dev, sc->msix_irq_res[i],
4496 					  sc->msix_ih[i]);
4497 			sc->msix_ih[i] = NULL;
4498 		}
4499 	}
4500 	free(sc->msix_ih, M_DEVBUF);
4501 
4502 
4503 abort_with_res:
4504 	for (i = 0; i < sc->num_slices; i++) {
4505 		rid = i + 1;
4506 		if (sc->msix_irq_res[i] != NULL)
4507 			bus_release_resource(sc->dev, SYS_RES_IRQ, rid,
4508 					     sc->msix_irq_res[i]);
4509 		sc->msix_irq_res[i] = NULL;
4510 	}
4511 	free(sc->msix_irq_res, M_DEVBUF);
4512 
4513 
4514 abort_with_msix:
4515 	pci_release_msi(sc->dev);
4516 
4517 abort_with_msix_table:
4518 	bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2),
4519 			     sc->msix_table_res);
4520 
4521 	return err;
4522 }
4523 
4524 static int
4525 mxge_add_single_irq(mxge_softc_t *sc)
4526 {
4527 	int count, err, rid;
4528 
4529 	count = pci_msi_count(sc->dev);
4530 	if (count == 1 && pci_alloc_msi(sc->dev, &count) == 0) {
4531 		rid = 1;
4532 	} else {
4533 		rid = 0;
4534 		sc->legacy_irq = 1;
4535 	}
4536 	sc->irq_res = bus_alloc_resource(sc->dev, SYS_RES_IRQ, &rid, 0, ~0,
4537 					 1, RF_SHAREABLE | RF_ACTIVE);
4538 	if (sc->irq_res == NULL) {
4539 		device_printf(sc->dev, "could not alloc interrupt\n");
4540 		return ENXIO;
4541 	}
4542 	if (mxge_verbose)
4543 		device_printf(sc->dev, "using %s irq %ld\n",
4544 			      sc->legacy_irq ? "INTx" : "MSI",
4545 			      rman_get_start(sc->irq_res));
4546 	err = bus_setup_intr(sc->dev, sc->irq_res,
4547 			     INTR_TYPE_NET | INTR_MPSAFE,
4548 #if __FreeBSD_version > 700030
4549 			     NULL,
4550 #endif
4551 			     mxge_intr, &sc->ss[0], &sc->ih);
4552 	if (err != 0) {
4553 		bus_release_resource(sc->dev, SYS_RES_IRQ,
4554 				     sc->legacy_irq ? 0 : 1, sc->irq_res);
4555 		if (!sc->legacy_irq)
4556 			pci_release_msi(sc->dev);
4557 	}
4558 	return err;
4559 }
4560 
4561 static void
4562 mxge_rem_msix_irqs(mxge_softc_t *sc)
4563 {
4564 	int i, rid;
4565 
4566 	for (i = 0; i < sc->num_slices; i++) {
4567 		if (sc->msix_ih[i] != NULL) {
4568 			bus_teardown_intr(sc->dev, sc->msix_irq_res[i],
4569 					  sc->msix_ih[i]);
4570 			sc->msix_ih[i] = NULL;
4571 		}
4572 	}
4573 	free(sc->msix_ih, M_DEVBUF);
4574 
4575 	for (i = 0; i < sc->num_slices; i++) {
4576 		rid = i + 1;
4577 		if (sc->msix_irq_res[i] != NULL)
4578 			bus_release_resource(sc->dev, SYS_RES_IRQ, rid,
4579 					     sc->msix_irq_res[i]);
4580 		sc->msix_irq_res[i] = NULL;
4581 	}
4582 	free(sc->msix_irq_res, M_DEVBUF);
4583 
4584 	bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2),
4585 			     sc->msix_table_res);
4586 
4587 	pci_release_msi(sc->dev);
4588 	return;
4589 }
4590 
4591 static void
4592 mxge_rem_single_irq(mxge_softc_t *sc)
4593 {
4594 	bus_teardown_intr(sc->dev, sc->irq_res, sc->ih);
4595 	bus_release_resource(sc->dev, SYS_RES_IRQ,
4596 			     sc->legacy_irq ? 0 : 1, sc->irq_res);
4597 	if (!sc->legacy_irq)
4598 		pci_release_msi(sc->dev);
4599 }
4600 
4601 static void
4602 mxge_rem_irq(mxge_softc_t *sc)
4603 {
4604 	if (sc->num_slices > 1)
4605 		mxge_rem_msix_irqs(sc);
4606 	else
4607 		mxge_rem_single_irq(sc);
4608 }
4609 
4610 static int
4611 mxge_add_irq(mxge_softc_t *sc)
4612 {
4613 	int err;
4614 
4615 	if (sc->num_slices > 1)
4616 		err = mxge_add_msix_irqs(sc);
4617 	else
4618 		err = mxge_add_single_irq(sc);
4619 
4620 	if (0 && err == 0 && sc->num_slices > 1) {
4621 		mxge_rem_msix_irqs(sc);
4622 		err = mxge_add_msix_irqs(sc);
4623 	}
4624 	return err;
4625 }
4626 
4627 
4628 static int
4629 mxge_attach(device_t dev)
4630 {
4631 	mxge_softc_t *sc = device_get_softc(dev);
4632 	struct ifnet *ifp;
4633 	int err, rid;
4634 
4635 	sc->dev = dev;
4636 	mxge_fetch_tunables(sc);
4637 
4638 	TASK_INIT(&sc->watchdog_task, 1, mxge_watchdog_task, sc);
4639 	sc->tq = taskqueue_create_fast("mxge_taskq", M_WAITOK,
4640 				       taskqueue_thread_enqueue,
4641 				       &sc->tq);
4642 	if (sc->tq == NULL) {
4643 		err = ENOMEM;
4644 		goto abort_with_nothing;
4645 	}
4646 
4647 	err = bus_dma_tag_create(NULL,			/* parent */
4648 				 1,			/* alignment */
4649 				 0,			/* boundary */
4650 				 BUS_SPACE_MAXADDR,	/* low */
4651 				 BUS_SPACE_MAXADDR,	/* high */
4652 				 NULL, NULL,		/* filter */
4653 				 65536 + 256,		/* maxsize */
4654 				 MXGE_MAX_SEND_DESC, 	/* num segs */
4655 				 65536,			/* maxsegsize */
4656 				 0,			/* flags */
4657 				 NULL, NULL,		/* lock */
4658 				 &sc->parent_dmat);	/* tag */
4659 
4660 	if (err != 0) {
4661 		device_printf(sc->dev, "Err %d allocating parent dmat\n",
4662 			      err);
4663 		goto abort_with_tq;
4664 	}
4665 
4666 	ifp = sc->ifp = if_alloc(IFT_ETHER);
4667 	if (ifp == NULL) {
4668 		device_printf(dev, "can not if_alloc()\n");
4669 		err = ENOSPC;
4670 		goto abort_with_parent_dmat;
4671 	}
4672 	if_initname(ifp, device_get_name(dev), device_get_unit(dev));
4673 
4674 	snprintf(sc->cmd_mtx_name, sizeof(sc->cmd_mtx_name), "%s:cmd",
4675 		 device_get_nameunit(dev));
4676 	mtx_init(&sc->cmd_mtx, sc->cmd_mtx_name, NULL, MTX_DEF);
4677 	snprintf(sc->driver_mtx_name, sizeof(sc->driver_mtx_name),
4678 		 "%s:drv", device_get_nameunit(dev));
4679 	mtx_init(&sc->driver_mtx, sc->driver_mtx_name,
4680 		 MTX_NETWORK_LOCK, MTX_DEF);
4681 
4682 	callout_init_mtx(&sc->co_hdl, &sc->driver_mtx, 0);
4683 
4684 	mxge_setup_cfg_space(sc);
4685 
4686 	/* Map the board into the kernel */
4687 	rid = PCIR_BARS;
4688 	sc->mem_res = bus_alloc_resource(dev, SYS_RES_MEMORY, &rid, 0,
4689 					 ~0, 1, RF_ACTIVE);
4690 	if (sc->mem_res == NULL) {
4691 		device_printf(dev, "could not map memory\n");
4692 		err = ENXIO;
4693 		goto abort_with_lock;
4694 	}
4695 	sc->sram = rman_get_virtual(sc->mem_res);
4696 	sc->sram_size = 2*1024*1024 - (2*(48*1024)+(32*1024)) - 0x100;
4697 	if (sc->sram_size > rman_get_size(sc->mem_res)) {
4698 		device_printf(dev, "impossible memory region size %ld\n",
4699 			      rman_get_size(sc->mem_res));
4700 		err = ENXIO;
4701 		goto abort_with_mem_res;
4702 	}
4703 
4704 	/* make NULL terminated copy of the EEPROM strings section of
4705 	   lanai SRAM */
4706 	bzero(sc->eeprom_strings, MXGE_EEPROM_STRINGS_SIZE);
4707 	bus_space_read_region_1(rman_get_bustag(sc->mem_res),
4708 				rman_get_bushandle(sc->mem_res),
4709 				sc->sram_size - MXGE_EEPROM_STRINGS_SIZE,
4710 				sc->eeprom_strings,
4711 				MXGE_EEPROM_STRINGS_SIZE - 2);
4712 	err = mxge_parse_strings(sc);
4713 	if (err != 0)
4714 		goto abort_with_mem_res;
4715 
4716 	/* Enable write combining for efficient use of PCIe bus */
4717 	mxge_enable_wc(sc);
4718 
4719 	/* Allocate the out of band dma memory */
4720 	err = mxge_dma_alloc(sc, &sc->cmd_dma,
4721 			     sizeof (mxge_cmd_t), 64);
4722 	if (err != 0)
4723 		goto abort_with_mem_res;
4724 	sc->cmd = (mcp_cmd_response_t *) sc->cmd_dma.addr;
4725 	err = mxge_dma_alloc(sc, &sc->zeropad_dma, 64, 64);
4726 	if (err != 0)
4727 		goto abort_with_cmd_dma;
4728 
4729 	err = mxge_dma_alloc(sc, &sc->dmabench_dma, 4096, 4096);
4730 	if (err != 0)
4731 		goto abort_with_zeropad_dma;
4732 
4733 	/* select & load the firmware */
4734 	err = mxge_select_firmware(sc);
4735 	if (err != 0)
4736 		goto abort_with_dmabench;
4737 	sc->intr_coal_delay = mxge_intr_coal_delay;
4738 
4739 	mxge_slice_probe(sc);
4740 	err = mxge_alloc_slices(sc);
4741 	if (err != 0)
4742 		goto abort_with_dmabench;
4743 
4744 	err = mxge_reset(sc, 0);
4745 	if (err != 0)
4746 		goto abort_with_slices;
4747 
4748 	err = mxge_alloc_rings(sc);
4749 	if (err != 0) {
4750 		device_printf(sc->dev, "failed to allocate rings\n");
4751 		goto abort_with_slices;
4752 	}
4753 
4754 	err = mxge_add_irq(sc);
4755 	if (err != 0) {
4756 		device_printf(sc->dev, "failed to add irq\n");
4757 		goto abort_with_rings;
4758 	}
4759 
4760 	ifp->if_baudrate = IF_Gbps(10UL);
4761 	ifp->if_capabilities = IFCAP_RXCSUM | IFCAP_TXCSUM | IFCAP_TSO4 |
4762 		IFCAP_VLAN_MTU | IFCAP_LINKSTATE;
4763 #ifdef INET
4764 	ifp->if_capabilities |= IFCAP_LRO;
4765 #endif
4766 
4767 #ifdef MXGE_NEW_VLAN_API
4768 	ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_HWCSUM;
4769 
4770 	/* Only FW 1.4.32 and newer can do TSO over vlans */
4771 	if (sc->fw_ver_major == 1 && sc->fw_ver_minor == 4 &&
4772 	    sc->fw_ver_tiny >= 32)
4773 		ifp->if_capabilities |= IFCAP_VLAN_HWTSO;
4774 #endif
4775 
4776 	sc->max_mtu = mxge_max_mtu(sc);
4777 	if (sc->max_mtu >= 9000)
4778 		ifp->if_capabilities |= IFCAP_JUMBO_MTU;
4779 	else
4780 		device_printf(dev, "MTU limited to %d.  Install "
4781 			      "latest firmware for 9000 byte jumbo support\n",
4782 			      sc->max_mtu - ETHER_HDR_LEN);
4783 	ifp->if_hwassist = CSUM_TCP | CSUM_UDP | CSUM_TSO;
4784 	ifp->if_capenable = ifp->if_capabilities;
4785 	if (sc->lro_cnt == 0)
4786 		ifp->if_capenable &= ~IFCAP_LRO;
4787 	sc->csum_flag = 1;
4788         ifp->if_init = mxge_init;
4789         ifp->if_softc = sc;
4790         ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
4791         ifp->if_ioctl = mxge_ioctl;
4792         ifp->if_start = mxge_start;
4793 	/* Initialise the ifmedia structure */
4794 	ifmedia_init(&sc->media, 0, mxge_media_change,
4795 		     mxge_media_status);
4796 	mxge_media_init(sc);
4797 	mxge_media_probe(sc);
4798 	sc->dying = 0;
4799 	ether_ifattach(ifp, sc->mac_addr);
4800 	/* ether_ifattach sets mtu to ETHERMTU */
4801 	if (mxge_initial_mtu != ETHERMTU)
4802 		mxge_change_mtu(sc, mxge_initial_mtu);
4803 
4804 	mxge_add_sysctls(sc);
4805 #ifdef IFNET_BUF_RING
4806 	ifp->if_transmit = mxge_transmit;
4807 	ifp->if_qflush = mxge_qflush;
4808 #endif
4809 	taskqueue_start_threads(&sc->tq, 1, PI_NET, "%s taskq",
4810 				device_get_nameunit(sc->dev));
4811 	callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
4812 	return 0;
4813 
4814 abort_with_rings:
4815 	mxge_free_rings(sc);
4816 abort_with_slices:
4817 	mxge_free_slices(sc);
4818 abort_with_dmabench:
4819 	mxge_dma_free(&sc->dmabench_dma);
4820 abort_with_zeropad_dma:
4821 	mxge_dma_free(&sc->zeropad_dma);
4822 abort_with_cmd_dma:
4823 	mxge_dma_free(&sc->cmd_dma);
4824 abort_with_mem_res:
4825 	bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
4826 abort_with_lock:
4827 	pci_disable_busmaster(dev);
4828 	mtx_destroy(&sc->cmd_mtx);
4829 	mtx_destroy(&sc->driver_mtx);
4830 	if_free(ifp);
4831 abort_with_parent_dmat:
4832 	bus_dma_tag_destroy(sc->parent_dmat);
4833 abort_with_tq:
4834 	if (sc->tq != NULL) {
4835 		taskqueue_drain(sc->tq, &sc->watchdog_task);
4836 		taskqueue_free(sc->tq);
4837 		sc->tq = NULL;
4838 	}
4839 abort_with_nothing:
4840 	return err;
4841 }
4842 
4843 static int
4844 mxge_detach(device_t dev)
4845 {
4846 	mxge_softc_t *sc = device_get_softc(dev);
4847 
4848 	if (mxge_vlans_active(sc)) {
4849 		device_printf(sc->dev,
4850 			      "Detach vlans before removing module\n");
4851 		return EBUSY;
4852 	}
4853 	mtx_lock(&sc->driver_mtx);
4854 	sc->dying = 1;
4855 	if (sc->ifp->if_drv_flags & IFF_DRV_RUNNING)
4856 		mxge_close(sc, 0);
4857 	mtx_unlock(&sc->driver_mtx);
4858 	ether_ifdetach(sc->ifp);
4859 	if (sc->tq != NULL) {
4860 		taskqueue_drain(sc->tq, &sc->watchdog_task);
4861 		taskqueue_free(sc->tq);
4862 		sc->tq = NULL;
4863 	}
4864 	callout_drain(&sc->co_hdl);
4865 	ifmedia_removeall(&sc->media);
4866 	mxge_dummy_rdma(sc, 0);
4867 	mxge_rem_sysctls(sc);
4868 	mxge_rem_irq(sc);
4869 	mxge_free_rings(sc);
4870 	mxge_free_slices(sc);
4871 	mxge_dma_free(&sc->dmabench_dma);
4872 	mxge_dma_free(&sc->zeropad_dma);
4873 	mxge_dma_free(&sc->cmd_dma);
4874 	bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
4875 	pci_disable_busmaster(dev);
4876 	mtx_destroy(&sc->cmd_mtx);
4877 	mtx_destroy(&sc->driver_mtx);
4878 	if_free(sc->ifp);
4879 	bus_dma_tag_destroy(sc->parent_dmat);
4880 	return 0;
4881 }
4882 
4883 static int
4884 mxge_shutdown(device_t dev)
4885 {
4886 	return 0;
4887 }
4888 
4889 /*
4890   This file uses Myri10GE driver indentation.
4891 
4892   Local Variables:
4893   c-file-style:"linux"
4894   tab-width:8
4895   End:
4896 */
4897