xref: /freebsd/sys/dev/mxge/if_mxge.c (revision 4710aa248bcdd77811540ad5695270254edfff55)
1 /******************************************************************************
2 SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3 
4 Copyright (c) 2006-2013, Myricom Inc.
5 All rights reserved.
6 
7 Redistribution and use in source and binary forms, with or without
8 modification, are permitted provided that the following conditions are met:
9 
10  1. Redistributions of source code must retain the above copyright notice,
11     this list of conditions and the following disclaimer.
12 
13  2. Neither the name of the Myricom Inc, nor the names of its
14     contributors may be used to endorse or promote products derived from
15     this software without specific prior written permission.
16 
17 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
21 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27 POSSIBILITY OF SUCH DAMAGE.
28 
29 ***************************************************************************/
30 
31 #include <sys/cdefs.h>
32 __FBSDID("$FreeBSD$");
33 
34 #include <sys/param.h>
35 #include <sys/systm.h>
36 #include <sys/linker.h>
37 #include <sys/firmware.h>
38 #include <sys/endian.h>
39 #include <sys/sockio.h>
40 #include <sys/mbuf.h>
41 #include <sys/malloc.h>
42 #include <sys/kdb.h>
43 #include <sys/kernel.h>
44 #include <sys/lock.h>
45 #include <sys/module.h>
46 #include <sys/socket.h>
47 #include <sys/sysctl.h>
48 #include <sys/sx.h>
49 #include <sys/taskqueue.h>
50 #include <contrib/zlib/zlib.h>
51 #include <dev/zlib/zcalloc.h>
52 
53 #include <net/if.h>
54 #include <net/if_var.h>
55 #include <net/if_arp.h>
56 #include <net/ethernet.h>
57 #include <net/if_dl.h>
58 #include <net/if_media.h>
59 
60 #include <net/bpf.h>
61 
62 #include <net/if_types.h>
63 #include <net/if_vlan_var.h>
64 
65 #include <netinet/in_systm.h>
66 #include <netinet/in.h>
67 #include <netinet/ip.h>
68 #include <netinet/ip6.h>
69 #include <netinet/tcp.h>
70 #include <netinet/tcp_lro.h>
71 #include <netinet6/ip6_var.h>
72 
73 #include <machine/bus.h>
74 #include <machine/in_cksum.h>
75 #include <machine/resource.h>
76 #include <sys/bus.h>
77 #include <sys/rman.h>
78 #include <sys/smp.h>
79 
80 #include <dev/pci/pcireg.h>
81 #include <dev/pci/pcivar.h>
82 #include <dev/pci/pci_private.h> /* XXX for pci_cfg_restore */
83 
84 #include <vm/vm.h>		/* for pmap_mapdev() */
85 #include <vm/pmap.h>
86 
87 #if defined(__i386) || defined(__amd64)
88 #include <machine/specialreg.h>
89 #endif
90 
91 #include <dev/mxge/mxge_mcp.h>
92 #include <dev/mxge/mcp_gen_header.h>
93 /*#define MXGE_FAKE_IFP*/
94 #include <dev/mxge/if_mxge_var.h>
95 #ifdef IFNET_BUF_RING
96 #include <sys/buf_ring.h>
97 #endif
98 
99 #include "opt_inet.h"
100 #include "opt_inet6.h"
101 
102 /* tunable params */
103 static int mxge_nvidia_ecrc_enable = 1;
104 static int mxge_force_firmware = 0;
105 static int mxge_intr_coal_delay = 30;
106 static int mxge_deassert_wait = 1;
107 static int mxge_flow_control = 1;
108 static int mxge_verbose = 0;
109 static int mxge_ticks;
110 static int mxge_max_slices = 1;
111 static int mxge_rss_hash_type = MXGEFW_RSS_HASH_TYPE_SRC_DST_PORT;
112 static int mxge_always_promisc = 0;
113 static int mxge_initial_mtu = ETHERMTU_JUMBO;
114 static int mxge_throttle = 0;
115 static char *mxge_fw_unaligned = "mxge_ethp_z8e";
116 static char *mxge_fw_aligned = "mxge_eth_z8e";
117 static char *mxge_fw_rss_aligned = "mxge_rss_eth_z8e";
118 static char *mxge_fw_rss_unaligned = "mxge_rss_ethp_z8e";
119 
120 static int mxge_probe(device_t dev);
121 static int mxge_attach(device_t dev);
122 static int mxge_detach(device_t dev);
123 static int mxge_shutdown(device_t dev);
124 static void mxge_intr(void *arg);
125 
126 static device_method_t mxge_methods[] =
127 {
128   /* Device interface */
129   DEVMETHOD(device_probe, mxge_probe),
130   DEVMETHOD(device_attach, mxge_attach),
131   DEVMETHOD(device_detach, mxge_detach),
132   DEVMETHOD(device_shutdown, mxge_shutdown),
133 
134   DEVMETHOD_END
135 };
136 
137 static driver_t mxge_driver =
138 {
139   "mxge",
140   mxge_methods,
141   sizeof(mxge_softc_t),
142 };
143 
144 static devclass_t mxge_devclass;
145 
146 /* Declare ourselves to be a child of the PCI bus.*/
147 DRIVER_MODULE(mxge, pci, mxge_driver, mxge_devclass, 0, 0);
148 MODULE_DEPEND(mxge, firmware, 1, 1, 1);
149 MODULE_DEPEND(mxge, zlib, 1, 1, 1);
150 
151 static int mxge_load_firmware(mxge_softc_t *sc, int adopt);
152 static int mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data);
153 static int mxge_close(mxge_softc_t *sc, int down);
154 static int mxge_open(mxge_softc_t *sc);
155 static void mxge_tick(void *arg);
156 
157 static int
158 mxge_probe(device_t dev)
159 {
160 	int rev;
161 
162 	if ((pci_get_vendor(dev) == MXGE_PCI_VENDOR_MYRICOM) &&
163 	    ((pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E) ||
164 	     (pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E_9))) {
165 		rev = pci_get_revid(dev);
166 		switch (rev) {
167 		case MXGE_PCI_REV_Z8E:
168 			device_set_desc(dev, "Myri10G-PCIE-8A");
169 			break;
170 		case MXGE_PCI_REV_Z8ES:
171 			device_set_desc(dev, "Myri10G-PCIE-8B");
172 			break;
173 		default:
174 			device_set_desc(dev, "Myri10G-PCIE-8??");
175 			device_printf(dev, "Unrecognized rev %d NIC\n",
176 				      rev);
177 			break;
178 		}
179 		return 0;
180 	}
181 	return ENXIO;
182 }
183 
184 static void
185 mxge_enable_wc(mxge_softc_t *sc)
186 {
187 #if defined(__i386) || defined(__amd64)
188 	vm_offset_t len;
189 	int err;
190 
191 	sc->wc = 1;
192 	len = rman_get_size(sc->mem_res);
193 	err = pmap_change_attr((vm_offset_t) sc->sram,
194 			       len, PAT_WRITE_COMBINING);
195 	if (err != 0) {
196 		device_printf(sc->dev, "pmap_change_attr failed, %d\n",
197 			      err);
198 		sc->wc = 0;
199 	}
200 #endif
201 }
202 
203 /* callback to get our DMA address */
204 static void
205 mxge_dmamap_callback(void *arg, bus_dma_segment_t *segs, int nsegs,
206 			 int error)
207 {
208 	if (error == 0) {
209 		*(bus_addr_t *) arg = segs->ds_addr;
210 	}
211 }
212 
213 static int
214 mxge_dma_alloc(mxge_softc_t *sc, mxge_dma_t *dma, size_t bytes,
215 		   bus_size_t alignment)
216 {
217 	int err;
218 	device_t dev = sc->dev;
219 	bus_size_t boundary, maxsegsize;
220 
221 	if (bytes > 4096 && alignment == 4096) {
222 		boundary = 0;
223 		maxsegsize = bytes;
224 	} else {
225 		boundary = 4096;
226 		maxsegsize = 4096;
227 	}
228 
229 	/* allocate DMAable memory tags */
230 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
231 				 alignment,		/* alignment */
232 				 boundary,		/* boundary */
233 				 BUS_SPACE_MAXADDR,	/* low */
234 				 BUS_SPACE_MAXADDR,	/* high */
235 				 NULL, NULL,		/* filter */
236 				 bytes,			/* maxsize */
237 				 1,			/* num segs */
238 				 maxsegsize,		/* maxsegsize */
239 				 BUS_DMA_COHERENT,	/* flags */
240 				 NULL, NULL,		/* lock */
241 				 &dma->dmat);		/* tag */
242 	if (err != 0) {
243 		device_printf(dev, "couldn't alloc tag (err = %d)\n", err);
244 		return err;
245 	}
246 
247 	/* allocate DMAable memory & map */
248 	err = bus_dmamem_alloc(dma->dmat, &dma->addr,
249 			       (BUS_DMA_WAITOK | BUS_DMA_COHERENT
250 				| BUS_DMA_ZERO),  &dma->map);
251 	if (err != 0) {
252 		device_printf(dev, "couldn't alloc mem (err = %d)\n", err);
253 		goto abort_with_dmat;
254 	}
255 
256 	/* load the memory */
257 	err = bus_dmamap_load(dma->dmat, dma->map, dma->addr, bytes,
258 			      mxge_dmamap_callback,
259 			      (void *)&dma->bus_addr, 0);
260 	if (err != 0) {
261 		device_printf(dev, "couldn't load map (err = %d)\n", err);
262 		goto abort_with_mem;
263 	}
264 	return 0;
265 
266 abort_with_mem:
267 	bus_dmamem_free(dma->dmat, dma->addr, dma->map);
268 abort_with_dmat:
269 	(void)bus_dma_tag_destroy(dma->dmat);
270 	return err;
271 }
272 
273 static void
274 mxge_dma_free(mxge_dma_t *dma)
275 {
276 	bus_dmamap_unload(dma->dmat, dma->map);
277 	bus_dmamem_free(dma->dmat, dma->addr, dma->map);
278 	(void)bus_dma_tag_destroy(dma->dmat);
279 }
280 
281 /*
282  * The eeprom strings on the lanaiX have the format
283  * SN=x\0
284  * MAC=x:x:x:x:x:x\0
285  * PC=text\0
286  */
287 
288 static int
289 mxge_parse_strings(mxge_softc_t *sc)
290 {
291 	char *ptr;
292 	int i, found_mac, found_sn2;
293 	char *endptr;
294 
295 	ptr = sc->eeprom_strings;
296 	found_mac = 0;
297 	found_sn2 = 0;
298 	while (*ptr != '\0') {
299 		if (strncmp(ptr, "MAC=", 4) == 0) {
300 			ptr += 4;
301 			for (i = 0;;) {
302 				sc->mac_addr[i] = strtoul(ptr, &endptr, 16);
303 				if (endptr - ptr != 2)
304 					goto abort;
305 				ptr = endptr;
306 				if (++i == 6)
307 					break;
308 				if (*ptr++ != ':')
309 					goto abort;
310 			}
311 			found_mac = 1;
312 		} else if (strncmp(ptr, "PC=", 3) == 0) {
313 			ptr += 3;
314 			strlcpy(sc->product_code_string, ptr,
315 			    sizeof(sc->product_code_string));
316 		} else if (!found_sn2 && (strncmp(ptr, "SN=", 3) == 0)) {
317 			ptr += 3;
318 			strlcpy(sc->serial_number_string, ptr,
319 			    sizeof(sc->serial_number_string));
320 		} else if (strncmp(ptr, "SN2=", 4) == 0) {
321 			/* SN2 takes precedence over SN */
322 			ptr += 4;
323 			found_sn2 = 1;
324 			strlcpy(sc->serial_number_string, ptr,
325 			    sizeof(sc->serial_number_string));
326 		}
327 		while (*ptr++ != '\0') {}
328 	}
329 
330 	if (found_mac)
331 		return 0;
332 
333  abort:
334 	device_printf(sc->dev, "failed to parse eeprom_strings\n");
335 
336 	return ENXIO;
337 }
338 
339 #if defined __i386 || defined i386 || defined __i386__ || defined __x86_64__
340 static void
341 mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
342 {
343 	uint32_t val;
344 	unsigned long base, off;
345 	char *va, *cfgptr;
346 	device_t pdev, mcp55;
347 	uint16_t vendor_id, device_id, word;
348 	uintptr_t bus, slot, func, ivend, idev;
349 	uint32_t *ptr32;
350 
351 	if (!mxge_nvidia_ecrc_enable)
352 		return;
353 
354 	pdev = device_get_parent(device_get_parent(sc->dev));
355 	if (pdev == NULL) {
356 		device_printf(sc->dev, "could not find parent?\n");
357 		return;
358 	}
359 	vendor_id = pci_read_config(pdev, PCIR_VENDOR, 2);
360 	device_id = pci_read_config(pdev, PCIR_DEVICE, 2);
361 
362 	if (vendor_id != 0x10de)
363 		return;
364 
365 	base = 0;
366 
367 	if (device_id == 0x005d) {
368 		/* ck804, base address is magic */
369 		base = 0xe0000000UL;
370 	} else if (device_id >= 0x0374 && device_id <= 0x378) {
371 		/* mcp55, base address stored in chipset */
372 		mcp55 = pci_find_bsf(0, 0, 0);
373 		if (mcp55 &&
374 		    0x10de == pci_read_config(mcp55, PCIR_VENDOR, 2) &&
375 		    0x0369 == pci_read_config(mcp55, PCIR_DEVICE, 2)) {
376 			word = pci_read_config(mcp55, 0x90, 2);
377 			base = ((unsigned long)word & 0x7ffeU) << 25;
378 		}
379 	}
380 	if (!base)
381 		return;
382 
383 	/* XXXX
384 	   Test below is commented because it is believed that doing
385 	   config read/write beyond 0xff will access the config space
386 	   for the next larger function.  Uncomment this and remove
387 	   the hacky pmap_mapdev() way of accessing config space when
388 	   FreeBSD grows support for extended pcie config space access
389 	*/
390 #if 0
391 	/* See if we can, by some miracle, access the extended
392 	   config space */
393 	val = pci_read_config(pdev, 0x178, 4);
394 	if (val != 0xffffffff) {
395 		val |= 0x40;
396 		pci_write_config(pdev, 0x178, val, 4);
397 		return;
398 	}
399 #endif
400 	/* Rather than using normal pci config space writes, we must
401 	 * map the Nvidia config space ourselves.  This is because on
402 	 * opteron/nvidia class machine the 0xe000000 mapping is
403 	 * handled by the nvidia chipset, that means the internal PCI
404 	 * device (the on-chip northbridge), or the amd-8131 bridge
405 	 * and things behind them are not visible by this method.
406 	 */
407 
408 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
409 		      PCI_IVAR_BUS, &bus);
410 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
411 		      PCI_IVAR_SLOT, &slot);
412 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
413 		      PCI_IVAR_FUNCTION, &func);
414 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
415 		      PCI_IVAR_VENDOR, &ivend);
416 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
417 		      PCI_IVAR_DEVICE, &idev);
418 
419 	off =  base
420 		+ 0x00100000UL * (unsigned long)bus
421 		+ 0x00001000UL * (unsigned long)(func
422 						 + 8 * slot);
423 
424 	/* map it into the kernel */
425 	va = pmap_mapdev(trunc_page((vm_paddr_t)off), PAGE_SIZE);
426 
427 	if (va == NULL) {
428 		device_printf(sc->dev, "pmap_kenter_temporary didn't\n");
429 		return;
430 	}
431 	/* get a pointer to the config space mapped into the kernel */
432 	cfgptr = va + (off & PAGE_MASK);
433 
434 	/* make sure that we can really access it */
435 	vendor_id = *(uint16_t *)(cfgptr + PCIR_VENDOR);
436 	device_id = *(uint16_t *)(cfgptr + PCIR_DEVICE);
437 	if (! (vendor_id == ivend && device_id == idev)) {
438 		device_printf(sc->dev, "mapping failed: 0x%x:0x%x\n",
439 			      vendor_id, device_id);
440 		pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
441 		return;
442 	}
443 
444 	ptr32 = (uint32_t*)(cfgptr + 0x178);
445 	val = *ptr32;
446 
447 	if (val == 0xffffffff) {
448 		device_printf(sc->dev, "extended mapping failed\n");
449 		pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
450 		return;
451 	}
452 	*ptr32 = val | 0x40;
453 	pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
454 	if (mxge_verbose)
455 		device_printf(sc->dev,
456 			      "Enabled ECRC on upstream Nvidia bridge "
457 			      "at %d:%d:%d\n",
458 			      (int)bus, (int)slot, (int)func);
459 	return;
460 }
461 #else
462 static void
463 mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
464 {
465 	device_printf(sc->dev,
466 		      "Nforce 4 chipset on non-x86/amd64!?!?!\n");
467 	return;
468 }
469 #endif
470 
471 static int
472 mxge_dma_test(mxge_softc_t *sc, int test_type)
473 {
474 	mxge_cmd_t cmd;
475 	bus_addr_t dmatest_bus = sc->dmabench_dma.bus_addr;
476 	int status;
477 	uint32_t len;
478 	char *test = " ";
479 
480 	/* Run a small DMA test.
481 	 * The magic multipliers to the length tell the firmware
482 	 * to do DMA read, write, or read+write tests.  The
483 	 * results are returned in cmd.data0.  The upper 16
484 	 * bits of the return is the number of transfers completed.
485 	 * The lower 16 bits is the time in 0.5us ticks that the
486 	 * transfers took to complete.
487 	 */
488 
489 	len = sc->tx_boundary;
490 
491 	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
492 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
493 	cmd.data2 = len * 0x10000;
494 	status = mxge_send_cmd(sc, test_type, &cmd);
495 	if (status != 0) {
496 		test = "read";
497 		goto abort;
498 	}
499 	sc->read_dma = ((cmd.data0>>16) * len * 2) /
500 		(cmd.data0 & 0xffff);
501 	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
502 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
503 	cmd.data2 = len * 0x1;
504 	status = mxge_send_cmd(sc, test_type, &cmd);
505 	if (status != 0) {
506 		test = "write";
507 		goto abort;
508 	}
509 	sc->write_dma = ((cmd.data0>>16) * len * 2) /
510 		(cmd.data0 & 0xffff);
511 
512 	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
513 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
514 	cmd.data2 = len * 0x10001;
515 	status = mxge_send_cmd(sc, test_type, &cmd);
516 	if (status != 0) {
517 		test = "read/write";
518 		goto abort;
519 	}
520 	sc->read_write_dma = ((cmd.data0>>16) * len * 2 * 2) /
521 		(cmd.data0 & 0xffff);
522 
523 abort:
524 	if (status != 0 && test_type != MXGEFW_CMD_UNALIGNED_TEST)
525 		device_printf(sc->dev, "DMA %s benchmark failed: %d\n",
526 			      test, status);
527 
528 	return status;
529 }
530 
531 /*
532  * The Lanai Z8E PCI-E interface achieves higher Read-DMA throughput
533  * when the PCI-E Completion packets are aligned on an 8-byte
534  * boundary.  Some PCI-E chip sets always align Completion packets; on
535  * the ones that do not, the alignment can be enforced by enabling
536  * ECRC generation (if supported).
537  *
538  * When PCI-E Completion packets are not aligned, it is actually more
539  * efficient to limit Read-DMA transactions to 2KB, rather than 4KB.
540  *
541  * If the driver can neither enable ECRC nor verify that it has
542  * already been enabled, then it must use a firmware image which works
543  * around unaligned completion packets (ethp_z8e.dat), and it should
544  * also ensure that it never gives the device a Read-DMA which is
545  * larger than 2KB by setting the tx_boundary to 2KB.  If ECRC is
546  * enabled, then the driver should use the aligned (eth_z8e.dat)
547  * firmware image, and set tx_boundary to 4KB.
548  */
549 
550 static int
551 mxge_firmware_probe(mxge_softc_t *sc)
552 {
553 	device_t dev = sc->dev;
554 	int reg, status;
555 	uint16_t pectl;
556 
557 	sc->tx_boundary = 4096;
558 	/*
559 	 * Verify the max read request size was set to 4KB
560 	 * before trying the test with 4KB.
561 	 */
562 	if (pci_find_cap(dev, PCIY_EXPRESS, &reg) == 0) {
563 		pectl = pci_read_config(dev, reg + 0x8, 2);
564 		if ((pectl & (5 << 12)) != (5 << 12)) {
565 			device_printf(dev, "Max Read Req. size != 4k (0x%x\n",
566 				      pectl);
567 			sc->tx_boundary = 2048;
568 		}
569 	}
570 
571 	/*
572 	 * load the optimized firmware (which assumes aligned PCIe
573 	 * completions) in order to see if it works on this host.
574 	 */
575 	sc->fw_name = mxge_fw_aligned;
576 	status = mxge_load_firmware(sc, 1);
577 	if (status != 0) {
578 		return status;
579 	}
580 
581 	/*
582 	 * Enable ECRC if possible
583 	 */
584 	mxge_enable_nvidia_ecrc(sc);
585 
586 	/*
587 	 * Run a DMA test which watches for unaligned completions and
588 	 * aborts on the first one seen.  Not required on Z8ES or newer.
589 	 */
590 	if (pci_get_revid(sc->dev) >= MXGE_PCI_REV_Z8ES)
591 		return 0;
592 	status = mxge_dma_test(sc, MXGEFW_CMD_UNALIGNED_TEST);
593 	if (status == 0)
594 		return 0; /* keep the aligned firmware */
595 
596 	if (status != E2BIG)
597 		device_printf(dev, "DMA test failed: %d\n", status);
598 	if (status == ENOSYS)
599 		device_printf(dev, "Falling back to ethp! "
600 			      "Please install up to date fw\n");
601 	return status;
602 }
603 
604 static int
605 mxge_select_firmware(mxge_softc_t *sc)
606 {
607 	int aligned = 0;
608 	int force_firmware = mxge_force_firmware;
609 
610 	if (sc->throttle)
611 		force_firmware = sc->throttle;
612 
613 	if (force_firmware != 0) {
614 		if (force_firmware == 1)
615 			aligned = 1;
616 		else
617 			aligned = 0;
618 		if (mxge_verbose)
619 			device_printf(sc->dev,
620 				      "Assuming %s completions (forced)\n",
621 				      aligned ? "aligned" : "unaligned");
622 		goto abort;
623 	}
624 
625 	/* if the PCIe link width is 4 or less, we can use the aligned
626 	   firmware and skip any checks */
627 	if (sc->link_width != 0 && sc->link_width <= 4) {
628 		device_printf(sc->dev,
629 			      "PCIe x%d Link, expect reduced performance\n",
630 			      sc->link_width);
631 		aligned = 1;
632 		goto abort;
633 	}
634 
635 	if (0 == mxge_firmware_probe(sc))
636 		return 0;
637 
638 abort:
639 	if (aligned) {
640 		sc->fw_name = mxge_fw_aligned;
641 		sc->tx_boundary = 4096;
642 	} else {
643 		sc->fw_name = mxge_fw_unaligned;
644 		sc->tx_boundary = 2048;
645 	}
646 	return (mxge_load_firmware(sc, 0));
647 }
648 
649 static int
650 mxge_validate_firmware(mxge_softc_t *sc, const mcp_gen_header_t *hdr)
651 {
652 
653 	if (be32toh(hdr->mcp_type) != MCP_TYPE_ETH) {
654 		device_printf(sc->dev, "Bad firmware type: 0x%x\n",
655 			      be32toh(hdr->mcp_type));
656 		return EIO;
657 	}
658 
659 	/* save firmware version for sysctl */
660 	strlcpy(sc->fw_version, hdr->version, sizeof(sc->fw_version));
661 	if (mxge_verbose)
662 		device_printf(sc->dev, "firmware id: %s\n", hdr->version);
663 
664 	sscanf(sc->fw_version, "%d.%d.%d", &sc->fw_ver_major,
665 	       &sc->fw_ver_minor, &sc->fw_ver_tiny);
666 
667 	if (!(sc->fw_ver_major == MXGEFW_VERSION_MAJOR
668 	      && sc->fw_ver_minor == MXGEFW_VERSION_MINOR)) {
669 		device_printf(sc->dev, "Found firmware version %s\n",
670 			      sc->fw_version);
671 		device_printf(sc->dev, "Driver needs %d.%d\n",
672 			      MXGEFW_VERSION_MAJOR, MXGEFW_VERSION_MINOR);
673 		return EINVAL;
674 	}
675 	return 0;
676 
677 }
678 
679 static int
680 mxge_load_firmware_helper(mxge_softc_t *sc, uint32_t *limit)
681 {
682 	z_stream zs;
683 	char *inflate_buffer;
684 	const struct firmware *fw;
685 	const mcp_gen_header_t *hdr;
686 	unsigned hdr_offset;
687 	int status;
688 	unsigned int i;
689 	char dummy;
690 	size_t fw_len;
691 
692 	fw = firmware_get(sc->fw_name);
693 	if (fw == NULL) {
694 		device_printf(sc->dev, "Could not find firmware image %s\n",
695 			      sc->fw_name);
696 		return ENOENT;
697 	}
698 
699 	/* setup zlib and decompress f/w */
700 	bzero(&zs, sizeof (zs));
701 	zs.zalloc = zcalloc_nowait;
702 	zs.zfree = zcfree;
703 	status = inflateInit(&zs);
704 	if (status != Z_OK) {
705 		status = EIO;
706 		goto abort_with_fw;
707 	}
708 
709 	/* the uncompressed size is stored as the firmware version,
710 	   which would otherwise go unused */
711 	fw_len = (size_t) fw->version;
712 	inflate_buffer = malloc(fw_len, M_TEMP, M_NOWAIT);
713 	if (inflate_buffer == NULL)
714 		goto abort_with_zs;
715 	zs.avail_in = fw->datasize;
716 	zs.next_in = __DECONST(char *, fw->data);
717 	zs.avail_out = fw_len;
718 	zs.next_out = inflate_buffer;
719 	status = inflate(&zs, Z_FINISH);
720 	if (status != Z_STREAM_END) {
721 		device_printf(sc->dev, "zlib %d\n", status);
722 		status = EIO;
723 		goto abort_with_buffer;
724 	}
725 
726 	/* check id */
727 	hdr_offset = htobe32(*(const uint32_t *)
728 			     (inflate_buffer + MCP_HEADER_PTR_OFFSET));
729 	if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > fw_len) {
730 		device_printf(sc->dev, "Bad firmware file");
731 		status = EIO;
732 		goto abort_with_buffer;
733 	}
734 	hdr = (const void*)(inflate_buffer + hdr_offset);
735 
736 	status = mxge_validate_firmware(sc, hdr);
737 	if (status != 0)
738 		goto abort_with_buffer;
739 
740 	/* Copy the inflated firmware to NIC SRAM. */
741 	for (i = 0; i < fw_len; i += 256) {
742 		mxge_pio_copy(sc->sram + MXGE_FW_OFFSET + i,
743 			      inflate_buffer + i,
744 			      min(256U, (unsigned)(fw_len - i)));
745 		wmb();
746 		dummy = *sc->sram;
747 		wmb();
748 	}
749 
750 	*limit = fw_len;
751 	status = 0;
752 abort_with_buffer:
753 	free(inflate_buffer, M_TEMP);
754 abort_with_zs:
755 	inflateEnd(&zs);
756 abort_with_fw:
757 	firmware_put(fw, FIRMWARE_UNLOAD);
758 	return status;
759 }
760 
761 /*
762  * Enable or disable periodic RDMAs from the host to make certain
763  * chipsets resend dropped PCIe messages
764  */
765 
766 static void
767 mxge_dummy_rdma(mxge_softc_t *sc, int enable)
768 {
769 	char buf_bytes[72];
770 	volatile uint32_t *confirm;
771 	volatile char *submit;
772 	uint32_t *buf, dma_low, dma_high;
773 	int i;
774 
775 	buf = (uint32_t *)((uintptr_t)(buf_bytes + 7) & ~7UL);
776 
777 	/* clear confirmation addr */
778 	confirm = (volatile uint32_t *)sc->cmd;
779 	*confirm = 0;
780 	wmb();
781 
782 	/* send an rdma command to the PCIe engine, and wait for the
783 	   response in the confirmation address.  The firmware should
784 	   write a -1 there to indicate it is alive and well
785 	*/
786 
787 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
788 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
789 	buf[0] = htobe32(dma_high);		/* confirm addr MSW */
790 	buf[1] = htobe32(dma_low);		/* confirm addr LSW */
791 	buf[2] = htobe32(0xffffffff);		/* confirm data */
792 	dma_low = MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr);
793 	dma_high = MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr);
794 	buf[3] = htobe32(dma_high); 		/* dummy addr MSW */
795 	buf[4] = htobe32(dma_low); 		/* dummy addr LSW */
796 	buf[5] = htobe32(enable);			/* enable? */
797 
798 	submit = (volatile char *)(sc->sram + MXGEFW_BOOT_DUMMY_RDMA);
799 
800 	mxge_pio_copy(submit, buf, 64);
801 	wmb();
802 	DELAY(1000);
803 	wmb();
804 	i = 0;
805 	while (*confirm != 0xffffffff && i < 20) {
806 		DELAY(1000);
807 		i++;
808 	}
809 	if (*confirm != 0xffffffff) {
810 		device_printf(sc->dev, "dummy rdma %s failed (%p = 0x%x)",
811 			      (enable ? "enable" : "disable"), confirm,
812 			      *confirm);
813 	}
814 	return;
815 }
816 
817 static int
818 mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data)
819 {
820 	mcp_cmd_t *buf;
821 	char buf_bytes[sizeof(*buf) + 8];
822 	volatile mcp_cmd_response_t *response = sc->cmd;
823 	volatile char *cmd_addr = sc->sram + MXGEFW_ETH_CMD;
824 	uint32_t dma_low, dma_high;
825 	int err, sleep_total = 0;
826 
827 	/* ensure buf is aligned to 8 bytes */
828 	buf = (mcp_cmd_t *)((uintptr_t)(buf_bytes + 7) & ~7UL);
829 
830 	buf->data0 = htobe32(data->data0);
831 	buf->data1 = htobe32(data->data1);
832 	buf->data2 = htobe32(data->data2);
833 	buf->cmd = htobe32(cmd);
834 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
835 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
836 
837 	buf->response_addr.low = htobe32(dma_low);
838 	buf->response_addr.high = htobe32(dma_high);
839 	mtx_lock(&sc->cmd_mtx);
840 	response->result = 0xffffffff;
841 	wmb();
842 	mxge_pio_copy((volatile void *)cmd_addr, buf, sizeof (*buf));
843 
844 	/* wait up to 20ms */
845 	err = EAGAIN;
846 	for (sleep_total = 0; sleep_total <  20; sleep_total++) {
847 		bus_dmamap_sync(sc->cmd_dma.dmat,
848 				sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
849 		wmb();
850 		switch (be32toh(response->result)) {
851 		case 0:
852 			data->data0 = be32toh(response->data);
853 			err = 0;
854 			break;
855 		case 0xffffffff:
856 			DELAY(1000);
857 			break;
858 		case MXGEFW_CMD_UNKNOWN:
859 			err = ENOSYS;
860 			break;
861 		case MXGEFW_CMD_ERROR_UNALIGNED:
862 			err = E2BIG;
863 			break;
864 		case MXGEFW_CMD_ERROR_BUSY:
865 			err = EBUSY;
866 			break;
867 		case MXGEFW_CMD_ERROR_I2C_ABSENT:
868 			err = ENXIO;
869 			break;
870 		default:
871 			device_printf(sc->dev,
872 				      "mxge: command %d "
873 				      "failed, result = %d\n",
874 				      cmd, be32toh(response->result));
875 			err = ENXIO;
876 			break;
877 		}
878 		if (err != EAGAIN)
879 			break;
880 	}
881 	if (err == EAGAIN)
882 		device_printf(sc->dev, "mxge: command %d timed out"
883 			      "result = %d\n",
884 			      cmd, be32toh(response->result));
885 	mtx_unlock(&sc->cmd_mtx);
886 	return err;
887 }
888 
889 static int
890 mxge_adopt_running_firmware(mxge_softc_t *sc)
891 {
892 	struct mcp_gen_header *hdr;
893 	const size_t bytes = sizeof (struct mcp_gen_header);
894 	size_t hdr_offset;
895 	int status;
896 
897 	/* find running firmware header */
898 	hdr_offset = htobe32(*(volatile uint32_t *)
899 			     (sc->sram + MCP_HEADER_PTR_OFFSET));
900 
901 	if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > sc->sram_size) {
902 		device_printf(sc->dev,
903 			      "Running firmware has bad header offset (%d)\n",
904 			      (int)hdr_offset);
905 		return EIO;
906 	}
907 
908 	/* copy header of running firmware from SRAM to host memory to
909 	 * validate firmware */
910 	hdr = malloc(bytes, M_DEVBUF, M_NOWAIT);
911 	if (hdr == NULL) {
912 		device_printf(sc->dev, "could not malloc firmware hdr\n");
913 		return ENOMEM;
914 	}
915 	bus_space_read_region_1(rman_get_bustag(sc->mem_res),
916 				rman_get_bushandle(sc->mem_res),
917 				hdr_offset, (char *)hdr, bytes);
918 	status = mxge_validate_firmware(sc, hdr);
919 	free(hdr, M_DEVBUF);
920 
921 	/*
922 	 * check to see if adopted firmware has bug where adopting
923 	 * it will cause broadcasts to be filtered unless the NIC
924 	 * is kept in ALLMULTI mode
925 	 */
926 	if (sc->fw_ver_major == 1 && sc->fw_ver_minor == 4 &&
927 	    sc->fw_ver_tiny >= 4 && sc->fw_ver_tiny <= 11) {
928 		sc->adopted_rx_filter_bug = 1;
929 		device_printf(sc->dev, "Adopting fw %d.%d.%d: "
930 			      "working around rx filter bug\n",
931 			      sc->fw_ver_major, sc->fw_ver_minor,
932 			      sc->fw_ver_tiny);
933 	}
934 
935 	return status;
936 }
937 
938 static int
939 mxge_load_firmware(mxge_softc_t *sc, int adopt)
940 {
941 	volatile uint32_t *confirm;
942 	volatile char *submit;
943 	char buf_bytes[72];
944 	uint32_t *buf, size, dma_low, dma_high;
945 	int status, i;
946 
947 	buf = (uint32_t *)((uintptr_t)(buf_bytes + 7) & ~7UL);
948 
949 	size = sc->sram_size;
950 	status = mxge_load_firmware_helper(sc, &size);
951 	if (status) {
952 		if (!adopt)
953 			return status;
954 		/* Try to use the currently running firmware, if
955 		   it is new enough */
956 		status = mxge_adopt_running_firmware(sc);
957 		if (status) {
958 			device_printf(sc->dev,
959 				      "failed to adopt running firmware\n");
960 			return status;
961 		}
962 		device_printf(sc->dev,
963 			      "Successfully adopted running firmware\n");
964 		if (sc->tx_boundary == 4096) {
965 			device_printf(sc->dev,
966 				"Using firmware currently running on NIC"
967 				 ".  For optimal\n");
968 			device_printf(sc->dev,
969 				 "performance consider loading optimized "
970 				 "firmware\n");
971 		}
972 		sc->fw_name = mxge_fw_unaligned;
973 		sc->tx_boundary = 2048;
974 		return 0;
975 	}
976 	/* clear confirmation addr */
977 	confirm = (volatile uint32_t *)sc->cmd;
978 	*confirm = 0;
979 	wmb();
980 	/* send a reload command to the bootstrap MCP, and wait for the
981 	   response in the confirmation address.  The firmware should
982 	   write a -1 there to indicate it is alive and well
983 	*/
984 
985 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
986 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
987 
988 	buf[0] = htobe32(dma_high);	/* confirm addr MSW */
989 	buf[1] = htobe32(dma_low);	/* confirm addr LSW */
990 	buf[2] = htobe32(0xffffffff);	/* confirm data */
991 
992 	/* FIX: All newest firmware should un-protect the bottom of
993 	   the sram before handoff. However, the very first interfaces
994 	   do not. Therefore the handoff copy must skip the first 8 bytes
995 	*/
996 					/* where the code starts*/
997 	buf[3] = htobe32(MXGE_FW_OFFSET + 8);
998 	buf[4] = htobe32(size - 8); 	/* length of code */
999 	buf[5] = htobe32(8);		/* where to copy to */
1000 	buf[6] = htobe32(0);		/* where to jump to */
1001 
1002 	submit = (volatile char *)(sc->sram + MXGEFW_BOOT_HANDOFF);
1003 	mxge_pio_copy(submit, buf, 64);
1004 	wmb();
1005 	DELAY(1000);
1006 	wmb();
1007 	i = 0;
1008 	while (*confirm != 0xffffffff && i < 20) {
1009 		DELAY(1000*10);
1010 		i++;
1011 		bus_dmamap_sync(sc->cmd_dma.dmat,
1012 				sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
1013 	}
1014 	if (*confirm != 0xffffffff) {
1015 		device_printf(sc->dev,"handoff failed (%p = 0x%x)",
1016 			confirm, *confirm);
1017 
1018 		return ENXIO;
1019 	}
1020 	return 0;
1021 }
1022 
1023 static int
1024 mxge_update_mac_address(mxge_softc_t *sc)
1025 {
1026 	mxge_cmd_t cmd;
1027 	uint8_t *addr = sc->mac_addr;
1028 	int status;
1029 
1030 	cmd.data0 = ((addr[0] << 24) | (addr[1] << 16)
1031 		     | (addr[2] << 8) | addr[3]);
1032 
1033 	cmd.data1 = ((addr[4] << 8) | (addr[5]));
1034 
1035 	status = mxge_send_cmd(sc, MXGEFW_SET_MAC_ADDRESS, &cmd);
1036 	return status;
1037 }
1038 
1039 static int
1040 mxge_change_pause(mxge_softc_t *sc, int pause)
1041 {
1042 	mxge_cmd_t cmd;
1043 	int status;
1044 
1045 	if (pause)
1046 		status = mxge_send_cmd(sc, MXGEFW_ENABLE_FLOW_CONTROL,
1047 				       &cmd);
1048 	else
1049 		status = mxge_send_cmd(sc, MXGEFW_DISABLE_FLOW_CONTROL,
1050 				       &cmd);
1051 
1052 	if (status) {
1053 		device_printf(sc->dev, "Failed to set flow control mode\n");
1054 		return ENXIO;
1055 	}
1056 	sc->pause = pause;
1057 	return 0;
1058 }
1059 
1060 static void
1061 mxge_change_promisc(mxge_softc_t *sc, int promisc)
1062 {
1063 	mxge_cmd_t cmd;
1064 	int status;
1065 
1066 	if (mxge_always_promisc)
1067 		promisc = 1;
1068 
1069 	if (promisc)
1070 		status = mxge_send_cmd(sc, MXGEFW_ENABLE_PROMISC,
1071 				       &cmd);
1072 	else
1073 		status = mxge_send_cmd(sc, MXGEFW_DISABLE_PROMISC,
1074 				       &cmd);
1075 
1076 	if (status) {
1077 		device_printf(sc->dev, "Failed to set promisc mode\n");
1078 	}
1079 }
1080 
1081 struct mxge_add_maddr_ctx {
1082 	mxge_softc_t *sc;
1083 	int error;
1084 };
1085 
1086 static u_int
1087 mxge_add_maddr(void *arg, struct sockaddr_dl *sdl, u_int cnt)
1088 {
1089 	struct mxge_add_maddr_ctx *ctx = arg;
1090 	mxge_cmd_t cmd;
1091 
1092 	if (ctx->error != 0)
1093 		return (0);
1094 	bcopy(LLADDR(sdl), &cmd.data0, 4);
1095 	bcopy(LLADDR(sdl) + 4, &cmd.data1, 2);
1096 	cmd.data0 = htonl(cmd.data0);
1097 	cmd.data1 = htonl(cmd.data1);
1098 
1099 	ctx->error = mxge_send_cmd(ctx->sc, MXGEFW_JOIN_MULTICAST_GROUP, &cmd);
1100 
1101 	return (1);
1102 }
1103 
1104 static void
1105 mxge_set_multicast_list(mxge_softc_t *sc)
1106 {
1107 	struct mxge_add_maddr_ctx ctx;
1108 	struct ifnet *ifp = sc->ifp;
1109 	mxge_cmd_t cmd;
1110 	int err;
1111 
1112 	/* This firmware is known to not support multicast */
1113 	if (!sc->fw_multicast_support)
1114 		return;
1115 
1116 	/* Disable multicast filtering while we play with the lists*/
1117 	err = mxge_send_cmd(sc, MXGEFW_ENABLE_ALLMULTI, &cmd);
1118 	if (err != 0) {
1119 		device_printf(sc->dev, "Failed MXGEFW_ENABLE_ALLMULTI,"
1120 		       " error status: %d\n", err);
1121 		return;
1122 	}
1123 
1124 	if (sc->adopted_rx_filter_bug)
1125 		return;
1126 
1127 	if (ifp->if_flags & IFF_ALLMULTI)
1128 		/* request to disable multicast filtering, so quit here */
1129 		return;
1130 
1131 	/* Flush all the filters */
1132 
1133 	err = mxge_send_cmd(sc, MXGEFW_LEAVE_ALL_MULTICAST_GROUPS, &cmd);
1134 	if (err != 0) {
1135 		device_printf(sc->dev,
1136 			      "Failed MXGEFW_LEAVE_ALL_MULTICAST_GROUPS"
1137 			      ", error status: %d\n", err);
1138 		return;
1139 	}
1140 
1141 	/* Walk the multicast list, and add each address */
1142 	ctx.sc = sc;
1143 	ctx.error = 0;
1144 	if_foreach_llmaddr(ifp, mxge_add_maddr, &ctx);
1145 	if (ctx.error != 0) {
1146 		device_printf(sc->dev, "Failed MXGEFW_JOIN_MULTICAST_GROUP, "
1147 		    "error status:" "%d\t", ctx.error);
1148 		/* abort, leaving multicast filtering off */
1149 		return;
1150 	}
1151 
1152 	/* Enable multicast filtering */
1153 	err = mxge_send_cmd(sc, MXGEFW_DISABLE_ALLMULTI, &cmd);
1154 	if (err != 0) {
1155 		device_printf(sc->dev, "Failed MXGEFW_DISABLE_ALLMULTI"
1156 		       ", error status: %d\n", err);
1157 	}
1158 }
1159 
1160 static int
1161 mxge_max_mtu(mxge_softc_t *sc)
1162 {
1163 	mxge_cmd_t cmd;
1164 	int status;
1165 
1166 	if (MJUMPAGESIZE - MXGEFW_PAD >  MXGEFW_MAX_MTU)
1167 		return  MXGEFW_MAX_MTU - MXGEFW_PAD;
1168 
1169 	/* try to set nbufs to see if it we can
1170 	   use virtually contiguous jumbos */
1171 	cmd.data0 = 0;
1172 	status = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
1173 			       &cmd);
1174 	if (status == 0)
1175 		return  MXGEFW_MAX_MTU - MXGEFW_PAD;
1176 
1177 	/* otherwise, we're limited to MJUMPAGESIZE */
1178 	return MJUMPAGESIZE - MXGEFW_PAD;
1179 }
1180 
1181 static int
1182 mxge_reset(mxge_softc_t *sc, int interrupts_setup)
1183 {
1184 	struct mxge_slice_state *ss;
1185 	mxge_rx_done_t *rx_done;
1186 	volatile uint32_t *irq_claim;
1187 	mxge_cmd_t cmd;
1188 	int slice, status;
1189 
1190 	/* try to send a reset command to the card to see if it
1191 	   is alive */
1192 	memset(&cmd, 0, sizeof (cmd));
1193 	status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
1194 	if (status != 0) {
1195 		device_printf(sc->dev, "failed reset\n");
1196 		return ENXIO;
1197 	}
1198 
1199 	mxge_dummy_rdma(sc, 1);
1200 
1201 	/* set the intrq size */
1202 	cmd.data0 = sc->rx_ring_size;
1203 	status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
1204 
1205 	/*
1206 	 * Even though we already know how many slices are supported
1207 	 * via mxge_slice_probe(), MXGEFW_CMD_GET_MAX_RSS_QUEUES
1208 	 * has magic side effects, and must be called after a reset.
1209 	 * It must be called prior to calling any RSS related cmds,
1210 	 * including assigning an interrupt queue for anything but
1211 	 * slice 0.  It must also be called *after*
1212 	 * MXGEFW_CMD_SET_INTRQ_SIZE, since the intrq size is used by
1213 	 * the firmware to compute offsets.
1214 	 */
1215 
1216 	if (sc->num_slices > 1) {
1217 		/* ask the maximum number of slices it supports */
1218 		status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES,
1219 					   &cmd);
1220 		if (status != 0) {
1221 			device_printf(sc->dev,
1222 				      "failed to get number of slices\n");
1223 			return status;
1224 		}
1225 		/*
1226 		 * MXGEFW_CMD_ENABLE_RSS_QUEUES must be called prior
1227 		 * to setting up the interrupt queue DMA
1228 		 */
1229 		cmd.data0 = sc->num_slices;
1230 		cmd.data1 = MXGEFW_SLICE_INTR_MODE_ONE_PER_SLICE;
1231 #ifdef IFNET_BUF_RING
1232 		cmd.data1 |= MXGEFW_SLICE_ENABLE_MULTIPLE_TX_QUEUES;
1233 #endif
1234 		status = mxge_send_cmd(sc, MXGEFW_CMD_ENABLE_RSS_QUEUES,
1235 					   &cmd);
1236 		if (status != 0) {
1237 			device_printf(sc->dev,
1238 				      "failed to set number of slices\n");
1239 			return status;
1240 		}
1241 	}
1242 
1243 	if (interrupts_setup) {
1244 		/* Now exchange information about interrupts  */
1245 		for (slice = 0; slice < sc->num_slices; slice++) {
1246 			rx_done = &sc->ss[slice].rx_done;
1247 			memset(rx_done->entry, 0, sc->rx_ring_size);
1248 			cmd.data0 = MXGE_LOWPART_TO_U32(rx_done->dma.bus_addr);
1249 			cmd.data1 = MXGE_HIGHPART_TO_U32(rx_done->dma.bus_addr);
1250 			cmd.data2 = slice;
1251 			status |= mxge_send_cmd(sc,
1252 						MXGEFW_CMD_SET_INTRQ_DMA,
1253 						&cmd);
1254 		}
1255 	}
1256 
1257 	status |= mxge_send_cmd(sc,
1258 				MXGEFW_CMD_GET_INTR_COAL_DELAY_OFFSET, &cmd);
1259 
1260 	sc->intr_coal_delay_ptr = (volatile uint32_t *)(sc->sram + cmd.data0);
1261 
1262 	status |= mxge_send_cmd(sc, MXGEFW_CMD_GET_IRQ_ACK_OFFSET, &cmd);
1263 	irq_claim = (volatile uint32_t *)(sc->sram + cmd.data0);
1264 
1265 	status |= mxge_send_cmd(sc,  MXGEFW_CMD_GET_IRQ_DEASSERT_OFFSET,
1266 				&cmd);
1267 	sc->irq_deassert = (volatile uint32_t *)(sc->sram + cmd.data0);
1268 	if (status != 0) {
1269 		device_printf(sc->dev, "failed set interrupt parameters\n");
1270 		return status;
1271 	}
1272 
1273 	*sc->intr_coal_delay_ptr = htobe32(sc->intr_coal_delay);
1274 
1275 	/* run a DMA benchmark */
1276 	(void) mxge_dma_test(sc, MXGEFW_DMA_TEST);
1277 
1278 	for (slice = 0; slice < sc->num_slices; slice++) {
1279 		ss = &sc->ss[slice];
1280 
1281 		ss->irq_claim = irq_claim + (2 * slice);
1282 		/* reset mcp/driver shared state back to 0 */
1283 		ss->rx_done.idx = 0;
1284 		ss->rx_done.cnt = 0;
1285 		ss->tx.req = 0;
1286 		ss->tx.done = 0;
1287 		ss->tx.pkt_done = 0;
1288 		ss->tx.queue_active = 0;
1289 		ss->tx.activate = 0;
1290 		ss->tx.deactivate = 0;
1291 		ss->tx.wake = 0;
1292 		ss->tx.defrag = 0;
1293 		ss->tx.stall = 0;
1294 		ss->rx_big.cnt = 0;
1295 		ss->rx_small.cnt = 0;
1296 		ss->lc.lro_bad_csum = 0;
1297 		ss->lc.lro_queued = 0;
1298 		ss->lc.lro_flushed = 0;
1299 		if (ss->fw_stats != NULL) {
1300 			bzero(ss->fw_stats, sizeof *ss->fw_stats);
1301 		}
1302 	}
1303 	sc->rdma_tags_available = 15;
1304 	status = mxge_update_mac_address(sc);
1305 	mxge_change_promisc(sc, sc->ifp->if_flags & IFF_PROMISC);
1306 	mxge_change_pause(sc, sc->pause);
1307 	mxge_set_multicast_list(sc);
1308 	if (sc->throttle) {
1309 		cmd.data0 = sc->throttle;
1310 		if (mxge_send_cmd(sc, MXGEFW_CMD_SET_THROTTLE_FACTOR,
1311 				  &cmd)) {
1312 			device_printf(sc->dev,
1313 				      "can't enable throttle\n");
1314 		}
1315 	}
1316 	return status;
1317 }
1318 
1319 static int
1320 mxge_change_throttle(SYSCTL_HANDLER_ARGS)
1321 {
1322 	mxge_cmd_t cmd;
1323 	mxge_softc_t *sc;
1324 	int err;
1325 	unsigned int throttle;
1326 
1327 	sc = arg1;
1328 	throttle = sc->throttle;
1329 	err = sysctl_handle_int(oidp, &throttle, arg2, req);
1330 	if (err != 0) {
1331 		return err;
1332 	}
1333 
1334 	if (throttle == sc->throttle)
1335 		return 0;
1336 
1337 	if (throttle < MXGE_MIN_THROTTLE || throttle > MXGE_MAX_THROTTLE)
1338 		return EINVAL;
1339 
1340 	mtx_lock(&sc->driver_mtx);
1341 	cmd.data0 = throttle;
1342 	err = mxge_send_cmd(sc, MXGEFW_CMD_SET_THROTTLE_FACTOR, &cmd);
1343 	if (err == 0)
1344 		sc->throttle = throttle;
1345 	mtx_unlock(&sc->driver_mtx);
1346 	return err;
1347 }
1348 
1349 static int
1350 mxge_change_intr_coal(SYSCTL_HANDLER_ARGS)
1351 {
1352 	mxge_softc_t *sc;
1353 	unsigned int intr_coal_delay;
1354 	int err;
1355 
1356 	sc = arg1;
1357 	intr_coal_delay = sc->intr_coal_delay;
1358 	err = sysctl_handle_int(oidp, &intr_coal_delay, arg2, req);
1359 	if (err != 0) {
1360 		return err;
1361 	}
1362 	if (intr_coal_delay == sc->intr_coal_delay)
1363 		return 0;
1364 
1365 	if (intr_coal_delay == 0 || intr_coal_delay > 1000*1000)
1366 		return EINVAL;
1367 
1368 	mtx_lock(&sc->driver_mtx);
1369 	*sc->intr_coal_delay_ptr = htobe32(intr_coal_delay);
1370 	sc->intr_coal_delay = intr_coal_delay;
1371 
1372 	mtx_unlock(&sc->driver_mtx);
1373 	return err;
1374 }
1375 
1376 static int
1377 mxge_change_flow_control(SYSCTL_HANDLER_ARGS)
1378 {
1379 	mxge_softc_t *sc;
1380 	unsigned int enabled;
1381 	int err;
1382 
1383 	sc = arg1;
1384 	enabled = sc->pause;
1385 	err = sysctl_handle_int(oidp, &enabled, arg2, req);
1386 	if (err != 0) {
1387 		return err;
1388 	}
1389 	if (enabled == sc->pause)
1390 		return 0;
1391 
1392 	mtx_lock(&sc->driver_mtx);
1393 	err = mxge_change_pause(sc, enabled);
1394 	mtx_unlock(&sc->driver_mtx);
1395 	return err;
1396 }
1397 
1398 static int
1399 mxge_handle_be32(SYSCTL_HANDLER_ARGS)
1400 {
1401 	int err;
1402 
1403 	if (arg1 == NULL)
1404 		return EFAULT;
1405 	arg2 = be32toh(*(int *)arg1);
1406 	arg1 = NULL;
1407 	err = sysctl_handle_int(oidp, arg1, arg2, req);
1408 
1409 	return err;
1410 }
1411 
1412 static void
1413 mxge_rem_sysctls(mxge_softc_t *sc)
1414 {
1415 	struct mxge_slice_state *ss;
1416 	int slice;
1417 
1418 	if (sc->slice_sysctl_tree == NULL)
1419 		return;
1420 
1421 	for (slice = 0; slice < sc->num_slices; slice++) {
1422 		ss = &sc->ss[slice];
1423 		if (ss == NULL || ss->sysctl_tree == NULL)
1424 			continue;
1425 		sysctl_ctx_free(&ss->sysctl_ctx);
1426 		ss->sysctl_tree = NULL;
1427 	}
1428 	sysctl_ctx_free(&sc->slice_sysctl_ctx);
1429 	sc->slice_sysctl_tree = NULL;
1430 }
1431 
1432 static void
1433 mxge_add_sysctls(mxge_softc_t *sc)
1434 {
1435 	struct sysctl_ctx_list *ctx;
1436 	struct sysctl_oid_list *children;
1437 	mcp_irq_data_t *fw;
1438 	struct mxge_slice_state *ss;
1439 	int slice;
1440 	char slice_num[8];
1441 
1442 	ctx = device_get_sysctl_ctx(sc->dev);
1443 	children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev));
1444 	fw = sc->ss[0].fw_stats;
1445 
1446 	/* random information */
1447 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1448 		       "firmware_version",
1449 		       CTLFLAG_RD, sc->fw_version,
1450 		       0, "firmware version");
1451 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1452 		       "serial_number",
1453 		       CTLFLAG_RD, sc->serial_number_string,
1454 		       0, "serial number");
1455 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1456 		       "product_code",
1457 		       CTLFLAG_RD, sc->product_code_string,
1458 		       0, "product_code");
1459 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1460 		       "pcie_link_width",
1461 		       CTLFLAG_RD, &sc->link_width,
1462 		       0, "tx_boundary");
1463 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1464 		       "tx_boundary",
1465 		       CTLFLAG_RD, &sc->tx_boundary,
1466 		       0, "tx_boundary");
1467 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1468 		       "write_combine",
1469 		       CTLFLAG_RD, &sc->wc,
1470 		       0, "write combining PIO?");
1471 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1472 		       "read_dma_MBs",
1473 		       CTLFLAG_RD, &sc->read_dma,
1474 		       0, "DMA Read speed in MB/s");
1475 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1476 		       "write_dma_MBs",
1477 		       CTLFLAG_RD, &sc->write_dma,
1478 		       0, "DMA Write speed in MB/s");
1479 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1480 		       "read_write_dma_MBs",
1481 		       CTLFLAG_RD, &sc->read_write_dma,
1482 		       0, "DMA concurrent Read/Write speed in MB/s");
1483 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1484 		       "watchdog_resets",
1485 		       CTLFLAG_RD, &sc->watchdog_resets,
1486 		       0, "Number of times NIC was reset");
1487 
1488 	/* performance related tunables */
1489 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1490 	    "intr_coal_delay", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
1491 	    sc, 0, mxge_change_intr_coal, "I",
1492 	    "interrupt coalescing delay in usecs");
1493 
1494 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1495 	    "throttle", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
1496 	    mxge_change_throttle, "I", "transmit throttling");
1497 
1498 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1499 	    "flow_control_enabled",
1500 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
1501 	    mxge_change_flow_control, "I",
1502 	    "interrupt coalescing delay in usecs");
1503 
1504 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1505 		       "deassert_wait",
1506 		       CTLFLAG_RW, &mxge_deassert_wait,
1507 		       0, "Wait for IRQ line to go low in ihandler");
1508 
1509 	/* stats block from firmware is in network byte order.
1510 	   Need to swap it */
1511 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1512 	    "link_up", CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
1513 	    &fw->link_up, 0, mxge_handle_be32, "I", "link up");
1514 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1515 	    "rdma_tags_available", CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
1516 	    &fw->rdma_tags_available, 0, mxge_handle_be32, "I",
1517 	    "rdma_tags_available");
1518 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1519 	    "dropped_bad_crc32", CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
1520 	    &fw->dropped_bad_crc32, 0, mxge_handle_be32, "I",
1521 	    "dropped_bad_crc32");
1522 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1523 	    "dropped_bad_phy", CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
1524 	    &fw->dropped_bad_phy, 0, mxge_handle_be32, "I", "dropped_bad_phy");
1525 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1526 	    "dropped_link_error_or_filtered",
1527 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
1528 	    &fw->dropped_link_error_or_filtered, 0, mxge_handle_be32, "I",
1529 	    "dropped_link_error_or_filtered");
1530 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1531 	    "dropped_link_overflow",
1532 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
1533 	    &fw->dropped_link_overflow, 0, mxge_handle_be32, "I",
1534 	    "dropped_link_overflow");
1535 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1536 	    "dropped_multicast_filtered",
1537 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
1538 	    &fw->dropped_multicast_filtered, 0, mxge_handle_be32, "I",
1539 	    "dropped_multicast_filtered");
1540 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1541 	    "dropped_no_big_buffer",
1542 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
1543 	    &fw->dropped_no_big_buffer, 0, mxge_handle_be32, "I",
1544 	    "dropped_no_big_buffer");
1545 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1546 	    "dropped_no_small_buffer",
1547 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
1548 	    &fw->dropped_no_small_buffer, 0, mxge_handle_be32, "I",
1549 	    "dropped_no_small_buffer");
1550 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1551 	    "dropped_overrun",
1552 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
1553 	    &fw->dropped_overrun, 0, mxge_handle_be32, "I",
1554 	    "dropped_overrun");
1555 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1556 	    "dropped_pause", CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
1557 	    &fw->dropped_pause, 0, mxge_handle_be32, "I", "dropped_pause");
1558 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1559 	    "dropped_runt", CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
1560 	    &fw->dropped_runt, 0, mxge_handle_be32, "I", "dropped_runt");
1561 
1562 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1563 	    "dropped_unicast_filtered",
1564 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
1565 	    &fw->dropped_unicast_filtered, 0, mxge_handle_be32, "I",
1566 	    "dropped_unicast_filtered");
1567 
1568 	/* verbose printing? */
1569 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1570 		       "verbose",
1571 		       CTLFLAG_RW, &mxge_verbose,
1572 		       0, "verbose printing");
1573 
1574 	/* add counters exported for debugging from all slices */
1575 	sysctl_ctx_init(&sc->slice_sysctl_ctx);
1576 	sc->slice_sysctl_tree =
1577 		SYSCTL_ADD_NODE(&sc->slice_sysctl_ctx, children, OID_AUTO,
1578 		    "slice", CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
1579 
1580 	for (slice = 0; slice < sc->num_slices; slice++) {
1581 		ss = &sc->ss[slice];
1582 		sysctl_ctx_init(&ss->sysctl_ctx);
1583 		ctx = &ss->sysctl_ctx;
1584 		children = SYSCTL_CHILDREN(sc->slice_sysctl_tree);
1585 		sprintf(slice_num, "%d", slice);
1586 		ss->sysctl_tree =
1587 			SYSCTL_ADD_NODE(ctx, children, OID_AUTO, slice_num,
1588 			    CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
1589 		children = SYSCTL_CHILDREN(ss->sysctl_tree);
1590 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1591 			       "rx_small_cnt",
1592 			       CTLFLAG_RD, &ss->rx_small.cnt,
1593 			       0, "rx_small_cnt");
1594 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1595 			       "rx_big_cnt",
1596 			       CTLFLAG_RD, &ss->rx_big.cnt,
1597 			       0, "rx_small_cnt");
1598 		SYSCTL_ADD_U64(ctx, children, OID_AUTO,
1599 			       "lro_flushed", CTLFLAG_RD, &ss->lc.lro_flushed,
1600 			       0, "number of lro merge queues flushed");
1601 
1602 		SYSCTL_ADD_U64(ctx, children, OID_AUTO,
1603 			       "lro_bad_csum", CTLFLAG_RD, &ss->lc.lro_bad_csum,
1604 			       0, "number of bad csums preventing LRO");
1605 
1606 		SYSCTL_ADD_U64(ctx, children, OID_AUTO,
1607 			       "lro_queued", CTLFLAG_RD, &ss->lc.lro_queued,
1608 			       0, "number of frames appended to lro merge"
1609 			       "queues");
1610 
1611 #ifndef IFNET_BUF_RING
1612 		/* only transmit from slice 0 for now */
1613 		if (slice > 0)
1614 			continue;
1615 #endif
1616 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1617 			       "tx_req",
1618 			       CTLFLAG_RD, &ss->tx.req,
1619 			       0, "tx_req");
1620 
1621 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1622 			       "tx_done",
1623 			       CTLFLAG_RD, &ss->tx.done,
1624 			       0, "tx_done");
1625 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1626 			       "tx_pkt_done",
1627 			       CTLFLAG_RD, &ss->tx.pkt_done,
1628 			       0, "tx_done");
1629 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1630 			       "tx_stall",
1631 			       CTLFLAG_RD, &ss->tx.stall,
1632 			       0, "tx_stall");
1633 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1634 			       "tx_wake",
1635 			       CTLFLAG_RD, &ss->tx.wake,
1636 			       0, "tx_wake");
1637 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1638 			       "tx_defrag",
1639 			       CTLFLAG_RD, &ss->tx.defrag,
1640 			       0, "tx_defrag");
1641 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1642 			       "tx_queue_active",
1643 			       CTLFLAG_RD, &ss->tx.queue_active,
1644 			       0, "tx_queue_active");
1645 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1646 			       "tx_activate",
1647 			       CTLFLAG_RD, &ss->tx.activate,
1648 			       0, "tx_activate");
1649 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1650 			       "tx_deactivate",
1651 			       CTLFLAG_RD, &ss->tx.deactivate,
1652 			       0, "tx_deactivate");
1653 	}
1654 }
1655 
1656 /* copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
1657    backwards one at a time and handle ring wraps */
1658 
1659 static inline void
1660 mxge_submit_req_backwards(mxge_tx_ring_t *tx,
1661 			    mcp_kreq_ether_send_t *src, int cnt)
1662 {
1663 	int idx, starting_slot;
1664 	starting_slot = tx->req;
1665 	while (cnt > 1) {
1666 		cnt--;
1667 		idx = (starting_slot + cnt) & tx->mask;
1668 		mxge_pio_copy(&tx->lanai[idx],
1669 			      &src[cnt], sizeof(*src));
1670 		wmb();
1671 	}
1672 }
1673 
1674 /*
1675  * copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
1676  * at most 32 bytes at a time, so as to avoid involving the software
1677  * pio handler in the nic.   We re-write the first segment's flags
1678  * to mark them valid only after writing the entire chain
1679  */
1680 
1681 static inline void
1682 mxge_submit_req(mxge_tx_ring_t *tx, mcp_kreq_ether_send_t *src,
1683 		  int cnt)
1684 {
1685 	int idx, i;
1686 	uint32_t *src_ints;
1687 	volatile uint32_t *dst_ints;
1688 	mcp_kreq_ether_send_t *srcp;
1689 	volatile mcp_kreq_ether_send_t *dstp, *dst;
1690 	uint8_t last_flags;
1691 
1692 	idx = tx->req & tx->mask;
1693 
1694 	last_flags = src->flags;
1695 	src->flags = 0;
1696 	wmb();
1697 	dst = dstp = &tx->lanai[idx];
1698 	srcp = src;
1699 
1700 	if ((idx + cnt) < tx->mask) {
1701 		for (i = 0; i < (cnt - 1); i += 2) {
1702 			mxge_pio_copy(dstp, srcp, 2 * sizeof(*src));
1703 			wmb(); /* force write every 32 bytes */
1704 			srcp += 2;
1705 			dstp += 2;
1706 		}
1707 	} else {
1708 		/* submit all but the first request, and ensure
1709 		   that it is submitted below */
1710 		mxge_submit_req_backwards(tx, src, cnt);
1711 		i = 0;
1712 	}
1713 	if (i < cnt) {
1714 		/* submit the first request */
1715 		mxge_pio_copy(dstp, srcp, sizeof(*src));
1716 		wmb(); /* barrier before setting valid flag */
1717 	}
1718 
1719 	/* re-write the last 32-bits with the valid flags */
1720 	src->flags = last_flags;
1721 	src_ints = (uint32_t *)src;
1722 	src_ints+=3;
1723 	dst_ints = (volatile uint32_t *)dst;
1724 	dst_ints+=3;
1725 	*dst_ints =  *src_ints;
1726 	tx->req += cnt;
1727 	wmb();
1728 }
1729 
1730 static int
1731 mxge_parse_tx(struct mxge_slice_state *ss, struct mbuf *m,
1732     struct mxge_pkt_info *pi)
1733 {
1734 	struct ether_vlan_header *eh;
1735 	uint16_t etype;
1736 	int tso = m->m_pkthdr.csum_flags & (CSUM_TSO);
1737 #if IFCAP_TSO6 && defined(INET6)
1738 	int nxt;
1739 #endif
1740 
1741 	eh = mtod(m, struct ether_vlan_header *);
1742 	if (eh->evl_encap_proto == htons(ETHERTYPE_VLAN)) {
1743 		etype = ntohs(eh->evl_proto);
1744 		pi->ip_off = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
1745 	} else {
1746 		etype = ntohs(eh->evl_encap_proto);
1747 		pi->ip_off = ETHER_HDR_LEN;
1748 	}
1749 
1750 	switch (etype) {
1751 	case ETHERTYPE_IP:
1752 		/*
1753 		 * ensure ip header is in first mbuf, copy it to a
1754 		 * scratch buffer if not
1755 		 */
1756 		pi->ip = (struct ip *)(m->m_data + pi->ip_off);
1757 		pi->ip6 = NULL;
1758 		if (__predict_false(m->m_len < pi->ip_off + sizeof(*pi->ip))) {
1759 			m_copydata(m, 0, pi->ip_off + sizeof(*pi->ip),
1760 			    ss->scratch);
1761 			pi->ip = (struct ip *)(ss->scratch + pi->ip_off);
1762 		}
1763 		pi->ip_hlen = pi->ip->ip_hl << 2;
1764 		if (!tso)
1765 			return 0;
1766 
1767 		if (__predict_false(m->m_len < pi->ip_off + pi->ip_hlen +
1768 		    sizeof(struct tcphdr))) {
1769 			m_copydata(m, 0, pi->ip_off + pi->ip_hlen +
1770 			    sizeof(struct tcphdr), ss->scratch);
1771 			pi->ip = (struct ip *)(ss->scratch + pi->ip_off);
1772 		}
1773 		pi->tcp = (struct tcphdr *)((char *)pi->ip + pi->ip_hlen);
1774 		break;
1775 #if IFCAP_TSO6 && defined(INET6)
1776 	case ETHERTYPE_IPV6:
1777 		pi->ip6 = (struct ip6_hdr *)(m->m_data + pi->ip_off);
1778 		if (__predict_false(m->m_len < pi->ip_off + sizeof(*pi->ip6))) {
1779 			m_copydata(m, 0, pi->ip_off + sizeof(*pi->ip6),
1780 			    ss->scratch);
1781 			pi->ip6 = (struct ip6_hdr *)(ss->scratch + pi->ip_off);
1782 		}
1783 		nxt = 0;
1784 		pi->ip_hlen = ip6_lasthdr(m, pi->ip_off, IPPROTO_IPV6, &nxt);
1785 		pi->ip_hlen -= pi->ip_off;
1786 		if (nxt != IPPROTO_TCP && nxt != IPPROTO_UDP)
1787 			return EINVAL;
1788 
1789 		if (!tso)
1790 			return 0;
1791 
1792 		if (pi->ip_off + pi->ip_hlen > ss->sc->max_tso6_hlen)
1793 			return EINVAL;
1794 
1795 		if (__predict_false(m->m_len < pi->ip_off + pi->ip_hlen +
1796 		    sizeof(struct tcphdr))) {
1797 			m_copydata(m, 0, pi->ip_off + pi->ip_hlen +
1798 			    sizeof(struct tcphdr), ss->scratch);
1799 			pi->ip6 = (struct ip6_hdr *)(ss->scratch + pi->ip_off);
1800 		}
1801 		pi->tcp = (struct tcphdr *)((char *)pi->ip6 + pi->ip_hlen);
1802 		break;
1803 #endif
1804 	default:
1805 		return EINVAL;
1806 	}
1807 	return 0;
1808 }
1809 
1810 #if IFCAP_TSO4
1811 
1812 static void
1813 mxge_encap_tso(struct mxge_slice_state *ss, struct mbuf *m,
1814 	       int busdma_seg_cnt, struct mxge_pkt_info *pi)
1815 {
1816 	mxge_tx_ring_t *tx;
1817 	mcp_kreq_ether_send_t *req;
1818 	bus_dma_segment_t *seg;
1819 	uint32_t low, high_swapped;
1820 	int len, seglen, cum_len, cum_len_next;
1821 	int next_is_first, chop, cnt, rdma_count, small;
1822 	uint16_t pseudo_hdr_offset, cksum_offset, mss, sum;
1823 	uint8_t flags, flags_next;
1824 	static int once;
1825 
1826 	mss = m->m_pkthdr.tso_segsz;
1827 
1828 	/* negative cum_len signifies to the
1829 	 * send loop that we are still in the
1830 	 * header portion of the TSO packet.
1831 	 */
1832 
1833 	cksum_offset = pi->ip_off + pi->ip_hlen;
1834 	cum_len = -(cksum_offset + (pi->tcp->th_off << 2));
1835 
1836 	/* TSO implies checksum offload on this hardware */
1837 	if (__predict_false((m->m_pkthdr.csum_flags & (CSUM_TCP|CSUM_TCP_IPV6)) == 0)) {
1838 		/*
1839 		 * If packet has full TCP csum, replace it with pseudo hdr
1840 		 * sum that the NIC expects, otherwise the NIC will emit
1841 		 * packets with bad TCP checksums.
1842 		 */
1843 		m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
1844 		if (pi->ip6) {
1845 #if (CSUM_TCP_IPV6 != 0) && defined(INET6)
1846 			m->m_pkthdr.csum_flags |= CSUM_TCP_IPV6;
1847 			sum = in6_cksum_pseudo(pi->ip6,
1848 			    m->m_pkthdr.len - cksum_offset,
1849 			    IPPROTO_TCP, 0);
1850 #endif
1851 		} else {
1852 #ifdef INET
1853 			m->m_pkthdr.csum_flags |= CSUM_TCP;
1854 			sum = in_pseudo(pi->ip->ip_src.s_addr,
1855 			    pi->ip->ip_dst.s_addr,
1856 			    htons(IPPROTO_TCP + (m->m_pkthdr.len -
1857 				    cksum_offset)));
1858 #endif
1859 		}
1860 		m_copyback(m, offsetof(struct tcphdr, th_sum) +
1861 		    cksum_offset, sizeof(sum), (caddr_t)&sum);
1862 	}
1863 	flags = MXGEFW_FLAGS_TSO_HDR | MXGEFW_FLAGS_FIRST;
1864 
1865 	/* for TSO, pseudo_hdr_offset holds mss.
1866 	 * The firmware figures out where to put
1867 	 * the checksum by parsing the header. */
1868 	pseudo_hdr_offset = htobe16(mss);
1869 
1870 	if (pi->ip6) {
1871 		/*
1872 		 * for IPv6 TSO, the "checksum offset" is re-purposed
1873 		 * to store the TCP header len
1874 		 */
1875 		cksum_offset = (pi->tcp->th_off << 2);
1876 	}
1877 
1878 	tx = &ss->tx;
1879 	req = tx->req_list;
1880 	seg = tx->seg_list;
1881 	cnt = 0;
1882 	rdma_count = 0;
1883 	/* "rdma_count" is the number of RDMAs belonging to the
1884 	 * current packet BEFORE the current send request. For
1885 	 * non-TSO packets, this is equal to "count".
1886 	 * For TSO packets, rdma_count needs to be reset
1887 	 * to 0 after a segment cut.
1888 	 *
1889 	 * The rdma_count field of the send request is
1890 	 * the number of RDMAs of the packet starting at
1891 	 * that request. For TSO send requests with one ore more cuts
1892 	 * in the middle, this is the number of RDMAs starting
1893 	 * after the last cut in the request. All previous
1894 	 * segments before the last cut implicitly have 1 RDMA.
1895 	 *
1896 	 * Since the number of RDMAs is not known beforehand,
1897 	 * it must be filled-in retroactively - after each
1898 	 * segmentation cut or at the end of the entire packet.
1899 	 */
1900 
1901 	while (busdma_seg_cnt) {
1902 		/* Break the busdma segment up into pieces*/
1903 		low = MXGE_LOWPART_TO_U32(seg->ds_addr);
1904 		high_swapped = 	htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
1905 		len = seg->ds_len;
1906 
1907 		while (len) {
1908 			flags_next = flags & ~MXGEFW_FLAGS_FIRST;
1909 			seglen = len;
1910 			cum_len_next = cum_len + seglen;
1911 			(req-rdma_count)->rdma_count = rdma_count + 1;
1912 			if (__predict_true(cum_len >= 0)) {
1913 				/* payload */
1914 				chop = (cum_len_next > mss);
1915 				cum_len_next = cum_len_next % mss;
1916 				next_is_first = (cum_len_next == 0);
1917 				flags |= chop * MXGEFW_FLAGS_TSO_CHOP;
1918 				flags_next |= next_is_first *
1919 					MXGEFW_FLAGS_FIRST;
1920 				rdma_count |= -(chop | next_is_first);
1921 				rdma_count += chop & !next_is_first;
1922 			} else if (cum_len_next >= 0) {
1923 				/* header ends */
1924 				rdma_count = -1;
1925 				cum_len_next = 0;
1926 				seglen = -cum_len;
1927 				small = (mss <= MXGEFW_SEND_SMALL_SIZE);
1928 				flags_next = MXGEFW_FLAGS_TSO_PLD |
1929 					MXGEFW_FLAGS_FIRST |
1930 					(small * MXGEFW_FLAGS_SMALL);
1931 			    }
1932 
1933 			req->addr_high = high_swapped;
1934 			req->addr_low = htobe32(low);
1935 			req->pseudo_hdr_offset = pseudo_hdr_offset;
1936 			req->pad = 0;
1937 			req->rdma_count = 1;
1938 			req->length = htobe16(seglen);
1939 			req->cksum_offset = cksum_offset;
1940 			req->flags = flags | ((cum_len & 1) *
1941 					      MXGEFW_FLAGS_ALIGN_ODD);
1942 			low += seglen;
1943 			len -= seglen;
1944 			cum_len = cum_len_next;
1945 			flags = flags_next;
1946 			req++;
1947 			cnt++;
1948 			rdma_count++;
1949 			if (cksum_offset != 0 && !pi->ip6) {
1950 				if (__predict_false(cksum_offset > seglen))
1951 					cksum_offset -= seglen;
1952 				else
1953 					cksum_offset = 0;
1954 			}
1955 			if (__predict_false(cnt > tx->max_desc))
1956 				goto drop;
1957 		}
1958 		busdma_seg_cnt--;
1959 		seg++;
1960 	}
1961 	(req-rdma_count)->rdma_count = rdma_count;
1962 
1963 	do {
1964 		req--;
1965 		req->flags |= MXGEFW_FLAGS_TSO_LAST;
1966 	} while (!(req->flags & (MXGEFW_FLAGS_TSO_CHOP | MXGEFW_FLAGS_FIRST)));
1967 
1968 	tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
1969 	mxge_submit_req(tx, tx->req_list, cnt);
1970 #ifdef IFNET_BUF_RING
1971 	if ((ss->sc->num_slices > 1) && tx->queue_active == 0) {
1972 		/* tell the NIC to start polling this slice */
1973 		*tx->send_go = 1;
1974 		tx->queue_active = 1;
1975 		tx->activate++;
1976 		wmb();
1977 	}
1978 #endif
1979 	return;
1980 
1981 drop:
1982 	bus_dmamap_unload(tx->dmat, tx->info[tx->req & tx->mask].map);
1983 	m_freem(m);
1984 	ss->oerrors++;
1985 	if (!once) {
1986 		printf("tx->max_desc exceeded via TSO!\n");
1987 		printf("mss = %d, %ld, %d!\n", mss,
1988 		       (long)seg - (long)tx->seg_list, tx->max_desc);
1989 		once = 1;
1990 	}
1991 	return;
1992 
1993 }
1994 
1995 #endif /* IFCAP_TSO4 */
1996 
1997 #ifdef MXGE_NEW_VLAN_API
1998 /*
1999  * We reproduce the software vlan tag insertion from
2000  * net/if_vlan.c:vlan_start() here so that we can advertise "hardware"
2001  * vlan tag insertion. We need to advertise this in order to have the
2002  * vlan interface respect our csum offload flags.
2003  */
2004 static struct mbuf *
2005 mxge_vlan_tag_insert(struct mbuf *m)
2006 {
2007 	struct ether_vlan_header *evl;
2008 
2009 	M_PREPEND(m, ETHER_VLAN_ENCAP_LEN, M_NOWAIT);
2010 	if (__predict_false(m == NULL))
2011 		return NULL;
2012 	if (m->m_len < sizeof(*evl)) {
2013 		m = m_pullup(m, sizeof(*evl));
2014 		if (__predict_false(m == NULL))
2015 			return NULL;
2016 	}
2017 	/*
2018 	 * Transform the Ethernet header into an Ethernet header
2019 	 * with 802.1Q encapsulation.
2020 	 */
2021 	evl = mtod(m, struct ether_vlan_header *);
2022 	bcopy((char *)evl + ETHER_VLAN_ENCAP_LEN,
2023 	      (char *)evl, ETHER_HDR_LEN - ETHER_TYPE_LEN);
2024 	evl->evl_encap_proto = htons(ETHERTYPE_VLAN);
2025 	evl->evl_tag = htons(m->m_pkthdr.ether_vtag);
2026 	m->m_flags &= ~M_VLANTAG;
2027 	return m;
2028 }
2029 #endif /* MXGE_NEW_VLAN_API */
2030 
2031 static void
2032 mxge_encap(struct mxge_slice_state *ss, struct mbuf *m)
2033 {
2034 	struct mxge_pkt_info pi = {0,0,0,0};
2035 	mxge_softc_t *sc;
2036 	mcp_kreq_ether_send_t *req;
2037 	bus_dma_segment_t *seg;
2038 	struct mbuf *m_tmp;
2039 	mxge_tx_ring_t *tx;
2040 	int cnt, cum_len, err, i, idx, odd_flag;
2041 	uint16_t pseudo_hdr_offset;
2042 	uint8_t flags, cksum_offset;
2043 
2044 	sc = ss->sc;
2045 	tx = &ss->tx;
2046 
2047 #ifdef MXGE_NEW_VLAN_API
2048 	if (m->m_flags & M_VLANTAG) {
2049 		m = mxge_vlan_tag_insert(m);
2050 		if (__predict_false(m == NULL))
2051 			goto drop_without_m;
2052 	}
2053 #endif
2054 	if (m->m_pkthdr.csum_flags &
2055 	    (CSUM_TSO | CSUM_DELAY_DATA | CSUM_DELAY_DATA_IPV6)) {
2056 		if (mxge_parse_tx(ss, m, &pi))
2057 			goto drop;
2058 	}
2059 
2060 	/* (try to) map the frame for DMA */
2061 	idx = tx->req & tx->mask;
2062 	err = bus_dmamap_load_mbuf_sg(tx->dmat, tx->info[idx].map,
2063 				      m, tx->seg_list, &cnt,
2064 				      BUS_DMA_NOWAIT);
2065 	if (__predict_false(err == EFBIG)) {
2066 		/* Too many segments in the chain.  Try
2067 		   to defrag */
2068 		m_tmp = m_defrag(m, M_NOWAIT);
2069 		if (m_tmp == NULL) {
2070 			goto drop;
2071 		}
2072 		ss->tx.defrag++;
2073 		m = m_tmp;
2074 		err = bus_dmamap_load_mbuf_sg(tx->dmat,
2075 					      tx->info[idx].map,
2076 					      m, tx->seg_list, &cnt,
2077 					      BUS_DMA_NOWAIT);
2078 	}
2079 	if (__predict_false(err != 0)) {
2080 		device_printf(sc->dev, "bus_dmamap_load_mbuf_sg returned %d"
2081 			      " packet len = %d\n", err, m->m_pkthdr.len);
2082 		goto drop;
2083 	}
2084 	bus_dmamap_sync(tx->dmat, tx->info[idx].map,
2085 			BUS_DMASYNC_PREWRITE);
2086 	tx->info[idx].m = m;
2087 
2088 #if IFCAP_TSO4
2089 	/* TSO is different enough, we handle it in another routine */
2090 	if (m->m_pkthdr.csum_flags & (CSUM_TSO)) {
2091 		mxge_encap_tso(ss, m, cnt, &pi);
2092 		return;
2093 	}
2094 #endif
2095 
2096 	req = tx->req_list;
2097 	cksum_offset = 0;
2098 	pseudo_hdr_offset = 0;
2099 	flags = MXGEFW_FLAGS_NO_TSO;
2100 
2101 	/* checksum offloading? */
2102 	if (m->m_pkthdr.csum_flags &
2103 	    (CSUM_DELAY_DATA | CSUM_DELAY_DATA_IPV6)) {
2104 		/* ensure ip header is in first mbuf, copy
2105 		   it to a scratch buffer if not */
2106 		cksum_offset = pi.ip_off + pi.ip_hlen;
2107 		pseudo_hdr_offset = cksum_offset +  m->m_pkthdr.csum_data;
2108 		pseudo_hdr_offset = htobe16(pseudo_hdr_offset);
2109 		req->cksum_offset = cksum_offset;
2110 		flags |= MXGEFW_FLAGS_CKSUM;
2111 		odd_flag = MXGEFW_FLAGS_ALIGN_ODD;
2112 	} else {
2113 		odd_flag = 0;
2114 	}
2115 	if (m->m_pkthdr.len < MXGEFW_SEND_SMALL_SIZE)
2116 		flags |= MXGEFW_FLAGS_SMALL;
2117 
2118 	/* convert segments into a request list */
2119 	cum_len = 0;
2120 	seg = tx->seg_list;
2121 	req->flags = MXGEFW_FLAGS_FIRST;
2122 	for (i = 0; i < cnt; i++) {
2123 		req->addr_low =
2124 			htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
2125 		req->addr_high =
2126 			htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
2127 		req->length = htobe16(seg->ds_len);
2128 		req->cksum_offset = cksum_offset;
2129 		if (cksum_offset > seg->ds_len)
2130 			cksum_offset -= seg->ds_len;
2131 		else
2132 			cksum_offset = 0;
2133 		req->pseudo_hdr_offset = pseudo_hdr_offset;
2134 		req->pad = 0; /* complete solid 16-byte block */
2135 		req->rdma_count = 1;
2136 		req->flags |= flags | ((cum_len & 1) * odd_flag);
2137 		cum_len += seg->ds_len;
2138 		seg++;
2139 		req++;
2140 		req->flags = 0;
2141 	}
2142 	req--;
2143 	/* pad runts to 60 bytes */
2144 	if (cum_len < 60) {
2145 		req++;
2146 		req->addr_low =
2147 			htobe32(MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr));
2148 		req->addr_high =
2149 			htobe32(MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr));
2150 		req->length = htobe16(60 - cum_len);
2151 		req->cksum_offset = 0;
2152 		req->pseudo_hdr_offset = pseudo_hdr_offset;
2153 		req->pad = 0; /* complete solid 16-byte block */
2154 		req->rdma_count = 1;
2155 		req->flags |= flags | ((cum_len & 1) * odd_flag);
2156 		cnt++;
2157 	}
2158 
2159 	tx->req_list[0].rdma_count = cnt;
2160 #if 0
2161 	/* print what the firmware will see */
2162 	for (i = 0; i < cnt; i++) {
2163 		printf("%d: addr: 0x%x 0x%x len:%d pso%d,"
2164 		    "cso:%d, flags:0x%x, rdma:%d\n",
2165 		    i, (int)ntohl(tx->req_list[i].addr_high),
2166 		    (int)ntohl(tx->req_list[i].addr_low),
2167 		    (int)ntohs(tx->req_list[i].length),
2168 		    (int)ntohs(tx->req_list[i].pseudo_hdr_offset),
2169 		    tx->req_list[i].cksum_offset, tx->req_list[i].flags,
2170 		    tx->req_list[i].rdma_count);
2171 	}
2172 	printf("--------------\n");
2173 #endif
2174 	tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
2175 	mxge_submit_req(tx, tx->req_list, cnt);
2176 #ifdef IFNET_BUF_RING
2177 	if ((ss->sc->num_slices > 1) && tx->queue_active == 0) {
2178 		/* tell the NIC to start polling this slice */
2179 		*tx->send_go = 1;
2180 		tx->queue_active = 1;
2181 		tx->activate++;
2182 		wmb();
2183 	}
2184 #endif
2185 	return;
2186 
2187 drop:
2188 	m_freem(m);
2189 drop_without_m:
2190 	ss->oerrors++;
2191 	return;
2192 }
2193 
2194 #ifdef IFNET_BUF_RING
2195 static void
2196 mxge_qflush(struct ifnet *ifp)
2197 {
2198 	mxge_softc_t *sc = ifp->if_softc;
2199 	mxge_tx_ring_t *tx;
2200 	struct mbuf *m;
2201 	int slice;
2202 
2203 	for (slice = 0; slice < sc->num_slices; slice++) {
2204 		tx = &sc->ss[slice].tx;
2205 		mtx_lock(&tx->mtx);
2206 		while ((m = buf_ring_dequeue_sc(tx->br)) != NULL)
2207 			m_freem(m);
2208 		mtx_unlock(&tx->mtx);
2209 	}
2210 	if_qflush(ifp);
2211 }
2212 
2213 static inline void
2214 mxge_start_locked(struct mxge_slice_state *ss)
2215 {
2216 	mxge_softc_t *sc;
2217 	struct mbuf *m;
2218 	struct ifnet *ifp;
2219 	mxge_tx_ring_t *tx;
2220 
2221 	sc = ss->sc;
2222 	ifp = sc->ifp;
2223 	tx = &ss->tx;
2224 
2225 	while ((tx->mask - (tx->req - tx->done)) > tx->max_desc) {
2226 		m = drbr_dequeue(ifp, tx->br);
2227 		if (m == NULL) {
2228 			return;
2229 		}
2230 		/* let BPF see it */
2231 		BPF_MTAP(ifp, m);
2232 
2233 		/* give it to the nic */
2234 		mxge_encap(ss, m);
2235 	}
2236 	/* ran out of transmit slots */
2237 	if (((ss->if_drv_flags & IFF_DRV_OACTIVE) == 0)
2238 	    && (!drbr_empty(ifp, tx->br))) {
2239 		ss->if_drv_flags |= IFF_DRV_OACTIVE;
2240 		tx->stall++;
2241 	}
2242 }
2243 
2244 static int
2245 mxge_transmit_locked(struct mxge_slice_state *ss, struct mbuf *m)
2246 {
2247 	mxge_softc_t *sc;
2248 	struct ifnet *ifp;
2249 	mxge_tx_ring_t *tx;
2250 	int err;
2251 
2252 	sc = ss->sc;
2253 	ifp = sc->ifp;
2254 	tx = &ss->tx;
2255 
2256 	if ((ss->if_drv_flags & (IFF_DRV_RUNNING|IFF_DRV_OACTIVE)) !=
2257 	    IFF_DRV_RUNNING) {
2258 		err = drbr_enqueue(ifp, tx->br, m);
2259 		return (err);
2260 	}
2261 
2262 	if (!drbr_needs_enqueue(ifp, tx->br) &&
2263 	    ((tx->mask - (tx->req - tx->done)) > tx->max_desc)) {
2264 		/* let BPF see it */
2265 		BPF_MTAP(ifp, m);
2266 		/* give it to the nic */
2267 		mxge_encap(ss, m);
2268 	} else if ((err = drbr_enqueue(ifp, tx->br, m)) != 0) {
2269 		return (err);
2270 	}
2271 	if (!drbr_empty(ifp, tx->br))
2272 		mxge_start_locked(ss);
2273 	return (0);
2274 }
2275 
2276 static int
2277 mxge_transmit(struct ifnet *ifp, struct mbuf *m)
2278 {
2279 	mxge_softc_t *sc = ifp->if_softc;
2280 	struct mxge_slice_state *ss;
2281 	mxge_tx_ring_t *tx;
2282 	int err = 0;
2283 	int slice;
2284 
2285 	slice = m->m_pkthdr.flowid;
2286 	slice &= (sc->num_slices - 1);  /* num_slices always power of 2 */
2287 
2288 	ss = &sc->ss[slice];
2289 	tx = &ss->tx;
2290 
2291 	if (mtx_trylock(&tx->mtx)) {
2292 		err = mxge_transmit_locked(ss, m);
2293 		mtx_unlock(&tx->mtx);
2294 	} else {
2295 		err = drbr_enqueue(ifp, tx->br, m);
2296 	}
2297 
2298 	return (err);
2299 }
2300 
2301 #else
2302 
2303 static inline void
2304 mxge_start_locked(struct mxge_slice_state *ss)
2305 {
2306 	mxge_softc_t *sc;
2307 	struct mbuf *m;
2308 	struct ifnet *ifp;
2309 	mxge_tx_ring_t *tx;
2310 
2311 	sc = ss->sc;
2312 	ifp = sc->ifp;
2313 	tx = &ss->tx;
2314 	while ((tx->mask - (tx->req - tx->done)) > tx->max_desc) {
2315 		IFQ_DRV_DEQUEUE(&ifp->if_snd, m);
2316 		if (m == NULL) {
2317 			return;
2318 		}
2319 		/* let BPF see it */
2320 		BPF_MTAP(ifp, m);
2321 
2322 		/* give it to the nic */
2323 		mxge_encap(ss, m);
2324 	}
2325 	/* ran out of transmit slots */
2326 	if ((sc->ifp->if_drv_flags & IFF_DRV_OACTIVE) == 0) {
2327 		sc->ifp->if_drv_flags |= IFF_DRV_OACTIVE;
2328 		tx->stall++;
2329 	}
2330 }
2331 #endif
2332 static void
2333 mxge_start(struct ifnet *ifp)
2334 {
2335 	mxge_softc_t *sc = ifp->if_softc;
2336 	struct mxge_slice_state *ss;
2337 
2338 	/* only use the first slice for now */
2339 	ss = &sc->ss[0];
2340 	mtx_lock(&ss->tx.mtx);
2341 	mxge_start_locked(ss);
2342 	mtx_unlock(&ss->tx.mtx);
2343 }
2344 
2345 /*
2346  * copy an array of mcp_kreq_ether_recv_t's to the mcp.  Copy
2347  * at most 32 bytes at a time, so as to avoid involving the software
2348  * pio handler in the nic.   We re-write the first segment's low
2349  * DMA address to mark it valid only after we write the entire chunk
2350  * in a burst
2351  */
2352 static inline void
2353 mxge_submit_8rx(volatile mcp_kreq_ether_recv_t *dst,
2354 		mcp_kreq_ether_recv_t *src)
2355 {
2356 	uint32_t low;
2357 
2358 	low = src->addr_low;
2359 	src->addr_low = 0xffffffff;
2360 	mxge_pio_copy(dst, src, 4 * sizeof (*src));
2361 	wmb();
2362 	mxge_pio_copy(dst + 4, src + 4, 4 * sizeof (*src));
2363 	wmb();
2364 	src->addr_low = low;
2365 	dst->addr_low = low;
2366 	wmb();
2367 }
2368 
2369 static int
2370 mxge_get_buf_small(struct mxge_slice_state *ss, bus_dmamap_t map, int idx)
2371 {
2372 	bus_dma_segment_t seg;
2373 	struct mbuf *m;
2374 	mxge_rx_ring_t *rx = &ss->rx_small;
2375 	int cnt, err;
2376 
2377 	m = m_gethdr(M_NOWAIT, MT_DATA);
2378 	if (m == NULL) {
2379 		rx->alloc_fail++;
2380 		err = ENOBUFS;
2381 		goto done;
2382 	}
2383 	m->m_len = MHLEN;
2384 	err = bus_dmamap_load_mbuf_sg(rx->dmat, map, m,
2385 				      &seg, &cnt, BUS_DMA_NOWAIT);
2386 	if (err != 0) {
2387 		m_free(m);
2388 		goto done;
2389 	}
2390 	rx->info[idx].m = m;
2391 	rx->shadow[idx].addr_low =
2392 		htobe32(MXGE_LOWPART_TO_U32(seg.ds_addr));
2393 	rx->shadow[idx].addr_high =
2394 		htobe32(MXGE_HIGHPART_TO_U32(seg.ds_addr));
2395 
2396 done:
2397 	if ((idx & 7) == 7)
2398 		mxge_submit_8rx(&rx->lanai[idx - 7], &rx->shadow[idx - 7]);
2399 	return err;
2400 }
2401 
2402 static int
2403 mxge_get_buf_big(struct mxge_slice_state *ss, bus_dmamap_t map, int idx)
2404 {
2405 	bus_dma_segment_t seg[3];
2406 	struct mbuf *m;
2407 	mxge_rx_ring_t *rx = &ss->rx_big;
2408 	int cnt, err, i;
2409 
2410 	m = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, rx->cl_size);
2411 	if (m == NULL) {
2412 		rx->alloc_fail++;
2413 		err = ENOBUFS;
2414 		goto done;
2415 	}
2416 	m->m_len = rx->mlen;
2417 	err = bus_dmamap_load_mbuf_sg(rx->dmat, map, m,
2418 				      seg, &cnt, BUS_DMA_NOWAIT);
2419 	if (err != 0) {
2420 		m_free(m);
2421 		goto done;
2422 	}
2423 	rx->info[idx].m = m;
2424 	rx->shadow[idx].addr_low =
2425 		htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
2426 	rx->shadow[idx].addr_high =
2427 		htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
2428 
2429 #if MXGE_VIRT_JUMBOS
2430 	for (i = 1; i < cnt; i++) {
2431 		rx->shadow[idx + i].addr_low =
2432 			htobe32(MXGE_LOWPART_TO_U32(seg[i].ds_addr));
2433 		rx->shadow[idx + i].addr_high =
2434 			htobe32(MXGE_HIGHPART_TO_U32(seg[i].ds_addr));
2435        }
2436 #endif
2437 
2438 done:
2439        for (i = 0; i < rx->nbufs; i++) {
2440 		if ((idx & 7) == 7) {
2441 			mxge_submit_8rx(&rx->lanai[idx - 7],
2442 					&rx->shadow[idx - 7]);
2443 		}
2444 		idx++;
2445 	}
2446 	return err;
2447 }
2448 
2449 #ifdef INET6
2450 
2451 static uint16_t
2452 mxge_csum_generic(uint16_t *raw, int len)
2453 {
2454 	uint32_t csum;
2455 
2456 	csum = 0;
2457 	while (len > 0) {
2458 		csum += *raw;
2459 		raw++;
2460 		len -= 2;
2461 	}
2462 	csum = (csum >> 16) + (csum & 0xffff);
2463 	csum = (csum >> 16) + (csum & 0xffff);
2464 	return (uint16_t)csum;
2465 }
2466 
2467 static inline uint16_t
2468 mxge_rx_csum6(void *p, struct mbuf *m, uint32_t csum)
2469 {
2470 	uint32_t partial;
2471 	int nxt, cksum_offset;
2472 	struct ip6_hdr *ip6 = p;
2473 	uint16_t c;
2474 
2475 	nxt = ip6->ip6_nxt;
2476 	cksum_offset = sizeof (*ip6) + ETHER_HDR_LEN;
2477 	if (nxt != IPPROTO_TCP && nxt != IPPROTO_UDP) {
2478 		cksum_offset = ip6_lasthdr(m, ETHER_HDR_LEN,
2479 					   IPPROTO_IPV6, &nxt);
2480 		if (nxt != IPPROTO_TCP && nxt != IPPROTO_UDP)
2481 			return (1);
2482 	}
2483 
2484 	/*
2485 	 * IPv6 headers do not contain a checksum, and hence
2486 	 * do not checksum to zero, so they don't "fall out"
2487 	 * of the partial checksum calculation like IPv4
2488 	 * headers do.  We need to fix the partial checksum by
2489 	 * subtracting the checksum of the IPv6 header.
2490 	 */
2491 
2492 	partial = mxge_csum_generic((uint16_t *)ip6, cksum_offset -
2493 				    ETHER_HDR_LEN);
2494 	csum += ~partial;
2495 	csum +=	 (csum < ~partial);
2496 	csum = (csum >> 16) + (csum & 0xFFFF);
2497 	csum = (csum >> 16) + (csum & 0xFFFF);
2498 	c = in6_cksum_pseudo(ip6, m->m_pkthdr.len - cksum_offset, nxt,
2499 			     csum);
2500 	c ^= 0xffff;
2501 	return (c);
2502 }
2503 #endif /* INET6 */
2504 /*
2505  *  Myri10GE hardware checksums are not valid if the sender
2506  *  padded the frame with non-zero padding.  This is because
2507  *  the firmware just does a simple 16-bit 1s complement
2508  *  checksum across the entire frame, excluding the first 14
2509  *  bytes.  It is best to simply to check the checksum and
2510  *  tell the stack about it only if the checksum is good
2511  */
2512 
2513 static inline uint16_t
2514 mxge_rx_csum(struct mbuf *m, int csum)
2515 {
2516 	struct ether_header *eh;
2517 #ifdef INET
2518 	struct ip *ip;
2519 #endif
2520 #if defined(INET) || defined(INET6)
2521 	int cap = m->m_pkthdr.rcvif->if_capenable;
2522 #endif
2523 	uint16_t c, etype;
2524 
2525 	eh = mtod(m, struct ether_header *);
2526 	etype = ntohs(eh->ether_type);
2527 	switch (etype) {
2528 #ifdef INET
2529 	case ETHERTYPE_IP:
2530 		if ((cap & IFCAP_RXCSUM) == 0)
2531 			return (1);
2532 		ip = (struct ip *)(eh + 1);
2533 		if (ip->ip_p != IPPROTO_TCP && ip->ip_p != IPPROTO_UDP)
2534 			return (1);
2535 		c = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
2536 			      htonl(ntohs(csum) + ntohs(ip->ip_len) -
2537 				    (ip->ip_hl << 2) + ip->ip_p));
2538 		c ^= 0xffff;
2539 		break;
2540 #endif
2541 #ifdef INET6
2542 	case ETHERTYPE_IPV6:
2543 		if ((cap & IFCAP_RXCSUM_IPV6) == 0)
2544 			return (1);
2545 		c = mxge_rx_csum6((eh + 1), m, csum);
2546 		break;
2547 #endif
2548 	default:
2549 		c = 1;
2550 	}
2551 	return (c);
2552 }
2553 
2554 static void
2555 mxge_vlan_tag_remove(struct mbuf *m, uint32_t *csum)
2556 {
2557 	struct ether_vlan_header *evl;
2558 	uint32_t partial;
2559 
2560 	evl = mtod(m, struct ether_vlan_header *);
2561 
2562 	/*
2563 	 * fix checksum by subtracting ETHER_VLAN_ENCAP_LEN bytes
2564 	 * after what the firmware thought was the end of the ethernet
2565 	 * header.
2566 	 */
2567 
2568 	/* put checksum into host byte order */
2569 	*csum = ntohs(*csum);
2570 	partial = ntohl(*(uint32_t *)(mtod(m, char *) + ETHER_HDR_LEN));
2571 	(*csum) += ~partial;
2572 	(*csum) +=  ((*csum) < ~partial);
2573 	(*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2574 	(*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2575 
2576 	/* restore checksum to network byte order;
2577 	   later consumers expect this */
2578 	*csum = htons(*csum);
2579 
2580 	/* save the tag */
2581 #ifdef MXGE_NEW_VLAN_API
2582 	m->m_pkthdr.ether_vtag = ntohs(evl->evl_tag);
2583 #else
2584 	{
2585 		struct m_tag *mtag;
2586 		mtag = m_tag_alloc(MTAG_VLAN, MTAG_VLAN_TAG, sizeof(u_int),
2587 				   M_NOWAIT);
2588 		if (mtag == NULL)
2589 			return;
2590 		VLAN_TAG_VALUE(mtag) = ntohs(evl->evl_tag);
2591 		m_tag_prepend(m, mtag);
2592 	}
2593 
2594 #endif
2595 	m->m_flags |= M_VLANTAG;
2596 
2597 	/*
2598 	 * Remove the 802.1q header by copying the Ethernet
2599 	 * addresses over it and adjusting the beginning of
2600 	 * the data in the mbuf.  The encapsulated Ethernet
2601 	 * type field is already in place.
2602 	 */
2603 	bcopy((char *)evl, (char *)evl + ETHER_VLAN_ENCAP_LEN,
2604 	      ETHER_HDR_LEN - ETHER_TYPE_LEN);
2605 	m_adj(m, ETHER_VLAN_ENCAP_LEN);
2606 }
2607 
2608 static inline void
2609 mxge_rx_done_big(struct mxge_slice_state *ss, uint32_t len,
2610 		 uint32_t csum, int lro)
2611 {
2612 	mxge_softc_t *sc;
2613 	struct ifnet *ifp;
2614 	struct mbuf *m;
2615 	struct ether_header *eh;
2616 	mxge_rx_ring_t *rx;
2617 	bus_dmamap_t old_map;
2618 	int idx;
2619 
2620 	sc = ss->sc;
2621 	ifp = sc->ifp;
2622 	rx = &ss->rx_big;
2623 	idx = rx->cnt & rx->mask;
2624 	rx->cnt += rx->nbufs;
2625 	/* save a pointer to the received mbuf */
2626 	m = rx->info[idx].m;
2627 	/* try to replace the received mbuf */
2628 	if (mxge_get_buf_big(ss, rx->extra_map, idx)) {
2629 		/* drop the frame -- the old mbuf is re-cycled */
2630 		if_inc_counter(ifp, IFCOUNTER_IERRORS, 1);
2631 		return;
2632 	}
2633 
2634 	/* unmap the received buffer */
2635 	old_map = rx->info[idx].map;
2636 	bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2637 	bus_dmamap_unload(rx->dmat, old_map);
2638 
2639 	/* swap the bus_dmamap_t's */
2640 	rx->info[idx].map = rx->extra_map;
2641 	rx->extra_map = old_map;
2642 
2643 	/* mcp implicitly skips 1st 2 bytes so that packet is properly
2644 	 * aligned */
2645 	m->m_data += MXGEFW_PAD;
2646 
2647 	m->m_pkthdr.rcvif = ifp;
2648 	m->m_len = m->m_pkthdr.len = len;
2649 	ss->ipackets++;
2650 	eh = mtod(m, struct ether_header *);
2651 	if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2652 		mxge_vlan_tag_remove(m, &csum);
2653 	}
2654 	/* flowid only valid if RSS hashing is enabled */
2655 	if (sc->num_slices > 1) {
2656 		m->m_pkthdr.flowid = (ss - sc->ss);
2657 		M_HASHTYPE_SET(m, M_HASHTYPE_OPAQUE);
2658 	}
2659 	/* if the checksum is valid, mark it in the mbuf header */
2660 	if ((ifp->if_capenable & (IFCAP_RXCSUM_IPV6 | IFCAP_RXCSUM)) &&
2661 	    (0 == mxge_rx_csum(m, csum))) {
2662 		/* Tell the stack that the  checksum is good */
2663 		m->m_pkthdr.csum_data = 0xffff;
2664 		m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR |
2665 			CSUM_DATA_VALID;
2666 
2667 #if defined(INET) || defined (INET6)
2668 		if (lro && (0 == tcp_lro_rx(&ss->lc, m, 0)))
2669 			return;
2670 #endif
2671 	}
2672 	/* pass the frame up the stack */
2673 	(*ifp->if_input)(ifp, m);
2674 }
2675 
2676 static inline void
2677 mxge_rx_done_small(struct mxge_slice_state *ss, uint32_t len,
2678 		   uint32_t csum, int lro)
2679 {
2680 	mxge_softc_t *sc;
2681 	struct ifnet *ifp;
2682 	struct ether_header *eh;
2683 	struct mbuf *m;
2684 	mxge_rx_ring_t *rx;
2685 	bus_dmamap_t old_map;
2686 	int idx;
2687 
2688 	sc = ss->sc;
2689 	ifp = sc->ifp;
2690 	rx = &ss->rx_small;
2691 	idx = rx->cnt & rx->mask;
2692 	rx->cnt++;
2693 	/* save a pointer to the received mbuf */
2694 	m = rx->info[idx].m;
2695 	/* try to replace the received mbuf */
2696 	if (mxge_get_buf_small(ss, rx->extra_map, idx)) {
2697 		/* drop the frame -- the old mbuf is re-cycled */
2698 		if_inc_counter(ifp, IFCOUNTER_IERRORS, 1);
2699 		return;
2700 	}
2701 
2702 	/* unmap the received buffer */
2703 	old_map = rx->info[idx].map;
2704 	bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2705 	bus_dmamap_unload(rx->dmat, old_map);
2706 
2707 	/* swap the bus_dmamap_t's */
2708 	rx->info[idx].map = rx->extra_map;
2709 	rx->extra_map = old_map;
2710 
2711 	/* mcp implicitly skips 1st 2 bytes so that packet is properly
2712 	 * aligned */
2713 	m->m_data += MXGEFW_PAD;
2714 
2715 	m->m_pkthdr.rcvif = ifp;
2716 	m->m_len = m->m_pkthdr.len = len;
2717 	ss->ipackets++;
2718 	eh = mtod(m, struct ether_header *);
2719 	if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2720 		mxge_vlan_tag_remove(m, &csum);
2721 	}
2722 	/* flowid only valid if RSS hashing is enabled */
2723 	if (sc->num_slices > 1) {
2724 		m->m_pkthdr.flowid = (ss - sc->ss);
2725 		M_HASHTYPE_SET(m, M_HASHTYPE_OPAQUE);
2726 	}
2727 	/* if the checksum is valid, mark it in the mbuf header */
2728 	if ((ifp->if_capenable & (IFCAP_RXCSUM_IPV6 | IFCAP_RXCSUM)) &&
2729 	    (0 == mxge_rx_csum(m, csum))) {
2730 		/* Tell the stack that the  checksum is good */
2731 		m->m_pkthdr.csum_data = 0xffff;
2732 		m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR |
2733 			CSUM_DATA_VALID;
2734 
2735 #if defined(INET) || defined (INET6)
2736 		if (lro && (0 == tcp_lro_rx(&ss->lc, m, csum)))
2737 			return;
2738 #endif
2739 	}
2740 	/* pass the frame up the stack */
2741 	(*ifp->if_input)(ifp, m);
2742 }
2743 
2744 static inline void
2745 mxge_clean_rx_done(struct mxge_slice_state *ss)
2746 {
2747 	mxge_rx_done_t *rx_done = &ss->rx_done;
2748 	int limit = 0;
2749 	uint16_t length;
2750 	uint16_t checksum;
2751 	int lro;
2752 
2753 	lro = ss->sc->ifp->if_capenable & IFCAP_LRO;
2754 	while (rx_done->entry[rx_done->idx].length != 0) {
2755 		length = ntohs(rx_done->entry[rx_done->idx].length);
2756 		rx_done->entry[rx_done->idx].length = 0;
2757 		checksum = rx_done->entry[rx_done->idx].checksum;
2758 		if (length <= (MHLEN - MXGEFW_PAD))
2759 			mxge_rx_done_small(ss, length, checksum, lro);
2760 		else
2761 			mxge_rx_done_big(ss, length, checksum, lro);
2762 		rx_done->cnt++;
2763 		rx_done->idx = rx_done->cnt & rx_done->mask;
2764 
2765 		/* limit potential for livelock */
2766 		if (__predict_false(++limit > rx_done->mask / 2))
2767 			break;
2768 	}
2769 #if defined(INET)  || defined (INET6)
2770 	tcp_lro_flush_all(&ss->lc);
2771 #endif
2772 }
2773 
2774 static inline void
2775 mxge_tx_done(struct mxge_slice_state *ss, uint32_t mcp_idx)
2776 {
2777 	struct ifnet *ifp __unused;
2778 	mxge_tx_ring_t *tx;
2779 	struct mbuf *m;
2780 	bus_dmamap_t map;
2781 	int idx;
2782 	int *flags;
2783 
2784 	tx = &ss->tx;
2785 	ifp = ss->sc->ifp;
2786 	while (tx->pkt_done != mcp_idx) {
2787 		idx = tx->done & tx->mask;
2788 		tx->done++;
2789 		m = tx->info[idx].m;
2790 		/* mbuf and DMA map only attached to the first
2791 		   segment per-mbuf */
2792 		if (m != NULL) {
2793 			ss->obytes += m->m_pkthdr.len;
2794 			if (m->m_flags & M_MCAST)
2795 				ss->omcasts++;
2796 			ss->opackets++;
2797 			tx->info[idx].m = NULL;
2798 			map = tx->info[idx].map;
2799 			bus_dmamap_unload(tx->dmat, map);
2800 			m_freem(m);
2801 		}
2802 		if (tx->info[idx].flag) {
2803 			tx->info[idx].flag = 0;
2804 			tx->pkt_done++;
2805 		}
2806 	}
2807 
2808 	/* If we have space, clear IFF_OACTIVE to tell the stack that
2809 	   its OK to send packets */
2810 #ifdef IFNET_BUF_RING
2811 	flags = &ss->if_drv_flags;
2812 #else
2813 	flags = &ifp->if_drv_flags;
2814 #endif
2815 	mtx_lock(&ss->tx.mtx);
2816 	if ((*flags) & IFF_DRV_OACTIVE &&
2817 	    tx->req - tx->done < (tx->mask + 1)/4) {
2818 		*(flags) &= ~IFF_DRV_OACTIVE;
2819 		ss->tx.wake++;
2820 		mxge_start_locked(ss);
2821 	}
2822 #ifdef IFNET_BUF_RING
2823 	if ((ss->sc->num_slices > 1) && (tx->req == tx->done)) {
2824 		/* let the NIC stop polling this queue, since there
2825 		 * are no more transmits pending */
2826 		if (tx->req == tx->done) {
2827 			*tx->send_stop = 1;
2828 			tx->queue_active = 0;
2829 			tx->deactivate++;
2830 			wmb();
2831 		}
2832 	}
2833 #endif
2834 	mtx_unlock(&ss->tx.mtx);
2835 
2836 }
2837 
2838 static struct mxge_media_type mxge_xfp_media_types[] =
2839 {
2840 	{IFM_10G_CX4,	0x7f, 		"10GBASE-CX4 (module)"},
2841 	{IFM_10G_SR, 	(1 << 7),	"10GBASE-SR"},
2842 	{IFM_10G_LR, 	(1 << 6),	"10GBASE-LR"},
2843 	{0,		(1 << 5),	"10GBASE-ER"},
2844 	{IFM_10G_LRM,	(1 << 4),	"10GBASE-LRM"},
2845 	{0,		(1 << 3),	"10GBASE-SW"},
2846 	{0,		(1 << 2),	"10GBASE-LW"},
2847 	{0,		(1 << 1),	"10GBASE-EW"},
2848 	{0,		(1 << 0),	"Reserved"}
2849 };
2850 static struct mxge_media_type mxge_sfp_media_types[] =
2851 {
2852 	{IFM_10G_TWINAX,      0,	"10GBASE-Twinax"},
2853 	{0,		(1 << 7),	"Reserved"},
2854 	{IFM_10G_LRM,	(1 << 6),	"10GBASE-LRM"},
2855 	{IFM_10G_LR, 	(1 << 5),	"10GBASE-LR"},
2856 	{IFM_10G_SR,	(1 << 4),	"10GBASE-SR"},
2857 	{IFM_10G_TWINAX,(1 << 0),	"10GBASE-Twinax"}
2858 };
2859 
2860 static void
2861 mxge_media_set(mxge_softc_t *sc, int media_type)
2862 {
2863 
2864 	ifmedia_add(&sc->media, IFM_ETHER | IFM_FDX | media_type,
2865 		    0, NULL);
2866 	ifmedia_set(&sc->media, IFM_ETHER | IFM_FDX | media_type);
2867 	sc->current_media = media_type;
2868 	sc->media.ifm_media = sc->media.ifm_cur->ifm_media;
2869 }
2870 
2871 static void
2872 mxge_media_init(mxge_softc_t *sc)
2873 {
2874 	char *ptr;
2875 	int i;
2876 
2877 	ifmedia_removeall(&sc->media);
2878 	mxge_media_set(sc, IFM_AUTO);
2879 
2880 	/*
2881 	 * parse the product code to deterimine the interface type
2882 	 * (CX4, XFP, Quad Ribbon Fiber) by looking at the character
2883 	 * after the 3rd dash in the driver's cached copy of the
2884 	 * EEPROM's product code string.
2885 	 */
2886 	ptr = sc->product_code_string;
2887 	if (ptr == NULL) {
2888 		device_printf(sc->dev, "Missing product code\n");
2889 		return;
2890 	}
2891 
2892 	for (i = 0; i < 3; i++, ptr++) {
2893 		ptr = strchr(ptr, '-');
2894 		if (ptr == NULL) {
2895 			device_printf(sc->dev,
2896 				      "only %d dashes in PC?!?\n", i);
2897 			return;
2898 		}
2899 	}
2900 	if (*ptr == 'C' || *(ptr +1) == 'C') {
2901 		/* -C is CX4 */
2902 		sc->connector = MXGE_CX4;
2903 		mxge_media_set(sc, IFM_10G_CX4);
2904 	} else if (*ptr == 'Q') {
2905 		/* -Q is Quad Ribbon Fiber */
2906 		sc->connector = MXGE_QRF;
2907 		device_printf(sc->dev, "Quad Ribbon Fiber Media\n");
2908 		/* FreeBSD has no media type for Quad ribbon fiber */
2909 	} else if (*ptr == 'R') {
2910 		/* -R is XFP */
2911 		sc->connector = MXGE_XFP;
2912 	} else if (*ptr == 'S' || *(ptr +1) == 'S') {
2913 		/* -S or -2S is SFP+ */
2914 		sc->connector = MXGE_SFP;
2915 	} else {
2916 		device_printf(sc->dev, "Unknown media type: %c\n", *ptr);
2917 	}
2918 }
2919 
2920 /*
2921  * Determine the media type for a NIC.  Some XFPs will identify
2922  * themselves only when their link is up, so this is initiated via a
2923  * link up interrupt.  However, this can potentially take up to
2924  * several milliseconds, so it is run via the watchdog routine, rather
2925  * than in the interrupt handler itself.
2926  */
2927 static void
2928 mxge_media_probe(mxge_softc_t *sc)
2929 {
2930 	mxge_cmd_t cmd;
2931 	char *cage_type;
2932 
2933 	struct mxge_media_type *mxge_media_types = NULL;
2934 	int i, err, ms, mxge_media_type_entries;
2935 	uint32_t byte;
2936 
2937 	sc->need_media_probe = 0;
2938 
2939 	if (sc->connector == MXGE_XFP) {
2940 		/* -R is XFP */
2941 		mxge_media_types = mxge_xfp_media_types;
2942 		mxge_media_type_entries =
2943 			nitems(mxge_xfp_media_types);
2944 		byte = MXGE_XFP_COMPLIANCE_BYTE;
2945 		cage_type = "XFP";
2946 	} else 	if (sc->connector == MXGE_SFP) {
2947 		/* -S or -2S is SFP+ */
2948 		mxge_media_types = mxge_sfp_media_types;
2949 		mxge_media_type_entries =
2950 			nitems(mxge_sfp_media_types);
2951 		cage_type = "SFP+";
2952 		byte = 3;
2953 	} else {
2954 		/* nothing to do; media type cannot change */
2955 		return;
2956 	}
2957 
2958 	/*
2959 	 * At this point we know the NIC has an XFP cage, so now we
2960 	 * try to determine what is in the cage by using the
2961 	 * firmware's XFP I2C commands to read the XFP 10GbE compilance
2962 	 * register.  We read just one byte, which may take over
2963 	 * a millisecond
2964 	 */
2965 
2966 	cmd.data0 = 0;	 /* just fetch 1 byte, not all 256 */
2967 	cmd.data1 = byte;
2968 	err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_READ, &cmd);
2969 	if (err == MXGEFW_CMD_ERROR_I2C_FAILURE) {
2970 		device_printf(sc->dev, "failed to read XFP\n");
2971 	}
2972 	if (err == MXGEFW_CMD_ERROR_I2C_ABSENT) {
2973 		device_printf(sc->dev, "Type R/S with no XFP!?!?\n");
2974 	}
2975 	if (err != MXGEFW_CMD_OK) {
2976 		return;
2977 	}
2978 
2979 	/* now we wait for the data to be cached */
2980 	cmd.data0 = byte;
2981 	err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
2982 	for (ms = 0; (err == EBUSY) && (ms < 50); ms++) {
2983 		DELAY(1000);
2984 		cmd.data0 = byte;
2985 		err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
2986 	}
2987 	if (err != MXGEFW_CMD_OK) {
2988 		device_printf(sc->dev, "failed to read %s (%d, %dms)\n",
2989 			      cage_type, err, ms);
2990 		return;
2991 	}
2992 
2993 	if (cmd.data0 == mxge_media_types[0].bitmask) {
2994 		if (mxge_verbose)
2995 			device_printf(sc->dev, "%s:%s\n", cage_type,
2996 				      mxge_media_types[0].name);
2997 		if (sc->current_media != mxge_media_types[0].flag) {
2998 			mxge_media_init(sc);
2999 			mxge_media_set(sc, mxge_media_types[0].flag);
3000 		}
3001 		return;
3002 	}
3003 	for (i = 1; i < mxge_media_type_entries; i++) {
3004 		if (cmd.data0 & mxge_media_types[i].bitmask) {
3005 			if (mxge_verbose)
3006 				device_printf(sc->dev, "%s:%s\n",
3007 					      cage_type,
3008 					      mxge_media_types[i].name);
3009 
3010 			if (sc->current_media != mxge_media_types[i].flag) {
3011 				mxge_media_init(sc);
3012 				mxge_media_set(sc, mxge_media_types[i].flag);
3013 			}
3014 			return;
3015 		}
3016 	}
3017 	if (mxge_verbose)
3018 		device_printf(sc->dev, "%s media 0x%x unknown\n",
3019 			      cage_type, cmd.data0);
3020 
3021 	return;
3022 }
3023 
3024 static void
3025 mxge_intr(void *arg)
3026 {
3027 	struct mxge_slice_state *ss = arg;
3028 	mxge_softc_t *sc = ss->sc;
3029 	mcp_irq_data_t *stats = ss->fw_stats;
3030 	mxge_tx_ring_t *tx = &ss->tx;
3031 	mxge_rx_done_t *rx_done = &ss->rx_done;
3032 	uint32_t send_done_count;
3033 	uint8_t valid;
3034 
3035 #ifndef IFNET_BUF_RING
3036 	/* an interrupt on a non-zero slice is implicitly valid
3037 	   since MSI-X irqs are not shared */
3038 	if (ss != sc->ss) {
3039 		mxge_clean_rx_done(ss);
3040 		*ss->irq_claim = be32toh(3);
3041 		return;
3042 	}
3043 #endif
3044 
3045 	/* make sure the DMA has finished */
3046 	if (!stats->valid) {
3047 		return;
3048 	}
3049 	valid = stats->valid;
3050 
3051 	if (sc->legacy_irq) {
3052 		/* lower legacy IRQ  */
3053 		*sc->irq_deassert = 0;
3054 		if (!mxge_deassert_wait)
3055 			/* don't wait for conf. that irq is low */
3056 			stats->valid = 0;
3057 	} else {
3058 		stats->valid = 0;
3059 	}
3060 
3061 	/* loop while waiting for legacy irq deassertion */
3062 	do {
3063 		/* check for transmit completes and receives */
3064 		send_done_count = be32toh(stats->send_done_count);
3065 		while ((send_done_count != tx->pkt_done) ||
3066 		       (rx_done->entry[rx_done->idx].length != 0)) {
3067 			if (send_done_count != tx->pkt_done)
3068 				mxge_tx_done(ss, (int)send_done_count);
3069 			mxge_clean_rx_done(ss);
3070 			send_done_count = be32toh(stats->send_done_count);
3071 		}
3072 		if (sc->legacy_irq && mxge_deassert_wait)
3073 			wmb();
3074 	} while (*((volatile uint8_t *) &stats->valid));
3075 
3076 	/* fw link & error stats meaningful only on the first slice */
3077 	if (__predict_false((ss == sc->ss) && stats->stats_updated)) {
3078 		if (sc->link_state != stats->link_up) {
3079 			sc->link_state = stats->link_up;
3080 			if (sc->link_state) {
3081 				if_link_state_change(sc->ifp, LINK_STATE_UP);
3082 				if (mxge_verbose)
3083 					device_printf(sc->dev, "link up\n");
3084 			} else {
3085 				if_link_state_change(sc->ifp, LINK_STATE_DOWN);
3086 				if (mxge_verbose)
3087 					device_printf(sc->dev, "link down\n");
3088 			}
3089 			sc->need_media_probe = 1;
3090 		}
3091 		if (sc->rdma_tags_available !=
3092 		    be32toh(stats->rdma_tags_available)) {
3093 			sc->rdma_tags_available =
3094 				be32toh(stats->rdma_tags_available);
3095 			device_printf(sc->dev, "RDMA timed out! %d tags "
3096 				      "left\n", sc->rdma_tags_available);
3097 		}
3098 
3099 		if (stats->link_down) {
3100 			sc->down_cnt += stats->link_down;
3101 			sc->link_state = 0;
3102 			if_link_state_change(sc->ifp, LINK_STATE_DOWN);
3103 		}
3104 	}
3105 
3106 	/* check to see if we have rx token to pass back */
3107 	if (valid & 0x1)
3108 	    *ss->irq_claim = be32toh(3);
3109 	*(ss->irq_claim + 1) = be32toh(3);
3110 }
3111 
3112 static void
3113 mxge_init(void *arg)
3114 {
3115 	mxge_softc_t *sc = arg;
3116 	struct ifnet *ifp = sc->ifp;
3117 
3118 	mtx_lock(&sc->driver_mtx);
3119 	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0)
3120 		(void) mxge_open(sc);
3121 	mtx_unlock(&sc->driver_mtx);
3122 }
3123 
3124 static void
3125 mxge_free_slice_mbufs(struct mxge_slice_state *ss)
3126 {
3127 	int i;
3128 
3129 #if defined(INET) || defined(INET6)
3130 	tcp_lro_free(&ss->lc);
3131 #endif
3132 	for (i = 0; i <= ss->rx_big.mask; i++) {
3133 		if (ss->rx_big.info[i].m == NULL)
3134 			continue;
3135 		bus_dmamap_unload(ss->rx_big.dmat,
3136 				  ss->rx_big.info[i].map);
3137 		m_freem(ss->rx_big.info[i].m);
3138 		ss->rx_big.info[i].m = NULL;
3139 	}
3140 
3141 	for (i = 0; i <= ss->rx_small.mask; i++) {
3142 		if (ss->rx_small.info[i].m == NULL)
3143 			continue;
3144 		bus_dmamap_unload(ss->rx_small.dmat,
3145 				  ss->rx_small.info[i].map);
3146 		m_freem(ss->rx_small.info[i].m);
3147 		ss->rx_small.info[i].m = NULL;
3148 	}
3149 
3150 	/* transmit ring used only on the first slice */
3151 	if (ss->tx.info == NULL)
3152 		return;
3153 
3154 	for (i = 0; i <= ss->tx.mask; i++) {
3155 		ss->tx.info[i].flag = 0;
3156 		if (ss->tx.info[i].m == NULL)
3157 			continue;
3158 		bus_dmamap_unload(ss->tx.dmat,
3159 				  ss->tx.info[i].map);
3160 		m_freem(ss->tx.info[i].m);
3161 		ss->tx.info[i].m = NULL;
3162 	}
3163 }
3164 
3165 static void
3166 mxge_free_mbufs(mxge_softc_t *sc)
3167 {
3168 	int slice;
3169 
3170 	for (slice = 0; slice < sc->num_slices; slice++)
3171 		mxge_free_slice_mbufs(&sc->ss[slice]);
3172 }
3173 
3174 static void
3175 mxge_free_slice_rings(struct mxge_slice_state *ss)
3176 {
3177 	int i;
3178 
3179 	if (ss->rx_done.entry != NULL)
3180 		mxge_dma_free(&ss->rx_done.dma);
3181 	ss->rx_done.entry = NULL;
3182 
3183 	if (ss->tx.req_bytes != NULL)
3184 		free(ss->tx.req_bytes, M_DEVBUF);
3185 	ss->tx.req_bytes = NULL;
3186 
3187 	if (ss->tx.seg_list != NULL)
3188 		free(ss->tx.seg_list, M_DEVBUF);
3189 	ss->tx.seg_list = NULL;
3190 
3191 	if (ss->rx_small.shadow != NULL)
3192 		free(ss->rx_small.shadow, M_DEVBUF);
3193 	ss->rx_small.shadow = NULL;
3194 
3195 	if (ss->rx_big.shadow != NULL)
3196 		free(ss->rx_big.shadow, M_DEVBUF);
3197 	ss->rx_big.shadow = NULL;
3198 
3199 	if (ss->tx.info != NULL) {
3200 		if (ss->tx.dmat != NULL) {
3201 			for (i = 0; i <= ss->tx.mask; i++) {
3202 				bus_dmamap_destroy(ss->tx.dmat,
3203 						   ss->tx.info[i].map);
3204 			}
3205 			bus_dma_tag_destroy(ss->tx.dmat);
3206 		}
3207 		free(ss->tx.info, M_DEVBUF);
3208 	}
3209 	ss->tx.info = NULL;
3210 
3211 	if (ss->rx_small.info != NULL) {
3212 		if (ss->rx_small.dmat != NULL) {
3213 			for (i = 0; i <= ss->rx_small.mask; i++) {
3214 				bus_dmamap_destroy(ss->rx_small.dmat,
3215 						   ss->rx_small.info[i].map);
3216 			}
3217 			bus_dmamap_destroy(ss->rx_small.dmat,
3218 					   ss->rx_small.extra_map);
3219 			bus_dma_tag_destroy(ss->rx_small.dmat);
3220 		}
3221 		free(ss->rx_small.info, M_DEVBUF);
3222 	}
3223 	ss->rx_small.info = NULL;
3224 
3225 	if (ss->rx_big.info != NULL) {
3226 		if (ss->rx_big.dmat != NULL) {
3227 			for (i = 0; i <= ss->rx_big.mask; i++) {
3228 				bus_dmamap_destroy(ss->rx_big.dmat,
3229 						   ss->rx_big.info[i].map);
3230 			}
3231 			bus_dmamap_destroy(ss->rx_big.dmat,
3232 					   ss->rx_big.extra_map);
3233 			bus_dma_tag_destroy(ss->rx_big.dmat);
3234 		}
3235 		free(ss->rx_big.info, M_DEVBUF);
3236 	}
3237 	ss->rx_big.info = NULL;
3238 }
3239 
3240 static void
3241 mxge_free_rings(mxge_softc_t *sc)
3242 {
3243 	int slice;
3244 
3245 	for (slice = 0; slice < sc->num_slices; slice++)
3246 		mxge_free_slice_rings(&sc->ss[slice]);
3247 }
3248 
3249 static int
3250 mxge_alloc_slice_rings(struct mxge_slice_state *ss, int rx_ring_entries,
3251 		       int tx_ring_entries)
3252 {
3253 	mxge_softc_t *sc = ss->sc;
3254 	size_t bytes;
3255 	int err, i;
3256 
3257 	/* allocate per-slice receive resources */
3258 
3259 	ss->rx_small.mask = ss->rx_big.mask = rx_ring_entries - 1;
3260 	ss->rx_done.mask = (2 * rx_ring_entries) - 1;
3261 
3262 	/* allocate the rx shadow rings */
3263 	bytes = rx_ring_entries * sizeof (*ss->rx_small.shadow);
3264 	ss->rx_small.shadow = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3265 
3266 	bytes = rx_ring_entries * sizeof (*ss->rx_big.shadow);
3267 	ss->rx_big.shadow = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3268 
3269 	/* allocate the rx host info rings */
3270 	bytes = rx_ring_entries * sizeof (*ss->rx_small.info);
3271 	ss->rx_small.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3272 
3273 	bytes = rx_ring_entries * sizeof (*ss->rx_big.info);
3274 	ss->rx_big.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3275 
3276 	/* allocate the rx busdma resources */
3277 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
3278 				 1,			/* alignment */
3279 				 4096,			/* boundary */
3280 				 BUS_SPACE_MAXADDR,	/* low */
3281 				 BUS_SPACE_MAXADDR,	/* high */
3282 				 NULL, NULL,		/* filter */
3283 				 MHLEN,			/* maxsize */
3284 				 1,			/* num segs */
3285 				 MHLEN,			/* maxsegsize */
3286 				 BUS_DMA_ALLOCNOW,	/* flags */
3287 				 NULL, NULL,		/* lock */
3288 				 &ss->rx_small.dmat);	/* tag */
3289 	if (err != 0) {
3290 		device_printf(sc->dev, "Err %d allocating rx_small dmat\n",
3291 			      err);
3292 		return err;
3293 	}
3294 
3295 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
3296 				 1,			/* alignment */
3297 #if MXGE_VIRT_JUMBOS
3298 				 4096,			/* boundary */
3299 #else
3300 				 0,			/* boundary */
3301 #endif
3302 				 BUS_SPACE_MAXADDR,	/* low */
3303 				 BUS_SPACE_MAXADDR,	/* high */
3304 				 NULL, NULL,		/* filter */
3305 				 3*4096,		/* maxsize */
3306 #if MXGE_VIRT_JUMBOS
3307 				 3,			/* num segs */
3308 				 4096,			/* maxsegsize*/
3309 #else
3310 				 1,			/* num segs */
3311 				 MJUM9BYTES,		/* maxsegsize*/
3312 #endif
3313 				 BUS_DMA_ALLOCNOW,	/* flags */
3314 				 NULL, NULL,		/* lock */
3315 				 &ss->rx_big.dmat);	/* tag */
3316 	if (err != 0) {
3317 		device_printf(sc->dev, "Err %d allocating rx_big dmat\n",
3318 			      err);
3319 		return err;
3320 	}
3321 	for (i = 0; i <= ss->rx_small.mask; i++) {
3322 		err = bus_dmamap_create(ss->rx_small.dmat, 0,
3323 					&ss->rx_small.info[i].map);
3324 		if (err != 0) {
3325 			device_printf(sc->dev, "Err %d  rx_small dmamap\n",
3326 				      err);
3327 			return err;
3328 		}
3329 	}
3330 	err = bus_dmamap_create(ss->rx_small.dmat, 0,
3331 				&ss->rx_small.extra_map);
3332 	if (err != 0) {
3333 		device_printf(sc->dev, "Err %d extra rx_small dmamap\n",
3334 			      err);
3335 		return err;
3336 	}
3337 
3338 	for (i = 0; i <= ss->rx_big.mask; i++) {
3339 		err = bus_dmamap_create(ss->rx_big.dmat, 0,
3340 					&ss->rx_big.info[i].map);
3341 		if (err != 0) {
3342 			device_printf(sc->dev, "Err %d  rx_big dmamap\n",
3343 				      err);
3344 			return err;
3345 		}
3346 	}
3347 	err = bus_dmamap_create(ss->rx_big.dmat, 0,
3348 				&ss->rx_big.extra_map);
3349 	if (err != 0) {
3350 		device_printf(sc->dev, "Err %d extra rx_big dmamap\n",
3351 			      err);
3352 		return err;
3353 	}
3354 
3355 	/* now allocate TX resources */
3356 
3357 #ifndef IFNET_BUF_RING
3358 	/* only use a single TX ring for now */
3359 	if (ss != ss->sc->ss)
3360 		return 0;
3361 #endif
3362 
3363 	ss->tx.mask = tx_ring_entries - 1;
3364 	ss->tx.max_desc = MIN(MXGE_MAX_SEND_DESC, tx_ring_entries / 4);
3365 
3366 	/* allocate the tx request copy block */
3367 	bytes = 8 +
3368 		sizeof (*ss->tx.req_list) * (ss->tx.max_desc + 4);
3369 	ss->tx.req_bytes = malloc(bytes, M_DEVBUF, M_WAITOK);
3370 	/* ensure req_list entries are aligned to 8 bytes */
3371 	ss->tx.req_list = (mcp_kreq_ether_send_t *)
3372 		((uintptr_t)(ss->tx.req_bytes + 7) & ~7UL);
3373 
3374 	/* allocate the tx busdma segment list */
3375 	bytes = sizeof (*ss->tx.seg_list) * ss->tx.max_desc;
3376 	ss->tx.seg_list = (bus_dma_segment_t *)
3377 		malloc(bytes, M_DEVBUF, M_WAITOK);
3378 
3379 	/* allocate the tx host info ring */
3380 	bytes = tx_ring_entries * sizeof (*ss->tx.info);
3381 	ss->tx.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3382 
3383 	/* allocate the tx busdma resources */
3384 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
3385 				 1,			/* alignment */
3386 				 sc->tx_boundary,	/* boundary */
3387 				 BUS_SPACE_MAXADDR,	/* low */
3388 				 BUS_SPACE_MAXADDR,	/* high */
3389 				 NULL, NULL,		/* filter */
3390 				 65536 + 256,		/* maxsize */
3391 				 ss->tx.max_desc - 2,	/* num segs */
3392 				 sc->tx_boundary,	/* maxsegsz */
3393 				 BUS_DMA_ALLOCNOW,	/* flags */
3394 				 NULL, NULL,		/* lock */
3395 				 &ss->tx.dmat);		/* tag */
3396 
3397 	if (err != 0) {
3398 		device_printf(sc->dev, "Err %d allocating tx dmat\n",
3399 			      err);
3400 		return err;
3401 	}
3402 
3403 	/* now use these tags to setup dmamaps for each slot
3404 	   in the ring */
3405 	for (i = 0; i <= ss->tx.mask; i++) {
3406 		err = bus_dmamap_create(ss->tx.dmat, 0,
3407 					&ss->tx.info[i].map);
3408 		if (err != 0) {
3409 			device_printf(sc->dev, "Err %d  tx dmamap\n",
3410 				      err);
3411 			return err;
3412 		}
3413 	}
3414 	return 0;
3415 
3416 }
3417 
3418 static int
3419 mxge_alloc_rings(mxge_softc_t *sc)
3420 {
3421 	mxge_cmd_t cmd;
3422 	int tx_ring_size;
3423 	int tx_ring_entries, rx_ring_entries;
3424 	int err, slice;
3425 
3426 	/* get ring sizes */
3427 	err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_RING_SIZE, &cmd);
3428 	tx_ring_size = cmd.data0;
3429 	if (err != 0) {
3430 		device_printf(sc->dev, "Cannot determine tx ring sizes\n");
3431 		goto abort;
3432 	}
3433 
3434 	tx_ring_entries = tx_ring_size / sizeof (mcp_kreq_ether_send_t);
3435 	rx_ring_entries = sc->rx_ring_size / sizeof (mcp_dma_addr_t);
3436 	IFQ_SET_MAXLEN(&sc->ifp->if_snd, tx_ring_entries - 1);
3437 	sc->ifp->if_snd.ifq_drv_maxlen = sc->ifp->if_snd.ifq_maxlen;
3438 	IFQ_SET_READY(&sc->ifp->if_snd);
3439 
3440 	for (slice = 0; slice < sc->num_slices; slice++) {
3441 		err = mxge_alloc_slice_rings(&sc->ss[slice],
3442 					     rx_ring_entries,
3443 					     tx_ring_entries);
3444 		if (err != 0)
3445 			goto abort;
3446 	}
3447 	return 0;
3448 
3449 abort:
3450 	mxge_free_rings(sc);
3451 	return err;
3452 
3453 }
3454 
3455 static void
3456 mxge_choose_params(int mtu, int *big_buf_size, int *cl_size, int *nbufs)
3457 {
3458 	int bufsize = mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN + MXGEFW_PAD;
3459 
3460 	if (bufsize < MCLBYTES) {
3461 		/* easy, everything fits in a single buffer */
3462 		*big_buf_size = MCLBYTES;
3463 		*cl_size = MCLBYTES;
3464 		*nbufs = 1;
3465 		return;
3466 	}
3467 
3468 	if (bufsize < MJUMPAGESIZE) {
3469 		/* still easy, everything still fits in a single buffer */
3470 		*big_buf_size = MJUMPAGESIZE;
3471 		*cl_size = MJUMPAGESIZE;
3472 		*nbufs = 1;
3473 		return;
3474 	}
3475 #if MXGE_VIRT_JUMBOS
3476 	/* now we need to use virtually contiguous buffers */
3477 	*cl_size = MJUM9BYTES;
3478 	*big_buf_size = 4096;
3479 	*nbufs = mtu / 4096 + 1;
3480 	/* needs to be a power of two, so round up */
3481 	if (*nbufs == 3)
3482 		*nbufs = 4;
3483 #else
3484 	*cl_size = MJUM9BYTES;
3485 	*big_buf_size = MJUM9BYTES;
3486 	*nbufs = 1;
3487 #endif
3488 }
3489 
3490 static int
3491 mxge_slice_open(struct mxge_slice_state *ss, int nbufs, int cl_size)
3492 {
3493 	mxge_softc_t *sc;
3494 	mxge_cmd_t cmd;
3495 	bus_dmamap_t map;
3496 	int err, i, slice;
3497 
3498 	sc = ss->sc;
3499 	slice = ss - sc->ss;
3500 
3501 #if defined(INET) || defined(INET6)
3502 	(void)tcp_lro_init(&ss->lc);
3503 #endif
3504 	ss->lc.ifp = sc->ifp;
3505 
3506 	/* get the lanai pointers to the send and receive rings */
3507 
3508 	err = 0;
3509 #ifndef IFNET_BUF_RING
3510 	/* We currently only send from the first slice */
3511 	if (slice == 0) {
3512 #endif
3513 		cmd.data0 = slice;
3514 		err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_OFFSET, &cmd);
3515 		ss->tx.lanai =
3516 			(volatile mcp_kreq_ether_send_t *)(sc->sram + cmd.data0);
3517 		ss->tx.send_go = (volatile uint32_t *)
3518 			(sc->sram + MXGEFW_ETH_SEND_GO + 64 * slice);
3519 		ss->tx.send_stop = (volatile uint32_t *)
3520 		(sc->sram + MXGEFW_ETH_SEND_STOP + 64 * slice);
3521 #ifndef IFNET_BUF_RING
3522 	}
3523 #endif
3524 	cmd.data0 = slice;
3525 	err |= mxge_send_cmd(sc,
3526 			     MXGEFW_CMD_GET_SMALL_RX_OFFSET, &cmd);
3527 	ss->rx_small.lanai =
3528 		(volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
3529 	cmd.data0 = slice;
3530 	err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_BIG_RX_OFFSET, &cmd);
3531 	ss->rx_big.lanai =
3532 		(volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
3533 
3534 	if (err != 0) {
3535 		device_printf(sc->dev,
3536 			      "failed to get ring sizes or locations\n");
3537 		return EIO;
3538 	}
3539 
3540 	/* stock receive rings */
3541 	for (i = 0; i <= ss->rx_small.mask; i++) {
3542 		map = ss->rx_small.info[i].map;
3543 		err = mxge_get_buf_small(ss, map, i);
3544 		if (err) {
3545 			device_printf(sc->dev, "alloced %d/%d smalls\n",
3546 				      i, ss->rx_small.mask + 1);
3547 			return ENOMEM;
3548 		}
3549 	}
3550 	for (i = 0; i <= ss->rx_big.mask; i++) {
3551 		ss->rx_big.shadow[i].addr_low = 0xffffffff;
3552 		ss->rx_big.shadow[i].addr_high = 0xffffffff;
3553 	}
3554 	ss->rx_big.nbufs = nbufs;
3555 	ss->rx_big.cl_size = cl_size;
3556 	ss->rx_big.mlen = ss->sc->ifp->if_mtu + ETHER_HDR_LEN +
3557 		ETHER_VLAN_ENCAP_LEN + MXGEFW_PAD;
3558 	for (i = 0; i <= ss->rx_big.mask; i += ss->rx_big.nbufs) {
3559 		map = ss->rx_big.info[i].map;
3560 		err = mxge_get_buf_big(ss, map, i);
3561 		if (err) {
3562 			device_printf(sc->dev, "alloced %d/%d bigs\n",
3563 				      i, ss->rx_big.mask + 1);
3564 			return ENOMEM;
3565 		}
3566 	}
3567 	return 0;
3568 }
3569 
3570 static int
3571 mxge_open(mxge_softc_t *sc)
3572 {
3573 	mxge_cmd_t cmd;
3574 	int err, big_bytes, nbufs, slice, cl_size, i;
3575 	bus_addr_t bus;
3576 	volatile uint8_t *itable;
3577 	struct mxge_slice_state *ss;
3578 
3579 	/* Copy the MAC address in case it was overridden */
3580 	bcopy(IF_LLADDR(sc->ifp), sc->mac_addr, ETHER_ADDR_LEN);
3581 
3582 	err = mxge_reset(sc, 1);
3583 	if (err != 0) {
3584 		device_printf(sc->dev, "failed to reset\n");
3585 		return EIO;
3586 	}
3587 
3588 	if (sc->num_slices > 1) {
3589 		/* setup the indirection table */
3590 		cmd.data0 = sc->num_slices;
3591 		err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_TABLE_SIZE,
3592 				    &cmd);
3593 
3594 		err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_RSS_TABLE_OFFSET,
3595 				     &cmd);
3596 		if (err != 0) {
3597 			device_printf(sc->dev,
3598 				      "failed to setup rss tables\n");
3599 			return err;
3600 		}
3601 
3602 		/* just enable an identity mapping */
3603 		itable = sc->sram + cmd.data0;
3604 		for (i = 0; i < sc->num_slices; i++)
3605 			itable[i] = (uint8_t)i;
3606 
3607 		cmd.data0 = 1;
3608 		cmd.data1 = mxge_rss_hash_type;
3609 		err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_ENABLE, &cmd);
3610 		if (err != 0) {
3611 			device_printf(sc->dev, "failed to enable slices\n");
3612 			return err;
3613 		}
3614 	}
3615 
3616 	mxge_choose_params(sc->ifp->if_mtu, &big_bytes, &cl_size, &nbufs);
3617 
3618 	cmd.data0 = nbufs;
3619 	err = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
3620 			    &cmd);
3621 	/* error is only meaningful if we're trying to set
3622 	   MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS > 1 */
3623 	if (err && nbufs > 1) {
3624 		device_printf(sc->dev,
3625 			      "Failed to set alway-use-n to %d\n",
3626 			      nbufs);
3627 		return EIO;
3628 	}
3629 	/* Give the firmware the mtu and the big and small buffer
3630 	   sizes.  The firmware wants the big buf size to be a power
3631 	   of two. Luckily, FreeBSD's clusters are powers of two */
3632 	cmd.data0 = sc->ifp->if_mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
3633 	err = mxge_send_cmd(sc, MXGEFW_CMD_SET_MTU, &cmd);
3634 	cmd.data0 = MHLEN - MXGEFW_PAD;
3635 	err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_SMALL_BUFFER_SIZE,
3636 			     &cmd);
3637 	cmd.data0 = big_bytes;
3638 	err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_BIG_BUFFER_SIZE, &cmd);
3639 
3640 	if (err != 0) {
3641 		device_printf(sc->dev, "failed to setup params\n");
3642 		goto abort;
3643 	}
3644 
3645 	/* Now give him the pointer to the stats block */
3646 	for (slice = 0;
3647 #ifdef IFNET_BUF_RING
3648 	     slice < sc->num_slices;
3649 #else
3650 	     slice < 1;
3651 #endif
3652 	     slice++) {
3653 		ss = &sc->ss[slice];
3654 		cmd.data0 =
3655 			MXGE_LOWPART_TO_U32(ss->fw_stats_dma.bus_addr);
3656 		cmd.data1 =
3657 			MXGE_HIGHPART_TO_U32(ss->fw_stats_dma.bus_addr);
3658 		cmd.data2 = sizeof(struct mcp_irq_data);
3659 		cmd.data2 |= (slice << 16);
3660 		err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_STATS_DMA_V2, &cmd);
3661 	}
3662 
3663 	if (err != 0) {
3664 		bus = sc->ss->fw_stats_dma.bus_addr;
3665 		bus += offsetof(struct mcp_irq_data, send_done_count);
3666 		cmd.data0 = MXGE_LOWPART_TO_U32(bus);
3667 		cmd.data1 = MXGE_HIGHPART_TO_U32(bus);
3668 		err = mxge_send_cmd(sc,
3669 				    MXGEFW_CMD_SET_STATS_DMA_OBSOLETE,
3670 				    &cmd);
3671 		/* Firmware cannot support multicast without STATS_DMA_V2 */
3672 		sc->fw_multicast_support = 0;
3673 	} else {
3674 		sc->fw_multicast_support = 1;
3675 	}
3676 
3677 	if (err != 0) {
3678 		device_printf(sc->dev, "failed to setup params\n");
3679 		goto abort;
3680 	}
3681 
3682 	for (slice = 0; slice < sc->num_slices; slice++) {
3683 		err = mxge_slice_open(&sc->ss[slice], nbufs, cl_size);
3684 		if (err != 0) {
3685 			device_printf(sc->dev, "couldn't open slice %d\n",
3686 				      slice);
3687 			goto abort;
3688 		}
3689 	}
3690 
3691 	/* Finally, start the firmware running */
3692 	err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_UP, &cmd);
3693 	if (err) {
3694 		device_printf(sc->dev, "Couldn't bring up link\n");
3695 		goto abort;
3696 	}
3697 #ifdef IFNET_BUF_RING
3698 	for (slice = 0; slice < sc->num_slices; slice++) {
3699 		ss = &sc->ss[slice];
3700 		ss->if_drv_flags |= IFF_DRV_RUNNING;
3701 		ss->if_drv_flags &= ~IFF_DRV_OACTIVE;
3702 	}
3703 #endif
3704 	sc->ifp->if_drv_flags |= IFF_DRV_RUNNING;
3705 	sc->ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
3706 
3707 	return 0;
3708 
3709 abort:
3710 	mxge_free_mbufs(sc);
3711 
3712 	return err;
3713 }
3714 
3715 static int
3716 mxge_close(mxge_softc_t *sc, int down)
3717 {
3718 	mxge_cmd_t cmd;
3719 	int err, old_down_cnt;
3720 #ifdef IFNET_BUF_RING
3721 	struct mxge_slice_state *ss;
3722 	int slice;
3723 #endif
3724 
3725 #ifdef IFNET_BUF_RING
3726 	for (slice = 0; slice < sc->num_slices; slice++) {
3727 		ss = &sc->ss[slice];
3728 		ss->if_drv_flags &= ~IFF_DRV_RUNNING;
3729 	}
3730 #endif
3731 	sc->ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
3732 	if (!down) {
3733 		old_down_cnt = sc->down_cnt;
3734 		wmb();
3735 		err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_DOWN, &cmd);
3736 		if (err) {
3737 			device_printf(sc->dev,
3738 				      "Couldn't bring down link\n");
3739 		}
3740 		if (old_down_cnt == sc->down_cnt) {
3741 			/* wait for down irq */
3742 			DELAY(10 * sc->intr_coal_delay);
3743 		}
3744 		wmb();
3745 		if (old_down_cnt == sc->down_cnt) {
3746 			device_printf(sc->dev, "never got down irq\n");
3747 		}
3748 	}
3749 	mxge_free_mbufs(sc);
3750 
3751 	return 0;
3752 }
3753 
3754 static void
3755 mxge_setup_cfg_space(mxge_softc_t *sc)
3756 {
3757 	device_t dev = sc->dev;
3758 	int reg;
3759 	uint16_t lnk, pectl;
3760 
3761 	/* find the PCIe link width and set max read request to 4KB*/
3762 	if (pci_find_cap(dev, PCIY_EXPRESS, &reg) == 0) {
3763 		lnk = pci_read_config(dev, reg + 0x12, 2);
3764 		sc->link_width = (lnk >> 4) & 0x3f;
3765 
3766 		if (sc->pectl == 0) {
3767 			pectl = pci_read_config(dev, reg + 0x8, 2);
3768 			pectl = (pectl & ~0x7000) | (5 << 12);
3769 			pci_write_config(dev, reg + 0x8, pectl, 2);
3770 			sc->pectl = pectl;
3771 		} else {
3772 			/* restore saved pectl after watchdog reset */
3773 			pci_write_config(dev, reg + 0x8, sc->pectl, 2);
3774 		}
3775 	}
3776 
3777 	/* Enable DMA and Memory space access */
3778 	pci_enable_busmaster(dev);
3779 }
3780 
3781 static uint32_t
3782 mxge_read_reboot(mxge_softc_t *sc)
3783 {
3784 	device_t dev = sc->dev;
3785 	uint32_t vs;
3786 
3787 	/* find the vendor specific offset */
3788 	if (pci_find_cap(dev, PCIY_VENDOR, &vs) != 0) {
3789 		device_printf(sc->dev,
3790 			      "could not find vendor specific offset\n");
3791 		return (uint32_t)-1;
3792 	}
3793 	/* enable read32 mode */
3794 	pci_write_config(dev, vs + 0x10, 0x3, 1);
3795 	/* tell NIC which register to read */
3796 	pci_write_config(dev, vs + 0x18, 0xfffffff0, 4);
3797 	return (pci_read_config(dev, vs + 0x14, 4));
3798 }
3799 
3800 static void
3801 mxge_watchdog_reset(mxge_softc_t *sc)
3802 {
3803 	struct pci_devinfo *dinfo;
3804 	struct mxge_slice_state *ss;
3805 	int err, running, s, num_tx_slices = 1;
3806 	uint32_t reboot;
3807 	uint16_t cmd;
3808 
3809 	err = ENXIO;
3810 
3811 	device_printf(sc->dev, "Watchdog reset!\n");
3812 
3813 	/*
3814 	 * check to see if the NIC rebooted.  If it did, then all of
3815 	 * PCI config space has been reset, and things like the
3816 	 * busmaster bit will be zero.  If this is the case, then we
3817 	 * must restore PCI config space before the NIC can be used
3818 	 * again
3819 	 */
3820 	cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3821 	if (cmd == 0xffff) {
3822 		/*
3823 		 * maybe the watchdog caught the NIC rebooting; wait
3824 		 * up to 100ms for it to finish.  If it does not come
3825 		 * back, then give up
3826 		 */
3827 		DELAY(1000*100);
3828 		cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3829 		if (cmd == 0xffff) {
3830 			device_printf(sc->dev, "NIC disappeared!\n");
3831 		}
3832 	}
3833 	if ((cmd & PCIM_CMD_BUSMASTEREN) == 0) {
3834 		/* print the reboot status */
3835 		reboot = mxge_read_reboot(sc);
3836 		device_printf(sc->dev, "NIC rebooted, status = 0x%x\n",
3837 			      reboot);
3838 		running = sc->ifp->if_drv_flags & IFF_DRV_RUNNING;
3839 		if (running) {
3840 			/*
3841 			 * quiesce NIC so that TX routines will not try to
3842 			 * xmit after restoration of BAR
3843 			 */
3844 
3845 			/* Mark the link as down */
3846 			if (sc->link_state) {
3847 				sc->link_state = 0;
3848 				if_link_state_change(sc->ifp,
3849 						     LINK_STATE_DOWN);
3850 			}
3851 #ifdef IFNET_BUF_RING
3852 			num_tx_slices = sc->num_slices;
3853 #endif
3854 			/* grab all TX locks to ensure no tx  */
3855 			for (s = 0; s < num_tx_slices; s++) {
3856 				ss = &sc->ss[s];
3857 				mtx_lock(&ss->tx.mtx);
3858 			}
3859 			mxge_close(sc, 1);
3860 		}
3861 		/* restore PCI configuration space */
3862 		dinfo = device_get_ivars(sc->dev);
3863 		pci_cfg_restore(sc->dev, dinfo);
3864 
3865 		/* and redo any changes we made to our config space */
3866 		mxge_setup_cfg_space(sc);
3867 
3868 		/* reload f/w */
3869 		err = mxge_load_firmware(sc, 0);
3870 		if (err) {
3871 			device_printf(sc->dev,
3872 				      "Unable to re-load f/w\n");
3873 		}
3874 		if (running) {
3875 			if (!err)
3876 				err = mxge_open(sc);
3877 			/* release all TX locks */
3878 			for (s = 0; s < num_tx_slices; s++) {
3879 				ss = &sc->ss[s];
3880 #ifdef IFNET_BUF_RING
3881 				mxge_start_locked(ss);
3882 #endif
3883 				mtx_unlock(&ss->tx.mtx);
3884 			}
3885 		}
3886 		sc->watchdog_resets++;
3887 	} else {
3888 		device_printf(sc->dev,
3889 			      "NIC did not reboot, not resetting\n");
3890 		err = 0;
3891 	}
3892 	if (err) {
3893 		device_printf(sc->dev, "watchdog reset failed\n");
3894 	} else {
3895 		if (sc->dying == 2)
3896 			sc->dying = 0;
3897 		callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
3898 	}
3899 }
3900 
3901 static void
3902 mxge_watchdog_task(void *arg, int pending)
3903 {
3904 	mxge_softc_t *sc = arg;
3905 
3906 	mtx_lock(&sc->driver_mtx);
3907 	mxge_watchdog_reset(sc);
3908 	mtx_unlock(&sc->driver_mtx);
3909 }
3910 
3911 static void
3912 mxge_warn_stuck(mxge_softc_t *sc, mxge_tx_ring_t *tx, int slice)
3913 {
3914 	tx = &sc->ss[slice].tx;
3915 	device_printf(sc->dev, "slice %d struck? ring state:\n", slice);
3916 	device_printf(sc->dev,
3917 		      "tx.req=%d tx.done=%d, tx.queue_active=%d\n",
3918 		      tx->req, tx->done, tx->queue_active);
3919 	device_printf(sc->dev, "tx.activate=%d tx.deactivate=%d\n",
3920 			      tx->activate, tx->deactivate);
3921 	device_printf(sc->dev, "pkt_done=%d fw=%d\n",
3922 		      tx->pkt_done,
3923 		      be32toh(sc->ss->fw_stats->send_done_count));
3924 }
3925 
3926 static int
3927 mxge_watchdog(mxge_softc_t *sc)
3928 {
3929 	mxge_tx_ring_t *tx;
3930 	uint32_t rx_pause = be32toh(sc->ss->fw_stats->dropped_pause);
3931 	int i, err = 0;
3932 
3933 	/* see if we have outstanding transmits, which
3934 	   have been pending for more than mxge_ticks */
3935 	for (i = 0;
3936 #ifdef IFNET_BUF_RING
3937 	     (i < sc->num_slices) && (err == 0);
3938 #else
3939 	     (i < 1) && (err == 0);
3940 #endif
3941 	     i++) {
3942 		tx = &sc->ss[i].tx;
3943 		if (tx->req != tx->done &&
3944 		    tx->watchdog_req != tx->watchdog_done &&
3945 		    tx->done == tx->watchdog_done) {
3946 			/* check for pause blocking before resetting */
3947 			if (tx->watchdog_rx_pause == rx_pause) {
3948 				mxge_warn_stuck(sc, tx, i);
3949 				taskqueue_enqueue(sc->tq, &sc->watchdog_task);
3950 				return (ENXIO);
3951 			}
3952 			else
3953 				device_printf(sc->dev, "Flow control blocking "
3954 					      "xmits, check link partner\n");
3955 		}
3956 
3957 		tx->watchdog_req = tx->req;
3958 		tx->watchdog_done = tx->done;
3959 		tx->watchdog_rx_pause = rx_pause;
3960 	}
3961 
3962 	if (sc->need_media_probe)
3963 		mxge_media_probe(sc);
3964 	return (err);
3965 }
3966 
3967 static uint64_t
3968 mxge_get_counter(struct ifnet *ifp, ift_counter cnt)
3969 {
3970 	struct mxge_softc *sc;
3971 	uint64_t rv;
3972 
3973 	sc = if_getsoftc(ifp);
3974 	rv = 0;
3975 
3976 	switch (cnt) {
3977 	case IFCOUNTER_IPACKETS:
3978 		for (int s = 0; s < sc->num_slices; s++)
3979 			rv += sc->ss[s].ipackets;
3980 		return (rv);
3981 	case IFCOUNTER_OPACKETS:
3982 		for (int s = 0; s < sc->num_slices; s++)
3983 			rv += sc->ss[s].opackets;
3984 		return (rv);
3985 	case IFCOUNTER_OERRORS:
3986 		for (int s = 0; s < sc->num_slices; s++)
3987 			rv += sc->ss[s].oerrors;
3988 		return (rv);
3989 #ifdef IFNET_BUF_RING
3990 	case IFCOUNTER_OBYTES:
3991 		for (int s = 0; s < sc->num_slices; s++)
3992 			rv += sc->ss[s].obytes;
3993 		return (rv);
3994 	case IFCOUNTER_OMCASTS:
3995 		for (int s = 0; s < sc->num_slices; s++)
3996 			rv += sc->ss[s].omcasts;
3997 		return (rv);
3998 	case IFCOUNTER_OQDROPS:
3999 		for (int s = 0; s < sc->num_slices; s++)
4000 			rv += sc->ss[s].tx.br->br_drops;
4001 		return (rv);
4002 #endif
4003 	default:
4004 		return (if_get_counter_default(ifp, cnt));
4005 	}
4006 }
4007 
4008 static void
4009 mxge_tick(void *arg)
4010 {
4011 	mxge_softc_t *sc = arg;
4012 	u_long pkts = 0;
4013 	int err = 0;
4014 	int running, ticks;
4015 	uint16_t cmd;
4016 
4017 	ticks = mxge_ticks;
4018 	running = sc->ifp->if_drv_flags & IFF_DRV_RUNNING;
4019 	if (running) {
4020 		if (!sc->watchdog_countdown) {
4021 			err = mxge_watchdog(sc);
4022 			sc->watchdog_countdown = 4;
4023 		}
4024 		sc->watchdog_countdown--;
4025 	}
4026 	if (pkts == 0) {
4027 		/* ensure NIC did not suffer h/w fault while idle */
4028 		cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
4029 		if ((cmd & PCIM_CMD_BUSMASTEREN) == 0) {
4030 			sc->dying = 2;
4031 			taskqueue_enqueue(sc->tq, &sc->watchdog_task);
4032 			err = ENXIO;
4033 		}
4034 		/* look less often if NIC is idle */
4035 		ticks *= 4;
4036 	}
4037 
4038 	if (err == 0)
4039 		callout_reset(&sc->co_hdl, ticks, mxge_tick, sc);
4040 
4041 }
4042 
4043 static int
4044 mxge_media_change(struct ifnet *ifp)
4045 {
4046 	return EINVAL;
4047 }
4048 
4049 static int
4050 mxge_change_mtu(mxge_softc_t *sc, int mtu)
4051 {
4052 	struct ifnet *ifp = sc->ifp;
4053 	int real_mtu, old_mtu;
4054 	int err = 0;
4055 
4056 	real_mtu = mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
4057 	if ((real_mtu > sc->max_mtu) || real_mtu < 60)
4058 		return EINVAL;
4059 	mtx_lock(&sc->driver_mtx);
4060 	old_mtu = ifp->if_mtu;
4061 	ifp->if_mtu = mtu;
4062 	if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
4063 		mxge_close(sc, 0);
4064 		err = mxge_open(sc);
4065 		if (err != 0) {
4066 			ifp->if_mtu = old_mtu;
4067 			mxge_close(sc, 0);
4068 			(void) mxge_open(sc);
4069 		}
4070 	}
4071 	mtx_unlock(&sc->driver_mtx);
4072 	return err;
4073 }
4074 
4075 static void
4076 mxge_media_status(struct ifnet *ifp, struct ifmediareq *ifmr)
4077 {
4078 	mxge_softc_t *sc = ifp->if_softc;
4079 
4080 	if (sc == NULL)
4081 		return;
4082 	ifmr->ifm_status = IFM_AVALID;
4083 	ifmr->ifm_active = IFM_ETHER | IFM_FDX;
4084 	ifmr->ifm_status |= sc->link_state ? IFM_ACTIVE : 0;
4085 	ifmr->ifm_active |= sc->current_media;
4086 }
4087 
4088 static int
4089 mxge_fetch_i2c(mxge_softc_t *sc, struct ifi2creq *i2c)
4090 {
4091 	mxge_cmd_t cmd;
4092 	uint32_t i2c_args;
4093 	int i, ms, err;
4094 
4095 	if (i2c->dev_addr != 0xA0 &&
4096 	    i2c->dev_addr != 0xA2)
4097 		return (EINVAL);
4098 	if (i2c->len > sizeof(i2c->data))
4099 		return (EINVAL);
4100 
4101 	for (i = 0; i < i2c->len; i++) {
4102 		i2c_args = i2c->dev_addr << 0x8;
4103 		i2c_args |= i2c->offset + i;
4104 		cmd.data0 = 0;	 /* just fetch 1 byte, not all 256 */
4105 		cmd.data1 = i2c_args;
4106 		err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_READ, &cmd);
4107 
4108 		if (err != MXGEFW_CMD_OK)
4109 			return (EIO);
4110 		/* now we wait for the data to be cached */
4111 		cmd.data0 = i2c_args & 0xff;
4112 		err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
4113 		for (ms = 0; (err == EBUSY) && (ms < 50); ms++) {
4114 			cmd.data0 = i2c_args & 0xff;
4115 			err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
4116 			if (err == EBUSY)
4117 				DELAY(1000);
4118 		}
4119 		if (err != MXGEFW_CMD_OK)
4120 			return (EIO);
4121 		i2c->data[i] = cmd.data0;
4122 	}
4123 	return (0);
4124 }
4125 
4126 static int
4127 mxge_ioctl(struct ifnet *ifp, u_long command, caddr_t data)
4128 {
4129 	mxge_softc_t *sc = ifp->if_softc;
4130 	struct ifreq *ifr = (struct ifreq *)data;
4131 	struct ifi2creq i2c;
4132 	int err, mask;
4133 
4134 	err = 0;
4135 	switch (command) {
4136 	case SIOCSIFMTU:
4137 		err = mxge_change_mtu(sc, ifr->ifr_mtu);
4138 		break;
4139 
4140 	case SIOCSIFFLAGS:
4141 		mtx_lock(&sc->driver_mtx);
4142 		if (sc->dying) {
4143 			mtx_unlock(&sc->driver_mtx);
4144 			return EINVAL;
4145 		}
4146 		if (ifp->if_flags & IFF_UP) {
4147 			if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) {
4148 				err = mxge_open(sc);
4149 			} else {
4150 				/* take care of promis can allmulti
4151 				   flag chages */
4152 				mxge_change_promisc(sc,
4153 						    ifp->if_flags & IFF_PROMISC);
4154 				mxge_set_multicast_list(sc);
4155 			}
4156 		} else {
4157 			if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
4158 				mxge_close(sc, 0);
4159 			}
4160 		}
4161 		mtx_unlock(&sc->driver_mtx);
4162 		break;
4163 
4164 	case SIOCADDMULTI:
4165 	case SIOCDELMULTI:
4166 		mtx_lock(&sc->driver_mtx);
4167 		if (sc->dying) {
4168 			mtx_unlock(&sc->driver_mtx);
4169 			return (EINVAL);
4170 		}
4171 		mxge_set_multicast_list(sc);
4172 		mtx_unlock(&sc->driver_mtx);
4173 		break;
4174 
4175 	case SIOCSIFCAP:
4176 		mtx_lock(&sc->driver_mtx);
4177 		mask = ifr->ifr_reqcap ^ ifp->if_capenable;
4178 		if (mask & IFCAP_TXCSUM) {
4179 			if (IFCAP_TXCSUM & ifp->if_capenable) {
4180 				mask &= ~IFCAP_TSO4;
4181 				ifp->if_capenable &= ~(IFCAP_TXCSUM|IFCAP_TSO4);
4182 				ifp->if_hwassist &= ~(CSUM_TCP | CSUM_UDP);
4183 			} else {
4184 				ifp->if_capenable |= IFCAP_TXCSUM;
4185 				ifp->if_hwassist |= (CSUM_TCP | CSUM_UDP);
4186 			}
4187 		}
4188 		if (mask & IFCAP_RXCSUM) {
4189 			if (IFCAP_RXCSUM & ifp->if_capenable) {
4190 				ifp->if_capenable &= ~IFCAP_RXCSUM;
4191 			} else {
4192 				ifp->if_capenable |= IFCAP_RXCSUM;
4193 			}
4194 		}
4195 		if (mask & IFCAP_TSO4) {
4196 			if (IFCAP_TSO4 & ifp->if_capenable) {
4197 				ifp->if_capenable &= ~IFCAP_TSO4;
4198 			} else if (IFCAP_TXCSUM & ifp->if_capenable) {
4199 				ifp->if_capenable |= IFCAP_TSO4;
4200 				ifp->if_hwassist |= CSUM_TSO;
4201 			} else {
4202 				printf("mxge requires tx checksum offload"
4203 				       " be enabled to use TSO\n");
4204 				err = EINVAL;
4205 			}
4206 		}
4207 #if IFCAP_TSO6
4208 		if (mask & IFCAP_TXCSUM_IPV6) {
4209 			if (IFCAP_TXCSUM_IPV6 & ifp->if_capenable) {
4210 				mask &= ~IFCAP_TSO6;
4211 				ifp->if_capenable &= ~(IFCAP_TXCSUM_IPV6
4212 						       | IFCAP_TSO6);
4213 				ifp->if_hwassist &= ~(CSUM_TCP_IPV6
4214 						      | CSUM_UDP);
4215 			} else {
4216 				ifp->if_capenable |= IFCAP_TXCSUM_IPV6;
4217 				ifp->if_hwassist |= (CSUM_TCP_IPV6
4218 						     | CSUM_UDP_IPV6);
4219 			}
4220 		}
4221 		if (mask & IFCAP_RXCSUM_IPV6) {
4222 			if (IFCAP_RXCSUM_IPV6 & ifp->if_capenable) {
4223 				ifp->if_capenable &= ~IFCAP_RXCSUM_IPV6;
4224 			} else {
4225 				ifp->if_capenable |= IFCAP_RXCSUM_IPV6;
4226 			}
4227 		}
4228 		if (mask & IFCAP_TSO6) {
4229 			if (IFCAP_TSO6 & ifp->if_capenable) {
4230 				ifp->if_capenable &= ~IFCAP_TSO6;
4231 			} else if (IFCAP_TXCSUM_IPV6 & ifp->if_capenable) {
4232 				ifp->if_capenable |= IFCAP_TSO6;
4233 				ifp->if_hwassist |= CSUM_TSO;
4234 			} else {
4235 				printf("mxge requires tx checksum offload"
4236 				       " be enabled to use TSO\n");
4237 				err = EINVAL;
4238 			}
4239 		}
4240 #endif /*IFCAP_TSO6 */
4241 
4242 		if (mask & IFCAP_LRO)
4243 			ifp->if_capenable ^= IFCAP_LRO;
4244 		if (mask & IFCAP_VLAN_HWTAGGING)
4245 			ifp->if_capenable ^= IFCAP_VLAN_HWTAGGING;
4246 		if (mask & IFCAP_VLAN_HWTSO)
4247 			ifp->if_capenable ^= IFCAP_VLAN_HWTSO;
4248 
4249 		if (!(ifp->if_capabilities & IFCAP_VLAN_HWTSO) ||
4250 		    !(ifp->if_capenable & IFCAP_VLAN_HWTAGGING))
4251 			ifp->if_capenable &= ~IFCAP_VLAN_HWTSO;
4252 
4253 		mtx_unlock(&sc->driver_mtx);
4254 		VLAN_CAPABILITIES(ifp);
4255 
4256 		break;
4257 
4258 	case SIOCGIFMEDIA:
4259 		mtx_lock(&sc->driver_mtx);
4260 		if (sc->dying) {
4261 			mtx_unlock(&sc->driver_mtx);
4262 			return (EINVAL);
4263 		}
4264 		mxge_media_probe(sc);
4265 		mtx_unlock(&sc->driver_mtx);
4266 		err = ifmedia_ioctl(ifp, (struct ifreq *)data,
4267 				    &sc->media, command);
4268 		break;
4269 
4270 	case SIOCGI2C:
4271 		if (sc->connector != MXGE_XFP &&
4272 		    sc->connector != MXGE_SFP) {
4273 			err = ENXIO;
4274 			break;
4275 		}
4276 		err = copyin(ifr_data_get_ptr(ifr), &i2c, sizeof(i2c));
4277 		if (err != 0)
4278 			break;
4279 		mtx_lock(&sc->driver_mtx);
4280 		if (sc->dying) {
4281 			mtx_unlock(&sc->driver_mtx);
4282 			return (EINVAL);
4283 		}
4284 		err = mxge_fetch_i2c(sc, &i2c);
4285 		mtx_unlock(&sc->driver_mtx);
4286 		if (err == 0)
4287 			err = copyout(&i2c, ifr_data_get_ptr(ifr),
4288 			    sizeof(i2c));
4289 		break;
4290 	default:
4291 		err = ether_ioctl(ifp, command, data);
4292 		break;
4293 	}
4294 	return err;
4295 }
4296 
4297 static void
4298 mxge_fetch_tunables(mxge_softc_t *sc)
4299 {
4300 
4301 	TUNABLE_INT_FETCH("hw.mxge.max_slices", &mxge_max_slices);
4302 	TUNABLE_INT_FETCH("hw.mxge.flow_control_enabled",
4303 			  &mxge_flow_control);
4304 	TUNABLE_INT_FETCH("hw.mxge.intr_coal_delay",
4305 			  &mxge_intr_coal_delay);
4306 	TUNABLE_INT_FETCH("hw.mxge.nvidia_ecrc_enable",
4307 			  &mxge_nvidia_ecrc_enable);
4308 	TUNABLE_INT_FETCH("hw.mxge.force_firmware",
4309 			  &mxge_force_firmware);
4310 	TUNABLE_INT_FETCH("hw.mxge.deassert_wait",
4311 			  &mxge_deassert_wait);
4312 	TUNABLE_INT_FETCH("hw.mxge.verbose",
4313 			  &mxge_verbose);
4314 	TUNABLE_INT_FETCH("hw.mxge.ticks", &mxge_ticks);
4315 	TUNABLE_INT_FETCH("hw.mxge.always_promisc", &mxge_always_promisc);
4316 	TUNABLE_INT_FETCH("hw.mxge.rss_hash_type", &mxge_rss_hash_type);
4317 	TUNABLE_INT_FETCH("hw.mxge.rss_hashtype", &mxge_rss_hash_type);
4318 	TUNABLE_INT_FETCH("hw.mxge.initial_mtu", &mxge_initial_mtu);
4319 	TUNABLE_INT_FETCH("hw.mxge.throttle", &mxge_throttle);
4320 
4321 	if (bootverbose)
4322 		mxge_verbose = 1;
4323 	if (mxge_intr_coal_delay < 0 || mxge_intr_coal_delay > 10*1000)
4324 		mxge_intr_coal_delay = 30;
4325 	if (mxge_ticks == 0)
4326 		mxge_ticks = hz / 2;
4327 	sc->pause = mxge_flow_control;
4328 	if (mxge_rss_hash_type < MXGEFW_RSS_HASH_TYPE_IPV4
4329 	    || mxge_rss_hash_type > MXGEFW_RSS_HASH_TYPE_MAX) {
4330 		mxge_rss_hash_type = MXGEFW_RSS_HASH_TYPE_SRC_DST_PORT;
4331 	}
4332 	if (mxge_initial_mtu > ETHERMTU_JUMBO ||
4333 	    mxge_initial_mtu < ETHER_MIN_LEN)
4334 		mxge_initial_mtu = ETHERMTU_JUMBO;
4335 
4336 	if (mxge_throttle && mxge_throttle > MXGE_MAX_THROTTLE)
4337 		mxge_throttle = MXGE_MAX_THROTTLE;
4338 	if (mxge_throttle && mxge_throttle < MXGE_MIN_THROTTLE)
4339 		mxge_throttle = MXGE_MIN_THROTTLE;
4340 	sc->throttle = mxge_throttle;
4341 }
4342 
4343 static void
4344 mxge_free_slices(mxge_softc_t *sc)
4345 {
4346 	struct mxge_slice_state *ss;
4347 	int i;
4348 
4349 	if (sc->ss == NULL)
4350 		return;
4351 
4352 	for (i = 0; i < sc->num_slices; i++) {
4353 		ss = &sc->ss[i];
4354 		if (ss->fw_stats != NULL) {
4355 			mxge_dma_free(&ss->fw_stats_dma);
4356 			ss->fw_stats = NULL;
4357 #ifdef IFNET_BUF_RING
4358 			if (ss->tx.br != NULL) {
4359 				drbr_free(ss->tx.br, M_DEVBUF);
4360 				ss->tx.br = NULL;
4361 			}
4362 #endif
4363 			mtx_destroy(&ss->tx.mtx);
4364 		}
4365 		if (ss->rx_done.entry != NULL) {
4366 			mxge_dma_free(&ss->rx_done.dma);
4367 			ss->rx_done.entry = NULL;
4368 		}
4369 	}
4370 	free(sc->ss, M_DEVBUF);
4371 	sc->ss = NULL;
4372 }
4373 
4374 static int
4375 mxge_alloc_slices(mxge_softc_t *sc)
4376 {
4377 	mxge_cmd_t cmd;
4378 	struct mxge_slice_state *ss;
4379 	size_t bytes;
4380 	int err, i, max_intr_slots;
4381 
4382 	err = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
4383 	if (err != 0) {
4384 		device_printf(sc->dev, "Cannot determine rx ring size\n");
4385 		return err;
4386 	}
4387 	sc->rx_ring_size = cmd.data0;
4388 	max_intr_slots = 2 * (sc->rx_ring_size / sizeof (mcp_dma_addr_t));
4389 
4390 	bytes = sizeof (*sc->ss) * sc->num_slices;
4391 	sc->ss = malloc(bytes, M_DEVBUF, M_NOWAIT | M_ZERO);
4392 	if (sc->ss == NULL)
4393 		return (ENOMEM);
4394 	for (i = 0; i < sc->num_slices; i++) {
4395 		ss = &sc->ss[i];
4396 
4397 		ss->sc = sc;
4398 
4399 		/* allocate per-slice rx interrupt queues */
4400 
4401 		bytes = max_intr_slots * sizeof (*ss->rx_done.entry);
4402 		err = mxge_dma_alloc(sc, &ss->rx_done.dma, bytes, 4096);
4403 		if (err != 0)
4404 			goto abort;
4405 		ss->rx_done.entry = ss->rx_done.dma.addr;
4406 		bzero(ss->rx_done.entry, bytes);
4407 
4408 		/*
4409 		 * allocate the per-slice firmware stats; stats
4410 		 * (including tx) are used used only on the first
4411 		 * slice for now
4412 		 */
4413 #ifndef IFNET_BUF_RING
4414 		if (i > 0)
4415 			continue;
4416 #endif
4417 
4418 		bytes = sizeof (*ss->fw_stats);
4419 		err = mxge_dma_alloc(sc, &ss->fw_stats_dma,
4420 				     sizeof (*ss->fw_stats), 64);
4421 		if (err != 0)
4422 			goto abort;
4423 		ss->fw_stats = (mcp_irq_data_t *)ss->fw_stats_dma.addr;
4424 		snprintf(ss->tx.mtx_name, sizeof(ss->tx.mtx_name),
4425 			 "%s:tx(%d)", device_get_nameunit(sc->dev), i);
4426 		mtx_init(&ss->tx.mtx, ss->tx.mtx_name, NULL, MTX_DEF);
4427 #ifdef IFNET_BUF_RING
4428 		ss->tx.br = buf_ring_alloc(2048, M_DEVBUF, M_WAITOK,
4429 					   &ss->tx.mtx);
4430 #endif
4431 	}
4432 
4433 	return (0);
4434 
4435 abort:
4436 	mxge_free_slices(sc);
4437 	return (ENOMEM);
4438 }
4439 
4440 static void
4441 mxge_slice_probe(mxge_softc_t *sc)
4442 {
4443 	mxge_cmd_t cmd;
4444 	char *old_fw;
4445 	int msix_cnt, status, max_intr_slots;
4446 
4447 	sc->num_slices = 1;
4448 	/*
4449 	 *  don't enable multiple slices if they are not enabled,
4450 	 *  or if this is not an SMP system
4451 	 */
4452 
4453 	if (mxge_max_slices == 0 || mxge_max_slices == 1 || mp_ncpus < 2)
4454 		return;
4455 
4456 	/* see how many MSI-X interrupts are available */
4457 	msix_cnt = pci_msix_count(sc->dev);
4458 	if (msix_cnt < 2)
4459 		return;
4460 
4461 	/* now load the slice aware firmware see what it supports */
4462 	old_fw = sc->fw_name;
4463 	if (old_fw == mxge_fw_aligned)
4464 		sc->fw_name = mxge_fw_rss_aligned;
4465 	else
4466 		sc->fw_name = mxge_fw_rss_unaligned;
4467 	status = mxge_load_firmware(sc, 0);
4468 	if (status != 0) {
4469 		device_printf(sc->dev, "Falling back to a single slice\n");
4470 		return;
4471 	}
4472 
4473 	/* try to send a reset command to the card to see if it
4474 	   is alive */
4475 	memset(&cmd, 0, sizeof (cmd));
4476 	status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
4477 	if (status != 0) {
4478 		device_printf(sc->dev, "failed reset\n");
4479 		goto abort_with_fw;
4480 	}
4481 
4482 	/* get rx ring size */
4483 	status = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
4484 	if (status != 0) {
4485 		device_printf(sc->dev, "Cannot determine rx ring size\n");
4486 		goto abort_with_fw;
4487 	}
4488 	max_intr_slots = 2 * (cmd.data0 / sizeof (mcp_dma_addr_t));
4489 
4490 	/* tell it the size of the interrupt queues */
4491 	cmd.data0 = max_intr_slots * sizeof (struct mcp_slot);
4492 	status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
4493 	if (status != 0) {
4494 		device_printf(sc->dev, "failed MXGEFW_CMD_SET_INTRQ_SIZE\n");
4495 		goto abort_with_fw;
4496 	}
4497 
4498 	/* ask the maximum number of slices it supports */
4499 	status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES, &cmd);
4500 	if (status != 0) {
4501 		device_printf(sc->dev,
4502 			      "failed MXGEFW_CMD_GET_MAX_RSS_QUEUES\n");
4503 		goto abort_with_fw;
4504 	}
4505 	sc->num_slices = cmd.data0;
4506 	if (sc->num_slices > msix_cnt)
4507 		sc->num_slices = msix_cnt;
4508 
4509 	if (mxge_max_slices == -1) {
4510 		/* cap to number of CPUs in system */
4511 		if (sc->num_slices > mp_ncpus)
4512 			sc->num_slices = mp_ncpus;
4513 	} else {
4514 		if (sc->num_slices > mxge_max_slices)
4515 			sc->num_slices = mxge_max_slices;
4516 	}
4517 	/* make sure it is a power of two */
4518 	while (sc->num_slices & (sc->num_slices - 1))
4519 		sc->num_slices--;
4520 
4521 	if (mxge_verbose)
4522 		device_printf(sc->dev, "using %d slices\n",
4523 			      sc->num_slices);
4524 
4525 	return;
4526 
4527 abort_with_fw:
4528 	sc->fw_name = old_fw;
4529 	(void) mxge_load_firmware(sc, 0);
4530 }
4531 
4532 static int
4533 mxge_add_msix_irqs(mxge_softc_t *sc)
4534 {
4535 	size_t bytes;
4536 	int count, err, i, rid;
4537 
4538 	rid = PCIR_BAR(2);
4539 	sc->msix_table_res = bus_alloc_resource_any(sc->dev, SYS_RES_MEMORY,
4540 						    &rid, RF_ACTIVE);
4541 
4542 	if (sc->msix_table_res == NULL) {
4543 		device_printf(sc->dev, "couldn't alloc MSIX table res\n");
4544 		return ENXIO;
4545 	}
4546 
4547 	count = sc->num_slices;
4548 	err = pci_alloc_msix(sc->dev, &count);
4549 	if (err != 0) {
4550 		device_printf(sc->dev, "pci_alloc_msix: failed, wanted %d"
4551 			      "err = %d \n", sc->num_slices, err);
4552 		goto abort_with_msix_table;
4553 	}
4554 	if (count < sc->num_slices) {
4555 		device_printf(sc->dev, "pci_alloc_msix: need %d, got %d\n",
4556 			      count, sc->num_slices);
4557 		device_printf(sc->dev,
4558 			      "Try setting hw.mxge.max_slices to %d\n",
4559 			      count);
4560 		err = ENOSPC;
4561 		goto abort_with_msix;
4562 	}
4563 	bytes = sizeof (*sc->msix_irq_res) * sc->num_slices;
4564 	sc->msix_irq_res = malloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO);
4565 	if (sc->msix_irq_res == NULL) {
4566 		err = ENOMEM;
4567 		goto abort_with_msix;
4568 	}
4569 
4570 	for (i = 0; i < sc->num_slices; i++) {
4571 		rid = i + 1;
4572 		sc->msix_irq_res[i] = bus_alloc_resource_any(sc->dev,
4573 							  SYS_RES_IRQ,
4574 							  &rid, RF_ACTIVE);
4575 		if (sc->msix_irq_res[i] == NULL) {
4576 			device_printf(sc->dev, "couldn't allocate IRQ res"
4577 				      " for message %d\n", i);
4578 			err = ENXIO;
4579 			goto abort_with_res;
4580 		}
4581 	}
4582 
4583 	bytes = sizeof (*sc->msix_ih) * sc->num_slices;
4584 	sc->msix_ih =  malloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO);
4585 
4586 	for (i = 0; i < sc->num_slices; i++) {
4587 		err = bus_setup_intr(sc->dev, sc->msix_irq_res[i],
4588 				     INTR_TYPE_NET | INTR_MPSAFE,
4589 #if __FreeBSD_version > 700030
4590 				     NULL,
4591 #endif
4592 				     mxge_intr, &sc->ss[i], &sc->msix_ih[i]);
4593 		if (err != 0) {
4594 			device_printf(sc->dev, "couldn't setup intr for "
4595 				      "message %d\n", i);
4596 			goto abort_with_intr;
4597 		}
4598 		bus_describe_intr(sc->dev, sc->msix_irq_res[i],
4599 				  sc->msix_ih[i], "s%d", i);
4600 	}
4601 
4602 	if (mxge_verbose) {
4603 		device_printf(sc->dev, "using %d msix IRQs:",
4604 			      sc->num_slices);
4605 		for (i = 0; i < sc->num_slices; i++)
4606 			printf(" %jd", rman_get_start(sc->msix_irq_res[i]));
4607 		printf("\n");
4608 	}
4609 	return (0);
4610 
4611 abort_with_intr:
4612 	for (i = 0; i < sc->num_slices; i++) {
4613 		if (sc->msix_ih[i] != NULL) {
4614 			bus_teardown_intr(sc->dev, sc->msix_irq_res[i],
4615 					  sc->msix_ih[i]);
4616 			sc->msix_ih[i] = NULL;
4617 		}
4618 	}
4619 	free(sc->msix_ih, M_DEVBUF);
4620 
4621 abort_with_res:
4622 	for (i = 0; i < sc->num_slices; i++) {
4623 		rid = i + 1;
4624 		if (sc->msix_irq_res[i] != NULL)
4625 			bus_release_resource(sc->dev, SYS_RES_IRQ, rid,
4626 					     sc->msix_irq_res[i]);
4627 		sc->msix_irq_res[i] = NULL;
4628 	}
4629 	free(sc->msix_irq_res, M_DEVBUF);
4630 
4631 abort_with_msix:
4632 	pci_release_msi(sc->dev);
4633 
4634 abort_with_msix_table:
4635 	bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2),
4636 			     sc->msix_table_res);
4637 
4638 	return err;
4639 }
4640 
4641 static int
4642 mxge_add_single_irq(mxge_softc_t *sc)
4643 {
4644 	int count, err, rid;
4645 
4646 	count = pci_msi_count(sc->dev);
4647 	if (count == 1 && pci_alloc_msi(sc->dev, &count) == 0) {
4648 		rid = 1;
4649 	} else {
4650 		rid = 0;
4651 		sc->legacy_irq = 1;
4652 	}
4653 	sc->irq_res = bus_alloc_resource_any(sc->dev, SYS_RES_IRQ, &rid,
4654 					     RF_SHAREABLE | RF_ACTIVE);
4655 	if (sc->irq_res == NULL) {
4656 		device_printf(sc->dev, "could not alloc interrupt\n");
4657 		return ENXIO;
4658 	}
4659 	if (mxge_verbose)
4660 		device_printf(sc->dev, "using %s irq %jd\n",
4661 			      sc->legacy_irq ? "INTx" : "MSI",
4662 			      rman_get_start(sc->irq_res));
4663 	err = bus_setup_intr(sc->dev, sc->irq_res,
4664 			     INTR_TYPE_NET | INTR_MPSAFE,
4665 #if __FreeBSD_version > 700030
4666 			     NULL,
4667 #endif
4668 			     mxge_intr, &sc->ss[0], &sc->ih);
4669 	if (err != 0) {
4670 		bus_release_resource(sc->dev, SYS_RES_IRQ,
4671 				     sc->legacy_irq ? 0 : 1, sc->irq_res);
4672 		if (!sc->legacy_irq)
4673 			pci_release_msi(sc->dev);
4674 	}
4675 	return err;
4676 }
4677 
4678 static void
4679 mxge_rem_msix_irqs(mxge_softc_t *sc)
4680 {
4681 	int i, rid;
4682 
4683 	for (i = 0; i < sc->num_slices; i++) {
4684 		if (sc->msix_ih[i] != NULL) {
4685 			bus_teardown_intr(sc->dev, sc->msix_irq_res[i],
4686 					  sc->msix_ih[i]);
4687 			sc->msix_ih[i] = NULL;
4688 		}
4689 	}
4690 	free(sc->msix_ih, M_DEVBUF);
4691 
4692 	for (i = 0; i < sc->num_slices; i++) {
4693 		rid = i + 1;
4694 		if (sc->msix_irq_res[i] != NULL)
4695 			bus_release_resource(sc->dev, SYS_RES_IRQ, rid,
4696 					     sc->msix_irq_res[i]);
4697 		sc->msix_irq_res[i] = NULL;
4698 	}
4699 	free(sc->msix_irq_res, M_DEVBUF);
4700 
4701 	bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2),
4702 			     sc->msix_table_res);
4703 
4704 	pci_release_msi(sc->dev);
4705 	return;
4706 }
4707 
4708 static void
4709 mxge_rem_single_irq(mxge_softc_t *sc)
4710 {
4711 	bus_teardown_intr(sc->dev, sc->irq_res, sc->ih);
4712 	bus_release_resource(sc->dev, SYS_RES_IRQ,
4713 			     sc->legacy_irq ? 0 : 1, sc->irq_res);
4714 	if (!sc->legacy_irq)
4715 		pci_release_msi(sc->dev);
4716 }
4717 
4718 static void
4719 mxge_rem_irq(mxge_softc_t *sc)
4720 {
4721 	if (sc->num_slices > 1)
4722 		mxge_rem_msix_irqs(sc);
4723 	else
4724 		mxge_rem_single_irq(sc);
4725 }
4726 
4727 static int
4728 mxge_add_irq(mxge_softc_t *sc)
4729 {
4730 	int err;
4731 
4732 	if (sc->num_slices > 1)
4733 		err = mxge_add_msix_irqs(sc);
4734 	else
4735 		err = mxge_add_single_irq(sc);
4736 
4737 	if (0 && err == 0 && sc->num_slices > 1) {
4738 		mxge_rem_msix_irqs(sc);
4739 		err = mxge_add_msix_irqs(sc);
4740 	}
4741 	return err;
4742 }
4743 
4744 static int
4745 mxge_attach(device_t dev)
4746 {
4747 	mxge_cmd_t cmd;
4748 	mxge_softc_t *sc = device_get_softc(dev);
4749 	struct ifnet *ifp;
4750 	int err, rid;
4751 
4752 	sc->dev = dev;
4753 	mxge_fetch_tunables(sc);
4754 
4755 	TASK_INIT(&sc->watchdog_task, 1, mxge_watchdog_task, sc);
4756 	sc->tq = taskqueue_create("mxge_taskq", M_WAITOK,
4757 				  taskqueue_thread_enqueue, &sc->tq);
4758 	if (sc->tq == NULL) {
4759 		err = ENOMEM;
4760 		goto abort_with_nothing;
4761 	}
4762 
4763 	err = bus_dma_tag_create(bus_get_dma_tag(dev),	/* parent */
4764 				 1,			/* alignment */
4765 				 0,			/* boundary */
4766 				 BUS_SPACE_MAXADDR,	/* low */
4767 				 BUS_SPACE_MAXADDR,	/* high */
4768 				 NULL, NULL,		/* filter */
4769 				 65536 + 256,		/* maxsize */
4770 				 MXGE_MAX_SEND_DESC, 	/* num segs */
4771 				 65536,			/* maxsegsize */
4772 				 0,			/* flags */
4773 				 NULL, NULL,		/* lock */
4774 				 &sc->parent_dmat);	/* tag */
4775 
4776 	if (err != 0) {
4777 		device_printf(sc->dev, "Err %d allocating parent dmat\n",
4778 			      err);
4779 		goto abort_with_tq;
4780 	}
4781 
4782 	ifp = sc->ifp = if_alloc(IFT_ETHER);
4783 	if (ifp == NULL) {
4784 		device_printf(dev, "can not if_alloc()\n");
4785 		err = ENOSPC;
4786 		goto abort_with_parent_dmat;
4787 	}
4788 	if_initname(ifp, device_get_name(dev), device_get_unit(dev));
4789 
4790 	snprintf(sc->cmd_mtx_name, sizeof(sc->cmd_mtx_name), "%s:cmd",
4791 		 device_get_nameunit(dev));
4792 	mtx_init(&sc->cmd_mtx, sc->cmd_mtx_name, NULL, MTX_DEF);
4793 	snprintf(sc->driver_mtx_name, sizeof(sc->driver_mtx_name),
4794 		 "%s:drv", device_get_nameunit(dev));
4795 	mtx_init(&sc->driver_mtx, sc->driver_mtx_name,
4796 		 MTX_NETWORK_LOCK, MTX_DEF);
4797 
4798 	callout_init_mtx(&sc->co_hdl, &sc->driver_mtx, 0);
4799 
4800 	mxge_setup_cfg_space(sc);
4801 
4802 	/* Map the board into the kernel */
4803 	rid = PCIR_BARS;
4804 	sc->mem_res = bus_alloc_resource_any(dev, SYS_RES_MEMORY, &rid,
4805 					     RF_ACTIVE);
4806 	if (sc->mem_res == NULL) {
4807 		device_printf(dev, "could not map memory\n");
4808 		err = ENXIO;
4809 		goto abort_with_lock;
4810 	}
4811 	sc->sram = rman_get_virtual(sc->mem_res);
4812 	sc->sram_size = 2*1024*1024 - (2*(48*1024)+(32*1024)) - 0x100;
4813 	if (sc->sram_size > rman_get_size(sc->mem_res)) {
4814 		device_printf(dev, "impossible memory region size %jd\n",
4815 			      rman_get_size(sc->mem_res));
4816 		err = ENXIO;
4817 		goto abort_with_mem_res;
4818 	}
4819 
4820 	/* make NULL terminated copy of the EEPROM strings section of
4821 	   lanai SRAM */
4822 	bzero(sc->eeprom_strings, MXGE_EEPROM_STRINGS_SIZE);
4823 	bus_space_read_region_1(rman_get_bustag(sc->mem_res),
4824 				rman_get_bushandle(sc->mem_res),
4825 				sc->sram_size - MXGE_EEPROM_STRINGS_SIZE,
4826 				sc->eeprom_strings,
4827 				MXGE_EEPROM_STRINGS_SIZE - 2);
4828 	err = mxge_parse_strings(sc);
4829 	if (err != 0)
4830 		goto abort_with_mem_res;
4831 
4832 	/* Enable write combining for efficient use of PCIe bus */
4833 	mxge_enable_wc(sc);
4834 
4835 	/* Allocate the out of band dma memory */
4836 	err = mxge_dma_alloc(sc, &sc->cmd_dma,
4837 			     sizeof (mxge_cmd_t), 64);
4838 	if (err != 0)
4839 		goto abort_with_mem_res;
4840 	sc->cmd = (mcp_cmd_response_t *) sc->cmd_dma.addr;
4841 	err = mxge_dma_alloc(sc, &sc->zeropad_dma, 64, 64);
4842 	if (err != 0)
4843 		goto abort_with_cmd_dma;
4844 
4845 	err = mxge_dma_alloc(sc, &sc->dmabench_dma, 4096, 4096);
4846 	if (err != 0)
4847 		goto abort_with_zeropad_dma;
4848 
4849 	/* select & load the firmware */
4850 	err = mxge_select_firmware(sc);
4851 	if (err != 0)
4852 		goto abort_with_dmabench;
4853 	sc->intr_coal_delay = mxge_intr_coal_delay;
4854 
4855 	mxge_slice_probe(sc);
4856 	err = mxge_alloc_slices(sc);
4857 	if (err != 0)
4858 		goto abort_with_dmabench;
4859 
4860 	err = mxge_reset(sc, 0);
4861 	if (err != 0)
4862 		goto abort_with_slices;
4863 
4864 	err = mxge_alloc_rings(sc);
4865 	if (err != 0) {
4866 		device_printf(sc->dev, "failed to allocate rings\n");
4867 		goto abort_with_slices;
4868 	}
4869 
4870 	err = mxge_add_irq(sc);
4871 	if (err != 0) {
4872 		device_printf(sc->dev, "failed to add irq\n");
4873 		goto abort_with_rings;
4874 	}
4875 
4876 	ifp->if_baudrate = IF_Gbps(10);
4877 	ifp->if_capabilities = IFCAP_RXCSUM | IFCAP_TXCSUM | IFCAP_TSO4 |
4878 		IFCAP_VLAN_MTU | IFCAP_LINKSTATE | IFCAP_TXCSUM_IPV6 |
4879 		IFCAP_RXCSUM_IPV6;
4880 #if defined(INET) || defined(INET6)
4881 	ifp->if_capabilities |= IFCAP_LRO;
4882 #endif
4883 
4884 #ifdef MXGE_NEW_VLAN_API
4885 	ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_HWCSUM;
4886 
4887 	/* Only FW 1.4.32 and newer can do TSO over vlans */
4888 	if (sc->fw_ver_major == 1 && sc->fw_ver_minor == 4 &&
4889 	    sc->fw_ver_tiny >= 32)
4890 		ifp->if_capabilities |= IFCAP_VLAN_HWTSO;
4891 #endif
4892 	sc->max_mtu = mxge_max_mtu(sc);
4893 	if (sc->max_mtu >= 9000)
4894 		ifp->if_capabilities |= IFCAP_JUMBO_MTU;
4895 	else
4896 		device_printf(dev, "MTU limited to %d.  Install "
4897 			      "latest firmware for 9000 byte jumbo support\n",
4898 			      sc->max_mtu - ETHER_HDR_LEN);
4899 	ifp->if_hwassist = CSUM_TCP | CSUM_UDP | CSUM_TSO;
4900 	ifp->if_hwassist |= CSUM_TCP_IPV6 | CSUM_UDP_IPV6;
4901 	/* check to see if f/w supports TSO for IPv6 */
4902 	if (!mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_TSO6_HDR_SIZE, &cmd)) {
4903 		if (CSUM_TCP_IPV6)
4904 			ifp->if_capabilities |= IFCAP_TSO6;
4905 		sc->max_tso6_hlen = min(cmd.data0,
4906 					sizeof (sc->ss[0].scratch));
4907 	}
4908 	ifp->if_capenable = ifp->if_capabilities;
4909 	if (sc->lro_cnt == 0)
4910 		ifp->if_capenable &= ~IFCAP_LRO;
4911 	ifp->if_init = mxge_init;
4912 	ifp->if_softc = sc;
4913 	ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
4914 	ifp->if_ioctl = mxge_ioctl;
4915 	ifp->if_start = mxge_start;
4916 	ifp->if_get_counter = mxge_get_counter;
4917 	ifp->if_hw_tsomax = IP_MAXPACKET - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN);
4918 	ifp->if_hw_tsomaxsegcount = sc->ss[0].tx.max_desc;
4919 	ifp->if_hw_tsomaxsegsize = IP_MAXPACKET;
4920 	/* Initialise the ifmedia structure */
4921 	ifmedia_init(&sc->media, 0, mxge_media_change,
4922 		     mxge_media_status);
4923 	mxge_media_init(sc);
4924 	mxge_media_probe(sc);
4925 	sc->dying = 0;
4926 	ether_ifattach(ifp, sc->mac_addr);
4927 	/* ether_ifattach sets mtu to ETHERMTU */
4928 	if (mxge_initial_mtu != ETHERMTU)
4929 		mxge_change_mtu(sc, mxge_initial_mtu);
4930 
4931 	mxge_add_sysctls(sc);
4932 #ifdef IFNET_BUF_RING
4933 	ifp->if_transmit = mxge_transmit;
4934 	ifp->if_qflush = mxge_qflush;
4935 #endif
4936 	taskqueue_start_threads(&sc->tq, 1, PI_NET, "%s taskq",
4937 				device_get_nameunit(sc->dev));
4938 	callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
4939 	return 0;
4940 
4941 abort_with_rings:
4942 	mxge_free_rings(sc);
4943 abort_with_slices:
4944 	mxge_free_slices(sc);
4945 abort_with_dmabench:
4946 	mxge_dma_free(&sc->dmabench_dma);
4947 abort_with_zeropad_dma:
4948 	mxge_dma_free(&sc->zeropad_dma);
4949 abort_with_cmd_dma:
4950 	mxge_dma_free(&sc->cmd_dma);
4951 abort_with_mem_res:
4952 	bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
4953 abort_with_lock:
4954 	pci_disable_busmaster(dev);
4955 	mtx_destroy(&sc->cmd_mtx);
4956 	mtx_destroy(&sc->driver_mtx);
4957 	if_free(ifp);
4958 abort_with_parent_dmat:
4959 	bus_dma_tag_destroy(sc->parent_dmat);
4960 abort_with_tq:
4961 	if (sc->tq != NULL) {
4962 		taskqueue_drain(sc->tq, &sc->watchdog_task);
4963 		taskqueue_free(sc->tq);
4964 		sc->tq = NULL;
4965 	}
4966 abort_with_nothing:
4967 	return err;
4968 }
4969 
4970 static int
4971 mxge_detach(device_t dev)
4972 {
4973 	mxge_softc_t *sc = device_get_softc(dev);
4974 
4975 	if (mxge_vlans_active(sc)) {
4976 		device_printf(sc->dev,
4977 			      "Detach vlans before removing module\n");
4978 		return EBUSY;
4979 	}
4980 	mtx_lock(&sc->driver_mtx);
4981 	sc->dying = 1;
4982 	if (sc->ifp->if_drv_flags & IFF_DRV_RUNNING)
4983 		mxge_close(sc, 0);
4984 	mtx_unlock(&sc->driver_mtx);
4985 	ether_ifdetach(sc->ifp);
4986 	if (sc->tq != NULL) {
4987 		taskqueue_drain(sc->tq, &sc->watchdog_task);
4988 		taskqueue_free(sc->tq);
4989 		sc->tq = NULL;
4990 	}
4991 	callout_drain(&sc->co_hdl);
4992 	ifmedia_removeall(&sc->media);
4993 	mxge_dummy_rdma(sc, 0);
4994 	mxge_rem_sysctls(sc);
4995 	mxge_rem_irq(sc);
4996 	mxge_free_rings(sc);
4997 	mxge_free_slices(sc);
4998 	mxge_dma_free(&sc->dmabench_dma);
4999 	mxge_dma_free(&sc->zeropad_dma);
5000 	mxge_dma_free(&sc->cmd_dma);
5001 	bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
5002 	pci_disable_busmaster(dev);
5003 	mtx_destroy(&sc->cmd_mtx);
5004 	mtx_destroy(&sc->driver_mtx);
5005 	if_free(sc->ifp);
5006 	bus_dma_tag_destroy(sc->parent_dmat);
5007 	return 0;
5008 }
5009 
5010 static int
5011 mxge_shutdown(device_t dev)
5012 {
5013 	return 0;
5014 }
5015 
5016 /*
5017   This file uses Myri10GE driver indentation.
5018 
5019   Local Variables:
5020   c-file-style:"linux"
5021   tab-width:8
5022   End:
5023 */
5024