xref: /freebsd/sys/dev/mxge/if_mxge.c (revision 454630c72556d45e401f29f56b3317c2fb0499a0)
1 /******************************************************************************
2 SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3 
4 Copyright (c) 2006-2013, Myricom Inc.
5 All rights reserved.
6 
7 Redistribution and use in source and binary forms, with or without
8 modification, are permitted provided that the following conditions are met:
9 
10  1. Redistributions of source code must retain the above copyright notice,
11     this list of conditions and the following disclaimer.
12 
13  2. Neither the name of the Myricom Inc, nor the names of its
14     contributors may be used to endorse or promote products derived from
15     this software without specific prior written permission.
16 
17 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
21 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27 POSSIBILITY OF SUCH DAMAGE.
28 
29 ***************************************************************************/
30 
31 #include <sys/cdefs.h>
32 __FBSDID("$FreeBSD$");
33 
34 #include <sys/param.h>
35 #include <sys/systm.h>
36 #include <sys/linker.h>
37 #include <sys/firmware.h>
38 #include <sys/endian.h>
39 #include <sys/sockio.h>
40 #include <sys/mbuf.h>
41 #include <sys/malloc.h>
42 #include <sys/kdb.h>
43 #include <sys/kernel.h>
44 #include <sys/lock.h>
45 #include <sys/module.h>
46 #include <sys/socket.h>
47 #include <sys/sysctl.h>
48 #include <sys/sx.h>
49 #include <sys/taskqueue.h>
50 #include <contrib/zlib/zlib.h>
51 #include <dev/zlib/zcalloc.h>
52 
53 #include <net/if.h>
54 #include <net/if_var.h>
55 #include <net/if_arp.h>
56 #include <net/ethernet.h>
57 #include <net/if_dl.h>
58 #include <net/if_media.h>
59 
60 #include <net/bpf.h>
61 
62 #include <net/if_types.h>
63 #include <net/if_vlan_var.h>
64 
65 #include <netinet/in_systm.h>
66 #include <netinet/in.h>
67 #include <netinet/ip.h>
68 #include <netinet/ip6.h>
69 #include <netinet/tcp.h>
70 #include <netinet/tcp_lro.h>
71 #include <netinet6/ip6_var.h>
72 
73 #include <machine/bus.h>
74 #include <machine/in_cksum.h>
75 #include <machine/resource.h>
76 #include <sys/bus.h>
77 #include <sys/rman.h>
78 #include <sys/smp.h>
79 
80 #include <dev/pci/pcireg.h>
81 #include <dev/pci/pcivar.h>
82 #include <dev/pci/pci_private.h> /* XXX for pci_cfg_restore */
83 
84 #include <vm/vm.h>		/* for pmap_mapdev() */
85 #include <vm/pmap.h>
86 
87 #if defined(__i386) || defined(__amd64)
88 #include <machine/specialreg.h>
89 #endif
90 
91 #include <dev/mxge/mxge_mcp.h>
92 #include <dev/mxge/mcp_gen_header.h>
93 /*#define MXGE_FAKE_IFP*/
94 #include <dev/mxge/if_mxge_var.h>
95 #ifdef IFNET_BUF_RING
96 #include <sys/buf_ring.h>
97 #endif
98 
99 #include "opt_inet.h"
100 #include "opt_inet6.h"
101 
102 /* tunable params */
103 static int mxge_nvidia_ecrc_enable = 1;
104 static int mxge_force_firmware = 0;
105 static int mxge_intr_coal_delay = 30;
106 static int mxge_deassert_wait = 1;
107 static int mxge_flow_control = 1;
108 static int mxge_verbose = 0;
109 static int mxge_ticks;
110 static int mxge_max_slices = 1;
111 static int mxge_rss_hash_type = MXGEFW_RSS_HASH_TYPE_SRC_DST_PORT;
112 static int mxge_always_promisc = 0;
113 static int mxge_initial_mtu = ETHERMTU_JUMBO;
114 static int mxge_throttle = 0;
115 static char *mxge_fw_unaligned = "mxge_ethp_z8e";
116 static char *mxge_fw_aligned = "mxge_eth_z8e";
117 static char *mxge_fw_rss_aligned = "mxge_rss_eth_z8e";
118 static char *mxge_fw_rss_unaligned = "mxge_rss_ethp_z8e";
119 
120 static int mxge_probe(device_t dev);
121 static int mxge_attach(device_t dev);
122 static int mxge_detach(device_t dev);
123 static int mxge_shutdown(device_t dev);
124 static void mxge_intr(void *arg);
125 
126 static device_method_t mxge_methods[] =
127 {
128   /* Device interface */
129   DEVMETHOD(device_probe, mxge_probe),
130   DEVMETHOD(device_attach, mxge_attach),
131   DEVMETHOD(device_detach, mxge_detach),
132   DEVMETHOD(device_shutdown, mxge_shutdown),
133 
134   DEVMETHOD_END
135 };
136 
137 static driver_t mxge_driver =
138 {
139   "mxge",
140   mxge_methods,
141   sizeof(mxge_softc_t),
142 };
143 
144 static devclass_t mxge_devclass;
145 
146 /* Declare ourselves to be a child of the PCI bus.*/
147 DRIVER_MODULE(mxge, pci, mxge_driver, mxge_devclass, 0, 0);
148 MODULE_DEPEND(mxge, firmware, 1, 1, 1);
149 MODULE_DEPEND(mxge, zlib, 1, 1, 1);
150 
151 static int mxge_load_firmware(mxge_softc_t *sc, int adopt);
152 static int mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data);
153 static int mxge_close(mxge_softc_t *sc, int down);
154 static int mxge_open(mxge_softc_t *sc);
155 static void mxge_tick(void *arg);
156 
157 static int
158 mxge_probe(device_t dev)
159 {
160 	int rev;
161 
162 	if ((pci_get_vendor(dev) == MXGE_PCI_VENDOR_MYRICOM) &&
163 	    ((pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E) ||
164 	     (pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E_9))) {
165 		rev = pci_get_revid(dev);
166 		switch (rev) {
167 		case MXGE_PCI_REV_Z8E:
168 			device_set_desc(dev, "Myri10G-PCIE-8A");
169 			break;
170 		case MXGE_PCI_REV_Z8ES:
171 			device_set_desc(dev, "Myri10G-PCIE-8B");
172 			break;
173 		default:
174 			device_set_desc(dev, "Myri10G-PCIE-8??");
175 			device_printf(dev, "Unrecognized rev %d NIC\n",
176 				      rev);
177 			break;
178 		}
179 		return 0;
180 	}
181 	return ENXIO;
182 }
183 
184 static void
185 mxge_enable_wc(mxge_softc_t *sc)
186 {
187 #if defined(__i386) || defined(__amd64)
188 	vm_offset_t len;
189 	int err;
190 
191 	sc->wc = 1;
192 	len = rman_get_size(sc->mem_res);
193 	err = pmap_change_attr((vm_offset_t) sc->sram,
194 			       len, PAT_WRITE_COMBINING);
195 	if (err != 0) {
196 		device_printf(sc->dev, "pmap_change_attr failed, %d\n",
197 			      err);
198 		sc->wc = 0;
199 	}
200 #endif
201 }
202 
203 /* callback to get our DMA address */
204 static void
205 mxge_dmamap_callback(void *arg, bus_dma_segment_t *segs, int nsegs,
206 			 int error)
207 {
208 	if (error == 0) {
209 		*(bus_addr_t *) arg = segs->ds_addr;
210 	}
211 }
212 
213 static int
214 mxge_dma_alloc(mxge_softc_t *sc, mxge_dma_t *dma, size_t bytes,
215 		   bus_size_t alignment)
216 {
217 	int err;
218 	device_t dev = sc->dev;
219 	bus_size_t boundary, maxsegsize;
220 
221 	if (bytes > 4096 && alignment == 4096) {
222 		boundary = 0;
223 		maxsegsize = bytes;
224 	} else {
225 		boundary = 4096;
226 		maxsegsize = 4096;
227 	}
228 
229 	/* allocate DMAable memory tags */
230 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
231 				 alignment,		/* alignment */
232 				 boundary,		/* boundary */
233 				 BUS_SPACE_MAXADDR,	/* low */
234 				 BUS_SPACE_MAXADDR,	/* high */
235 				 NULL, NULL,		/* filter */
236 				 bytes,			/* maxsize */
237 				 1,			/* num segs */
238 				 maxsegsize,		/* maxsegsize */
239 				 BUS_DMA_COHERENT,	/* flags */
240 				 NULL, NULL,		/* lock */
241 				 &dma->dmat);		/* tag */
242 	if (err != 0) {
243 		device_printf(dev, "couldn't alloc tag (err = %d)\n", err);
244 		return err;
245 	}
246 
247 	/* allocate DMAable memory & map */
248 	err = bus_dmamem_alloc(dma->dmat, &dma->addr,
249 			       (BUS_DMA_WAITOK | BUS_DMA_COHERENT
250 				| BUS_DMA_ZERO),  &dma->map);
251 	if (err != 0) {
252 		device_printf(dev, "couldn't alloc mem (err = %d)\n", err);
253 		goto abort_with_dmat;
254 	}
255 
256 	/* load the memory */
257 	err = bus_dmamap_load(dma->dmat, dma->map, dma->addr, bytes,
258 			      mxge_dmamap_callback,
259 			      (void *)&dma->bus_addr, 0);
260 	if (err != 0) {
261 		device_printf(dev, "couldn't load map (err = %d)\n", err);
262 		goto abort_with_mem;
263 	}
264 	return 0;
265 
266 abort_with_mem:
267 	bus_dmamem_free(dma->dmat, dma->addr, dma->map);
268 abort_with_dmat:
269 	(void)bus_dma_tag_destroy(dma->dmat);
270 	return err;
271 }
272 
273 static void
274 mxge_dma_free(mxge_dma_t *dma)
275 {
276 	bus_dmamap_unload(dma->dmat, dma->map);
277 	bus_dmamem_free(dma->dmat, dma->addr, dma->map);
278 	(void)bus_dma_tag_destroy(dma->dmat);
279 }
280 
281 /*
282  * The eeprom strings on the lanaiX have the format
283  * SN=x\0
284  * MAC=x:x:x:x:x:x\0
285  * PC=text\0
286  */
287 
288 static int
289 mxge_parse_strings(mxge_softc_t *sc)
290 {
291 	char *ptr;
292 	int i, found_mac, found_sn2;
293 	char *endptr;
294 
295 	ptr = sc->eeprom_strings;
296 	found_mac = 0;
297 	found_sn2 = 0;
298 	while (*ptr != '\0') {
299 		if (strncmp(ptr, "MAC=", 4) == 0) {
300 			ptr += 4;
301 			for (i = 0;;) {
302 				sc->mac_addr[i] = strtoul(ptr, &endptr, 16);
303 				if (endptr - ptr != 2)
304 					goto abort;
305 				ptr = endptr;
306 				if (++i == 6)
307 					break;
308 				if (*ptr++ != ':')
309 					goto abort;
310 			}
311 			found_mac = 1;
312 		} else if (strncmp(ptr, "PC=", 3) == 0) {
313 			ptr += 3;
314 			strlcpy(sc->product_code_string, ptr,
315 			    sizeof(sc->product_code_string));
316 		} else if (!found_sn2 && (strncmp(ptr, "SN=", 3) == 0)) {
317 			ptr += 3;
318 			strlcpy(sc->serial_number_string, ptr,
319 			    sizeof(sc->serial_number_string));
320 		} else if (strncmp(ptr, "SN2=", 4) == 0) {
321 			/* SN2 takes precedence over SN */
322 			ptr += 4;
323 			found_sn2 = 1;
324 			strlcpy(sc->serial_number_string, ptr,
325 			    sizeof(sc->serial_number_string));
326 		}
327 		while (*ptr++ != '\0') {}
328 	}
329 
330 	if (found_mac)
331 		return 0;
332 
333  abort:
334 	device_printf(sc->dev, "failed to parse eeprom_strings\n");
335 
336 	return ENXIO;
337 }
338 
339 #if defined __i386 || defined i386 || defined __i386__ || defined __x86_64__
340 static void
341 mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
342 {
343 	uint32_t val;
344 	unsigned long base, off;
345 	char *va, *cfgptr;
346 	device_t pdev, mcp55;
347 	uint16_t vendor_id, device_id, word;
348 	uintptr_t bus, slot, func, ivend, idev;
349 	uint32_t *ptr32;
350 
351 	if (!mxge_nvidia_ecrc_enable)
352 		return;
353 
354 	pdev = device_get_parent(device_get_parent(sc->dev));
355 	if (pdev == NULL) {
356 		device_printf(sc->dev, "could not find parent?\n");
357 		return;
358 	}
359 	vendor_id = pci_read_config(pdev, PCIR_VENDOR, 2);
360 	device_id = pci_read_config(pdev, PCIR_DEVICE, 2);
361 
362 	if (vendor_id != 0x10de)
363 		return;
364 
365 	base = 0;
366 
367 	if (device_id == 0x005d) {
368 		/* ck804, base address is magic */
369 		base = 0xe0000000UL;
370 	} else if (device_id >= 0x0374 && device_id <= 0x378) {
371 		/* mcp55, base address stored in chipset */
372 		mcp55 = pci_find_bsf(0, 0, 0);
373 		if (mcp55 &&
374 		    0x10de == pci_read_config(mcp55, PCIR_VENDOR, 2) &&
375 		    0x0369 == pci_read_config(mcp55, PCIR_DEVICE, 2)) {
376 			word = pci_read_config(mcp55, 0x90, 2);
377 			base = ((unsigned long)word & 0x7ffeU) << 25;
378 		}
379 	}
380 	if (!base)
381 		return;
382 
383 	/* XXXX
384 	   Test below is commented because it is believed that doing
385 	   config read/write beyond 0xff will access the config space
386 	   for the next larger function.  Uncomment this and remove
387 	   the hacky pmap_mapdev() way of accessing config space when
388 	   FreeBSD grows support for extended pcie config space access
389 	*/
390 #if 0
391 	/* See if we can, by some miracle, access the extended
392 	   config space */
393 	val = pci_read_config(pdev, 0x178, 4);
394 	if (val != 0xffffffff) {
395 		val |= 0x40;
396 		pci_write_config(pdev, 0x178, val, 4);
397 		return;
398 	}
399 #endif
400 	/* Rather than using normal pci config space writes, we must
401 	 * map the Nvidia config space ourselves.  This is because on
402 	 * opteron/nvidia class machine the 0xe000000 mapping is
403 	 * handled by the nvidia chipset, that means the internal PCI
404 	 * device (the on-chip northbridge), or the amd-8131 bridge
405 	 * and things behind them are not visible by this method.
406 	 */
407 
408 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
409 		      PCI_IVAR_BUS, &bus);
410 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
411 		      PCI_IVAR_SLOT, &slot);
412 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
413 		      PCI_IVAR_FUNCTION, &func);
414 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
415 		      PCI_IVAR_VENDOR, &ivend);
416 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
417 		      PCI_IVAR_DEVICE, &idev);
418 
419 	off =  base
420 		+ 0x00100000UL * (unsigned long)bus
421 		+ 0x00001000UL * (unsigned long)(func
422 						 + 8 * slot);
423 
424 	/* map it into the kernel */
425 	va = pmap_mapdev(trunc_page((vm_paddr_t)off), PAGE_SIZE);
426 
427 	if (va == NULL) {
428 		device_printf(sc->dev, "pmap_kenter_temporary didn't\n");
429 		return;
430 	}
431 	/* get a pointer to the config space mapped into the kernel */
432 	cfgptr = va + (off & PAGE_MASK);
433 
434 	/* make sure that we can really access it */
435 	vendor_id = *(uint16_t *)(cfgptr + PCIR_VENDOR);
436 	device_id = *(uint16_t *)(cfgptr + PCIR_DEVICE);
437 	if (! (vendor_id == ivend && device_id == idev)) {
438 		device_printf(sc->dev, "mapping failed: 0x%x:0x%x\n",
439 			      vendor_id, device_id);
440 		pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
441 		return;
442 	}
443 
444 	ptr32 = (uint32_t*)(cfgptr + 0x178);
445 	val = *ptr32;
446 
447 	if (val == 0xffffffff) {
448 		device_printf(sc->dev, "extended mapping failed\n");
449 		pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
450 		return;
451 	}
452 	*ptr32 = val | 0x40;
453 	pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
454 	if (mxge_verbose)
455 		device_printf(sc->dev,
456 			      "Enabled ECRC on upstream Nvidia bridge "
457 			      "at %d:%d:%d\n",
458 			      (int)bus, (int)slot, (int)func);
459 	return;
460 }
461 #else
462 static void
463 mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
464 {
465 	device_printf(sc->dev,
466 		      "Nforce 4 chipset on non-x86/amd64!?!?!\n");
467 	return;
468 }
469 #endif
470 
471 static int
472 mxge_dma_test(mxge_softc_t *sc, int test_type)
473 {
474 	mxge_cmd_t cmd;
475 	bus_addr_t dmatest_bus = sc->dmabench_dma.bus_addr;
476 	int status;
477 	uint32_t len;
478 	char *test = " ";
479 
480 	/* Run a small DMA test.
481 	 * The magic multipliers to the length tell the firmware
482 	 * to do DMA read, write, or read+write tests.  The
483 	 * results are returned in cmd.data0.  The upper 16
484 	 * bits of the return is the number of transfers completed.
485 	 * The lower 16 bits is the time in 0.5us ticks that the
486 	 * transfers took to complete.
487 	 */
488 
489 	len = sc->tx_boundary;
490 
491 	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
492 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
493 	cmd.data2 = len * 0x10000;
494 	status = mxge_send_cmd(sc, test_type, &cmd);
495 	if (status != 0) {
496 		test = "read";
497 		goto abort;
498 	}
499 	sc->read_dma = ((cmd.data0>>16) * len * 2) /
500 		(cmd.data0 & 0xffff);
501 	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
502 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
503 	cmd.data2 = len * 0x1;
504 	status = mxge_send_cmd(sc, test_type, &cmd);
505 	if (status != 0) {
506 		test = "write";
507 		goto abort;
508 	}
509 	sc->write_dma = ((cmd.data0>>16) * len * 2) /
510 		(cmd.data0 & 0xffff);
511 
512 	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
513 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
514 	cmd.data2 = len * 0x10001;
515 	status = mxge_send_cmd(sc, test_type, &cmd);
516 	if (status != 0) {
517 		test = "read/write";
518 		goto abort;
519 	}
520 	sc->read_write_dma = ((cmd.data0>>16) * len * 2 * 2) /
521 		(cmd.data0 & 0xffff);
522 
523 abort:
524 	if (status != 0 && test_type != MXGEFW_CMD_UNALIGNED_TEST)
525 		device_printf(sc->dev, "DMA %s benchmark failed: %d\n",
526 			      test, status);
527 
528 	return status;
529 }
530 
531 /*
532  * The Lanai Z8E PCI-E interface achieves higher Read-DMA throughput
533  * when the PCI-E Completion packets are aligned on an 8-byte
534  * boundary.  Some PCI-E chip sets always align Completion packets; on
535  * the ones that do not, the alignment can be enforced by enabling
536  * ECRC generation (if supported).
537  *
538  * When PCI-E Completion packets are not aligned, it is actually more
539  * efficient to limit Read-DMA transactions to 2KB, rather than 4KB.
540  *
541  * If the driver can neither enable ECRC nor verify that it has
542  * already been enabled, then it must use a firmware image which works
543  * around unaligned completion packets (ethp_z8e.dat), and it should
544  * also ensure that it never gives the device a Read-DMA which is
545  * larger than 2KB by setting the tx_boundary to 2KB.  If ECRC is
546  * enabled, then the driver should use the aligned (eth_z8e.dat)
547  * firmware image, and set tx_boundary to 4KB.
548  */
549 
550 static int
551 mxge_firmware_probe(mxge_softc_t *sc)
552 {
553 	device_t dev = sc->dev;
554 	int reg, status;
555 	uint16_t pectl;
556 
557 	sc->tx_boundary = 4096;
558 	/*
559 	 * Verify the max read request size was set to 4KB
560 	 * before trying the test with 4KB.
561 	 */
562 	if (pci_find_cap(dev, PCIY_EXPRESS, &reg) == 0) {
563 		pectl = pci_read_config(dev, reg + 0x8, 2);
564 		if ((pectl & (5 << 12)) != (5 << 12)) {
565 			device_printf(dev, "Max Read Req. size != 4k (0x%x\n",
566 				      pectl);
567 			sc->tx_boundary = 2048;
568 		}
569 	}
570 
571 	/*
572 	 * load the optimized firmware (which assumes aligned PCIe
573 	 * completions) in order to see if it works on this host.
574 	 */
575 	sc->fw_name = mxge_fw_aligned;
576 	status = mxge_load_firmware(sc, 1);
577 	if (status != 0) {
578 		return status;
579 	}
580 
581 	/*
582 	 * Enable ECRC if possible
583 	 */
584 	mxge_enable_nvidia_ecrc(sc);
585 
586 	/*
587 	 * Run a DMA test which watches for unaligned completions and
588 	 * aborts on the first one seen.  Not required on Z8ES or newer.
589 	 */
590 	if (pci_get_revid(sc->dev) >= MXGE_PCI_REV_Z8ES)
591 		return 0;
592 	status = mxge_dma_test(sc, MXGEFW_CMD_UNALIGNED_TEST);
593 	if (status == 0)
594 		return 0; /* keep the aligned firmware */
595 
596 	if (status != E2BIG)
597 		device_printf(dev, "DMA test failed: %d\n", status);
598 	if (status == ENOSYS)
599 		device_printf(dev, "Falling back to ethp! "
600 			      "Please install up to date fw\n");
601 	return status;
602 }
603 
604 static int
605 mxge_select_firmware(mxge_softc_t *sc)
606 {
607 	int aligned = 0;
608 	int force_firmware = mxge_force_firmware;
609 
610 	if (sc->throttle)
611 		force_firmware = sc->throttle;
612 
613 	if (force_firmware != 0) {
614 		if (force_firmware == 1)
615 			aligned = 1;
616 		else
617 			aligned = 0;
618 		if (mxge_verbose)
619 			device_printf(sc->dev,
620 				      "Assuming %s completions (forced)\n",
621 				      aligned ? "aligned" : "unaligned");
622 		goto abort;
623 	}
624 
625 	/* if the PCIe link width is 4 or less, we can use the aligned
626 	   firmware and skip any checks */
627 	if (sc->link_width != 0 && sc->link_width <= 4) {
628 		device_printf(sc->dev,
629 			      "PCIe x%d Link, expect reduced performance\n",
630 			      sc->link_width);
631 		aligned = 1;
632 		goto abort;
633 	}
634 
635 	if (0 == mxge_firmware_probe(sc))
636 		return 0;
637 
638 abort:
639 	if (aligned) {
640 		sc->fw_name = mxge_fw_aligned;
641 		sc->tx_boundary = 4096;
642 	} else {
643 		sc->fw_name = mxge_fw_unaligned;
644 		sc->tx_boundary = 2048;
645 	}
646 	return (mxge_load_firmware(sc, 0));
647 }
648 
649 static int
650 mxge_validate_firmware(mxge_softc_t *sc, const mcp_gen_header_t *hdr)
651 {
652 
653 	if (be32toh(hdr->mcp_type) != MCP_TYPE_ETH) {
654 		device_printf(sc->dev, "Bad firmware type: 0x%x\n",
655 			      be32toh(hdr->mcp_type));
656 		return EIO;
657 	}
658 
659 	/* save firmware version for sysctl */
660 	strlcpy(sc->fw_version, hdr->version, sizeof(sc->fw_version));
661 	if (mxge_verbose)
662 		device_printf(sc->dev, "firmware id: %s\n", hdr->version);
663 
664 	sscanf(sc->fw_version, "%d.%d.%d", &sc->fw_ver_major,
665 	       &sc->fw_ver_minor, &sc->fw_ver_tiny);
666 
667 	if (!(sc->fw_ver_major == MXGEFW_VERSION_MAJOR
668 	      && sc->fw_ver_minor == MXGEFW_VERSION_MINOR)) {
669 		device_printf(sc->dev, "Found firmware version %s\n",
670 			      sc->fw_version);
671 		device_printf(sc->dev, "Driver needs %d.%d\n",
672 			      MXGEFW_VERSION_MAJOR, MXGEFW_VERSION_MINOR);
673 		return EINVAL;
674 	}
675 	return 0;
676 
677 }
678 
679 static int
680 mxge_load_firmware_helper(mxge_softc_t *sc, uint32_t *limit)
681 {
682 	z_stream zs;
683 	char *inflate_buffer;
684 	const struct firmware *fw;
685 	const mcp_gen_header_t *hdr;
686 	unsigned hdr_offset;
687 	int status;
688 	unsigned int i;
689 	size_t fw_len;
690 
691 	fw = firmware_get(sc->fw_name);
692 	if (fw == NULL) {
693 		device_printf(sc->dev, "Could not find firmware image %s\n",
694 			      sc->fw_name);
695 		return ENOENT;
696 	}
697 
698 	/* setup zlib and decompress f/w */
699 	bzero(&zs, sizeof (zs));
700 	zs.zalloc = zcalloc_nowait;
701 	zs.zfree = zcfree;
702 	status = inflateInit(&zs);
703 	if (status != Z_OK) {
704 		status = EIO;
705 		goto abort_with_fw;
706 	}
707 
708 	/* the uncompressed size is stored as the firmware version,
709 	   which would otherwise go unused */
710 	fw_len = (size_t) fw->version;
711 	inflate_buffer = malloc(fw_len, M_TEMP, M_NOWAIT);
712 	if (inflate_buffer == NULL)
713 		goto abort_with_zs;
714 	zs.avail_in = fw->datasize;
715 	zs.next_in = __DECONST(char *, fw->data);
716 	zs.avail_out = fw_len;
717 	zs.next_out = inflate_buffer;
718 	status = inflate(&zs, Z_FINISH);
719 	if (status != Z_STREAM_END) {
720 		device_printf(sc->dev, "zlib %d\n", status);
721 		status = EIO;
722 		goto abort_with_buffer;
723 	}
724 
725 	/* check id */
726 	hdr_offset = htobe32(*(const uint32_t *)
727 			     (inflate_buffer + MCP_HEADER_PTR_OFFSET));
728 	if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > fw_len) {
729 		device_printf(sc->dev, "Bad firmware file");
730 		status = EIO;
731 		goto abort_with_buffer;
732 	}
733 	hdr = (const void*)(inflate_buffer + hdr_offset);
734 
735 	status = mxge_validate_firmware(sc, hdr);
736 	if (status != 0)
737 		goto abort_with_buffer;
738 
739 	/* Copy the inflated firmware to NIC SRAM. */
740 	for (i = 0; i < fw_len; i += 256) {
741 		mxge_pio_copy(sc->sram + MXGE_FW_OFFSET + i,
742 			      inflate_buffer + i,
743 			      min(256U, (unsigned)(fw_len - i)));
744 		wmb();
745 		(void)*sc->sram;
746 		wmb();
747 	}
748 
749 	*limit = fw_len;
750 	status = 0;
751 abort_with_buffer:
752 	free(inflate_buffer, M_TEMP);
753 abort_with_zs:
754 	inflateEnd(&zs);
755 abort_with_fw:
756 	firmware_put(fw, FIRMWARE_UNLOAD);
757 	return status;
758 }
759 
760 /*
761  * Enable or disable periodic RDMAs from the host to make certain
762  * chipsets resend dropped PCIe messages
763  */
764 
765 static void
766 mxge_dummy_rdma(mxge_softc_t *sc, int enable)
767 {
768 	char buf_bytes[72];
769 	volatile uint32_t *confirm;
770 	volatile char *submit;
771 	uint32_t *buf, dma_low, dma_high;
772 	int i;
773 
774 	buf = (uint32_t *)((uintptr_t)(buf_bytes + 7) & ~7UL);
775 
776 	/* clear confirmation addr */
777 	confirm = (volatile uint32_t *)sc->cmd;
778 	*confirm = 0;
779 	wmb();
780 
781 	/* send an rdma command to the PCIe engine, and wait for the
782 	   response in the confirmation address.  The firmware should
783 	   write a -1 there to indicate it is alive and well
784 	*/
785 
786 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
787 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
788 	buf[0] = htobe32(dma_high);		/* confirm addr MSW */
789 	buf[1] = htobe32(dma_low);		/* confirm addr LSW */
790 	buf[2] = htobe32(0xffffffff);		/* confirm data */
791 	dma_low = MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr);
792 	dma_high = MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr);
793 	buf[3] = htobe32(dma_high); 		/* dummy addr MSW */
794 	buf[4] = htobe32(dma_low); 		/* dummy addr LSW */
795 	buf[5] = htobe32(enable);			/* enable? */
796 
797 	submit = (volatile char *)(sc->sram + MXGEFW_BOOT_DUMMY_RDMA);
798 
799 	mxge_pio_copy(submit, buf, 64);
800 	wmb();
801 	DELAY(1000);
802 	wmb();
803 	i = 0;
804 	while (*confirm != 0xffffffff && i < 20) {
805 		DELAY(1000);
806 		i++;
807 	}
808 	if (*confirm != 0xffffffff) {
809 		device_printf(sc->dev, "dummy rdma %s failed (%p = 0x%x)",
810 			      (enable ? "enable" : "disable"), confirm,
811 			      *confirm);
812 	}
813 	return;
814 }
815 
816 static int
817 mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data)
818 {
819 	mcp_cmd_t *buf;
820 	char buf_bytes[sizeof(*buf) + 8];
821 	volatile mcp_cmd_response_t *response = sc->cmd;
822 	volatile char *cmd_addr = sc->sram + MXGEFW_ETH_CMD;
823 	uint32_t dma_low, dma_high;
824 	int err, sleep_total = 0;
825 
826 	/* ensure buf is aligned to 8 bytes */
827 	buf = (mcp_cmd_t *)((uintptr_t)(buf_bytes + 7) & ~7UL);
828 
829 	buf->data0 = htobe32(data->data0);
830 	buf->data1 = htobe32(data->data1);
831 	buf->data2 = htobe32(data->data2);
832 	buf->cmd = htobe32(cmd);
833 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
834 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
835 
836 	buf->response_addr.low = htobe32(dma_low);
837 	buf->response_addr.high = htobe32(dma_high);
838 	mtx_lock(&sc->cmd_mtx);
839 	response->result = 0xffffffff;
840 	wmb();
841 	mxge_pio_copy((volatile void *)cmd_addr, buf, sizeof (*buf));
842 
843 	/* wait up to 20ms */
844 	err = EAGAIN;
845 	for (sleep_total = 0; sleep_total <  20; sleep_total++) {
846 		bus_dmamap_sync(sc->cmd_dma.dmat,
847 				sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
848 		wmb();
849 		switch (be32toh(response->result)) {
850 		case 0:
851 			data->data0 = be32toh(response->data);
852 			err = 0;
853 			break;
854 		case 0xffffffff:
855 			DELAY(1000);
856 			break;
857 		case MXGEFW_CMD_UNKNOWN:
858 			err = ENOSYS;
859 			break;
860 		case MXGEFW_CMD_ERROR_UNALIGNED:
861 			err = E2BIG;
862 			break;
863 		case MXGEFW_CMD_ERROR_BUSY:
864 			err = EBUSY;
865 			break;
866 		case MXGEFW_CMD_ERROR_I2C_ABSENT:
867 			err = ENXIO;
868 			break;
869 		default:
870 			device_printf(sc->dev,
871 				      "mxge: command %d "
872 				      "failed, result = %d\n",
873 				      cmd, be32toh(response->result));
874 			err = ENXIO;
875 			break;
876 		}
877 		if (err != EAGAIN)
878 			break;
879 	}
880 	if (err == EAGAIN)
881 		device_printf(sc->dev, "mxge: command %d timed out"
882 			      "result = %d\n",
883 			      cmd, be32toh(response->result));
884 	mtx_unlock(&sc->cmd_mtx);
885 	return err;
886 }
887 
888 static int
889 mxge_adopt_running_firmware(mxge_softc_t *sc)
890 {
891 	struct mcp_gen_header *hdr;
892 	const size_t bytes = sizeof (struct mcp_gen_header);
893 	size_t hdr_offset;
894 	int status;
895 
896 	/* find running firmware header */
897 	hdr_offset = htobe32(*(volatile uint32_t *)
898 			     (sc->sram + MCP_HEADER_PTR_OFFSET));
899 
900 	if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > sc->sram_size) {
901 		device_printf(sc->dev,
902 			      "Running firmware has bad header offset (%d)\n",
903 			      (int)hdr_offset);
904 		return EIO;
905 	}
906 
907 	/* copy header of running firmware from SRAM to host memory to
908 	 * validate firmware */
909 	hdr = malloc(bytes, M_DEVBUF, M_NOWAIT);
910 	if (hdr == NULL) {
911 		device_printf(sc->dev, "could not malloc firmware hdr\n");
912 		return ENOMEM;
913 	}
914 	bus_space_read_region_1(rman_get_bustag(sc->mem_res),
915 				rman_get_bushandle(sc->mem_res),
916 				hdr_offset, (char *)hdr, bytes);
917 	status = mxge_validate_firmware(sc, hdr);
918 	free(hdr, M_DEVBUF);
919 
920 	/*
921 	 * check to see if adopted firmware has bug where adopting
922 	 * it will cause broadcasts to be filtered unless the NIC
923 	 * is kept in ALLMULTI mode
924 	 */
925 	if (sc->fw_ver_major == 1 && sc->fw_ver_minor == 4 &&
926 	    sc->fw_ver_tiny >= 4 && sc->fw_ver_tiny <= 11) {
927 		sc->adopted_rx_filter_bug = 1;
928 		device_printf(sc->dev, "Adopting fw %d.%d.%d: "
929 			      "working around rx filter bug\n",
930 			      sc->fw_ver_major, sc->fw_ver_minor,
931 			      sc->fw_ver_tiny);
932 	}
933 
934 	return status;
935 }
936 
937 static int
938 mxge_load_firmware(mxge_softc_t *sc, int adopt)
939 {
940 	volatile uint32_t *confirm;
941 	volatile char *submit;
942 	char buf_bytes[72];
943 	uint32_t *buf, size, dma_low, dma_high;
944 	int status, i;
945 
946 	buf = (uint32_t *)((uintptr_t)(buf_bytes + 7) & ~7UL);
947 
948 	size = sc->sram_size;
949 	status = mxge_load_firmware_helper(sc, &size);
950 	if (status) {
951 		if (!adopt)
952 			return status;
953 		/* Try to use the currently running firmware, if
954 		   it is new enough */
955 		status = mxge_adopt_running_firmware(sc);
956 		if (status) {
957 			device_printf(sc->dev,
958 				      "failed to adopt running firmware\n");
959 			return status;
960 		}
961 		device_printf(sc->dev,
962 			      "Successfully adopted running firmware\n");
963 		if (sc->tx_boundary == 4096) {
964 			device_printf(sc->dev,
965 				"Using firmware currently running on NIC"
966 				 ".  For optimal\n");
967 			device_printf(sc->dev,
968 				 "performance consider loading optimized "
969 				 "firmware\n");
970 		}
971 		sc->fw_name = mxge_fw_unaligned;
972 		sc->tx_boundary = 2048;
973 		return 0;
974 	}
975 	/* clear confirmation addr */
976 	confirm = (volatile uint32_t *)sc->cmd;
977 	*confirm = 0;
978 	wmb();
979 	/* send a reload command to the bootstrap MCP, and wait for the
980 	   response in the confirmation address.  The firmware should
981 	   write a -1 there to indicate it is alive and well
982 	*/
983 
984 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
985 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
986 
987 	buf[0] = htobe32(dma_high);	/* confirm addr MSW */
988 	buf[1] = htobe32(dma_low);	/* confirm addr LSW */
989 	buf[2] = htobe32(0xffffffff);	/* confirm data */
990 
991 	/* FIX: All newest firmware should un-protect the bottom of
992 	   the sram before handoff. However, the very first interfaces
993 	   do not. Therefore the handoff copy must skip the first 8 bytes
994 	*/
995 					/* where the code starts*/
996 	buf[3] = htobe32(MXGE_FW_OFFSET + 8);
997 	buf[4] = htobe32(size - 8); 	/* length of code */
998 	buf[5] = htobe32(8);		/* where to copy to */
999 	buf[6] = htobe32(0);		/* where to jump to */
1000 
1001 	submit = (volatile char *)(sc->sram + MXGEFW_BOOT_HANDOFF);
1002 	mxge_pio_copy(submit, buf, 64);
1003 	wmb();
1004 	DELAY(1000);
1005 	wmb();
1006 	i = 0;
1007 	while (*confirm != 0xffffffff && i < 20) {
1008 		DELAY(1000*10);
1009 		i++;
1010 		bus_dmamap_sync(sc->cmd_dma.dmat,
1011 				sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
1012 	}
1013 	if (*confirm != 0xffffffff) {
1014 		device_printf(sc->dev,"handoff failed (%p = 0x%x)",
1015 			confirm, *confirm);
1016 
1017 		return ENXIO;
1018 	}
1019 	return 0;
1020 }
1021 
1022 static int
1023 mxge_update_mac_address(mxge_softc_t *sc)
1024 {
1025 	mxge_cmd_t cmd;
1026 	uint8_t *addr = sc->mac_addr;
1027 	int status;
1028 
1029 	cmd.data0 = ((addr[0] << 24) | (addr[1] << 16)
1030 		     | (addr[2] << 8) | addr[3]);
1031 
1032 	cmd.data1 = ((addr[4] << 8) | (addr[5]));
1033 
1034 	status = mxge_send_cmd(sc, MXGEFW_SET_MAC_ADDRESS, &cmd);
1035 	return status;
1036 }
1037 
1038 static int
1039 mxge_change_pause(mxge_softc_t *sc, int pause)
1040 {
1041 	mxge_cmd_t cmd;
1042 	int status;
1043 
1044 	if (pause)
1045 		status = mxge_send_cmd(sc, MXGEFW_ENABLE_FLOW_CONTROL,
1046 				       &cmd);
1047 	else
1048 		status = mxge_send_cmd(sc, MXGEFW_DISABLE_FLOW_CONTROL,
1049 				       &cmd);
1050 
1051 	if (status) {
1052 		device_printf(sc->dev, "Failed to set flow control mode\n");
1053 		return ENXIO;
1054 	}
1055 	sc->pause = pause;
1056 	return 0;
1057 }
1058 
1059 static void
1060 mxge_change_promisc(mxge_softc_t *sc, int promisc)
1061 {
1062 	mxge_cmd_t cmd;
1063 	int status;
1064 
1065 	if (mxge_always_promisc)
1066 		promisc = 1;
1067 
1068 	if (promisc)
1069 		status = mxge_send_cmd(sc, MXGEFW_ENABLE_PROMISC,
1070 				       &cmd);
1071 	else
1072 		status = mxge_send_cmd(sc, MXGEFW_DISABLE_PROMISC,
1073 				       &cmd);
1074 
1075 	if (status) {
1076 		device_printf(sc->dev, "Failed to set promisc mode\n");
1077 	}
1078 }
1079 
1080 struct mxge_add_maddr_ctx {
1081 	mxge_softc_t *sc;
1082 	int error;
1083 };
1084 
1085 static u_int
1086 mxge_add_maddr(void *arg, struct sockaddr_dl *sdl, u_int cnt)
1087 {
1088 	struct mxge_add_maddr_ctx *ctx = arg;
1089 	mxge_cmd_t cmd;
1090 
1091 	if (ctx->error != 0)
1092 		return (0);
1093 	bcopy(LLADDR(sdl), &cmd.data0, 4);
1094 	bcopy(LLADDR(sdl) + 4, &cmd.data1, 2);
1095 	cmd.data0 = htonl(cmd.data0);
1096 	cmd.data1 = htonl(cmd.data1);
1097 
1098 	ctx->error = mxge_send_cmd(ctx->sc, MXGEFW_JOIN_MULTICAST_GROUP, &cmd);
1099 
1100 	return (1);
1101 }
1102 
1103 static void
1104 mxge_set_multicast_list(mxge_softc_t *sc)
1105 {
1106 	struct mxge_add_maddr_ctx ctx;
1107 	struct ifnet *ifp = sc->ifp;
1108 	mxge_cmd_t cmd;
1109 	int err;
1110 
1111 	/* This firmware is known to not support multicast */
1112 	if (!sc->fw_multicast_support)
1113 		return;
1114 
1115 	/* Disable multicast filtering while we play with the lists*/
1116 	err = mxge_send_cmd(sc, MXGEFW_ENABLE_ALLMULTI, &cmd);
1117 	if (err != 0) {
1118 		device_printf(sc->dev, "Failed MXGEFW_ENABLE_ALLMULTI,"
1119 		       " error status: %d\n", err);
1120 		return;
1121 	}
1122 
1123 	if (sc->adopted_rx_filter_bug)
1124 		return;
1125 
1126 	if (ifp->if_flags & IFF_ALLMULTI)
1127 		/* request to disable multicast filtering, so quit here */
1128 		return;
1129 
1130 	/* Flush all the filters */
1131 
1132 	err = mxge_send_cmd(sc, MXGEFW_LEAVE_ALL_MULTICAST_GROUPS, &cmd);
1133 	if (err != 0) {
1134 		device_printf(sc->dev,
1135 			      "Failed MXGEFW_LEAVE_ALL_MULTICAST_GROUPS"
1136 			      ", error status: %d\n", err);
1137 		return;
1138 	}
1139 
1140 	/* Walk the multicast list, and add each address */
1141 	ctx.sc = sc;
1142 	ctx.error = 0;
1143 	if_foreach_llmaddr(ifp, mxge_add_maddr, &ctx);
1144 	if (ctx.error != 0) {
1145 		device_printf(sc->dev, "Failed MXGEFW_JOIN_MULTICAST_GROUP, "
1146 		    "error status:" "%d\t", ctx.error);
1147 		/* abort, leaving multicast filtering off */
1148 		return;
1149 	}
1150 
1151 	/* Enable multicast filtering */
1152 	err = mxge_send_cmd(sc, MXGEFW_DISABLE_ALLMULTI, &cmd);
1153 	if (err != 0) {
1154 		device_printf(sc->dev, "Failed MXGEFW_DISABLE_ALLMULTI"
1155 		       ", error status: %d\n", err);
1156 	}
1157 }
1158 
1159 static int
1160 mxge_max_mtu(mxge_softc_t *sc)
1161 {
1162 	mxge_cmd_t cmd;
1163 	int status;
1164 
1165 	if (MJUMPAGESIZE - MXGEFW_PAD >  MXGEFW_MAX_MTU)
1166 		return  MXGEFW_MAX_MTU - MXGEFW_PAD;
1167 
1168 	/* try to set nbufs to see if it we can
1169 	   use virtually contiguous jumbos */
1170 	cmd.data0 = 0;
1171 	status = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
1172 			       &cmd);
1173 	if (status == 0)
1174 		return  MXGEFW_MAX_MTU - MXGEFW_PAD;
1175 
1176 	/* otherwise, we're limited to MJUMPAGESIZE */
1177 	return MJUMPAGESIZE - MXGEFW_PAD;
1178 }
1179 
1180 static int
1181 mxge_reset(mxge_softc_t *sc, int interrupts_setup)
1182 {
1183 	struct mxge_slice_state *ss;
1184 	mxge_rx_done_t *rx_done;
1185 	volatile uint32_t *irq_claim;
1186 	mxge_cmd_t cmd;
1187 	int slice, status;
1188 
1189 	/* try to send a reset command to the card to see if it
1190 	   is alive */
1191 	memset(&cmd, 0, sizeof (cmd));
1192 	status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
1193 	if (status != 0) {
1194 		device_printf(sc->dev, "failed reset\n");
1195 		return ENXIO;
1196 	}
1197 
1198 	mxge_dummy_rdma(sc, 1);
1199 
1200 	/* set the intrq size */
1201 	cmd.data0 = sc->rx_ring_size;
1202 	status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
1203 
1204 	/*
1205 	 * Even though we already know how many slices are supported
1206 	 * via mxge_slice_probe(), MXGEFW_CMD_GET_MAX_RSS_QUEUES
1207 	 * has magic side effects, and must be called after a reset.
1208 	 * It must be called prior to calling any RSS related cmds,
1209 	 * including assigning an interrupt queue for anything but
1210 	 * slice 0.  It must also be called *after*
1211 	 * MXGEFW_CMD_SET_INTRQ_SIZE, since the intrq size is used by
1212 	 * the firmware to compute offsets.
1213 	 */
1214 
1215 	if (sc->num_slices > 1) {
1216 		/* ask the maximum number of slices it supports */
1217 		status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES,
1218 					   &cmd);
1219 		if (status != 0) {
1220 			device_printf(sc->dev,
1221 				      "failed to get number of slices\n");
1222 			return status;
1223 		}
1224 		/*
1225 		 * MXGEFW_CMD_ENABLE_RSS_QUEUES must be called prior
1226 		 * to setting up the interrupt queue DMA
1227 		 */
1228 		cmd.data0 = sc->num_slices;
1229 		cmd.data1 = MXGEFW_SLICE_INTR_MODE_ONE_PER_SLICE;
1230 #ifdef IFNET_BUF_RING
1231 		cmd.data1 |= MXGEFW_SLICE_ENABLE_MULTIPLE_TX_QUEUES;
1232 #endif
1233 		status = mxge_send_cmd(sc, MXGEFW_CMD_ENABLE_RSS_QUEUES,
1234 					   &cmd);
1235 		if (status != 0) {
1236 			device_printf(sc->dev,
1237 				      "failed to set number of slices\n");
1238 			return status;
1239 		}
1240 	}
1241 
1242 	if (interrupts_setup) {
1243 		/* Now exchange information about interrupts  */
1244 		for (slice = 0; slice < sc->num_slices; slice++) {
1245 			rx_done = &sc->ss[slice].rx_done;
1246 			memset(rx_done->entry, 0, sc->rx_ring_size);
1247 			cmd.data0 = MXGE_LOWPART_TO_U32(rx_done->dma.bus_addr);
1248 			cmd.data1 = MXGE_HIGHPART_TO_U32(rx_done->dma.bus_addr);
1249 			cmd.data2 = slice;
1250 			status |= mxge_send_cmd(sc,
1251 						MXGEFW_CMD_SET_INTRQ_DMA,
1252 						&cmd);
1253 		}
1254 	}
1255 
1256 	status |= mxge_send_cmd(sc,
1257 				MXGEFW_CMD_GET_INTR_COAL_DELAY_OFFSET, &cmd);
1258 
1259 	sc->intr_coal_delay_ptr = (volatile uint32_t *)(sc->sram + cmd.data0);
1260 
1261 	status |= mxge_send_cmd(sc, MXGEFW_CMD_GET_IRQ_ACK_OFFSET, &cmd);
1262 	irq_claim = (volatile uint32_t *)(sc->sram + cmd.data0);
1263 
1264 	status |= mxge_send_cmd(sc,  MXGEFW_CMD_GET_IRQ_DEASSERT_OFFSET,
1265 				&cmd);
1266 	sc->irq_deassert = (volatile uint32_t *)(sc->sram + cmd.data0);
1267 	if (status != 0) {
1268 		device_printf(sc->dev, "failed set interrupt parameters\n");
1269 		return status;
1270 	}
1271 
1272 	*sc->intr_coal_delay_ptr = htobe32(sc->intr_coal_delay);
1273 
1274 	/* run a DMA benchmark */
1275 	(void) mxge_dma_test(sc, MXGEFW_DMA_TEST);
1276 
1277 	for (slice = 0; slice < sc->num_slices; slice++) {
1278 		ss = &sc->ss[slice];
1279 
1280 		ss->irq_claim = irq_claim + (2 * slice);
1281 		/* reset mcp/driver shared state back to 0 */
1282 		ss->rx_done.idx = 0;
1283 		ss->rx_done.cnt = 0;
1284 		ss->tx.req = 0;
1285 		ss->tx.done = 0;
1286 		ss->tx.pkt_done = 0;
1287 		ss->tx.queue_active = 0;
1288 		ss->tx.activate = 0;
1289 		ss->tx.deactivate = 0;
1290 		ss->tx.wake = 0;
1291 		ss->tx.defrag = 0;
1292 		ss->tx.stall = 0;
1293 		ss->rx_big.cnt = 0;
1294 		ss->rx_small.cnt = 0;
1295 		ss->lc.lro_bad_csum = 0;
1296 		ss->lc.lro_queued = 0;
1297 		ss->lc.lro_flushed = 0;
1298 		if (ss->fw_stats != NULL) {
1299 			bzero(ss->fw_stats, sizeof *ss->fw_stats);
1300 		}
1301 	}
1302 	sc->rdma_tags_available = 15;
1303 	status = mxge_update_mac_address(sc);
1304 	mxge_change_promisc(sc, sc->ifp->if_flags & IFF_PROMISC);
1305 	mxge_change_pause(sc, sc->pause);
1306 	mxge_set_multicast_list(sc);
1307 	if (sc->throttle) {
1308 		cmd.data0 = sc->throttle;
1309 		if (mxge_send_cmd(sc, MXGEFW_CMD_SET_THROTTLE_FACTOR,
1310 				  &cmd)) {
1311 			device_printf(sc->dev,
1312 				      "can't enable throttle\n");
1313 		}
1314 	}
1315 	return status;
1316 }
1317 
1318 static int
1319 mxge_change_throttle(SYSCTL_HANDLER_ARGS)
1320 {
1321 	mxge_cmd_t cmd;
1322 	mxge_softc_t *sc;
1323 	int err;
1324 	unsigned int throttle;
1325 
1326 	sc = arg1;
1327 	throttle = sc->throttle;
1328 	err = sysctl_handle_int(oidp, &throttle, arg2, req);
1329 	if (err != 0) {
1330 		return err;
1331 	}
1332 
1333 	if (throttle == sc->throttle)
1334 		return 0;
1335 
1336 	if (throttle < MXGE_MIN_THROTTLE || throttle > MXGE_MAX_THROTTLE)
1337 		return EINVAL;
1338 
1339 	mtx_lock(&sc->driver_mtx);
1340 	cmd.data0 = throttle;
1341 	err = mxge_send_cmd(sc, MXGEFW_CMD_SET_THROTTLE_FACTOR, &cmd);
1342 	if (err == 0)
1343 		sc->throttle = throttle;
1344 	mtx_unlock(&sc->driver_mtx);
1345 	return err;
1346 }
1347 
1348 static int
1349 mxge_change_intr_coal(SYSCTL_HANDLER_ARGS)
1350 {
1351 	mxge_softc_t *sc;
1352 	unsigned int intr_coal_delay;
1353 	int err;
1354 
1355 	sc = arg1;
1356 	intr_coal_delay = sc->intr_coal_delay;
1357 	err = sysctl_handle_int(oidp, &intr_coal_delay, arg2, req);
1358 	if (err != 0) {
1359 		return err;
1360 	}
1361 	if (intr_coal_delay == sc->intr_coal_delay)
1362 		return 0;
1363 
1364 	if (intr_coal_delay == 0 || intr_coal_delay > 1000*1000)
1365 		return EINVAL;
1366 
1367 	mtx_lock(&sc->driver_mtx);
1368 	*sc->intr_coal_delay_ptr = htobe32(intr_coal_delay);
1369 	sc->intr_coal_delay = intr_coal_delay;
1370 
1371 	mtx_unlock(&sc->driver_mtx);
1372 	return err;
1373 }
1374 
1375 static int
1376 mxge_change_flow_control(SYSCTL_HANDLER_ARGS)
1377 {
1378 	mxge_softc_t *sc;
1379 	unsigned int enabled;
1380 	int err;
1381 
1382 	sc = arg1;
1383 	enabled = sc->pause;
1384 	err = sysctl_handle_int(oidp, &enabled, arg2, req);
1385 	if (err != 0) {
1386 		return err;
1387 	}
1388 	if (enabled == sc->pause)
1389 		return 0;
1390 
1391 	mtx_lock(&sc->driver_mtx);
1392 	err = mxge_change_pause(sc, enabled);
1393 	mtx_unlock(&sc->driver_mtx);
1394 	return err;
1395 }
1396 
1397 static int
1398 mxge_handle_be32(SYSCTL_HANDLER_ARGS)
1399 {
1400 	int err;
1401 
1402 	if (arg1 == NULL)
1403 		return EFAULT;
1404 	arg2 = be32toh(*(int *)arg1);
1405 	arg1 = NULL;
1406 	err = sysctl_handle_int(oidp, arg1, arg2, req);
1407 
1408 	return err;
1409 }
1410 
1411 static void
1412 mxge_rem_sysctls(mxge_softc_t *sc)
1413 {
1414 	struct mxge_slice_state *ss;
1415 	int slice;
1416 
1417 	if (sc->slice_sysctl_tree == NULL)
1418 		return;
1419 
1420 	for (slice = 0; slice < sc->num_slices; slice++) {
1421 		ss = &sc->ss[slice];
1422 		if (ss == NULL || ss->sysctl_tree == NULL)
1423 			continue;
1424 		sysctl_ctx_free(&ss->sysctl_ctx);
1425 		ss->sysctl_tree = NULL;
1426 	}
1427 	sysctl_ctx_free(&sc->slice_sysctl_ctx);
1428 	sc->slice_sysctl_tree = NULL;
1429 }
1430 
1431 static void
1432 mxge_add_sysctls(mxge_softc_t *sc)
1433 {
1434 	struct sysctl_ctx_list *ctx;
1435 	struct sysctl_oid_list *children;
1436 	mcp_irq_data_t *fw;
1437 	struct mxge_slice_state *ss;
1438 	int slice;
1439 	char slice_num[8];
1440 
1441 	ctx = device_get_sysctl_ctx(sc->dev);
1442 	children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev));
1443 	fw = sc->ss[0].fw_stats;
1444 
1445 	/* random information */
1446 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1447 		       "firmware_version",
1448 		       CTLFLAG_RD, sc->fw_version,
1449 		       0, "firmware version");
1450 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1451 		       "serial_number",
1452 		       CTLFLAG_RD, sc->serial_number_string,
1453 		       0, "serial number");
1454 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1455 		       "product_code",
1456 		       CTLFLAG_RD, sc->product_code_string,
1457 		       0, "product_code");
1458 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1459 		       "pcie_link_width",
1460 		       CTLFLAG_RD, &sc->link_width,
1461 		       0, "tx_boundary");
1462 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1463 		       "tx_boundary",
1464 		       CTLFLAG_RD, &sc->tx_boundary,
1465 		       0, "tx_boundary");
1466 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1467 		       "write_combine",
1468 		       CTLFLAG_RD, &sc->wc,
1469 		       0, "write combining PIO?");
1470 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1471 		       "read_dma_MBs",
1472 		       CTLFLAG_RD, &sc->read_dma,
1473 		       0, "DMA Read speed in MB/s");
1474 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1475 		       "write_dma_MBs",
1476 		       CTLFLAG_RD, &sc->write_dma,
1477 		       0, "DMA Write speed in MB/s");
1478 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1479 		       "read_write_dma_MBs",
1480 		       CTLFLAG_RD, &sc->read_write_dma,
1481 		       0, "DMA concurrent Read/Write speed in MB/s");
1482 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1483 		       "watchdog_resets",
1484 		       CTLFLAG_RD, &sc->watchdog_resets,
1485 		       0, "Number of times NIC was reset");
1486 
1487 	/* performance related tunables */
1488 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1489 	    "intr_coal_delay", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
1490 	    sc, 0, mxge_change_intr_coal, "I",
1491 	    "interrupt coalescing delay in usecs");
1492 
1493 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1494 	    "throttle", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
1495 	    mxge_change_throttle, "I", "transmit throttling");
1496 
1497 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1498 	    "flow_control_enabled",
1499 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
1500 	    mxge_change_flow_control, "I",
1501 	    "interrupt coalescing delay in usecs");
1502 
1503 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1504 		       "deassert_wait",
1505 		       CTLFLAG_RW, &mxge_deassert_wait,
1506 		       0, "Wait for IRQ line to go low in ihandler");
1507 
1508 	/* stats block from firmware is in network byte order.
1509 	   Need to swap it */
1510 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1511 	    "link_up", CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
1512 	    &fw->link_up, 0, mxge_handle_be32, "I", "link up");
1513 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1514 	    "rdma_tags_available", CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
1515 	    &fw->rdma_tags_available, 0, mxge_handle_be32, "I",
1516 	    "rdma_tags_available");
1517 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1518 	    "dropped_bad_crc32", CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
1519 	    &fw->dropped_bad_crc32, 0, mxge_handle_be32, "I",
1520 	    "dropped_bad_crc32");
1521 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1522 	    "dropped_bad_phy", CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
1523 	    &fw->dropped_bad_phy, 0, mxge_handle_be32, "I", "dropped_bad_phy");
1524 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1525 	    "dropped_link_error_or_filtered",
1526 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
1527 	    &fw->dropped_link_error_or_filtered, 0, mxge_handle_be32, "I",
1528 	    "dropped_link_error_or_filtered");
1529 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1530 	    "dropped_link_overflow",
1531 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
1532 	    &fw->dropped_link_overflow, 0, mxge_handle_be32, "I",
1533 	    "dropped_link_overflow");
1534 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1535 	    "dropped_multicast_filtered",
1536 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
1537 	    &fw->dropped_multicast_filtered, 0, mxge_handle_be32, "I",
1538 	    "dropped_multicast_filtered");
1539 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1540 	    "dropped_no_big_buffer",
1541 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
1542 	    &fw->dropped_no_big_buffer, 0, mxge_handle_be32, "I",
1543 	    "dropped_no_big_buffer");
1544 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1545 	    "dropped_no_small_buffer",
1546 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
1547 	    &fw->dropped_no_small_buffer, 0, mxge_handle_be32, "I",
1548 	    "dropped_no_small_buffer");
1549 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1550 	    "dropped_overrun",
1551 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
1552 	    &fw->dropped_overrun, 0, mxge_handle_be32, "I",
1553 	    "dropped_overrun");
1554 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1555 	    "dropped_pause", CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
1556 	    &fw->dropped_pause, 0, mxge_handle_be32, "I", "dropped_pause");
1557 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1558 	    "dropped_runt", CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
1559 	    &fw->dropped_runt, 0, mxge_handle_be32, "I", "dropped_runt");
1560 
1561 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1562 	    "dropped_unicast_filtered",
1563 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
1564 	    &fw->dropped_unicast_filtered, 0, mxge_handle_be32, "I",
1565 	    "dropped_unicast_filtered");
1566 
1567 	/* verbose printing? */
1568 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1569 		       "verbose",
1570 		       CTLFLAG_RW, &mxge_verbose,
1571 		       0, "verbose printing");
1572 
1573 	/* add counters exported for debugging from all slices */
1574 	sysctl_ctx_init(&sc->slice_sysctl_ctx);
1575 	sc->slice_sysctl_tree =
1576 		SYSCTL_ADD_NODE(&sc->slice_sysctl_ctx, children, OID_AUTO,
1577 		    "slice", CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
1578 
1579 	for (slice = 0; slice < sc->num_slices; slice++) {
1580 		ss = &sc->ss[slice];
1581 		sysctl_ctx_init(&ss->sysctl_ctx);
1582 		ctx = &ss->sysctl_ctx;
1583 		children = SYSCTL_CHILDREN(sc->slice_sysctl_tree);
1584 		sprintf(slice_num, "%d", slice);
1585 		ss->sysctl_tree =
1586 			SYSCTL_ADD_NODE(ctx, children, OID_AUTO, slice_num,
1587 			    CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
1588 		children = SYSCTL_CHILDREN(ss->sysctl_tree);
1589 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1590 			       "rx_small_cnt",
1591 			       CTLFLAG_RD, &ss->rx_small.cnt,
1592 			       0, "rx_small_cnt");
1593 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1594 			       "rx_big_cnt",
1595 			       CTLFLAG_RD, &ss->rx_big.cnt,
1596 			       0, "rx_small_cnt");
1597 		SYSCTL_ADD_U64(ctx, children, OID_AUTO,
1598 			       "lro_flushed", CTLFLAG_RD, &ss->lc.lro_flushed,
1599 			       0, "number of lro merge queues flushed");
1600 
1601 		SYSCTL_ADD_U64(ctx, children, OID_AUTO,
1602 			       "lro_bad_csum", CTLFLAG_RD, &ss->lc.lro_bad_csum,
1603 			       0, "number of bad csums preventing LRO");
1604 
1605 		SYSCTL_ADD_U64(ctx, children, OID_AUTO,
1606 			       "lro_queued", CTLFLAG_RD, &ss->lc.lro_queued,
1607 			       0, "number of frames appended to lro merge"
1608 			       "queues");
1609 
1610 #ifndef IFNET_BUF_RING
1611 		/* only transmit from slice 0 for now */
1612 		if (slice > 0)
1613 			continue;
1614 #endif
1615 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1616 			       "tx_req",
1617 			       CTLFLAG_RD, &ss->tx.req,
1618 			       0, "tx_req");
1619 
1620 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1621 			       "tx_done",
1622 			       CTLFLAG_RD, &ss->tx.done,
1623 			       0, "tx_done");
1624 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1625 			       "tx_pkt_done",
1626 			       CTLFLAG_RD, &ss->tx.pkt_done,
1627 			       0, "tx_done");
1628 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1629 			       "tx_stall",
1630 			       CTLFLAG_RD, &ss->tx.stall,
1631 			       0, "tx_stall");
1632 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1633 			       "tx_wake",
1634 			       CTLFLAG_RD, &ss->tx.wake,
1635 			       0, "tx_wake");
1636 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1637 			       "tx_defrag",
1638 			       CTLFLAG_RD, &ss->tx.defrag,
1639 			       0, "tx_defrag");
1640 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1641 			       "tx_queue_active",
1642 			       CTLFLAG_RD, &ss->tx.queue_active,
1643 			       0, "tx_queue_active");
1644 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1645 			       "tx_activate",
1646 			       CTLFLAG_RD, &ss->tx.activate,
1647 			       0, "tx_activate");
1648 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1649 			       "tx_deactivate",
1650 			       CTLFLAG_RD, &ss->tx.deactivate,
1651 			       0, "tx_deactivate");
1652 	}
1653 }
1654 
1655 /* copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
1656    backwards one at a time and handle ring wraps */
1657 
1658 static inline void
1659 mxge_submit_req_backwards(mxge_tx_ring_t *tx,
1660 			    mcp_kreq_ether_send_t *src, int cnt)
1661 {
1662 	int idx, starting_slot;
1663 	starting_slot = tx->req;
1664 	while (cnt > 1) {
1665 		cnt--;
1666 		idx = (starting_slot + cnt) & tx->mask;
1667 		mxge_pio_copy(&tx->lanai[idx],
1668 			      &src[cnt], sizeof(*src));
1669 		wmb();
1670 	}
1671 }
1672 
1673 /*
1674  * copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
1675  * at most 32 bytes at a time, so as to avoid involving the software
1676  * pio handler in the nic.   We re-write the first segment's flags
1677  * to mark them valid only after writing the entire chain
1678  */
1679 
1680 static inline void
1681 mxge_submit_req(mxge_tx_ring_t *tx, mcp_kreq_ether_send_t *src,
1682 		  int cnt)
1683 {
1684 	int idx, i;
1685 	uint32_t *src_ints;
1686 	volatile uint32_t *dst_ints;
1687 	mcp_kreq_ether_send_t *srcp;
1688 	volatile mcp_kreq_ether_send_t *dstp, *dst;
1689 	uint8_t last_flags;
1690 
1691 	idx = tx->req & tx->mask;
1692 
1693 	last_flags = src->flags;
1694 	src->flags = 0;
1695 	wmb();
1696 	dst = dstp = &tx->lanai[idx];
1697 	srcp = src;
1698 
1699 	if ((idx + cnt) < tx->mask) {
1700 		for (i = 0; i < (cnt - 1); i += 2) {
1701 			mxge_pio_copy(dstp, srcp, 2 * sizeof(*src));
1702 			wmb(); /* force write every 32 bytes */
1703 			srcp += 2;
1704 			dstp += 2;
1705 		}
1706 	} else {
1707 		/* submit all but the first request, and ensure
1708 		   that it is submitted below */
1709 		mxge_submit_req_backwards(tx, src, cnt);
1710 		i = 0;
1711 	}
1712 	if (i < cnt) {
1713 		/* submit the first request */
1714 		mxge_pio_copy(dstp, srcp, sizeof(*src));
1715 		wmb(); /* barrier before setting valid flag */
1716 	}
1717 
1718 	/* re-write the last 32-bits with the valid flags */
1719 	src->flags = last_flags;
1720 	src_ints = (uint32_t *)src;
1721 	src_ints+=3;
1722 	dst_ints = (volatile uint32_t *)dst;
1723 	dst_ints+=3;
1724 	*dst_ints =  *src_ints;
1725 	tx->req += cnt;
1726 	wmb();
1727 }
1728 
1729 static int
1730 mxge_parse_tx(struct mxge_slice_state *ss, struct mbuf *m,
1731     struct mxge_pkt_info *pi)
1732 {
1733 	struct ether_vlan_header *eh;
1734 	uint16_t etype;
1735 	int tso = m->m_pkthdr.csum_flags & (CSUM_TSO);
1736 #if IFCAP_TSO6 && defined(INET6)
1737 	int nxt;
1738 #endif
1739 
1740 	eh = mtod(m, struct ether_vlan_header *);
1741 	if (eh->evl_encap_proto == htons(ETHERTYPE_VLAN)) {
1742 		etype = ntohs(eh->evl_proto);
1743 		pi->ip_off = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
1744 	} else {
1745 		etype = ntohs(eh->evl_encap_proto);
1746 		pi->ip_off = ETHER_HDR_LEN;
1747 	}
1748 
1749 	switch (etype) {
1750 	case ETHERTYPE_IP:
1751 		/*
1752 		 * ensure ip header is in first mbuf, copy it to a
1753 		 * scratch buffer if not
1754 		 */
1755 		pi->ip = (struct ip *)(m->m_data + pi->ip_off);
1756 		pi->ip6 = NULL;
1757 		if (__predict_false(m->m_len < pi->ip_off + sizeof(*pi->ip))) {
1758 			m_copydata(m, 0, pi->ip_off + sizeof(*pi->ip),
1759 			    ss->scratch);
1760 			pi->ip = (struct ip *)(ss->scratch + pi->ip_off);
1761 		}
1762 		pi->ip_hlen = pi->ip->ip_hl << 2;
1763 		if (!tso)
1764 			return 0;
1765 
1766 		if (__predict_false(m->m_len < pi->ip_off + pi->ip_hlen +
1767 		    sizeof(struct tcphdr))) {
1768 			m_copydata(m, 0, pi->ip_off + pi->ip_hlen +
1769 			    sizeof(struct tcphdr), ss->scratch);
1770 			pi->ip = (struct ip *)(ss->scratch + pi->ip_off);
1771 		}
1772 		pi->tcp = (struct tcphdr *)((char *)pi->ip + pi->ip_hlen);
1773 		break;
1774 #if IFCAP_TSO6 && defined(INET6)
1775 	case ETHERTYPE_IPV6:
1776 		pi->ip6 = (struct ip6_hdr *)(m->m_data + pi->ip_off);
1777 		if (__predict_false(m->m_len < pi->ip_off + sizeof(*pi->ip6))) {
1778 			m_copydata(m, 0, pi->ip_off + sizeof(*pi->ip6),
1779 			    ss->scratch);
1780 			pi->ip6 = (struct ip6_hdr *)(ss->scratch + pi->ip_off);
1781 		}
1782 		nxt = 0;
1783 		pi->ip_hlen = ip6_lasthdr(m, pi->ip_off, IPPROTO_IPV6, &nxt);
1784 		pi->ip_hlen -= pi->ip_off;
1785 		if (nxt != IPPROTO_TCP && nxt != IPPROTO_UDP)
1786 			return EINVAL;
1787 
1788 		if (!tso)
1789 			return 0;
1790 
1791 		if (pi->ip_off + pi->ip_hlen > ss->sc->max_tso6_hlen)
1792 			return EINVAL;
1793 
1794 		if (__predict_false(m->m_len < pi->ip_off + pi->ip_hlen +
1795 		    sizeof(struct tcphdr))) {
1796 			m_copydata(m, 0, pi->ip_off + pi->ip_hlen +
1797 			    sizeof(struct tcphdr), ss->scratch);
1798 			pi->ip6 = (struct ip6_hdr *)(ss->scratch + pi->ip_off);
1799 		}
1800 		pi->tcp = (struct tcphdr *)((char *)pi->ip6 + pi->ip_hlen);
1801 		break;
1802 #endif
1803 	default:
1804 		return EINVAL;
1805 	}
1806 	return 0;
1807 }
1808 
1809 #if IFCAP_TSO4
1810 
1811 static void
1812 mxge_encap_tso(struct mxge_slice_state *ss, struct mbuf *m,
1813 	       int busdma_seg_cnt, struct mxge_pkt_info *pi)
1814 {
1815 	mxge_tx_ring_t *tx;
1816 	mcp_kreq_ether_send_t *req;
1817 	bus_dma_segment_t *seg;
1818 	uint32_t low, high_swapped;
1819 	int len, seglen, cum_len, cum_len_next;
1820 	int next_is_first, chop, cnt, rdma_count, small;
1821 	uint16_t pseudo_hdr_offset, cksum_offset, mss, sum;
1822 	uint8_t flags, flags_next;
1823 	static int once;
1824 
1825 	mss = m->m_pkthdr.tso_segsz;
1826 
1827 	/* negative cum_len signifies to the
1828 	 * send loop that we are still in the
1829 	 * header portion of the TSO packet.
1830 	 */
1831 
1832 	cksum_offset = pi->ip_off + pi->ip_hlen;
1833 	cum_len = -(cksum_offset + (pi->tcp->th_off << 2));
1834 
1835 	/* TSO implies checksum offload on this hardware */
1836 	if (__predict_false((m->m_pkthdr.csum_flags & (CSUM_TCP|CSUM_TCP_IPV6)) == 0)) {
1837 		/*
1838 		 * If packet has full TCP csum, replace it with pseudo hdr
1839 		 * sum that the NIC expects, otherwise the NIC will emit
1840 		 * packets with bad TCP checksums.
1841 		 */
1842 		m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
1843 		if (pi->ip6) {
1844 #if (CSUM_TCP_IPV6 != 0) && defined(INET6)
1845 			m->m_pkthdr.csum_flags |= CSUM_TCP_IPV6;
1846 			sum = in6_cksum_pseudo(pi->ip6,
1847 			    m->m_pkthdr.len - cksum_offset,
1848 			    IPPROTO_TCP, 0);
1849 #endif
1850 		} else {
1851 #ifdef INET
1852 			m->m_pkthdr.csum_flags |= CSUM_TCP;
1853 			sum = in_pseudo(pi->ip->ip_src.s_addr,
1854 			    pi->ip->ip_dst.s_addr,
1855 			    htons(IPPROTO_TCP + (m->m_pkthdr.len -
1856 				    cksum_offset)));
1857 #endif
1858 		}
1859 		m_copyback(m, offsetof(struct tcphdr, th_sum) +
1860 		    cksum_offset, sizeof(sum), (caddr_t)&sum);
1861 	}
1862 	flags = MXGEFW_FLAGS_TSO_HDR | MXGEFW_FLAGS_FIRST;
1863 
1864 	/* for TSO, pseudo_hdr_offset holds mss.
1865 	 * The firmware figures out where to put
1866 	 * the checksum by parsing the header. */
1867 	pseudo_hdr_offset = htobe16(mss);
1868 
1869 	if (pi->ip6) {
1870 		/*
1871 		 * for IPv6 TSO, the "checksum offset" is re-purposed
1872 		 * to store the TCP header len
1873 		 */
1874 		cksum_offset = (pi->tcp->th_off << 2);
1875 	}
1876 
1877 	tx = &ss->tx;
1878 	req = tx->req_list;
1879 	seg = tx->seg_list;
1880 	cnt = 0;
1881 	rdma_count = 0;
1882 	/* "rdma_count" is the number of RDMAs belonging to the
1883 	 * current packet BEFORE the current send request. For
1884 	 * non-TSO packets, this is equal to "count".
1885 	 * For TSO packets, rdma_count needs to be reset
1886 	 * to 0 after a segment cut.
1887 	 *
1888 	 * The rdma_count field of the send request is
1889 	 * the number of RDMAs of the packet starting at
1890 	 * that request. For TSO send requests with one ore more cuts
1891 	 * in the middle, this is the number of RDMAs starting
1892 	 * after the last cut in the request. All previous
1893 	 * segments before the last cut implicitly have 1 RDMA.
1894 	 *
1895 	 * Since the number of RDMAs is not known beforehand,
1896 	 * it must be filled-in retroactively - after each
1897 	 * segmentation cut or at the end of the entire packet.
1898 	 */
1899 
1900 	while (busdma_seg_cnt) {
1901 		/* Break the busdma segment up into pieces*/
1902 		low = MXGE_LOWPART_TO_U32(seg->ds_addr);
1903 		high_swapped = 	htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
1904 		len = seg->ds_len;
1905 
1906 		while (len) {
1907 			flags_next = flags & ~MXGEFW_FLAGS_FIRST;
1908 			seglen = len;
1909 			cum_len_next = cum_len + seglen;
1910 			(req-rdma_count)->rdma_count = rdma_count + 1;
1911 			if (__predict_true(cum_len >= 0)) {
1912 				/* payload */
1913 				chop = (cum_len_next > mss);
1914 				cum_len_next = cum_len_next % mss;
1915 				next_is_first = (cum_len_next == 0);
1916 				flags |= chop * MXGEFW_FLAGS_TSO_CHOP;
1917 				flags_next |= next_is_first *
1918 					MXGEFW_FLAGS_FIRST;
1919 				rdma_count |= -(chop | next_is_first);
1920 				rdma_count += chop & !next_is_first;
1921 			} else if (cum_len_next >= 0) {
1922 				/* header ends */
1923 				rdma_count = -1;
1924 				cum_len_next = 0;
1925 				seglen = -cum_len;
1926 				small = (mss <= MXGEFW_SEND_SMALL_SIZE);
1927 				flags_next = MXGEFW_FLAGS_TSO_PLD |
1928 					MXGEFW_FLAGS_FIRST |
1929 					(small * MXGEFW_FLAGS_SMALL);
1930 			    }
1931 
1932 			req->addr_high = high_swapped;
1933 			req->addr_low = htobe32(low);
1934 			req->pseudo_hdr_offset = pseudo_hdr_offset;
1935 			req->pad = 0;
1936 			req->rdma_count = 1;
1937 			req->length = htobe16(seglen);
1938 			req->cksum_offset = cksum_offset;
1939 			req->flags = flags | ((cum_len & 1) *
1940 					      MXGEFW_FLAGS_ALIGN_ODD);
1941 			low += seglen;
1942 			len -= seglen;
1943 			cum_len = cum_len_next;
1944 			flags = flags_next;
1945 			req++;
1946 			cnt++;
1947 			rdma_count++;
1948 			if (cksum_offset != 0 && !pi->ip6) {
1949 				if (__predict_false(cksum_offset > seglen))
1950 					cksum_offset -= seglen;
1951 				else
1952 					cksum_offset = 0;
1953 			}
1954 			if (__predict_false(cnt > tx->max_desc))
1955 				goto drop;
1956 		}
1957 		busdma_seg_cnt--;
1958 		seg++;
1959 	}
1960 	(req-rdma_count)->rdma_count = rdma_count;
1961 
1962 	do {
1963 		req--;
1964 		req->flags |= MXGEFW_FLAGS_TSO_LAST;
1965 	} while (!(req->flags & (MXGEFW_FLAGS_TSO_CHOP | MXGEFW_FLAGS_FIRST)));
1966 
1967 	tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
1968 	mxge_submit_req(tx, tx->req_list, cnt);
1969 #ifdef IFNET_BUF_RING
1970 	if ((ss->sc->num_slices > 1) && tx->queue_active == 0) {
1971 		/* tell the NIC to start polling this slice */
1972 		*tx->send_go = 1;
1973 		tx->queue_active = 1;
1974 		tx->activate++;
1975 		wmb();
1976 	}
1977 #endif
1978 	return;
1979 
1980 drop:
1981 	bus_dmamap_unload(tx->dmat, tx->info[tx->req & tx->mask].map);
1982 	m_freem(m);
1983 	ss->oerrors++;
1984 	if (!once) {
1985 		printf("tx->max_desc exceeded via TSO!\n");
1986 		printf("mss = %d, %ld, %d!\n", mss,
1987 		       (long)seg - (long)tx->seg_list, tx->max_desc);
1988 		once = 1;
1989 	}
1990 	return;
1991 
1992 }
1993 
1994 #endif /* IFCAP_TSO4 */
1995 
1996 #ifdef MXGE_NEW_VLAN_API
1997 /*
1998  * We reproduce the software vlan tag insertion from
1999  * net/if_vlan.c:vlan_start() here so that we can advertise "hardware"
2000  * vlan tag insertion. We need to advertise this in order to have the
2001  * vlan interface respect our csum offload flags.
2002  */
2003 static struct mbuf *
2004 mxge_vlan_tag_insert(struct mbuf *m)
2005 {
2006 	struct ether_vlan_header *evl;
2007 
2008 	M_PREPEND(m, ETHER_VLAN_ENCAP_LEN, M_NOWAIT);
2009 	if (__predict_false(m == NULL))
2010 		return NULL;
2011 	if (m->m_len < sizeof(*evl)) {
2012 		m = m_pullup(m, sizeof(*evl));
2013 		if (__predict_false(m == NULL))
2014 			return NULL;
2015 	}
2016 	/*
2017 	 * Transform the Ethernet header into an Ethernet header
2018 	 * with 802.1Q encapsulation.
2019 	 */
2020 	evl = mtod(m, struct ether_vlan_header *);
2021 	bcopy((char *)evl + ETHER_VLAN_ENCAP_LEN,
2022 	      (char *)evl, ETHER_HDR_LEN - ETHER_TYPE_LEN);
2023 	evl->evl_encap_proto = htons(ETHERTYPE_VLAN);
2024 	evl->evl_tag = htons(m->m_pkthdr.ether_vtag);
2025 	m->m_flags &= ~M_VLANTAG;
2026 	return m;
2027 }
2028 #endif /* MXGE_NEW_VLAN_API */
2029 
2030 static void
2031 mxge_encap(struct mxge_slice_state *ss, struct mbuf *m)
2032 {
2033 	struct mxge_pkt_info pi = {0,0,0,0};
2034 	mxge_softc_t *sc;
2035 	mcp_kreq_ether_send_t *req;
2036 	bus_dma_segment_t *seg;
2037 	struct mbuf *m_tmp;
2038 	mxge_tx_ring_t *tx;
2039 	int cnt, cum_len, err, i, idx, odd_flag;
2040 	uint16_t pseudo_hdr_offset;
2041 	uint8_t flags, cksum_offset;
2042 
2043 	sc = ss->sc;
2044 	tx = &ss->tx;
2045 
2046 #ifdef MXGE_NEW_VLAN_API
2047 	if (m->m_flags & M_VLANTAG) {
2048 		m = mxge_vlan_tag_insert(m);
2049 		if (__predict_false(m == NULL))
2050 			goto drop_without_m;
2051 	}
2052 #endif
2053 	if (m->m_pkthdr.csum_flags &
2054 	    (CSUM_TSO | CSUM_DELAY_DATA | CSUM_DELAY_DATA_IPV6)) {
2055 		if (mxge_parse_tx(ss, m, &pi))
2056 			goto drop;
2057 	}
2058 
2059 	/* (try to) map the frame for DMA */
2060 	idx = tx->req & tx->mask;
2061 	err = bus_dmamap_load_mbuf_sg(tx->dmat, tx->info[idx].map,
2062 				      m, tx->seg_list, &cnt,
2063 				      BUS_DMA_NOWAIT);
2064 	if (__predict_false(err == EFBIG)) {
2065 		/* Too many segments in the chain.  Try
2066 		   to defrag */
2067 		m_tmp = m_defrag(m, M_NOWAIT);
2068 		if (m_tmp == NULL) {
2069 			goto drop;
2070 		}
2071 		ss->tx.defrag++;
2072 		m = m_tmp;
2073 		err = bus_dmamap_load_mbuf_sg(tx->dmat,
2074 					      tx->info[idx].map,
2075 					      m, tx->seg_list, &cnt,
2076 					      BUS_DMA_NOWAIT);
2077 	}
2078 	if (__predict_false(err != 0)) {
2079 		device_printf(sc->dev, "bus_dmamap_load_mbuf_sg returned %d"
2080 			      " packet len = %d\n", err, m->m_pkthdr.len);
2081 		goto drop;
2082 	}
2083 	bus_dmamap_sync(tx->dmat, tx->info[idx].map,
2084 			BUS_DMASYNC_PREWRITE);
2085 	tx->info[idx].m = m;
2086 
2087 #if IFCAP_TSO4
2088 	/* TSO is different enough, we handle it in another routine */
2089 	if (m->m_pkthdr.csum_flags & (CSUM_TSO)) {
2090 		mxge_encap_tso(ss, m, cnt, &pi);
2091 		return;
2092 	}
2093 #endif
2094 
2095 	req = tx->req_list;
2096 	cksum_offset = 0;
2097 	pseudo_hdr_offset = 0;
2098 	flags = MXGEFW_FLAGS_NO_TSO;
2099 
2100 	/* checksum offloading? */
2101 	if (m->m_pkthdr.csum_flags &
2102 	    (CSUM_DELAY_DATA | CSUM_DELAY_DATA_IPV6)) {
2103 		/* ensure ip header is in first mbuf, copy
2104 		   it to a scratch buffer if not */
2105 		cksum_offset = pi.ip_off + pi.ip_hlen;
2106 		pseudo_hdr_offset = cksum_offset +  m->m_pkthdr.csum_data;
2107 		pseudo_hdr_offset = htobe16(pseudo_hdr_offset);
2108 		req->cksum_offset = cksum_offset;
2109 		flags |= MXGEFW_FLAGS_CKSUM;
2110 		odd_flag = MXGEFW_FLAGS_ALIGN_ODD;
2111 	} else {
2112 		odd_flag = 0;
2113 	}
2114 	if (m->m_pkthdr.len < MXGEFW_SEND_SMALL_SIZE)
2115 		flags |= MXGEFW_FLAGS_SMALL;
2116 
2117 	/* convert segments into a request list */
2118 	cum_len = 0;
2119 	seg = tx->seg_list;
2120 	req->flags = MXGEFW_FLAGS_FIRST;
2121 	for (i = 0; i < cnt; i++) {
2122 		req->addr_low =
2123 			htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
2124 		req->addr_high =
2125 			htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
2126 		req->length = htobe16(seg->ds_len);
2127 		req->cksum_offset = cksum_offset;
2128 		if (cksum_offset > seg->ds_len)
2129 			cksum_offset -= seg->ds_len;
2130 		else
2131 			cksum_offset = 0;
2132 		req->pseudo_hdr_offset = pseudo_hdr_offset;
2133 		req->pad = 0; /* complete solid 16-byte block */
2134 		req->rdma_count = 1;
2135 		req->flags |= flags | ((cum_len & 1) * odd_flag);
2136 		cum_len += seg->ds_len;
2137 		seg++;
2138 		req++;
2139 		req->flags = 0;
2140 	}
2141 	req--;
2142 	/* pad runts to 60 bytes */
2143 	if (cum_len < 60) {
2144 		req++;
2145 		req->addr_low =
2146 			htobe32(MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr));
2147 		req->addr_high =
2148 			htobe32(MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr));
2149 		req->length = htobe16(60 - cum_len);
2150 		req->cksum_offset = 0;
2151 		req->pseudo_hdr_offset = pseudo_hdr_offset;
2152 		req->pad = 0; /* complete solid 16-byte block */
2153 		req->rdma_count = 1;
2154 		req->flags |= flags | ((cum_len & 1) * odd_flag);
2155 		cnt++;
2156 	}
2157 
2158 	tx->req_list[0].rdma_count = cnt;
2159 #if 0
2160 	/* print what the firmware will see */
2161 	for (i = 0; i < cnt; i++) {
2162 		printf("%d: addr: 0x%x 0x%x len:%d pso%d,"
2163 		    "cso:%d, flags:0x%x, rdma:%d\n",
2164 		    i, (int)ntohl(tx->req_list[i].addr_high),
2165 		    (int)ntohl(tx->req_list[i].addr_low),
2166 		    (int)ntohs(tx->req_list[i].length),
2167 		    (int)ntohs(tx->req_list[i].pseudo_hdr_offset),
2168 		    tx->req_list[i].cksum_offset, tx->req_list[i].flags,
2169 		    tx->req_list[i].rdma_count);
2170 	}
2171 	printf("--------------\n");
2172 #endif
2173 	tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
2174 	mxge_submit_req(tx, tx->req_list, cnt);
2175 #ifdef IFNET_BUF_RING
2176 	if ((ss->sc->num_slices > 1) && tx->queue_active == 0) {
2177 		/* tell the NIC to start polling this slice */
2178 		*tx->send_go = 1;
2179 		tx->queue_active = 1;
2180 		tx->activate++;
2181 		wmb();
2182 	}
2183 #endif
2184 	return;
2185 
2186 drop:
2187 	m_freem(m);
2188 drop_without_m:
2189 	ss->oerrors++;
2190 	return;
2191 }
2192 
2193 #ifdef IFNET_BUF_RING
2194 static void
2195 mxge_qflush(struct ifnet *ifp)
2196 {
2197 	mxge_softc_t *sc = ifp->if_softc;
2198 	mxge_tx_ring_t *tx;
2199 	struct mbuf *m;
2200 	int slice;
2201 
2202 	for (slice = 0; slice < sc->num_slices; slice++) {
2203 		tx = &sc->ss[slice].tx;
2204 		mtx_lock(&tx->mtx);
2205 		while ((m = buf_ring_dequeue_sc(tx->br)) != NULL)
2206 			m_freem(m);
2207 		mtx_unlock(&tx->mtx);
2208 	}
2209 	if_qflush(ifp);
2210 }
2211 
2212 static inline void
2213 mxge_start_locked(struct mxge_slice_state *ss)
2214 {
2215 	mxge_softc_t *sc;
2216 	struct mbuf *m;
2217 	struct ifnet *ifp;
2218 	mxge_tx_ring_t *tx;
2219 
2220 	sc = ss->sc;
2221 	ifp = sc->ifp;
2222 	tx = &ss->tx;
2223 
2224 	while ((tx->mask - (tx->req - tx->done)) > tx->max_desc) {
2225 		m = drbr_dequeue(ifp, tx->br);
2226 		if (m == NULL) {
2227 			return;
2228 		}
2229 		/* let BPF see it */
2230 		BPF_MTAP(ifp, m);
2231 
2232 		/* give it to the nic */
2233 		mxge_encap(ss, m);
2234 	}
2235 	/* ran out of transmit slots */
2236 	if (((ss->if_drv_flags & IFF_DRV_OACTIVE) == 0)
2237 	    && (!drbr_empty(ifp, tx->br))) {
2238 		ss->if_drv_flags |= IFF_DRV_OACTIVE;
2239 		tx->stall++;
2240 	}
2241 }
2242 
2243 static int
2244 mxge_transmit_locked(struct mxge_slice_state *ss, struct mbuf *m)
2245 {
2246 	mxge_softc_t *sc;
2247 	struct ifnet *ifp;
2248 	mxge_tx_ring_t *tx;
2249 	int err;
2250 
2251 	sc = ss->sc;
2252 	ifp = sc->ifp;
2253 	tx = &ss->tx;
2254 
2255 	if ((ss->if_drv_flags & (IFF_DRV_RUNNING|IFF_DRV_OACTIVE)) !=
2256 	    IFF_DRV_RUNNING) {
2257 		err = drbr_enqueue(ifp, tx->br, m);
2258 		return (err);
2259 	}
2260 
2261 	if (!drbr_needs_enqueue(ifp, tx->br) &&
2262 	    ((tx->mask - (tx->req - tx->done)) > tx->max_desc)) {
2263 		/* let BPF see it */
2264 		BPF_MTAP(ifp, m);
2265 		/* give it to the nic */
2266 		mxge_encap(ss, m);
2267 	} else if ((err = drbr_enqueue(ifp, tx->br, m)) != 0) {
2268 		return (err);
2269 	}
2270 	if (!drbr_empty(ifp, tx->br))
2271 		mxge_start_locked(ss);
2272 	return (0);
2273 }
2274 
2275 static int
2276 mxge_transmit(struct ifnet *ifp, struct mbuf *m)
2277 {
2278 	mxge_softc_t *sc = ifp->if_softc;
2279 	struct mxge_slice_state *ss;
2280 	mxge_tx_ring_t *tx;
2281 	int err = 0;
2282 	int slice;
2283 
2284 	slice = m->m_pkthdr.flowid;
2285 	slice &= (sc->num_slices - 1);  /* num_slices always power of 2 */
2286 
2287 	ss = &sc->ss[slice];
2288 	tx = &ss->tx;
2289 
2290 	if (mtx_trylock(&tx->mtx)) {
2291 		err = mxge_transmit_locked(ss, m);
2292 		mtx_unlock(&tx->mtx);
2293 	} else {
2294 		err = drbr_enqueue(ifp, tx->br, m);
2295 	}
2296 
2297 	return (err);
2298 }
2299 
2300 #else
2301 
2302 static inline void
2303 mxge_start_locked(struct mxge_slice_state *ss)
2304 {
2305 	mxge_softc_t *sc;
2306 	struct mbuf *m;
2307 	struct ifnet *ifp;
2308 	mxge_tx_ring_t *tx;
2309 
2310 	sc = ss->sc;
2311 	ifp = sc->ifp;
2312 	tx = &ss->tx;
2313 	while ((tx->mask - (tx->req - tx->done)) > tx->max_desc) {
2314 		IFQ_DRV_DEQUEUE(&ifp->if_snd, m);
2315 		if (m == NULL) {
2316 			return;
2317 		}
2318 		/* let BPF see it */
2319 		BPF_MTAP(ifp, m);
2320 
2321 		/* give it to the nic */
2322 		mxge_encap(ss, m);
2323 	}
2324 	/* ran out of transmit slots */
2325 	if ((sc->ifp->if_drv_flags & IFF_DRV_OACTIVE) == 0) {
2326 		sc->ifp->if_drv_flags |= IFF_DRV_OACTIVE;
2327 		tx->stall++;
2328 	}
2329 }
2330 #endif
2331 static void
2332 mxge_start(struct ifnet *ifp)
2333 {
2334 	mxge_softc_t *sc = ifp->if_softc;
2335 	struct mxge_slice_state *ss;
2336 
2337 	/* only use the first slice for now */
2338 	ss = &sc->ss[0];
2339 	mtx_lock(&ss->tx.mtx);
2340 	mxge_start_locked(ss);
2341 	mtx_unlock(&ss->tx.mtx);
2342 }
2343 
2344 /*
2345  * copy an array of mcp_kreq_ether_recv_t's to the mcp.  Copy
2346  * at most 32 bytes at a time, so as to avoid involving the software
2347  * pio handler in the nic.   We re-write the first segment's low
2348  * DMA address to mark it valid only after we write the entire chunk
2349  * in a burst
2350  */
2351 static inline void
2352 mxge_submit_8rx(volatile mcp_kreq_ether_recv_t *dst,
2353 		mcp_kreq_ether_recv_t *src)
2354 {
2355 	uint32_t low;
2356 
2357 	low = src->addr_low;
2358 	src->addr_low = 0xffffffff;
2359 	mxge_pio_copy(dst, src, 4 * sizeof (*src));
2360 	wmb();
2361 	mxge_pio_copy(dst + 4, src + 4, 4 * sizeof (*src));
2362 	wmb();
2363 	src->addr_low = low;
2364 	dst->addr_low = low;
2365 	wmb();
2366 }
2367 
2368 static int
2369 mxge_get_buf_small(struct mxge_slice_state *ss, bus_dmamap_t map, int idx)
2370 {
2371 	bus_dma_segment_t seg;
2372 	struct mbuf *m;
2373 	mxge_rx_ring_t *rx = &ss->rx_small;
2374 	int cnt, err;
2375 
2376 	m = m_gethdr(M_NOWAIT, MT_DATA);
2377 	if (m == NULL) {
2378 		rx->alloc_fail++;
2379 		err = ENOBUFS;
2380 		goto done;
2381 	}
2382 	m->m_len = MHLEN;
2383 	err = bus_dmamap_load_mbuf_sg(rx->dmat, map, m,
2384 				      &seg, &cnt, BUS_DMA_NOWAIT);
2385 	if (err != 0) {
2386 		m_free(m);
2387 		goto done;
2388 	}
2389 	rx->info[idx].m = m;
2390 	rx->shadow[idx].addr_low =
2391 		htobe32(MXGE_LOWPART_TO_U32(seg.ds_addr));
2392 	rx->shadow[idx].addr_high =
2393 		htobe32(MXGE_HIGHPART_TO_U32(seg.ds_addr));
2394 
2395 done:
2396 	if ((idx & 7) == 7)
2397 		mxge_submit_8rx(&rx->lanai[idx - 7], &rx->shadow[idx - 7]);
2398 	return err;
2399 }
2400 
2401 static int
2402 mxge_get_buf_big(struct mxge_slice_state *ss, bus_dmamap_t map, int idx)
2403 {
2404 	bus_dma_segment_t seg[3];
2405 	struct mbuf *m;
2406 	mxge_rx_ring_t *rx = &ss->rx_big;
2407 	int cnt, err, i;
2408 
2409 	m = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, rx->cl_size);
2410 	if (m == NULL) {
2411 		rx->alloc_fail++;
2412 		err = ENOBUFS;
2413 		goto done;
2414 	}
2415 	m->m_len = rx->mlen;
2416 	err = bus_dmamap_load_mbuf_sg(rx->dmat, map, m,
2417 				      seg, &cnt, BUS_DMA_NOWAIT);
2418 	if (err != 0) {
2419 		m_free(m);
2420 		goto done;
2421 	}
2422 	rx->info[idx].m = m;
2423 	rx->shadow[idx].addr_low =
2424 		htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
2425 	rx->shadow[idx].addr_high =
2426 		htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
2427 
2428 #if MXGE_VIRT_JUMBOS
2429 	for (i = 1; i < cnt; i++) {
2430 		rx->shadow[idx + i].addr_low =
2431 			htobe32(MXGE_LOWPART_TO_U32(seg[i].ds_addr));
2432 		rx->shadow[idx + i].addr_high =
2433 			htobe32(MXGE_HIGHPART_TO_U32(seg[i].ds_addr));
2434        }
2435 #endif
2436 
2437 done:
2438        for (i = 0; i < rx->nbufs; i++) {
2439 		if ((idx & 7) == 7) {
2440 			mxge_submit_8rx(&rx->lanai[idx - 7],
2441 					&rx->shadow[idx - 7]);
2442 		}
2443 		idx++;
2444 	}
2445 	return err;
2446 }
2447 
2448 #ifdef INET6
2449 
2450 static uint16_t
2451 mxge_csum_generic(uint16_t *raw, int len)
2452 {
2453 	uint32_t csum;
2454 
2455 	csum = 0;
2456 	while (len > 0) {
2457 		csum += *raw;
2458 		raw++;
2459 		len -= 2;
2460 	}
2461 	csum = (csum >> 16) + (csum & 0xffff);
2462 	csum = (csum >> 16) + (csum & 0xffff);
2463 	return (uint16_t)csum;
2464 }
2465 
2466 static inline uint16_t
2467 mxge_rx_csum6(void *p, struct mbuf *m, uint32_t csum)
2468 {
2469 	uint32_t partial;
2470 	int nxt, cksum_offset;
2471 	struct ip6_hdr *ip6 = p;
2472 	uint16_t c;
2473 
2474 	nxt = ip6->ip6_nxt;
2475 	cksum_offset = sizeof (*ip6) + ETHER_HDR_LEN;
2476 	if (nxt != IPPROTO_TCP && nxt != IPPROTO_UDP) {
2477 		cksum_offset = ip6_lasthdr(m, ETHER_HDR_LEN,
2478 					   IPPROTO_IPV6, &nxt);
2479 		if (nxt != IPPROTO_TCP && nxt != IPPROTO_UDP)
2480 			return (1);
2481 	}
2482 
2483 	/*
2484 	 * IPv6 headers do not contain a checksum, and hence
2485 	 * do not checksum to zero, so they don't "fall out"
2486 	 * of the partial checksum calculation like IPv4
2487 	 * headers do.  We need to fix the partial checksum by
2488 	 * subtracting the checksum of the IPv6 header.
2489 	 */
2490 
2491 	partial = mxge_csum_generic((uint16_t *)ip6, cksum_offset -
2492 				    ETHER_HDR_LEN);
2493 	csum += ~partial;
2494 	csum +=	 (csum < ~partial);
2495 	csum = (csum >> 16) + (csum & 0xFFFF);
2496 	csum = (csum >> 16) + (csum & 0xFFFF);
2497 	c = in6_cksum_pseudo(ip6, m->m_pkthdr.len - cksum_offset, nxt,
2498 			     csum);
2499 	c ^= 0xffff;
2500 	return (c);
2501 }
2502 #endif /* INET6 */
2503 /*
2504  *  Myri10GE hardware checksums are not valid if the sender
2505  *  padded the frame with non-zero padding.  This is because
2506  *  the firmware just does a simple 16-bit 1s complement
2507  *  checksum across the entire frame, excluding the first 14
2508  *  bytes.  It is best to simply to check the checksum and
2509  *  tell the stack about it only if the checksum is good
2510  */
2511 
2512 static inline uint16_t
2513 mxge_rx_csum(struct mbuf *m, int csum)
2514 {
2515 	struct ether_header *eh;
2516 #ifdef INET
2517 	struct ip *ip;
2518 #endif
2519 #if defined(INET) || defined(INET6)
2520 	int cap = m->m_pkthdr.rcvif->if_capenable;
2521 #endif
2522 	uint16_t c, etype;
2523 
2524 	eh = mtod(m, struct ether_header *);
2525 	etype = ntohs(eh->ether_type);
2526 	switch (etype) {
2527 #ifdef INET
2528 	case ETHERTYPE_IP:
2529 		if ((cap & IFCAP_RXCSUM) == 0)
2530 			return (1);
2531 		ip = (struct ip *)(eh + 1);
2532 		if (ip->ip_p != IPPROTO_TCP && ip->ip_p != IPPROTO_UDP)
2533 			return (1);
2534 		c = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
2535 			      htonl(ntohs(csum) + ntohs(ip->ip_len) -
2536 				    (ip->ip_hl << 2) + ip->ip_p));
2537 		c ^= 0xffff;
2538 		break;
2539 #endif
2540 #ifdef INET6
2541 	case ETHERTYPE_IPV6:
2542 		if ((cap & IFCAP_RXCSUM_IPV6) == 0)
2543 			return (1);
2544 		c = mxge_rx_csum6((eh + 1), m, csum);
2545 		break;
2546 #endif
2547 	default:
2548 		c = 1;
2549 	}
2550 	return (c);
2551 }
2552 
2553 static void
2554 mxge_vlan_tag_remove(struct mbuf *m, uint32_t *csum)
2555 {
2556 	struct ether_vlan_header *evl;
2557 	uint32_t partial;
2558 
2559 	evl = mtod(m, struct ether_vlan_header *);
2560 
2561 	/*
2562 	 * fix checksum by subtracting ETHER_VLAN_ENCAP_LEN bytes
2563 	 * after what the firmware thought was the end of the ethernet
2564 	 * header.
2565 	 */
2566 
2567 	/* put checksum into host byte order */
2568 	*csum = ntohs(*csum);
2569 	partial = ntohl(*(uint32_t *)(mtod(m, char *) + ETHER_HDR_LEN));
2570 	(*csum) += ~partial;
2571 	(*csum) +=  ((*csum) < ~partial);
2572 	(*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2573 	(*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2574 
2575 	/* restore checksum to network byte order;
2576 	   later consumers expect this */
2577 	*csum = htons(*csum);
2578 
2579 	/* save the tag */
2580 #ifdef MXGE_NEW_VLAN_API
2581 	m->m_pkthdr.ether_vtag = ntohs(evl->evl_tag);
2582 #else
2583 	{
2584 		struct m_tag *mtag;
2585 		mtag = m_tag_alloc(MTAG_VLAN, MTAG_VLAN_TAG, sizeof(u_int),
2586 				   M_NOWAIT);
2587 		if (mtag == NULL)
2588 			return;
2589 		VLAN_TAG_VALUE(mtag) = ntohs(evl->evl_tag);
2590 		m_tag_prepend(m, mtag);
2591 	}
2592 
2593 #endif
2594 	m->m_flags |= M_VLANTAG;
2595 
2596 	/*
2597 	 * Remove the 802.1q header by copying the Ethernet
2598 	 * addresses over it and adjusting the beginning of
2599 	 * the data in the mbuf.  The encapsulated Ethernet
2600 	 * type field is already in place.
2601 	 */
2602 	bcopy((char *)evl, (char *)evl + ETHER_VLAN_ENCAP_LEN,
2603 	      ETHER_HDR_LEN - ETHER_TYPE_LEN);
2604 	m_adj(m, ETHER_VLAN_ENCAP_LEN);
2605 }
2606 
2607 static inline void
2608 mxge_rx_done_big(struct mxge_slice_state *ss, uint32_t len,
2609 		 uint32_t csum, int lro)
2610 {
2611 	mxge_softc_t *sc;
2612 	struct ifnet *ifp;
2613 	struct mbuf *m;
2614 	struct ether_header *eh;
2615 	mxge_rx_ring_t *rx;
2616 	bus_dmamap_t old_map;
2617 	int idx;
2618 
2619 	sc = ss->sc;
2620 	ifp = sc->ifp;
2621 	rx = &ss->rx_big;
2622 	idx = rx->cnt & rx->mask;
2623 	rx->cnt += rx->nbufs;
2624 	/* save a pointer to the received mbuf */
2625 	m = rx->info[idx].m;
2626 	/* try to replace the received mbuf */
2627 	if (mxge_get_buf_big(ss, rx->extra_map, idx)) {
2628 		/* drop the frame -- the old mbuf is re-cycled */
2629 		if_inc_counter(ifp, IFCOUNTER_IERRORS, 1);
2630 		return;
2631 	}
2632 
2633 	/* unmap the received buffer */
2634 	old_map = rx->info[idx].map;
2635 	bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2636 	bus_dmamap_unload(rx->dmat, old_map);
2637 
2638 	/* swap the bus_dmamap_t's */
2639 	rx->info[idx].map = rx->extra_map;
2640 	rx->extra_map = old_map;
2641 
2642 	/* mcp implicitly skips 1st 2 bytes so that packet is properly
2643 	 * aligned */
2644 	m->m_data += MXGEFW_PAD;
2645 
2646 	m->m_pkthdr.rcvif = ifp;
2647 	m->m_len = m->m_pkthdr.len = len;
2648 	ss->ipackets++;
2649 	eh = mtod(m, struct ether_header *);
2650 	if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2651 		mxge_vlan_tag_remove(m, &csum);
2652 	}
2653 	/* flowid only valid if RSS hashing is enabled */
2654 	if (sc->num_slices > 1) {
2655 		m->m_pkthdr.flowid = (ss - sc->ss);
2656 		M_HASHTYPE_SET(m, M_HASHTYPE_OPAQUE);
2657 	}
2658 	/* if the checksum is valid, mark it in the mbuf header */
2659 	if ((ifp->if_capenable & (IFCAP_RXCSUM_IPV6 | IFCAP_RXCSUM)) &&
2660 	    (0 == mxge_rx_csum(m, csum))) {
2661 		/* Tell the stack that the  checksum is good */
2662 		m->m_pkthdr.csum_data = 0xffff;
2663 		m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR |
2664 			CSUM_DATA_VALID;
2665 
2666 #if defined(INET) || defined (INET6)
2667 		if (lro && (0 == tcp_lro_rx(&ss->lc, m, 0)))
2668 			return;
2669 #endif
2670 	}
2671 	/* pass the frame up the stack */
2672 	(*ifp->if_input)(ifp, m);
2673 }
2674 
2675 static inline void
2676 mxge_rx_done_small(struct mxge_slice_state *ss, uint32_t len,
2677 		   uint32_t csum, int lro)
2678 {
2679 	mxge_softc_t *sc;
2680 	struct ifnet *ifp;
2681 	struct ether_header *eh;
2682 	struct mbuf *m;
2683 	mxge_rx_ring_t *rx;
2684 	bus_dmamap_t old_map;
2685 	int idx;
2686 
2687 	sc = ss->sc;
2688 	ifp = sc->ifp;
2689 	rx = &ss->rx_small;
2690 	idx = rx->cnt & rx->mask;
2691 	rx->cnt++;
2692 	/* save a pointer to the received mbuf */
2693 	m = rx->info[idx].m;
2694 	/* try to replace the received mbuf */
2695 	if (mxge_get_buf_small(ss, rx->extra_map, idx)) {
2696 		/* drop the frame -- the old mbuf is re-cycled */
2697 		if_inc_counter(ifp, IFCOUNTER_IERRORS, 1);
2698 		return;
2699 	}
2700 
2701 	/* unmap the received buffer */
2702 	old_map = rx->info[idx].map;
2703 	bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2704 	bus_dmamap_unload(rx->dmat, old_map);
2705 
2706 	/* swap the bus_dmamap_t's */
2707 	rx->info[idx].map = rx->extra_map;
2708 	rx->extra_map = old_map;
2709 
2710 	/* mcp implicitly skips 1st 2 bytes so that packet is properly
2711 	 * aligned */
2712 	m->m_data += MXGEFW_PAD;
2713 
2714 	m->m_pkthdr.rcvif = ifp;
2715 	m->m_len = m->m_pkthdr.len = len;
2716 	ss->ipackets++;
2717 	eh = mtod(m, struct ether_header *);
2718 	if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2719 		mxge_vlan_tag_remove(m, &csum);
2720 	}
2721 	/* flowid only valid if RSS hashing is enabled */
2722 	if (sc->num_slices > 1) {
2723 		m->m_pkthdr.flowid = (ss - sc->ss);
2724 		M_HASHTYPE_SET(m, M_HASHTYPE_OPAQUE);
2725 	}
2726 	/* if the checksum is valid, mark it in the mbuf header */
2727 	if ((ifp->if_capenable & (IFCAP_RXCSUM_IPV6 | IFCAP_RXCSUM)) &&
2728 	    (0 == mxge_rx_csum(m, csum))) {
2729 		/* Tell the stack that the  checksum is good */
2730 		m->m_pkthdr.csum_data = 0xffff;
2731 		m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR |
2732 			CSUM_DATA_VALID;
2733 
2734 #if defined(INET) || defined (INET6)
2735 		if (lro && (0 == tcp_lro_rx(&ss->lc, m, csum)))
2736 			return;
2737 #endif
2738 	}
2739 	/* pass the frame up the stack */
2740 	(*ifp->if_input)(ifp, m);
2741 }
2742 
2743 static inline void
2744 mxge_clean_rx_done(struct mxge_slice_state *ss)
2745 {
2746 	mxge_rx_done_t *rx_done = &ss->rx_done;
2747 	int limit = 0;
2748 	uint16_t length;
2749 	uint16_t checksum;
2750 	int lro;
2751 
2752 	lro = ss->sc->ifp->if_capenable & IFCAP_LRO;
2753 	while (rx_done->entry[rx_done->idx].length != 0) {
2754 		length = ntohs(rx_done->entry[rx_done->idx].length);
2755 		rx_done->entry[rx_done->idx].length = 0;
2756 		checksum = rx_done->entry[rx_done->idx].checksum;
2757 		if (length <= (MHLEN - MXGEFW_PAD))
2758 			mxge_rx_done_small(ss, length, checksum, lro);
2759 		else
2760 			mxge_rx_done_big(ss, length, checksum, lro);
2761 		rx_done->cnt++;
2762 		rx_done->idx = rx_done->cnt & rx_done->mask;
2763 
2764 		/* limit potential for livelock */
2765 		if (__predict_false(++limit > rx_done->mask / 2))
2766 			break;
2767 	}
2768 #if defined(INET)  || defined (INET6)
2769 	tcp_lro_flush_all(&ss->lc);
2770 #endif
2771 }
2772 
2773 static inline void
2774 mxge_tx_done(struct mxge_slice_state *ss, uint32_t mcp_idx)
2775 {
2776 	struct ifnet *ifp __unused;
2777 	mxge_tx_ring_t *tx;
2778 	struct mbuf *m;
2779 	bus_dmamap_t map;
2780 	int idx;
2781 	int *flags;
2782 
2783 	tx = &ss->tx;
2784 	ifp = ss->sc->ifp;
2785 	while (tx->pkt_done != mcp_idx) {
2786 		idx = tx->done & tx->mask;
2787 		tx->done++;
2788 		m = tx->info[idx].m;
2789 		/* mbuf and DMA map only attached to the first
2790 		   segment per-mbuf */
2791 		if (m != NULL) {
2792 			ss->obytes += m->m_pkthdr.len;
2793 			if (m->m_flags & M_MCAST)
2794 				ss->omcasts++;
2795 			ss->opackets++;
2796 			tx->info[idx].m = NULL;
2797 			map = tx->info[idx].map;
2798 			bus_dmamap_unload(tx->dmat, map);
2799 			m_freem(m);
2800 		}
2801 		if (tx->info[idx].flag) {
2802 			tx->info[idx].flag = 0;
2803 			tx->pkt_done++;
2804 		}
2805 	}
2806 
2807 	/* If we have space, clear IFF_OACTIVE to tell the stack that
2808 	   its OK to send packets */
2809 #ifdef IFNET_BUF_RING
2810 	flags = &ss->if_drv_flags;
2811 #else
2812 	flags = &ifp->if_drv_flags;
2813 #endif
2814 	mtx_lock(&ss->tx.mtx);
2815 	if ((*flags) & IFF_DRV_OACTIVE &&
2816 	    tx->req - tx->done < (tx->mask + 1)/4) {
2817 		*(flags) &= ~IFF_DRV_OACTIVE;
2818 		ss->tx.wake++;
2819 		mxge_start_locked(ss);
2820 	}
2821 #ifdef IFNET_BUF_RING
2822 	if ((ss->sc->num_slices > 1) && (tx->req == tx->done)) {
2823 		/* let the NIC stop polling this queue, since there
2824 		 * are no more transmits pending */
2825 		if (tx->req == tx->done) {
2826 			*tx->send_stop = 1;
2827 			tx->queue_active = 0;
2828 			tx->deactivate++;
2829 			wmb();
2830 		}
2831 	}
2832 #endif
2833 	mtx_unlock(&ss->tx.mtx);
2834 
2835 }
2836 
2837 static struct mxge_media_type mxge_xfp_media_types[] =
2838 {
2839 	{IFM_10G_CX4,	0x7f, 		"10GBASE-CX4 (module)"},
2840 	{IFM_10G_SR, 	(1 << 7),	"10GBASE-SR"},
2841 	{IFM_10G_LR, 	(1 << 6),	"10GBASE-LR"},
2842 	{0,		(1 << 5),	"10GBASE-ER"},
2843 	{IFM_10G_LRM,	(1 << 4),	"10GBASE-LRM"},
2844 	{0,		(1 << 3),	"10GBASE-SW"},
2845 	{0,		(1 << 2),	"10GBASE-LW"},
2846 	{0,		(1 << 1),	"10GBASE-EW"},
2847 	{0,		(1 << 0),	"Reserved"}
2848 };
2849 static struct mxge_media_type mxge_sfp_media_types[] =
2850 {
2851 	{IFM_10G_TWINAX,      0,	"10GBASE-Twinax"},
2852 	{0,		(1 << 7),	"Reserved"},
2853 	{IFM_10G_LRM,	(1 << 6),	"10GBASE-LRM"},
2854 	{IFM_10G_LR, 	(1 << 5),	"10GBASE-LR"},
2855 	{IFM_10G_SR,	(1 << 4),	"10GBASE-SR"},
2856 	{IFM_10G_TWINAX,(1 << 0),	"10GBASE-Twinax"}
2857 };
2858 
2859 static void
2860 mxge_media_set(mxge_softc_t *sc, int media_type)
2861 {
2862 
2863 	ifmedia_add(&sc->media, IFM_ETHER | IFM_FDX | media_type,
2864 		    0, NULL);
2865 	ifmedia_set(&sc->media, IFM_ETHER | IFM_FDX | media_type);
2866 	sc->current_media = media_type;
2867 	sc->media.ifm_media = sc->media.ifm_cur->ifm_media;
2868 }
2869 
2870 static void
2871 mxge_media_init(mxge_softc_t *sc)
2872 {
2873 	char *ptr;
2874 	int i;
2875 
2876 	ifmedia_removeall(&sc->media);
2877 	mxge_media_set(sc, IFM_AUTO);
2878 
2879 	/*
2880 	 * parse the product code to deterimine the interface type
2881 	 * (CX4, XFP, Quad Ribbon Fiber) by looking at the character
2882 	 * after the 3rd dash in the driver's cached copy of the
2883 	 * EEPROM's product code string.
2884 	 */
2885 	ptr = sc->product_code_string;
2886 	if (ptr == NULL) {
2887 		device_printf(sc->dev, "Missing product code\n");
2888 		return;
2889 	}
2890 
2891 	for (i = 0; i < 3; i++, ptr++) {
2892 		ptr = strchr(ptr, '-');
2893 		if (ptr == NULL) {
2894 			device_printf(sc->dev,
2895 				      "only %d dashes in PC?!?\n", i);
2896 			return;
2897 		}
2898 	}
2899 	if (*ptr == 'C' || *(ptr +1) == 'C') {
2900 		/* -C is CX4 */
2901 		sc->connector = MXGE_CX4;
2902 		mxge_media_set(sc, IFM_10G_CX4);
2903 	} else if (*ptr == 'Q') {
2904 		/* -Q is Quad Ribbon Fiber */
2905 		sc->connector = MXGE_QRF;
2906 		device_printf(sc->dev, "Quad Ribbon Fiber Media\n");
2907 		/* FreeBSD has no media type for Quad ribbon fiber */
2908 	} else if (*ptr == 'R') {
2909 		/* -R is XFP */
2910 		sc->connector = MXGE_XFP;
2911 	} else if (*ptr == 'S' || *(ptr +1) == 'S') {
2912 		/* -S or -2S is SFP+ */
2913 		sc->connector = MXGE_SFP;
2914 	} else {
2915 		device_printf(sc->dev, "Unknown media type: %c\n", *ptr);
2916 	}
2917 }
2918 
2919 /*
2920  * Determine the media type for a NIC.  Some XFPs will identify
2921  * themselves only when their link is up, so this is initiated via a
2922  * link up interrupt.  However, this can potentially take up to
2923  * several milliseconds, so it is run via the watchdog routine, rather
2924  * than in the interrupt handler itself.
2925  */
2926 static void
2927 mxge_media_probe(mxge_softc_t *sc)
2928 {
2929 	mxge_cmd_t cmd;
2930 	char *cage_type;
2931 
2932 	struct mxge_media_type *mxge_media_types = NULL;
2933 	int i, err, ms, mxge_media_type_entries;
2934 	uint32_t byte;
2935 
2936 	sc->need_media_probe = 0;
2937 
2938 	if (sc->connector == MXGE_XFP) {
2939 		/* -R is XFP */
2940 		mxge_media_types = mxge_xfp_media_types;
2941 		mxge_media_type_entries =
2942 			nitems(mxge_xfp_media_types);
2943 		byte = MXGE_XFP_COMPLIANCE_BYTE;
2944 		cage_type = "XFP";
2945 	} else 	if (sc->connector == MXGE_SFP) {
2946 		/* -S or -2S is SFP+ */
2947 		mxge_media_types = mxge_sfp_media_types;
2948 		mxge_media_type_entries =
2949 			nitems(mxge_sfp_media_types);
2950 		cage_type = "SFP+";
2951 		byte = 3;
2952 	} else {
2953 		/* nothing to do; media type cannot change */
2954 		return;
2955 	}
2956 
2957 	/*
2958 	 * At this point we know the NIC has an XFP cage, so now we
2959 	 * try to determine what is in the cage by using the
2960 	 * firmware's XFP I2C commands to read the XFP 10GbE compilance
2961 	 * register.  We read just one byte, which may take over
2962 	 * a millisecond
2963 	 */
2964 
2965 	cmd.data0 = 0;	 /* just fetch 1 byte, not all 256 */
2966 	cmd.data1 = byte;
2967 	err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_READ, &cmd);
2968 	if (err == MXGEFW_CMD_ERROR_I2C_FAILURE) {
2969 		device_printf(sc->dev, "failed to read XFP\n");
2970 	}
2971 	if (err == MXGEFW_CMD_ERROR_I2C_ABSENT) {
2972 		device_printf(sc->dev, "Type R/S with no XFP!?!?\n");
2973 	}
2974 	if (err != MXGEFW_CMD_OK) {
2975 		return;
2976 	}
2977 
2978 	/* now we wait for the data to be cached */
2979 	cmd.data0 = byte;
2980 	err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
2981 	for (ms = 0; (err == EBUSY) && (ms < 50); ms++) {
2982 		DELAY(1000);
2983 		cmd.data0 = byte;
2984 		err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
2985 	}
2986 	if (err != MXGEFW_CMD_OK) {
2987 		device_printf(sc->dev, "failed to read %s (%d, %dms)\n",
2988 			      cage_type, err, ms);
2989 		return;
2990 	}
2991 
2992 	if (cmd.data0 == mxge_media_types[0].bitmask) {
2993 		if (mxge_verbose)
2994 			device_printf(sc->dev, "%s:%s\n", cage_type,
2995 				      mxge_media_types[0].name);
2996 		if (sc->current_media != mxge_media_types[0].flag) {
2997 			mxge_media_init(sc);
2998 			mxge_media_set(sc, mxge_media_types[0].flag);
2999 		}
3000 		return;
3001 	}
3002 	for (i = 1; i < mxge_media_type_entries; i++) {
3003 		if (cmd.data0 & mxge_media_types[i].bitmask) {
3004 			if (mxge_verbose)
3005 				device_printf(sc->dev, "%s:%s\n",
3006 					      cage_type,
3007 					      mxge_media_types[i].name);
3008 
3009 			if (sc->current_media != mxge_media_types[i].flag) {
3010 				mxge_media_init(sc);
3011 				mxge_media_set(sc, mxge_media_types[i].flag);
3012 			}
3013 			return;
3014 		}
3015 	}
3016 	if (mxge_verbose)
3017 		device_printf(sc->dev, "%s media 0x%x unknown\n",
3018 			      cage_type, cmd.data0);
3019 
3020 	return;
3021 }
3022 
3023 static void
3024 mxge_intr(void *arg)
3025 {
3026 	struct mxge_slice_state *ss = arg;
3027 	mxge_softc_t *sc = ss->sc;
3028 	mcp_irq_data_t *stats = ss->fw_stats;
3029 	mxge_tx_ring_t *tx = &ss->tx;
3030 	mxge_rx_done_t *rx_done = &ss->rx_done;
3031 	uint32_t send_done_count;
3032 	uint8_t valid;
3033 
3034 #ifndef IFNET_BUF_RING
3035 	/* an interrupt on a non-zero slice is implicitly valid
3036 	   since MSI-X irqs are not shared */
3037 	if (ss != sc->ss) {
3038 		mxge_clean_rx_done(ss);
3039 		*ss->irq_claim = be32toh(3);
3040 		return;
3041 	}
3042 #endif
3043 
3044 	/* make sure the DMA has finished */
3045 	if (!stats->valid) {
3046 		return;
3047 	}
3048 	valid = stats->valid;
3049 
3050 	if (sc->legacy_irq) {
3051 		/* lower legacy IRQ  */
3052 		*sc->irq_deassert = 0;
3053 		if (!mxge_deassert_wait)
3054 			/* don't wait for conf. that irq is low */
3055 			stats->valid = 0;
3056 	} else {
3057 		stats->valid = 0;
3058 	}
3059 
3060 	/* loop while waiting for legacy irq deassertion */
3061 	do {
3062 		/* check for transmit completes and receives */
3063 		send_done_count = be32toh(stats->send_done_count);
3064 		while ((send_done_count != tx->pkt_done) ||
3065 		       (rx_done->entry[rx_done->idx].length != 0)) {
3066 			if (send_done_count != tx->pkt_done)
3067 				mxge_tx_done(ss, (int)send_done_count);
3068 			mxge_clean_rx_done(ss);
3069 			send_done_count = be32toh(stats->send_done_count);
3070 		}
3071 		if (sc->legacy_irq && mxge_deassert_wait)
3072 			wmb();
3073 	} while (*((volatile uint8_t *) &stats->valid));
3074 
3075 	/* fw link & error stats meaningful only on the first slice */
3076 	if (__predict_false((ss == sc->ss) && stats->stats_updated)) {
3077 		if (sc->link_state != stats->link_up) {
3078 			sc->link_state = stats->link_up;
3079 			if (sc->link_state) {
3080 				if_link_state_change(sc->ifp, LINK_STATE_UP);
3081 				if (mxge_verbose)
3082 					device_printf(sc->dev, "link up\n");
3083 			} else {
3084 				if_link_state_change(sc->ifp, LINK_STATE_DOWN);
3085 				if (mxge_verbose)
3086 					device_printf(sc->dev, "link down\n");
3087 			}
3088 			sc->need_media_probe = 1;
3089 		}
3090 		if (sc->rdma_tags_available !=
3091 		    be32toh(stats->rdma_tags_available)) {
3092 			sc->rdma_tags_available =
3093 				be32toh(stats->rdma_tags_available);
3094 			device_printf(sc->dev, "RDMA timed out! %d tags "
3095 				      "left\n", sc->rdma_tags_available);
3096 		}
3097 
3098 		if (stats->link_down) {
3099 			sc->down_cnt += stats->link_down;
3100 			sc->link_state = 0;
3101 			if_link_state_change(sc->ifp, LINK_STATE_DOWN);
3102 		}
3103 	}
3104 
3105 	/* check to see if we have rx token to pass back */
3106 	if (valid & 0x1)
3107 	    *ss->irq_claim = be32toh(3);
3108 	*(ss->irq_claim + 1) = be32toh(3);
3109 }
3110 
3111 static void
3112 mxge_init(void *arg)
3113 {
3114 	mxge_softc_t *sc = arg;
3115 	struct ifnet *ifp = sc->ifp;
3116 
3117 	mtx_lock(&sc->driver_mtx);
3118 	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0)
3119 		(void) mxge_open(sc);
3120 	mtx_unlock(&sc->driver_mtx);
3121 }
3122 
3123 static void
3124 mxge_free_slice_mbufs(struct mxge_slice_state *ss)
3125 {
3126 	int i;
3127 
3128 #if defined(INET) || defined(INET6)
3129 	tcp_lro_free(&ss->lc);
3130 #endif
3131 	for (i = 0; i <= ss->rx_big.mask; i++) {
3132 		if (ss->rx_big.info[i].m == NULL)
3133 			continue;
3134 		bus_dmamap_unload(ss->rx_big.dmat,
3135 				  ss->rx_big.info[i].map);
3136 		m_freem(ss->rx_big.info[i].m);
3137 		ss->rx_big.info[i].m = NULL;
3138 	}
3139 
3140 	for (i = 0; i <= ss->rx_small.mask; i++) {
3141 		if (ss->rx_small.info[i].m == NULL)
3142 			continue;
3143 		bus_dmamap_unload(ss->rx_small.dmat,
3144 				  ss->rx_small.info[i].map);
3145 		m_freem(ss->rx_small.info[i].m);
3146 		ss->rx_small.info[i].m = NULL;
3147 	}
3148 
3149 	/* transmit ring used only on the first slice */
3150 	if (ss->tx.info == NULL)
3151 		return;
3152 
3153 	for (i = 0; i <= ss->tx.mask; i++) {
3154 		ss->tx.info[i].flag = 0;
3155 		if (ss->tx.info[i].m == NULL)
3156 			continue;
3157 		bus_dmamap_unload(ss->tx.dmat,
3158 				  ss->tx.info[i].map);
3159 		m_freem(ss->tx.info[i].m);
3160 		ss->tx.info[i].m = NULL;
3161 	}
3162 }
3163 
3164 static void
3165 mxge_free_mbufs(mxge_softc_t *sc)
3166 {
3167 	int slice;
3168 
3169 	for (slice = 0; slice < sc->num_slices; slice++)
3170 		mxge_free_slice_mbufs(&sc->ss[slice]);
3171 }
3172 
3173 static void
3174 mxge_free_slice_rings(struct mxge_slice_state *ss)
3175 {
3176 	int i;
3177 
3178 	if (ss->rx_done.entry != NULL)
3179 		mxge_dma_free(&ss->rx_done.dma);
3180 	ss->rx_done.entry = NULL;
3181 
3182 	if (ss->tx.req_bytes != NULL)
3183 		free(ss->tx.req_bytes, M_DEVBUF);
3184 	ss->tx.req_bytes = NULL;
3185 
3186 	if (ss->tx.seg_list != NULL)
3187 		free(ss->tx.seg_list, M_DEVBUF);
3188 	ss->tx.seg_list = NULL;
3189 
3190 	if (ss->rx_small.shadow != NULL)
3191 		free(ss->rx_small.shadow, M_DEVBUF);
3192 	ss->rx_small.shadow = NULL;
3193 
3194 	if (ss->rx_big.shadow != NULL)
3195 		free(ss->rx_big.shadow, M_DEVBUF);
3196 	ss->rx_big.shadow = NULL;
3197 
3198 	if (ss->tx.info != NULL) {
3199 		if (ss->tx.dmat != NULL) {
3200 			for (i = 0; i <= ss->tx.mask; i++) {
3201 				bus_dmamap_destroy(ss->tx.dmat,
3202 						   ss->tx.info[i].map);
3203 			}
3204 			bus_dma_tag_destroy(ss->tx.dmat);
3205 		}
3206 		free(ss->tx.info, M_DEVBUF);
3207 	}
3208 	ss->tx.info = NULL;
3209 
3210 	if (ss->rx_small.info != NULL) {
3211 		if (ss->rx_small.dmat != NULL) {
3212 			for (i = 0; i <= ss->rx_small.mask; i++) {
3213 				bus_dmamap_destroy(ss->rx_small.dmat,
3214 						   ss->rx_small.info[i].map);
3215 			}
3216 			bus_dmamap_destroy(ss->rx_small.dmat,
3217 					   ss->rx_small.extra_map);
3218 			bus_dma_tag_destroy(ss->rx_small.dmat);
3219 		}
3220 		free(ss->rx_small.info, M_DEVBUF);
3221 	}
3222 	ss->rx_small.info = NULL;
3223 
3224 	if (ss->rx_big.info != NULL) {
3225 		if (ss->rx_big.dmat != NULL) {
3226 			for (i = 0; i <= ss->rx_big.mask; i++) {
3227 				bus_dmamap_destroy(ss->rx_big.dmat,
3228 						   ss->rx_big.info[i].map);
3229 			}
3230 			bus_dmamap_destroy(ss->rx_big.dmat,
3231 					   ss->rx_big.extra_map);
3232 			bus_dma_tag_destroy(ss->rx_big.dmat);
3233 		}
3234 		free(ss->rx_big.info, M_DEVBUF);
3235 	}
3236 	ss->rx_big.info = NULL;
3237 }
3238 
3239 static void
3240 mxge_free_rings(mxge_softc_t *sc)
3241 {
3242 	int slice;
3243 
3244 	for (slice = 0; slice < sc->num_slices; slice++)
3245 		mxge_free_slice_rings(&sc->ss[slice]);
3246 }
3247 
3248 static int
3249 mxge_alloc_slice_rings(struct mxge_slice_state *ss, int rx_ring_entries,
3250 		       int tx_ring_entries)
3251 {
3252 	mxge_softc_t *sc = ss->sc;
3253 	size_t bytes;
3254 	int err, i;
3255 
3256 	/* allocate per-slice receive resources */
3257 
3258 	ss->rx_small.mask = ss->rx_big.mask = rx_ring_entries - 1;
3259 	ss->rx_done.mask = (2 * rx_ring_entries) - 1;
3260 
3261 	/* allocate the rx shadow rings */
3262 	bytes = rx_ring_entries * sizeof (*ss->rx_small.shadow);
3263 	ss->rx_small.shadow = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3264 
3265 	bytes = rx_ring_entries * sizeof (*ss->rx_big.shadow);
3266 	ss->rx_big.shadow = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3267 
3268 	/* allocate the rx host info rings */
3269 	bytes = rx_ring_entries * sizeof (*ss->rx_small.info);
3270 	ss->rx_small.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3271 
3272 	bytes = rx_ring_entries * sizeof (*ss->rx_big.info);
3273 	ss->rx_big.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3274 
3275 	/* allocate the rx busdma resources */
3276 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
3277 				 1,			/* alignment */
3278 				 4096,			/* boundary */
3279 				 BUS_SPACE_MAXADDR,	/* low */
3280 				 BUS_SPACE_MAXADDR,	/* high */
3281 				 NULL, NULL,		/* filter */
3282 				 MHLEN,			/* maxsize */
3283 				 1,			/* num segs */
3284 				 MHLEN,			/* maxsegsize */
3285 				 BUS_DMA_ALLOCNOW,	/* flags */
3286 				 NULL, NULL,		/* lock */
3287 				 &ss->rx_small.dmat);	/* tag */
3288 	if (err != 0) {
3289 		device_printf(sc->dev, "Err %d allocating rx_small dmat\n",
3290 			      err);
3291 		return err;
3292 	}
3293 
3294 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
3295 				 1,			/* alignment */
3296 #if MXGE_VIRT_JUMBOS
3297 				 4096,			/* boundary */
3298 #else
3299 				 0,			/* boundary */
3300 #endif
3301 				 BUS_SPACE_MAXADDR,	/* low */
3302 				 BUS_SPACE_MAXADDR,	/* high */
3303 				 NULL, NULL,		/* filter */
3304 				 3*4096,		/* maxsize */
3305 #if MXGE_VIRT_JUMBOS
3306 				 3,			/* num segs */
3307 				 4096,			/* maxsegsize*/
3308 #else
3309 				 1,			/* num segs */
3310 				 MJUM9BYTES,		/* maxsegsize*/
3311 #endif
3312 				 BUS_DMA_ALLOCNOW,	/* flags */
3313 				 NULL, NULL,		/* lock */
3314 				 &ss->rx_big.dmat);	/* tag */
3315 	if (err != 0) {
3316 		device_printf(sc->dev, "Err %d allocating rx_big dmat\n",
3317 			      err);
3318 		return err;
3319 	}
3320 	for (i = 0; i <= ss->rx_small.mask; i++) {
3321 		err = bus_dmamap_create(ss->rx_small.dmat, 0,
3322 					&ss->rx_small.info[i].map);
3323 		if (err != 0) {
3324 			device_printf(sc->dev, "Err %d  rx_small dmamap\n",
3325 				      err);
3326 			return err;
3327 		}
3328 	}
3329 	err = bus_dmamap_create(ss->rx_small.dmat, 0,
3330 				&ss->rx_small.extra_map);
3331 	if (err != 0) {
3332 		device_printf(sc->dev, "Err %d extra rx_small dmamap\n",
3333 			      err);
3334 		return err;
3335 	}
3336 
3337 	for (i = 0; i <= ss->rx_big.mask; i++) {
3338 		err = bus_dmamap_create(ss->rx_big.dmat, 0,
3339 					&ss->rx_big.info[i].map);
3340 		if (err != 0) {
3341 			device_printf(sc->dev, "Err %d  rx_big dmamap\n",
3342 				      err);
3343 			return err;
3344 		}
3345 	}
3346 	err = bus_dmamap_create(ss->rx_big.dmat, 0,
3347 				&ss->rx_big.extra_map);
3348 	if (err != 0) {
3349 		device_printf(sc->dev, "Err %d extra rx_big dmamap\n",
3350 			      err);
3351 		return err;
3352 	}
3353 
3354 	/* now allocate TX resources */
3355 
3356 #ifndef IFNET_BUF_RING
3357 	/* only use a single TX ring for now */
3358 	if (ss != ss->sc->ss)
3359 		return 0;
3360 #endif
3361 
3362 	ss->tx.mask = tx_ring_entries - 1;
3363 	ss->tx.max_desc = MIN(MXGE_MAX_SEND_DESC, tx_ring_entries / 4);
3364 
3365 	/* allocate the tx request copy block */
3366 	bytes = 8 +
3367 		sizeof (*ss->tx.req_list) * (ss->tx.max_desc + 4);
3368 	ss->tx.req_bytes = malloc(bytes, M_DEVBUF, M_WAITOK);
3369 	/* ensure req_list entries are aligned to 8 bytes */
3370 	ss->tx.req_list = (mcp_kreq_ether_send_t *)
3371 		((uintptr_t)(ss->tx.req_bytes + 7) & ~7UL);
3372 
3373 	/* allocate the tx busdma segment list */
3374 	bytes = sizeof (*ss->tx.seg_list) * ss->tx.max_desc;
3375 	ss->tx.seg_list = (bus_dma_segment_t *)
3376 		malloc(bytes, M_DEVBUF, M_WAITOK);
3377 
3378 	/* allocate the tx host info ring */
3379 	bytes = tx_ring_entries * sizeof (*ss->tx.info);
3380 	ss->tx.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3381 
3382 	/* allocate the tx busdma resources */
3383 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
3384 				 1,			/* alignment */
3385 				 sc->tx_boundary,	/* boundary */
3386 				 BUS_SPACE_MAXADDR,	/* low */
3387 				 BUS_SPACE_MAXADDR,	/* high */
3388 				 NULL, NULL,		/* filter */
3389 				 65536 + 256,		/* maxsize */
3390 				 ss->tx.max_desc - 2,	/* num segs */
3391 				 sc->tx_boundary,	/* maxsegsz */
3392 				 BUS_DMA_ALLOCNOW,	/* flags */
3393 				 NULL, NULL,		/* lock */
3394 				 &ss->tx.dmat);		/* tag */
3395 
3396 	if (err != 0) {
3397 		device_printf(sc->dev, "Err %d allocating tx dmat\n",
3398 			      err);
3399 		return err;
3400 	}
3401 
3402 	/* now use these tags to setup dmamaps for each slot
3403 	   in the ring */
3404 	for (i = 0; i <= ss->tx.mask; i++) {
3405 		err = bus_dmamap_create(ss->tx.dmat, 0,
3406 					&ss->tx.info[i].map);
3407 		if (err != 0) {
3408 			device_printf(sc->dev, "Err %d  tx dmamap\n",
3409 				      err);
3410 			return err;
3411 		}
3412 	}
3413 	return 0;
3414 
3415 }
3416 
3417 static int
3418 mxge_alloc_rings(mxge_softc_t *sc)
3419 {
3420 	mxge_cmd_t cmd;
3421 	int tx_ring_size;
3422 	int tx_ring_entries, rx_ring_entries;
3423 	int err, slice;
3424 
3425 	/* get ring sizes */
3426 	err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_RING_SIZE, &cmd);
3427 	tx_ring_size = cmd.data0;
3428 	if (err != 0) {
3429 		device_printf(sc->dev, "Cannot determine tx ring sizes\n");
3430 		goto abort;
3431 	}
3432 
3433 	tx_ring_entries = tx_ring_size / sizeof (mcp_kreq_ether_send_t);
3434 	rx_ring_entries = sc->rx_ring_size / sizeof (mcp_dma_addr_t);
3435 	IFQ_SET_MAXLEN(&sc->ifp->if_snd, tx_ring_entries - 1);
3436 	sc->ifp->if_snd.ifq_drv_maxlen = sc->ifp->if_snd.ifq_maxlen;
3437 	IFQ_SET_READY(&sc->ifp->if_snd);
3438 
3439 	for (slice = 0; slice < sc->num_slices; slice++) {
3440 		err = mxge_alloc_slice_rings(&sc->ss[slice],
3441 					     rx_ring_entries,
3442 					     tx_ring_entries);
3443 		if (err != 0)
3444 			goto abort;
3445 	}
3446 	return 0;
3447 
3448 abort:
3449 	mxge_free_rings(sc);
3450 	return err;
3451 
3452 }
3453 
3454 static void
3455 mxge_choose_params(int mtu, int *big_buf_size, int *cl_size, int *nbufs)
3456 {
3457 	int bufsize = mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN + MXGEFW_PAD;
3458 
3459 	if (bufsize < MCLBYTES) {
3460 		/* easy, everything fits in a single buffer */
3461 		*big_buf_size = MCLBYTES;
3462 		*cl_size = MCLBYTES;
3463 		*nbufs = 1;
3464 		return;
3465 	}
3466 
3467 	if (bufsize < MJUMPAGESIZE) {
3468 		/* still easy, everything still fits in a single buffer */
3469 		*big_buf_size = MJUMPAGESIZE;
3470 		*cl_size = MJUMPAGESIZE;
3471 		*nbufs = 1;
3472 		return;
3473 	}
3474 #if MXGE_VIRT_JUMBOS
3475 	/* now we need to use virtually contiguous buffers */
3476 	*cl_size = MJUM9BYTES;
3477 	*big_buf_size = 4096;
3478 	*nbufs = mtu / 4096 + 1;
3479 	/* needs to be a power of two, so round up */
3480 	if (*nbufs == 3)
3481 		*nbufs = 4;
3482 #else
3483 	*cl_size = MJUM9BYTES;
3484 	*big_buf_size = MJUM9BYTES;
3485 	*nbufs = 1;
3486 #endif
3487 }
3488 
3489 static int
3490 mxge_slice_open(struct mxge_slice_state *ss, int nbufs, int cl_size)
3491 {
3492 	mxge_softc_t *sc;
3493 	mxge_cmd_t cmd;
3494 	bus_dmamap_t map;
3495 	int err, i, slice;
3496 
3497 	sc = ss->sc;
3498 	slice = ss - sc->ss;
3499 
3500 #if defined(INET) || defined(INET6)
3501 	(void)tcp_lro_init(&ss->lc);
3502 #endif
3503 	ss->lc.ifp = sc->ifp;
3504 
3505 	/* get the lanai pointers to the send and receive rings */
3506 
3507 	err = 0;
3508 #ifndef IFNET_BUF_RING
3509 	/* We currently only send from the first slice */
3510 	if (slice == 0) {
3511 #endif
3512 		cmd.data0 = slice;
3513 		err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_OFFSET, &cmd);
3514 		ss->tx.lanai =
3515 			(volatile mcp_kreq_ether_send_t *)(sc->sram + cmd.data0);
3516 		ss->tx.send_go = (volatile uint32_t *)
3517 			(sc->sram + MXGEFW_ETH_SEND_GO + 64 * slice);
3518 		ss->tx.send_stop = (volatile uint32_t *)
3519 		(sc->sram + MXGEFW_ETH_SEND_STOP + 64 * slice);
3520 #ifndef IFNET_BUF_RING
3521 	}
3522 #endif
3523 	cmd.data0 = slice;
3524 	err |= mxge_send_cmd(sc,
3525 			     MXGEFW_CMD_GET_SMALL_RX_OFFSET, &cmd);
3526 	ss->rx_small.lanai =
3527 		(volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
3528 	cmd.data0 = slice;
3529 	err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_BIG_RX_OFFSET, &cmd);
3530 	ss->rx_big.lanai =
3531 		(volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
3532 
3533 	if (err != 0) {
3534 		device_printf(sc->dev,
3535 			      "failed to get ring sizes or locations\n");
3536 		return EIO;
3537 	}
3538 
3539 	/* stock receive rings */
3540 	for (i = 0; i <= ss->rx_small.mask; i++) {
3541 		map = ss->rx_small.info[i].map;
3542 		err = mxge_get_buf_small(ss, map, i);
3543 		if (err) {
3544 			device_printf(sc->dev, "alloced %d/%d smalls\n",
3545 				      i, ss->rx_small.mask + 1);
3546 			return ENOMEM;
3547 		}
3548 	}
3549 	for (i = 0; i <= ss->rx_big.mask; i++) {
3550 		ss->rx_big.shadow[i].addr_low = 0xffffffff;
3551 		ss->rx_big.shadow[i].addr_high = 0xffffffff;
3552 	}
3553 	ss->rx_big.nbufs = nbufs;
3554 	ss->rx_big.cl_size = cl_size;
3555 	ss->rx_big.mlen = ss->sc->ifp->if_mtu + ETHER_HDR_LEN +
3556 		ETHER_VLAN_ENCAP_LEN + MXGEFW_PAD;
3557 	for (i = 0; i <= ss->rx_big.mask; i += ss->rx_big.nbufs) {
3558 		map = ss->rx_big.info[i].map;
3559 		err = mxge_get_buf_big(ss, map, i);
3560 		if (err) {
3561 			device_printf(sc->dev, "alloced %d/%d bigs\n",
3562 				      i, ss->rx_big.mask + 1);
3563 			return ENOMEM;
3564 		}
3565 	}
3566 	return 0;
3567 }
3568 
3569 static int
3570 mxge_open(mxge_softc_t *sc)
3571 {
3572 	mxge_cmd_t cmd;
3573 	int err, big_bytes, nbufs, slice, cl_size, i;
3574 	bus_addr_t bus;
3575 	volatile uint8_t *itable;
3576 	struct mxge_slice_state *ss;
3577 
3578 	/* Copy the MAC address in case it was overridden */
3579 	bcopy(IF_LLADDR(sc->ifp), sc->mac_addr, ETHER_ADDR_LEN);
3580 
3581 	err = mxge_reset(sc, 1);
3582 	if (err != 0) {
3583 		device_printf(sc->dev, "failed to reset\n");
3584 		return EIO;
3585 	}
3586 
3587 	if (sc->num_slices > 1) {
3588 		/* setup the indirection table */
3589 		cmd.data0 = sc->num_slices;
3590 		err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_TABLE_SIZE,
3591 				    &cmd);
3592 
3593 		err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_RSS_TABLE_OFFSET,
3594 				     &cmd);
3595 		if (err != 0) {
3596 			device_printf(sc->dev,
3597 				      "failed to setup rss tables\n");
3598 			return err;
3599 		}
3600 
3601 		/* just enable an identity mapping */
3602 		itable = sc->sram + cmd.data0;
3603 		for (i = 0; i < sc->num_slices; i++)
3604 			itable[i] = (uint8_t)i;
3605 
3606 		cmd.data0 = 1;
3607 		cmd.data1 = mxge_rss_hash_type;
3608 		err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_ENABLE, &cmd);
3609 		if (err != 0) {
3610 			device_printf(sc->dev, "failed to enable slices\n");
3611 			return err;
3612 		}
3613 	}
3614 
3615 	mxge_choose_params(sc->ifp->if_mtu, &big_bytes, &cl_size, &nbufs);
3616 
3617 	cmd.data0 = nbufs;
3618 	err = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
3619 			    &cmd);
3620 	/* error is only meaningful if we're trying to set
3621 	   MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS > 1 */
3622 	if (err && nbufs > 1) {
3623 		device_printf(sc->dev,
3624 			      "Failed to set alway-use-n to %d\n",
3625 			      nbufs);
3626 		return EIO;
3627 	}
3628 	/* Give the firmware the mtu and the big and small buffer
3629 	   sizes.  The firmware wants the big buf size to be a power
3630 	   of two. Luckily, FreeBSD's clusters are powers of two */
3631 	cmd.data0 = sc->ifp->if_mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
3632 	err = mxge_send_cmd(sc, MXGEFW_CMD_SET_MTU, &cmd);
3633 	cmd.data0 = MHLEN - MXGEFW_PAD;
3634 	err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_SMALL_BUFFER_SIZE,
3635 			     &cmd);
3636 	cmd.data0 = big_bytes;
3637 	err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_BIG_BUFFER_SIZE, &cmd);
3638 
3639 	if (err != 0) {
3640 		device_printf(sc->dev, "failed to setup params\n");
3641 		goto abort;
3642 	}
3643 
3644 	/* Now give him the pointer to the stats block */
3645 	for (slice = 0;
3646 #ifdef IFNET_BUF_RING
3647 	     slice < sc->num_slices;
3648 #else
3649 	     slice < 1;
3650 #endif
3651 	     slice++) {
3652 		ss = &sc->ss[slice];
3653 		cmd.data0 =
3654 			MXGE_LOWPART_TO_U32(ss->fw_stats_dma.bus_addr);
3655 		cmd.data1 =
3656 			MXGE_HIGHPART_TO_U32(ss->fw_stats_dma.bus_addr);
3657 		cmd.data2 = sizeof(struct mcp_irq_data);
3658 		cmd.data2 |= (slice << 16);
3659 		err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_STATS_DMA_V2, &cmd);
3660 	}
3661 
3662 	if (err != 0) {
3663 		bus = sc->ss->fw_stats_dma.bus_addr;
3664 		bus += offsetof(struct mcp_irq_data, send_done_count);
3665 		cmd.data0 = MXGE_LOWPART_TO_U32(bus);
3666 		cmd.data1 = MXGE_HIGHPART_TO_U32(bus);
3667 		err = mxge_send_cmd(sc,
3668 				    MXGEFW_CMD_SET_STATS_DMA_OBSOLETE,
3669 				    &cmd);
3670 		/* Firmware cannot support multicast without STATS_DMA_V2 */
3671 		sc->fw_multicast_support = 0;
3672 	} else {
3673 		sc->fw_multicast_support = 1;
3674 	}
3675 
3676 	if (err != 0) {
3677 		device_printf(sc->dev, "failed to setup params\n");
3678 		goto abort;
3679 	}
3680 
3681 	for (slice = 0; slice < sc->num_slices; slice++) {
3682 		err = mxge_slice_open(&sc->ss[slice], nbufs, cl_size);
3683 		if (err != 0) {
3684 			device_printf(sc->dev, "couldn't open slice %d\n",
3685 				      slice);
3686 			goto abort;
3687 		}
3688 	}
3689 
3690 	/* Finally, start the firmware running */
3691 	err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_UP, &cmd);
3692 	if (err) {
3693 		device_printf(sc->dev, "Couldn't bring up link\n");
3694 		goto abort;
3695 	}
3696 #ifdef IFNET_BUF_RING
3697 	for (slice = 0; slice < sc->num_slices; slice++) {
3698 		ss = &sc->ss[slice];
3699 		ss->if_drv_flags |= IFF_DRV_RUNNING;
3700 		ss->if_drv_flags &= ~IFF_DRV_OACTIVE;
3701 	}
3702 #endif
3703 	sc->ifp->if_drv_flags |= IFF_DRV_RUNNING;
3704 	sc->ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
3705 
3706 	return 0;
3707 
3708 abort:
3709 	mxge_free_mbufs(sc);
3710 
3711 	return err;
3712 }
3713 
3714 static int
3715 mxge_close(mxge_softc_t *sc, int down)
3716 {
3717 	mxge_cmd_t cmd;
3718 	int err, old_down_cnt;
3719 #ifdef IFNET_BUF_RING
3720 	struct mxge_slice_state *ss;
3721 	int slice;
3722 #endif
3723 
3724 #ifdef IFNET_BUF_RING
3725 	for (slice = 0; slice < sc->num_slices; slice++) {
3726 		ss = &sc->ss[slice];
3727 		ss->if_drv_flags &= ~IFF_DRV_RUNNING;
3728 	}
3729 #endif
3730 	sc->ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
3731 	if (!down) {
3732 		old_down_cnt = sc->down_cnt;
3733 		wmb();
3734 		err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_DOWN, &cmd);
3735 		if (err) {
3736 			device_printf(sc->dev,
3737 				      "Couldn't bring down link\n");
3738 		}
3739 		if (old_down_cnt == sc->down_cnt) {
3740 			/* wait for down irq */
3741 			DELAY(10 * sc->intr_coal_delay);
3742 		}
3743 		wmb();
3744 		if (old_down_cnt == sc->down_cnt) {
3745 			device_printf(sc->dev, "never got down irq\n");
3746 		}
3747 	}
3748 	mxge_free_mbufs(sc);
3749 
3750 	return 0;
3751 }
3752 
3753 static void
3754 mxge_setup_cfg_space(mxge_softc_t *sc)
3755 {
3756 	device_t dev = sc->dev;
3757 	int reg;
3758 	uint16_t lnk, pectl;
3759 
3760 	/* find the PCIe link width and set max read request to 4KB*/
3761 	if (pci_find_cap(dev, PCIY_EXPRESS, &reg) == 0) {
3762 		lnk = pci_read_config(dev, reg + 0x12, 2);
3763 		sc->link_width = (lnk >> 4) & 0x3f;
3764 
3765 		if (sc->pectl == 0) {
3766 			pectl = pci_read_config(dev, reg + 0x8, 2);
3767 			pectl = (pectl & ~0x7000) | (5 << 12);
3768 			pci_write_config(dev, reg + 0x8, pectl, 2);
3769 			sc->pectl = pectl;
3770 		} else {
3771 			/* restore saved pectl after watchdog reset */
3772 			pci_write_config(dev, reg + 0x8, sc->pectl, 2);
3773 		}
3774 	}
3775 
3776 	/* Enable DMA and Memory space access */
3777 	pci_enable_busmaster(dev);
3778 }
3779 
3780 static uint32_t
3781 mxge_read_reboot(mxge_softc_t *sc)
3782 {
3783 	device_t dev = sc->dev;
3784 	uint32_t vs;
3785 
3786 	/* find the vendor specific offset */
3787 	if (pci_find_cap(dev, PCIY_VENDOR, &vs) != 0) {
3788 		device_printf(sc->dev,
3789 			      "could not find vendor specific offset\n");
3790 		return (uint32_t)-1;
3791 	}
3792 	/* enable read32 mode */
3793 	pci_write_config(dev, vs + 0x10, 0x3, 1);
3794 	/* tell NIC which register to read */
3795 	pci_write_config(dev, vs + 0x18, 0xfffffff0, 4);
3796 	return (pci_read_config(dev, vs + 0x14, 4));
3797 }
3798 
3799 static void
3800 mxge_watchdog_reset(mxge_softc_t *sc)
3801 {
3802 	struct pci_devinfo *dinfo;
3803 	struct mxge_slice_state *ss;
3804 	int err, running, s, num_tx_slices = 1;
3805 	uint32_t reboot;
3806 	uint16_t cmd;
3807 
3808 	err = ENXIO;
3809 
3810 	device_printf(sc->dev, "Watchdog reset!\n");
3811 
3812 	/*
3813 	 * check to see if the NIC rebooted.  If it did, then all of
3814 	 * PCI config space has been reset, and things like the
3815 	 * busmaster bit will be zero.  If this is the case, then we
3816 	 * must restore PCI config space before the NIC can be used
3817 	 * again
3818 	 */
3819 	cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3820 	if (cmd == 0xffff) {
3821 		/*
3822 		 * maybe the watchdog caught the NIC rebooting; wait
3823 		 * up to 100ms for it to finish.  If it does not come
3824 		 * back, then give up
3825 		 */
3826 		DELAY(1000*100);
3827 		cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3828 		if (cmd == 0xffff) {
3829 			device_printf(sc->dev, "NIC disappeared!\n");
3830 		}
3831 	}
3832 	if ((cmd & PCIM_CMD_BUSMASTEREN) == 0) {
3833 		/* print the reboot status */
3834 		reboot = mxge_read_reboot(sc);
3835 		device_printf(sc->dev, "NIC rebooted, status = 0x%x\n",
3836 			      reboot);
3837 		running = sc->ifp->if_drv_flags & IFF_DRV_RUNNING;
3838 		if (running) {
3839 			/*
3840 			 * quiesce NIC so that TX routines will not try to
3841 			 * xmit after restoration of BAR
3842 			 */
3843 
3844 			/* Mark the link as down */
3845 			if (sc->link_state) {
3846 				sc->link_state = 0;
3847 				if_link_state_change(sc->ifp,
3848 						     LINK_STATE_DOWN);
3849 			}
3850 #ifdef IFNET_BUF_RING
3851 			num_tx_slices = sc->num_slices;
3852 #endif
3853 			/* grab all TX locks to ensure no tx  */
3854 			for (s = 0; s < num_tx_slices; s++) {
3855 				ss = &sc->ss[s];
3856 				mtx_lock(&ss->tx.mtx);
3857 			}
3858 			mxge_close(sc, 1);
3859 		}
3860 		/* restore PCI configuration space */
3861 		dinfo = device_get_ivars(sc->dev);
3862 		pci_cfg_restore(sc->dev, dinfo);
3863 
3864 		/* and redo any changes we made to our config space */
3865 		mxge_setup_cfg_space(sc);
3866 
3867 		/* reload f/w */
3868 		err = mxge_load_firmware(sc, 0);
3869 		if (err) {
3870 			device_printf(sc->dev,
3871 				      "Unable to re-load f/w\n");
3872 		}
3873 		if (running) {
3874 			if (!err)
3875 				err = mxge_open(sc);
3876 			/* release all TX locks */
3877 			for (s = 0; s < num_tx_slices; s++) {
3878 				ss = &sc->ss[s];
3879 #ifdef IFNET_BUF_RING
3880 				mxge_start_locked(ss);
3881 #endif
3882 				mtx_unlock(&ss->tx.mtx);
3883 			}
3884 		}
3885 		sc->watchdog_resets++;
3886 	} else {
3887 		device_printf(sc->dev,
3888 			      "NIC did not reboot, not resetting\n");
3889 		err = 0;
3890 	}
3891 	if (err) {
3892 		device_printf(sc->dev, "watchdog reset failed\n");
3893 	} else {
3894 		if (sc->dying == 2)
3895 			sc->dying = 0;
3896 		callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
3897 	}
3898 }
3899 
3900 static void
3901 mxge_watchdog_task(void *arg, int pending)
3902 {
3903 	mxge_softc_t *sc = arg;
3904 
3905 	mtx_lock(&sc->driver_mtx);
3906 	mxge_watchdog_reset(sc);
3907 	mtx_unlock(&sc->driver_mtx);
3908 }
3909 
3910 static void
3911 mxge_warn_stuck(mxge_softc_t *sc, mxge_tx_ring_t *tx, int slice)
3912 {
3913 	tx = &sc->ss[slice].tx;
3914 	device_printf(sc->dev, "slice %d struck? ring state:\n", slice);
3915 	device_printf(sc->dev,
3916 		      "tx.req=%d tx.done=%d, tx.queue_active=%d\n",
3917 		      tx->req, tx->done, tx->queue_active);
3918 	device_printf(sc->dev, "tx.activate=%d tx.deactivate=%d\n",
3919 			      tx->activate, tx->deactivate);
3920 	device_printf(sc->dev, "pkt_done=%d fw=%d\n",
3921 		      tx->pkt_done,
3922 		      be32toh(sc->ss->fw_stats->send_done_count));
3923 }
3924 
3925 static int
3926 mxge_watchdog(mxge_softc_t *sc)
3927 {
3928 	mxge_tx_ring_t *tx;
3929 	uint32_t rx_pause = be32toh(sc->ss->fw_stats->dropped_pause);
3930 	int i, err = 0;
3931 
3932 	/* see if we have outstanding transmits, which
3933 	   have been pending for more than mxge_ticks */
3934 	for (i = 0;
3935 #ifdef IFNET_BUF_RING
3936 	     (i < sc->num_slices) && (err == 0);
3937 #else
3938 	     (i < 1) && (err == 0);
3939 #endif
3940 	     i++) {
3941 		tx = &sc->ss[i].tx;
3942 		if (tx->req != tx->done &&
3943 		    tx->watchdog_req != tx->watchdog_done &&
3944 		    tx->done == tx->watchdog_done) {
3945 			/* check for pause blocking before resetting */
3946 			if (tx->watchdog_rx_pause == rx_pause) {
3947 				mxge_warn_stuck(sc, tx, i);
3948 				taskqueue_enqueue(sc->tq, &sc->watchdog_task);
3949 				return (ENXIO);
3950 			}
3951 			else
3952 				device_printf(sc->dev, "Flow control blocking "
3953 					      "xmits, check link partner\n");
3954 		}
3955 
3956 		tx->watchdog_req = tx->req;
3957 		tx->watchdog_done = tx->done;
3958 		tx->watchdog_rx_pause = rx_pause;
3959 	}
3960 
3961 	if (sc->need_media_probe)
3962 		mxge_media_probe(sc);
3963 	return (err);
3964 }
3965 
3966 static uint64_t
3967 mxge_get_counter(struct ifnet *ifp, ift_counter cnt)
3968 {
3969 	struct mxge_softc *sc;
3970 	uint64_t rv;
3971 
3972 	sc = if_getsoftc(ifp);
3973 	rv = 0;
3974 
3975 	switch (cnt) {
3976 	case IFCOUNTER_IPACKETS:
3977 		for (int s = 0; s < sc->num_slices; s++)
3978 			rv += sc->ss[s].ipackets;
3979 		return (rv);
3980 	case IFCOUNTER_OPACKETS:
3981 		for (int s = 0; s < sc->num_slices; s++)
3982 			rv += sc->ss[s].opackets;
3983 		return (rv);
3984 	case IFCOUNTER_OERRORS:
3985 		for (int s = 0; s < sc->num_slices; s++)
3986 			rv += sc->ss[s].oerrors;
3987 		return (rv);
3988 #ifdef IFNET_BUF_RING
3989 	case IFCOUNTER_OBYTES:
3990 		for (int s = 0; s < sc->num_slices; s++)
3991 			rv += sc->ss[s].obytes;
3992 		return (rv);
3993 	case IFCOUNTER_OMCASTS:
3994 		for (int s = 0; s < sc->num_slices; s++)
3995 			rv += sc->ss[s].omcasts;
3996 		return (rv);
3997 	case IFCOUNTER_OQDROPS:
3998 		for (int s = 0; s < sc->num_slices; s++)
3999 			rv += sc->ss[s].tx.br->br_drops;
4000 		return (rv);
4001 #endif
4002 	default:
4003 		return (if_get_counter_default(ifp, cnt));
4004 	}
4005 }
4006 
4007 static void
4008 mxge_tick(void *arg)
4009 {
4010 	mxge_softc_t *sc = arg;
4011 	u_long pkts = 0;
4012 	int err = 0;
4013 	int running, ticks;
4014 	uint16_t cmd;
4015 
4016 	ticks = mxge_ticks;
4017 	running = sc->ifp->if_drv_flags & IFF_DRV_RUNNING;
4018 	if (running) {
4019 		if (!sc->watchdog_countdown) {
4020 			err = mxge_watchdog(sc);
4021 			sc->watchdog_countdown = 4;
4022 		}
4023 		sc->watchdog_countdown--;
4024 	}
4025 	if (pkts == 0) {
4026 		/* ensure NIC did not suffer h/w fault while idle */
4027 		cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
4028 		if ((cmd & PCIM_CMD_BUSMASTEREN) == 0) {
4029 			sc->dying = 2;
4030 			taskqueue_enqueue(sc->tq, &sc->watchdog_task);
4031 			err = ENXIO;
4032 		}
4033 		/* look less often if NIC is idle */
4034 		ticks *= 4;
4035 	}
4036 
4037 	if (err == 0)
4038 		callout_reset(&sc->co_hdl, ticks, mxge_tick, sc);
4039 
4040 }
4041 
4042 static int
4043 mxge_media_change(struct ifnet *ifp)
4044 {
4045 	return EINVAL;
4046 }
4047 
4048 static int
4049 mxge_change_mtu(mxge_softc_t *sc, int mtu)
4050 {
4051 	struct ifnet *ifp = sc->ifp;
4052 	int real_mtu, old_mtu;
4053 	int err = 0;
4054 
4055 	real_mtu = mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
4056 	if ((real_mtu > sc->max_mtu) || real_mtu < 60)
4057 		return EINVAL;
4058 	mtx_lock(&sc->driver_mtx);
4059 	old_mtu = ifp->if_mtu;
4060 	ifp->if_mtu = mtu;
4061 	if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
4062 		mxge_close(sc, 0);
4063 		err = mxge_open(sc);
4064 		if (err != 0) {
4065 			ifp->if_mtu = old_mtu;
4066 			mxge_close(sc, 0);
4067 			(void) mxge_open(sc);
4068 		}
4069 	}
4070 	mtx_unlock(&sc->driver_mtx);
4071 	return err;
4072 }
4073 
4074 static void
4075 mxge_media_status(struct ifnet *ifp, struct ifmediareq *ifmr)
4076 {
4077 	mxge_softc_t *sc = ifp->if_softc;
4078 
4079 	if (sc == NULL)
4080 		return;
4081 	ifmr->ifm_status = IFM_AVALID;
4082 	ifmr->ifm_active = IFM_ETHER | IFM_FDX;
4083 	ifmr->ifm_status |= sc->link_state ? IFM_ACTIVE : 0;
4084 	ifmr->ifm_active |= sc->current_media;
4085 }
4086 
4087 static int
4088 mxge_fetch_i2c(mxge_softc_t *sc, struct ifi2creq *i2c)
4089 {
4090 	mxge_cmd_t cmd;
4091 	uint32_t i2c_args;
4092 	int i, ms, err;
4093 
4094 	if (i2c->dev_addr != 0xA0 &&
4095 	    i2c->dev_addr != 0xA2)
4096 		return (EINVAL);
4097 	if (i2c->len > sizeof(i2c->data))
4098 		return (EINVAL);
4099 
4100 	for (i = 0; i < i2c->len; i++) {
4101 		i2c_args = i2c->dev_addr << 0x8;
4102 		i2c_args |= i2c->offset + i;
4103 		cmd.data0 = 0;	 /* just fetch 1 byte, not all 256 */
4104 		cmd.data1 = i2c_args;
4105 		err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_READ, &cmd);
4106 
4107 		if (err != MXGEFW_CMD_OK)
4108 			return (EIO);
4109 		/* now we wait for the data to be cached */
4110 		cmd.data0 = i2c_args & 0xff;
4111 		err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
4112 		for (ms = 0; (err == EBUSY) && (ms < 50); ms++) {
4113 			cmd.data0 = i2c_args & 0xff;
4114 			err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
4115 			if (err == EBUSY)
4116 				DELAY(1000);
4117 		}
4118 		if (err != MXGEFW_CMD_OK)
4119 			return (EIO);
4120 		i2c->data[i] = cmd.data0;
4121 	}
4122 	return (0);
4123 }
4124 
4125 static int
4126 mxge_ioctl(struct ifnet *ifp, u_long command, caddr_t data)
4127 {
4128 	mxge_softc_t *sc = ifp->if_softc;
4129 	struct ifreq *ifr = (struct ifreq *)data;
4130 	struct ifi2creq i2c;
4131 	int err, mask;
4132 
4133 	err = 0;
4134 	switch (command) {
4135 	case SIOCSIFMTU:
4136 		err = mxge_change_mtu(sc, ifr->ifr_mtu);
4137 		break;
4138 
4139 	case SIOCSIFFLAGS:
4140 		mtx_lock(&sc->driver_mtx);
4141 		if (sc->dying) {
4142 			mtx_unlock(&sc->driver_mtx);
4143 			return EINVAL;
4144 		}
4145 		if (ifp->if_flags & IFF_UP) {
4146 			if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) {
4147 				err = mxge_open(sc);
4148 			} else {
4149 				/* take care of promis can allmulti
4150 				   flag chages */
4151 				mxge_change_promisc(sc,
4152 						    ifp->if_flags & IFF_PROMISC);
4153 				mxge_set_multicast_list(sc);
4154 			}
4155 		} else {
4156 			if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
4157 				mxge_close(sc, 0);
4158 			}
4159 		}
4160 		mtx_unlock(&sc->driver_mtx);
4161 		break;
4162 
4163 	case SIOCADDMULTI:
4164 	case SIOCDELMULTI:
4165 		mtx_lock(&sc->driver_mtx);
4166 		if (sc->dying) {
4167 			mtx_unlock(&sc->driver_mtx);
4168 			return (EINVAL);
4169 		}
4170 		mxge_set_multicast_list(sc);
4171 		mtx_unlock(&sc->driver_mtx);
4172 		break;
4173 
4174 	case SIOCSIFCAP:
4175 		mtx_lock(&sc->driver_mtx);
4176 		mask = ifr->ifr_reqcap ^ ifp->if_capenable;
4177 		if (mask & IFCAP_TXCSUM) {
4178 			if (IFCAP_TXCSUM & ifp->if_capenable) {
4179 				mask &= ~IFCAP_TSO4;
4180 				ifp->if_capenable &= ~(IFCAP_TXCSUM|IFCAP_TSO4);
4181 				ifp->if_hwassist &= ~(CSUM_TCP | CSUM_UDP);
4182 			} else {
4183 				ifp->if_capenable |= IFCAP_TXCSUM;
4184 				ifp->if_hwassist |= (CSUM_TCP | CSUM_UDP);
4185 			}
4186 		}
4187 		if (mask & IFCAP_RXCSUM) {
4188 			if (IFCAP_RXCSUM & ifp->if_capenable) {
4189 				ifp->if_capenable &= ~IFCAP_RXCSUM;
4190 			} else {
4191 				ifp->if_capenable |= IFCAP_RXCSUM;
4192 			}
4193 		}
4194 		if (mask & IFCAP_TSO4) {
4195 			if (IFCAP_TSO4 & ifp->if_capenable) {
4196 				ifp->if_capenable &= ~IFCAP_TSO4;
4197 			} else if (IFCAP_TXCSUM & ifp->if_capenable) {
4198 				ifp->if_capenable |= IFCAP_TSO4;
4199 				ifp->if_hwassist |= CSUM_TSO;
4200 			} else {
4201 				printf("mxge requires tx checksum offload"
4202 				       " be enabled to use TSO\n");
4203 				err = EINVAL;
4204 			}
4205 		}
4206 #if IFCAP_TSO6
4207 		if (mask & IFCAP_TXCSUM_IPV6) {
4208 			if (IFCAP_TXCSUM_IPV6 & ifp->if_capenable) {
4209 				mask &= ~IFCAP_TSO6;
4210 				ifp->if_capenable &= ~(IFCAP_TXCSUM_IPV6
4211 						       | IFCAP_TSO6);
4212 				ifp->if_hwassist &= ~(CSUM_TCP_IPV6
4213 						      | CSUM_UDP);
4214 			} else {
4215 				ifp->if_capenable |= IFCAP_TXCSUM_IPV6;
4216 				ifp->if_hwassist |= (CSUM_TCP_IPV6
4217 						     | CSUM_UDP_IPV6);
4218 			}
4219 		}
4220 		if (mask & IFCAP_RXCSUM_IPV6) {
4221 			if (IFCAP_RXCSUM_IPV6 & ifp->if_capenable) {
4222 				ifp->if_capenable &= ~IFCAP_RXCSUM_IPV6;
4223 			} else {
4224 				ifp->if_capenable |= IFCAP_RXCSUM_IPV6;
4225 			}
4226 		}
4227 		if (mask & IFCAP_TSO6) {
4228 			if (IFCAP_TSO6 & ifp->if_capenable) {
4229 				ifp->if_capenable &= ~IFCAP_TSO6;
4230 			} else if (IFCAP_TXCSUM_IPV6 & ifp->if_capenable) {
4231 				ifp->if_capenable |= IFCAP_TSO6;
4232 				ifp->if_hwassist |= CSUM_TSO;
4233 			} else {
4234 				printf("mxge requires tx checksum offload"
4235 				       " be enabled to use TSO\n");
4236 				err = EINVAL;
4237 			}
4238 		}
4239 #endif /*IFCAP_TSO6 */
4240 
4241 		if (mask & IFCAP_LRO)
4242 			ifp->if_capenable ^= IFCAP_LRO;
4243 		if (mask & IFCAP_VLAN_HWTAGGING)
4244 			ifp->if_capenable ^= IFCAP_VLAN_HWTAGGING;
4245 		if (mask & IFCAP_VLAN_HWTSO)
4246 			ifp->if_capenable ^= IFCAP_VLAN_HWTSO;
4247 
4248 		if (!(ifp->if_capabilities & IFCAP_VLAN_HWTSO) ||
4249 		    !(ifp->if_capenable & IFCAP_VLAN_HWTAGGING))
4250 			ifp->if_capenable &= ~IFCAP_VLAN_HWTSO;
4251 
4252 		mtx_unlock(&sc->driver_mtx);
4253 		VLAN_CAPABILITIES(ifp);
4254 
4255 		break;
4256 
4257 	case SIOCGIFMEDIA:
4258 		mtx_lock(&sc->driver_mtx);
4259 		if (sc->dying) {
4260 			mtx_unlock(&sc->driver_mtx);
4261 			return (EINVAL);
4262 		}
4263 		mxge_media_probe(sc);
4264 		mtx_unlock(&sc->driver_mtx);
4265 		err = ifmedia_ioctl(ifp, (struct ifreq *)data,
4266 				    &sc->media, command);
4267 		break;
4268 
4269 	case SIOCGI2C:
4270 		if (sc->connector != MXGE_XFP &&
4271 		    sc->connector != MXGE_SFP) {
4272 			err = ENXIO;
4273 			break;
4274 		}
4275 		err = copyin(ifr_data_get_ptr(ifr), &i2c, sizeof(i2c));
4276 		if (err != 0)
4277 			break;
4278 		mtx_lock(&sc->driver_mtx);
4279 		if (sc->dying) {
4280 			mtx_unlock(&sc->driver_mtx);
4281 			return (EINVAL);
4282 		}
4283 		err = mxge_fetch_i2c(sc, &i2c);
4284 		mtx_unlock(&sc->driver_mtx);
4285 		if (err == 0)
4286 			err = copyout(&i2c, ifr_data_get_ptr(ifr),
4287 			    sizeof(i2c));
4288 		break;
4289 	default:
4290 		err = ether_ioctl(ifp, command, data);
4291 		break;
4292 	}
4293 	return err;
4294 }
4295 
4296 static void
4297 mxge_fetch_tunables(mxge_softc_t *sc)
4298 {
4299 
4300 	TUNABLE_INT_FETCH("hw.mxge.max_slices", &mxge_max_slices);
4301 	TUNABLE_INT_FETCH("hw.mxge.flow_control_enabled",
4302 			  &mxge_flow_control);
4303 	TUNABLE_INT_FETCH("hw.mxge.intr_coal_delay",
4304 			  &mxge_intr_coal_delay);
4305 	TUNABLE_INT_FETCH("hw.mxge.nvidia_ecrc_enable",
4306 			  &mxge_nvidia_ecrc_enable);
4307 	TUNABLE_INT_FETCH("hw.mxge.force_firmware",
4308 			  &mxge_force_firmware);
4309 	TUNABLE_INT_FETCH("hw.mxge.deassert_wait",
4310 			  &mxge_deassert_wait);
4311 	TUNABLE_INT_FETCH("hw.mxge.verbose",
4312 			  &mxge_verbose);
4313 	TUNABLE_INT_FETCH("hw.mxge.ticks", &mxge_ticks);
4314 	TUNABLE_INT_FETCH("hw.mxge.always_promisc", &mxge_always_promisc);
4315 	TUNABLE_INT_FETCH("hw.mxge.rss_hash_type", &mxge_rss_hash_type);
4316 	TUNABLE_INT_FETCH("hw.mxge.rss_hashtype", &mxge_rss_hash_type);
4317 	TUNABLE_INT_FETCH("hw.mxge.initial_mtu", &mxge_initial_mtu);
4318 	TUNABLE_INT_FETCH("hw.mxge.throttle", &mxge_throttle);
4319 
4320 	if (bootverbose)
4321 		mxge_verbose = 1;
4322 	if (mxge_intr_coal_delay < 0 || mxge_intr_coal_delay > 10*1000)
4323 		mxge_intr_coal_delay = 30;
4324 	if (mxge_ticks == 0)
4325 		mxge_ticks = hz / 2;
4326 	sc->pause = mxge_flow_control;
4327 	if (mxge_rss_hash_type < MXGEFW_RSS_HASH_TYPE_IPV4
4328 	    || mxge_rss_hash_type > MXGEFW_RSS_HASH_TYPE_MAX) {
4329 		mxge_rss_hash_type = MXGEFW_RSS_HASH_TYPE_SRC_DST_PORT;
4330 	}
4331 	if (mxge_initial_mtu > ETHERMTU_JUMBO ||
4332 	    mxge_initial_mtu < ETHER_MIN_LEN)
4333 		mxge_initial_mtu = ETHERMTU_JUMBO;
4334 
4335 	if (mxge_throttle && mxge_throttle > MXGE_MAX_THROTTLE)
4336 		mxge_throttle = MXGE_MAX_THROTTLE;
4337 	if (mxge_throttle && mxge_throttle < MXGE_MIN_THROTTLE)
4338 		mxge_throttle = MXGE_MIN_THROTTLE;
4339 	sc->throttle = mxge_throttle;
4340 }
4341 
4342 static void
4343 mxge_free_slices(mxge_softc_t *sc)
4344 {
4345 	struct mxge_slice_state *ss;
4346 	int i;
4347 
4348 	if (sc->ss == NULL)
4349 		return;
4350 
4351 	for (i = 0; i < sc->num_slices; i++) {
4352 		ss = &sc->ss[i];
4353 		if (ss->fw_stats != NULL) {
4354 			mxge_dma_free(&ss->fw_stats_dma);
4355 			ss->fw_stats = NULL;
4356 #ifdef IFNET_BUF_RING
4357 			if (ss->tx.br != NULL) {
4358 				drbr_free(ss->tx.br, M_DEVBUF);
4359 				ss->tx.br = NULL;
4360 			}
4361 #endif
4362 			mtx_destroy(&ss->tx.mtx);
4363 		}
4364 		if (ss->rx_done.entry != NULL) {
4365 			mxge_dma_free(&ss->rx_done.dma);
4366 			ss->rx_done.entry = NULL;
4367 		}
4368 	}
4369 	free(sc->ss, M_DEVBUF);
4370 	sc->ss = NULL;
4371 }
4372 
4373 static int
4374 mxge_alloc_slices(mxge_softc_t *sc)
4375 {
4376 	mxge_cmd_t cmd;
4377 	struct mxge_slice_state *ss;
4378 	size_t bytes;
4379 	int err, i, max_intr_slots;
4380 
4381 	err = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
4382 	if (err != 0) {
4383 		device_printf(sc->dev, "Cannot determine rx ring size\n");
4384 		return err;
4385 	}
4386 	sc->rx_ring_size = cmd.data0;
4387 	max_intr_slots = 2 * (sc->rx_ring_size / sizeof (mcp_dma_addr_t));
4388 
4389 	bytes = sizeof (*sc->ss) * sc->num_slices;
4390 	sc->ss = malloc(bytes, M_DEVBUF, M_NOWAIT | M_ZERO);
4391 	if (sc->ss == NULL)
4392 		return (ENOMEM);
4393 	for (i = 0; i < sc->num_slices; i++) {
4394 		ss = &sc->ss[i];
4395 
4396 		ss->sc = sc;
4397 
4398 		/* allocate per-slice rx interrupt queues */
4399 
4400 		bytes = max_intr_slots * sizeof (*ss->rx_done.entry);
4401 		err = mxge_dma_alloc(sc, &ss->rx_done.dma, bytes, 4096);
4402 		if (err != 0)
4403 			goto abort;
4404 		ss->rx_done.entry = ss->rx_done.dma.addr;
4405 		bzero(ss->rx_done.entry, bytes);
4406 
4407 		/*
4408 		 * allocate the per-slice firmware stats; stats
4409 		 * (including tx) are used used only on the first
4410 		 * slice for now
4411 		 */
4412 #ifndef IFNET_BUF_RING
4413 		if (i > 0)
4414 			continue;
4415 #endif
4416 
4417 		bytes = sizeof (*ss->fw_stats);
4418 		err = mxge_dma_alloc(sc, &ss->fw_stats_dma,
4419 				     sizeof (*ss->fw_stats), 64);
4420 		if (err != 0)
4421 			goto abort;
4422 		ss->fw_stats = (mcp_irq_data_t *)ss->fw_stats_dma.addr;
4423 		snprintf(ss->tx.mtx_name, sizeof(ss->tx.mtx_name),
4424 			 "%s:tx(%d)", device_get_nameunit(sc->dev), i);
4425 		mtx_init(&ss->tx.mtx, ss->tx.mtx_name, NULL, MTX_DEF);
4426 #ifdef IFNET_BUF_RING
4427 		ss->tx.br = buf_ring_alloc(2048, M_DEVBUF, M_WAITOK,
4428 					   &ss->tx.mtx);
4429 #endif
4430 	}
4431 
4432 	return (0);
4433 
4434 abort:
4435 	mxge_free_slices(sc);
4436 	return (ENOMEM);
4437 }
4438 
4439 static void
4440 mxge_slice_probe(mxge_softc_t *sc)
4441 {
4442 	mxge_cmd_t cmd;
4443 	char *old_fw;
4444 	int msix_cnt, status, max_intr_slots;
4445 
4446 	sc->num_slices = 1;
4447 	/*
4448 	 *  don't enable multiple slices if they are not enabled,
4449 	 *  or if this is not an SMP system
4450 	 */
4451 
4452 	if (mxge_max_slices == 0 || mxge_max_slices == 1 || mp_ncpus < 2)
4453 		return;
4454 
4455 	/* see how many MSI-X interrupts are available */
4456 	msix_cnt = pci_msix_count(sc->dev);
4457 	if (msix_cnt < 2)
4458 		return;
4459 
4460 	/* now load the slice aware firmware see what it supports */
4461 	old_fw = sc->fw_name;
4462 	if (old_fw == mxge_fw_aligned)
4463 		sc->fw_name = mxge_fw_rss_aligned;
4464 	else
4465 		sc->fw_name = mxge_fw_rss_unaligned;
4466 	status = mxge_load_firmware(sc, 0);
4467 	if (status != 0) {
4468 		device_printf(sc->dev, "Falling back to a single slice\n");
4469 		return;
4470 	}
4471 
4472 	/* try to send a reset command to the card to see if it
4473 	   is alive */
4474 	memset(&cmd, 0, sizeof (cmd));
4475 	status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
4476 	if (status != 0) {
4477 		device_printf(sc->dev, "failed reset\n");
4478 		goto abort_with_fw;
4479 	}
4480 
4481 	/* get rx ring size */
4482 	status = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
4483 	if (status != 0) {
4484 		device_printf(sc->dev, "Cannot determine rx ring size\n");
4485 		goto abort_with_fw;
4486 	}
4487 	max_intr_slots = 2 * (cmd.data0 / sizeof (mcp_dma_addr_t));
4488 
4489 	/* tell it the size of the interrupt queues */
4490 	cmd.data0 = max_intr_slots * sizeof (struct mcp_slot);
4491 	status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
4492 	if (status != 0) {
4493 		device_printf(sc->dev, "failed MXGEFW_CMD_SET_INTRQ_SIZE\n");
4494 		goto abort_with_fw;
4495 	}
4496 
4497 	/* ask the maximum number of slices it supports */
4498 	status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES, &cmd);
4499 	if (status != 0) {
4500 		device_printf(sc->dev,
4501 			      "failed MXGEFW_CMD_GET_MAX_RSS_QUEUES\n");
4502 		goto abort_with_fw;
4503 	}
4504 	sc->num_slices = cmd.data0;
4505 	if (sc->num_slices > msix_cnt)
4506 		sc->num_slices = msix_cnt;
4507 
4508 	if (mxge_max_slices == -1) {
4509 		/* cap to number of CPUs in system */
4510 		if (sc->num_slices > mp_ncpus)
4511 			sc->num_slices = mp_ncpus;
4512 	} else {
4513 		if (sc->num_slices > mxge_max_slices)
4514 			sc->num_slices = mxge_max_slices;
4515 	}
4516 	/* make sure it is a power of two */
4517 	while (sc->num_slices & (sc->num_slices - 1))
4518 		sc->num_slices--;
4519 
4520 	if (mxge_verbose)
4521 		device_printf(sc->dev, "using %d slices\n",
4522 			      sc->num_slices);
4523 
4524 	return;
4525 
4526 abort_with_fw:
4527 	sc->fw_name = old_fw;
4528 	(void) mxge_load_firmware(sc, 0);
4529 }
4530 
4531 static int
4532 mxge_add_msix_irqs(mxge_softc_t *sc)
4533 {
4534 	size_t bytes;
4535 	int count, err, i, rid;
4536 
4537 	rid = PCIR_BAR(2);
4538 	sc->msix_table_res = bus_alloc_resource_any(sc->dev, SYS_RES_MEMORY,
4539 						    &rid, RF_ACTIVE);
4540 
4541 	if (sc->msix_table_res == NULL) {
4542 		device_printf(sc->dev, "couldn't alloc MSIX table res\n");
4543 		return ENXIO;
4544 	}
4545 
4546 	count = sc->num_slices;
4547 	err = pci_alloc_msix(sc->dev, &count);
4548 	if (err != 0) {
4549 		device_printf(sc->dev, "pci_alloc_msix: failed, wanted %d"
4550 			      "err = %d \n", sc->num_slices, err);
4551 		goto abort_with_msix_table;
4552 	}
4553 	if (count < sc->num_slices) {
4554 		device_printf(sc->dev, "pci_alloc_msix: need %d, got %d\n",
4555 			      count, sc->num_slices);
4556 		device_printf(sc->dev,
4557 			      "Try setting hw.mxge.max_slices to %d\n",
4558 			      count);
4559 		err = ENOSPC;
4560 		goto abort_with_msix;
4561 	}
4562 	bytes = sizeof (*sc->msix_irq_res) * sc->num_slices;
4563 	sc->msix_irq_res = malloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO);
4564 	if (sc->msix_irq_res == NULL) {
4565 		err = ENOMEM;
4566 		goto abort_with_msix;
4567 	}
4568 
4569 	for (i = 0; i < sc->num_slices; i++) {
4570 		rid = i + 1;
4571 		sc->msix_irq_res[i] = bus_alloc_resource_any(sc->dev,
4572 							  SYS_RES_IRQ,
4573 							  &rid, RF_ACTIVE);
4574 		if (sc->msix_irq_res[i] == NULL) {
4575 			device_printf(sc->dev, "couldn't allocate IRQ res"
4576 				      " for message %d\n", i);
4577 			err = ENXIO;
4578 			goto abort_with_res;
4579 		}
4580 	}
4581 
4582 	bytes = sizeof (*sc->msix_ih) * sc->num_slices;
4583 	sc->msix_ih =  malloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO);
4584 
4585 	for (i = 0; i < sc->num_slices; i++) {
4586 		err = bus_setup_intr(sc->dev, sc->msix_irq_res[i],
4587 				     INTR_TYPE_NET | INTR_MPSAFE,
4588 #if __FreeBSD_version > 700030
4589 				     NULL,
4590 #endif
4591 				     mxge_intr, &sc->ss[i], &sc->msix_ih[i]);
4592 		if (err != 0) {
4593 			device_printf(sc->dev, "couldn't setup intr for "
4594 				      "message %d\n", i);
4595 			goto abort_with_intr;
4596 		}
4597 		bus_describe_intr(sc->dev, sc->msix_irq_res[i],
4598 				  sc->msix_ih[i], "s%d", i);
4599 	}
4600 
4601 	if (mxge_verbose) {
4602 		device_printf(sc->dev, "using %d msix IRQs:",
4603 			      sc->num_slices);
4604 		for (i = 0; i < sc->num_slices; i++)
4605 			printf(" %jd", rman_get_start(sc->msix_irq_res[i]));
4606 		printf("\n");
4607 	}
4608 	return (0);
4609 
4610 abort_with_intr:
4611 	for (i = 0; i < sc->num_slices; i++) {
4612 		if (sc->msix_ih[i] != NULL) {
4613 			bus_teardown_intr(sc->dev, sc->msix_irq_res[i],
4614 					  sc->msix_ih[i]);
4615 			sc->msix_ih[i] = NULL;
4616 		}
4617 	}
4618 	free(sc->msix_ih, M_DEVBUF);
4619 
4620 abort_with_res:
4621 	for (i = 0; i < sc->num_slices; i++) {
4622 		rid = i + 1;
4623 		if (sc->msix_irq_res[i] != NULL)
4624 			bus_release_resource(sc->dev, SYS_RES_IRQ, rid,
4625 					     sc->msix_irq_res[i]);
4626 		sc->msix_irq_res[i] = NULL;
4627 	}
4628 	free(sc->msix_irq_res, M_DEVBUF);
4629 
4630 abort_with_msix:
4631 	pci_release_msi(sc->dev);
4632 
4633 abort_with_msix_table:
4634 	bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2),
4635 			     sc->msix_table_res);
4636 
4637 	return err;
4638 }
4639 
4640 static int
4641 mxge_add_single_irq(mxge_softc_t *sc)
4642 {
4643 	int count, err, rid;
4644 
4645 	count = pci_msi_count(sc->dev);
4646 	if (count == 1 && pci_alloc_msi(sc->dev, &count) == 0) {
4647 		rid = 1;
4648 	} else {
4649 		rid = 0;
4650 		sc->legacy_irq = 1;
4651 	}
4652 	sc->irq_res = bus_alloc_resource_any(sc->dev, SYS_RES_IRQ, &rid,
4653 					     RF_SHAREABLE | RF_ACTIVE);
4654 	if (sc->irq_res == NULL) {
4655 		device_printf(sc->dev, "could not alloc interrupt\n");
4656 		return ENXIO;
4657 	}
4658 	if (mxge_verbose)
4659 		device_printf(sc->dev, "using %s irq %jd\n",
4660 			      sc->legacy_irq ? "INTx" : "MSI",
4661 			      rman_get_start(sc->irq_res));
4662 	err = bus_setup_intr(sc->dev, sc->irq_res,
4663 			     INTR_TYPE_NET | INTR_MPSAFE,
4664 #if __FreeBSD_version > 700030
4665 			     NULL,
4666 #endif
4667 			     mxge_intr, &sc->ss[0], &sc->ih);
4668 	if (err != 0) {
4669 		bus_release_resource(sc->dev, SYS_RES_IRQ,
4670 				     sc->legacy_irq ? 0 : 1, sc->irq_res);
4671 		if (!sc->legacy_irq)
4672 			pci_release_msi(sc->dev);
4673 	}
4674 	return err;
4675 }
4676 
4677 static void
4678 mxge_rem_msix_irqs(mxge_softc_t *sc)
4679 {
4680 	int i, rid;
4681 
4682 	for (i = 0; i < sc->num_slices; i++) {
4683 		if (sc->msix_ih[i] != NULL) {
4684 			bus_teardown_intr(sc->dev, sc->msix_irq_res[i],
4685 					  sc->msix_ih[i]);
4686 			sc->msix_ih[i] = NULL;
4687 		}
4688 	}
4689 	free(sc->msix_ih, M_DEVBUF);
4690 
4691 	for (i = 0; i < sc->num_slices; i++) {
4692 		rid = i + 1;
4693 		if (sc->msix_irq_res[i] != NULL)
4694 			bus_release_resource(sc->dev, SYS_RES_IRQ, rid,
4695 					     sc->msix_irq_res[i]);
4696 		sc->msix_irq_res[i] = NULL;
4697 	}
4698 	free(sc->msix_irq_res, M_DEVBUF);
4699 
4700 	bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2),
4701 			     sc->msix_table_res);
4702 
4703 	pci_release_msi(sc->dev);
4704 	return;
4705 }
4706 
4707 static void
4708 mxge_rem_single_irq(mxge_softc_t *sc)
4709 {
4710 	bus_teardown_intr(sc->dev, sc->irq_res, sc->ih);
4711 	bus_release_resource(sc->dev, SYS_RES_IRQ,
4712 			     sc->legacy_irq ? 0 : 1, sc->irq_res);
4713 	if (!sc->legacy_irq)
4714 		pci_release_msi(sc->dev);
4715 }
4716 
4717 static void
4718 mxge_rem_irq(mxge_softc_t *sc)
4719 {
4720 	if (sc->num_slices > 1)
4721 		mxge_rem_msix_irqs(sc);
4722 	else
4723 		mxge_rem_single_irq(sc);
4724 }
4725 
4726 static int
4727 mxge_add_irq(mxge_softc_t *sc)
4728 {
4729 	int err;
4730 
4731 	if (sc->num_slices > 1)
4732 		err = mxge_add_msix_irqs(sc);
4733 	else
4734 		err = mxge_add_single_irq(sc);
4735 
4736 	if (0 && err == 0 && sc->num_slices > 1) {
4737 		mxge_rem_msix_irqs(sc);
4738 		err = mxge_add_msix_irqs(sc);
4739 	}
4740 	return err;
4741 }
4742 
4743 static int
4744 mxge_attach(device_t dev)
4745 {
4746 	mxge_cmd_t cmd;
4747 	mxge_softc_t *sc = device_get_softc(dev);
4748 	struct ifnet *ifp;
4749 	int err, rid;
4750 
4751 	sc->dev = dev;
4752 	mxge_fetch_tunables(sc);
4753 
4754 	TASK_INIT(&sc->watchdog_task, 1, mxge_watchdog_task, sc);
4755 	sc->tq = taskqueue_create("mxge_taskq", M_WAITOK,
4756 				  taskqueue_thread_enqueue, &sc->tq);
4757 	if (sc->tq == NULL) {
4758 		err = ENOMEM;
4759 		goto abort_with_nothing;
4760 	}
4761 
4762 	err = bus_dma_tag_create(bus_get_dma_tag(dev),	/* parent */
4763 				 1,			/* alignment */
4764 				 0,			/* boundary */
4765 				 BUS_SPACE_MAXADDR,	/* low */
4766 				 BUS_SPACE_MAXADDR,	/* high */
4767 				 NULL, NULL,		/* filter */
4768 				 65536 + 256,		/* maxsize */
4769 				 MXGE_MAX_SEND_DESC, 	/* num segs */
4770 				 65536,			/* maxsegsize */
4771 				 0,			/* flags */
4772 				 NULL, NULL,		/* lock */
4773 				 &sc->parent_dmat);	/* tag */
4774 
4775 	if (err != 0) {
4776 		device_printf(sc->dev, "Err %d allocating parent dmat\n",
4777 			      err);
4778 		goto abort_with_tq;
4779 	}
4780 
4781 	ifp = sc->ifp = if_alloc(IFT_ETHER);
4782 	if (ifp == NULL) {
4783 		device_printf(dev, "can not if_alloc()\n");
4784 		err = ENOSPC;
4785 		goto abort_with_parent_dmat;
4786 	}
4787 	if_initname(ifp, device_get_name(dev), device_get_unit(dev));
4788 
4789 	snprintf(sc->cmd_mtx_name, sizeof(sc->cmd_mtx_name), "%s:cmd",
4790 		 device_get_nameunit(dev));
4791 	mtx_init(&sc->cmd_mtx, sc->cmd_mtx_name, NULL, MTX_DEF);
4792 	snprintf(sc->driver_mtx_name, sizeof(sc->driver_mtx_name),
4793 		 "%s:drv", device_get_nameunit(dev));
4794 	mtx_init(&sc->driver_mtx, sc->driver_mtx_name,
4795 		 MTX_NETWORK_LOCK, MTX_DEF);
4796 
4797 	callout_init_mtx(&sc->co_hdl, &sc->driver_mtx, 0);
4798 
4799 	mxge_setup_cfg_space(sc);
4800 
4801 	/* Map the board into the kernel */
4802 	rid = PCIR_BARS;
4803 	sc->mem_res = bus_alloc_resource_any(dev, SYS_RES_MEMORY, &rid,
4804 					     RF_ACTIVE);
4805 	if (sc->mem_res == NULL) {
4806 		device_printf(dev, "could not map memory\n");
4807 		err = ENXIO;
4808 		goto abort_with_lock;
4809 	}
4810 	sc->sram = rman_get_virtual(sc->mem_res);
4811 	sc->sram_size = 2*1024*1024 - (2*(48*1024)+(32*1024)) - 0x100;
4812 	if (sc->sram_size > rman_get_size(sc->mem_res)) {
4813 		device_printf(dev, "impossible memory region size %jd\n",
4814 			      rman_get_size(sc->mem_res));
4815 		err = ENXIO;
4816 		goto abort_with_mem_res;
4817 	}
4818 
4819 	/* make NULL terminated copy of the EEPROM strings section of
4820 	   lanai SRAM */
4821 	bzero(sc->eeprom_strings, MXGE_EEPROM_STRINGS_SIZE);
4822 	bus_space_read_region_1(rman_get_bustag(sc->mem_res),
4823 				rman_get_bushandle(sc->mem_res),
4824 				sc->sram_size - MXGE_EEPROM_STRINGS_SIZE,
4825 				sc->eeprom_strings,
4826 				MXGE_EEPROM_STRINGS_SIZE - 2);
4827 	err = mxge_parse_strings(sc);
4828 	if (err != 0)
4829 		goto abort_with_mem_res;
4830 
4831 	/* Enable write combining for efficient use of PCIe bus */
4832 	mxge_enable_wc(sc);
4833 
4834 	/* Allocate the out of band dma memory */
4835 	err = mxge_dma_alloc(sc, &sc->cmd_dma,
4836 			     sizeof (mxge_cmd_t), 64);
4837 	if (err != 0)
4838 		goto abort_with_mem_res;
4839 	sc->cmd = (mcp_cmd_response_t *) sc->cmd_dma.addr;
4840 	err = mxge_dma_alloc(sc, &sc->zeropad_dma, 64, 64);
4841 	if (err != 0)
4842 		goto abort_with_cmd_dma;
4843 
4844 	err = mxge_dma_alloc(sc, &sc->dmabench_dma, 4096, 4096);
4845 	if (err != 0)
4846 		goto abort_with_zeropad_dma;
4847 
4848 	/* select & load the firmware */
4849 	err = mxge_select_firmware(sc);
4850 	if (err != 0)
4851 		goto abort_with_dmabench;
4852 	sc->intr_coal_delay = mxge_intr_coal_delay;
4853 
4854 	mxge_slice_probe(sc);
4855 	err = mxge_alloc_slices(sc);
4856 	if (err != 0)
4857 		goto abort_with_dmabench;
4858 
4859 	err = mxge_reset(sc, 0);
4860 	if (err != 0)
4861 		goto abort_with_slices;
4862 
4863 	err = mxge_alloc_rings(sc);
4864 	if (err != 0) {
4865 		device_printf(sc->dev, "failed to allocate rings\n");
4866 		goto abort_with_slices;
4867 	}
4868 
4869 	err = mxge_add_irq(sc);
4870 	if (err != 0) {
4871 		device_printf(sc->dev, "failed to add irq\n");
4872 		goto abort_with_rings;
4873 	}
4874 
4875 	ifp->if_baudrate = IF_Gbps(10);
4876 	ifp->if_capabilities = IFCAP_RXCSUM | IFCAP_TXCSUM | IFCAP_TSO4 |
4877 		IFCAP_VLAN_MTU | IFCAP_LINKSTATE | IFCAP_TXCSUM_IPV6 |
4878 		IFCAP_RXCSUM_IPV6;
4879 #if defined(INET) || defined(INET6)
4880 	ifp->if_capabilities |= IFCAP_LRO;
4881 #endif
4882 
4883 #ifdef MXGE_NEW_VLAN_API
4884 	ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_HWCSUM;
4885 
4886 	/* Only FW 1.4.32 and newer can do TSO over vlans */
4887 	if (sc->fw_ver_major == 1 && sc->fw_ver_minor == 4 &&
4888 	    sc->fw_ver_tiny >= 32)
4889 		ifp->if_capabilities |= IFCAP_VLAN_HWTSO;
4890 #endif
4891 	sc->max_mtu = mxge_max_mtu(sc);
4892 	if (sc->max_mtu >= 9000)
4893 		ifp->if_capabilities |= IFCAP_JUMBO_MTU;
4894 	else
4895 		device_printf(dev, "MTU limited to %d.  Install "
4896 			      "latest firmware for 9000 byte jumbo support\n",
4897 			      sc->max_mtu - ETHER_HDR_LEN);
4898 	ifp->if_hwassist = CSUM_TCP | CSUM_UDP | CSUM_TSO;
4899 	ifp->if_hwassist |= CSUM_TCP_IPV6 | CSUM_UDP_IPV6;
4900 	/* check to see if f/w supports TSO for IPv6 */
4901 	if (!mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_TSO6_HDR_SIZE, &cmd)) {
4902 		if (CSUM_TCP_IPV6)
4903 			ifp->if_capabilities |= IFCAP_TSO6;
4904 		sc->max_tso6_hlen = min(cmd.data0,
4905 					sizeof (sc->ss[0].scratch));
4906 	}
4907 	ifp->if_capenable = ifp->if_capabilities;
4908 	if (sc->lro_cnt == 0)
4909 		ifp->if_capenable &= ~IFCAP_LRO;
4910 	ifp->if_init = mxge_init;
4911 	ifp->if_softc = sc;
4912 	ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
4913 	ifp->if_ioctl = mxge_ioctl;
4914 	ifp->if_start = mxge_start;
4915 	ifp->if_get_counter = mxge_get_counter;
4916 	ifp->if_hw_tsomax = IP_MAXPACKET - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN);
4917 	ifp->if_hw_tsomaxsegcount = sc->ss[0].tx.max_desc;
4918 	ifp->if_hw_tsomaxsegsize = IP_MAXPACKET;
4919 	/* Initialise the ifmedia structure */
4920 	ifmedia_init(&sc->media, 0, mxge_media_change,
4921 		     mxge_media_status);
4922 	mxge_media_init(sc);
4923 	mxge_media_probe(sc);
4924 	sc->dying = 0;
4925 	ether_ifattach(ifp, sc->mac_addr);
4926 	/* ether_ifattach sets mtu to ETHERMTU */
4927 	if (mxge_initial_mtu != ETHERMTU)
4928 		mxge_change_mtu(sc, mxge_initial_mtu);
4929 
4930 	mxge_add_sysctls(sc);
4931 #ifdef IFNET_BUF_RING
4932 	ifp->if_transmit = mxge_transmit;
4933 	ifp->if_qflush = mxge_qflush;
4934 #endif
4935 	taskqueue_start_threads(&sc->tq, 1, PI_NET, "%s taskq",
4936 				device_get_nameunit(sc->dev));
4937 	callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
4938 	return 0;
4939 
4940 abort_with_rings:
4941 	mxge_free_rings(sc);
4942 abort_with_slices:
4943 	mxge_free_slices(sc);
4944 abort_with_dmabench:
4945 	mxge_dma_free(&sc->dmabench_dma);
4946 abort_with_zeropad_dma:
4947 	mxge_dma_free(&sc->zeropad_dma);
4948 abort_with_cmd_dma:
4949 	mxge_dma_free(&sc->cmd_dma);
4950 abort_with_mem_res:
4951 	bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
4952 abort_with_lock:
4953 	pci_disable_busmaster(dev);
4954 	mtx_destroy(&sc->cmd_mtx);
4955 	mtx_destroy(&sc->driver_mtx);
4956 	if_free(ifp);
4957 abort_with_parent_dmat:
4958 	bus_dma_tag_destroy(sc->parent_dmat);
4959 abort_with_tq:
4960 	if (sc->tq != NULL) {
4961 		taskqueue_drain(sc->tq, &sc->watchdog_task);
4962 		taskqueue_free(sc->tq);
4963 		sc->tq = NULL;
4964 	}
4965 abort_with_nothing:
4966 	return err;
4967 }
4968 
4969 static int
4970 mxge_detach(device_t dev)
4971 {
4972 	mxge_softc_t *sc = device_get_softc(dev);
4973 
4974 	if (mxge_vlans_active(sc)) {
4975 		device_printf(sc->dev,
4976 			      "Detach vlans before removing module\n");
4977 		return EBUSY;
4978 	}
4979 	mtx_lock(&sc->driver_mtx);
4980 	sc->dying = 1;
4981 	if (sc->ifp->if_drv_flags & IFF_DRV_RUNNING)
4982 		mxge_close(sc, 0);
4983 	mtx_unlock(&sc->driver_mtx);
4984 	ether_ifdetach(sc->ifp);
4985 	if (sc->tq != NULL) {
4986 		taskqueue_drain(sc->tq, &sc->watchdog_task);
4987 		taskqueue_free(sc->tq);
4988 		sc->tq = NULL;
4989 	}
4990 	callout_drain(&sc->co_hdl);
4991 	ifmedia_removeall(&sc->media);
4992 	mxge_dummy_rdma(sc, 0);
4993 	mxge_rem_sysctls(sc);
4994 	mxge_rem_irq(sc);
4995 	mxge_free_rings(sc);
4996 	mxge_free_slices(sc);
4997 	mxge_dma_free(&sc->dmabench_dma);
4998 	mxge_dma_free(&sc->zeropad_dma);
4999 	mxge_dma_free(&sc->cmd_dma);
5000 	bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
5001 	pci_disable_busmaster(dev);
5002 	mtx_destroy(&sc->cmd_mtx);
5003 	mtx_destroy(&sc->driver_mtx);
5004 	if_free(sc->ifp);
5005 	bus_dma_tag_destroy(sc->parent_dmat);
5006 	return 0;
5007 }
5008 
5009 static int
5010 mxge_shutdown(device_t dev)
5011 {
5012 	return 0;
5013 }
5014 
5015 /*
5016   This file uses Myri10GE driver indentation.
5017 
5018   Local Variables:
5019   c-file-style:"linux"
5020   tab-width:8
5021   End:
5022 */
5023