xref: /freebsd/sys/dev/mxge/if_mxge.c (revision 312809fe7fefbc8d5caa2b59089a5d9266378057)
1 /******************************************************************************
2 SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3 
4 Copyright (c) 2006-2013, Myricom Inc.
5 All rights reserved.
6 
7 Redistribution and use in source and binary forms, with or without
8 modification, are permitted provided that the following conditions are met:
9 
10  1. Redistributions of source code must retain the above copyright notice,
11     this list of conditions and the following disclaimer.
12 
13  2. Neither the name of the Myricom Inc, nor the names of its
14     contributors may be used to endorse or promote products derived from
15     this software without specific prior written permission.
16 
17 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
21 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27 POSSIBILITY OF SUCH DAMAGE.
28 
29 ***************************************************************************/
30 
31 #include <sys/cdefs.h>
32 __FBSDID("$FreeBSD$");
33 
34 #include <sys/param.h>
35 #include <sys/systm.h>
36 #include <sys/linker.h>
37 #include <sys/firmware.h>
38 #include <sys/endian.h>
39 #include <sys/sockio.h>
40 #include <sys/mbuf.h>
41 #include <sys/malloc.h>
42 #include <sys/kdb.h>
43 #include <sys/kernel.h>
44 #include <sys/lock.h>
45 #include <sys/module.h>
46 #include <sys/socket.h>
47 #include <sys/sysctl.h>
48 #include <sys/sx.h>
49 #include <sys/taskqueue.h>
50 #include <contrib/zlib/zlib.h>
51 #include <dev/zlib/zcalloc.h>
52 
53 #include <net/if.h>
54 #include <net/if_var.h>
55 #include <net/if_arp.h>
56 #include <net/ethernet.h>
57 #include <net/if_dl.h>
58 #include <net/if_media.h>
59 
60 #include <net/bpf.h>
61 
62 #include <net/if_types.h>
63 #include <net/if_vlan_var.h>
64 
65 #include <netinet/in_systm.h>
66 #include <netinet/in.h>
67 #include <netinet/ip.h>
68 #include <netinet/ip6.h>
69 #include <netinet/tcp.h>
70 #include <netinet/tcp_lro.h>
71 #include <netinet6/ip6_var.h>
72 
73 #include <machine/bus.h>
74 #include <machine/in_cksum.h>
75 #include <machine/resource.h>
76 #include <sys/bus.h>
77 #include <sys/rman.h>
78 #include <sys/smp.h>
79 
80 #include <dev/pci/pcireg.h>
81 #include <dev/pci/pcivar.h>
82 #include <dev/pci/pci_private.h> /* XXX for pci_cfg_restore */
83 
84 #include <vm/vm.h>		/* for pmap_mapdev() */
85 #include <vm/pmap.h>
86 
87 #if defined(__i386) || defined(__amd64)
88 #include <machine/specialreg.h>
89 #endif
90 
91 #include <dev/mxge/mxge_mcp.h>
92 #include <dev/mxge/mcp_gen_header.h>
93 /*#define MXGE_FAKE_IFP*/
94 #include <dev/mxge/if_mxge_var.h>
95 #ifdef IFNET_BUF_RING
96 #include <sys/buf_ring.h>
97 #endif
98 
99 #include "opt_inet.h"
100 #include "opt_inet6.h"
101 
102 /* tunable params */
103 static int mxge_nvidia_ecrc_enable = 1;
104 static int mxge_force_firmware = 0;
105 static int mxge_intr_coal_delay = 30;
106 static int mxge_deassert_wait = 1;
107 static int mxge_flow_control = 1;
108 static int mxge_verbose = 0;
109 static int mxge_ticks;
110 static int mxge_max_slices = 1;
111 static int mxge_rss_hash_type = MXGEFW_RSS_HASH_TYPE_SRC_DST_PORT;
112 static int mxge_always_promisc = 0;
113 static int mxge_initial_mtu = ETHERMTU_JUMBO;
114 static int mxge_throttle = 0;
115 static char *mxge_fw_unaligned = "mxge_ethp_z8e";
116 static char *mxge_fw_aligned = "mxge_eth_z8e";
117 static char *mxge_fw_rss_aligned = "mxge_rss_eth_z8e";
118 static char *mxge_fw_rss_unaligned = "mxge_rss_ethp_z8e";
119 
120 static int mxge_probe(device_t dev);
121 static int mxge_attach(device_t dev);
122 static int mxge_detach(device_t dev);
123 static int mxge_shutdown(device_t dev);
124 static void mxge_intr(void *arg);
125 
126 static device_method_t mxge_methods[] =
127 {
128   /* Device interface */
129   DEVMETHOD(device_probe, mxge_probe),
130   DEVMETHOD(device_attach, mxge_attach),
131   DEVMETHOD(device_detach, mxge_detach),
132   DEVMETHOD(device_shutdown, mxge_shutdown),
133 
134   DEVMETHOD_END
135 };
136 
137 static driver_t mxge_driver =
138 {
139   "mxge",
140   mxge_methods,
141   sizeof(mxge_softc_t),
142 };
143 
144 static devclass_t mxge_devclass;
145 
146 /* Declare ourselves to be a child of the PCI bus.*/
147 DRIVER_MODULE(mxge, pci, mxge_driver, mxge_devclass, 0, 0);
148 MODULE_DEPEND(mxge, firmware, 1, 1, 1);
149 MODULE_DEPEND(mxge, zlib, 1, 1, 1);
150 
151 static int mxge_load_firmware(mxge_softc_t *sc, int adopt);
152 static int mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data);
153 static int mxge_close(mxge_softc_t *sc, int down);
154 static int mxge_open(mxge_softc_t *sc);
155 static void mxge_tick(void *arg);
156 
157 static int
158 mxge_probe(device_t dev)
159 {
160 	int rev;
161 
162 
163 	if ((pci_get_vendor(dev) == MXGE_PCI_VENDOR_MYRICOM) &&
164 	    ((pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E) ||
165 	     (pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E_9))) {
166 		rev = pci_get_revid(dev);
167 		switch (rev) {
168 		case MXGE_PCI_REV_Z8E:
169 			device_set_desc(dev, "Myri10G-PCIE-8A");
170 			break;
171 		case MXGE_PCI_REV_Z8ES:
172 			device_set_desc(dev, "Myri10G-PCIE-8B");
173 			break;
174 		default:
175 			device_set_desc(dev, "Myri10G-PCIE-8??");
176 			device_printf(dev, "Unrecognized rev %d NIC\n",
177 				      rev);
178 			break;
179 		}
180 		return 0;
181 	}
182 	return ENXIO;
183 }
184 
185 static void
186 mxge_enable_wc(mxge_softc_t *sc)
187 {
188 #if defined(__i386) || defined(__amd64)
189 	vm_offset_t len;
190 	int err;
191 
192 	sc->wc = 1;
193 	len = rman_get_size(sc->mem_res);
194 	err = pmap_change_attr((vm_offset_t) sc->sram,
195 			       len, PAT_WRITE_COMBINING);
196 	if (err != 0) {
197 		device_printf(sc->dev, "pmap_change_attr failed, %d\n",
198 			      err);
199 		sc->wc = 0;
200 	}
201 #endif
202 }
203 
204 
205 /* callback to get our DMA address */
206 static void
207 mxge_dmamap_callback(void *arg, bus_dma_segment_t *segs, int nsegs,
208 			 int error)
209 {
210 	if (error == 0) {
211 		*(bus_addr_t *) arg = segs->ds_addr;
212 	}
213 }
214 
215 static int
216 mxge_dma_alloc(mxge_softc_t *sc, mxge_dma_t *dma, size_t bytes,
217 		   bus_size_t alignment)
218 {
219 	int err;
220 	device_t dev = sc->dev;
221 	bus_size_t boundary, maxsegsize;
222 
223 	if (bytes > 4096 && alignment == 4096) {
224 		boundary = 0;
225 		maxsegsize = bytes;
226 	} else {
227 		boundary = 4096;
228 		maxsegsize = 4096;
229 	}
230 
231 	/* allocate DMAable memory tags */
232 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
233 				 alignment,		/* alignment */
234 				 boundary,		/* boundary */
235 				 BUS_SPACE_MAXADDR,	/* low */
236 				 BUS_SPACE_MAXADDR,	/* high */
237 				 NULL, NULL,		/* filter */
238 				 bytes,			/* maxsize */
239 				 1,			/* num segs */
240 				 maxsegsize,		/* maxsegsize */
241 				 BUS_DMA_COHERENT,	/* flags */
242 				 NULL, NULL,		/* lock */
243 				 &dma->dmat);		/* tag */
244 	if (err != 0) {
245 		device_printf(dev, "couldn't alloc tag (err = %d)\n", err);
246 		return err;
247 	}
248 
249 	/* allocate DMAable memory & map */
250 	err = bus_dmamem_alloc(dma->dmat, &dma->addr,
251 			       (BUS_DMA_WAITOK | BUS_DMA_COHERENT
252 				| BUS_DMA_ZERO),  &dma->map);
253 	if (err != 0) {
254 		device_printf(dev, "couldn't alloc mem (err = %d)\n", err);
255 		goto abort_with_dmat;
256 	}
257 
258 	/* load the memory */
259 	err = bus_dmamap_load(dma->dmat, dma->map, dma->addr, bytes,
260 			      mxge_dmamap_callback,
261 			      (void *)&dma->bus_addr, 0);
262 	if (err != 0) {
263 		device_printf(dev, "couldn't load map (err = %d)\n", err);
264 		goto abort_with_mem;
265 	}
266 	return 0;
267 
268 abort_with_mem:
269 	bus_dmamem_free(dma->dmat, dma->addr, dma->map);
270 abort_with_dmat:
271 	(void)bus_dma_tag_destroy(dma->dmat);
272 	return err;
273 }
274 
275 
276 static void
277 mxge_dma_free(mxge_dma_t *dma)
278 {
279 	bus_dmamap_unload(dma->dmat, dma->map);
280 	bus_dmamem_free(dma->dmat, dma->addr, dma->map);
281 	(void)bus_dma_tag_destroy(dma->dmat);
282 }
283 
284 /*
285  * The eeprom strings on the lanaiX have the format
286  * SN=x\0
287  * MAC=x:x:x:x:x:x\0
288  * PC=text\0
289  */
290 
291 static int
292 mxge_parse_strings(mxge_softc_t *sc)
293 {
294 	char *ptr;
295 	int i, found_mac, found_sn2;
296 	char *endptr;
297 
298 	ptr = sc->eeprom_strings;
299 	found_mac = 0;
300 	found_sn2 = 0;
301 	while (*ptr != '\0') {
302 		if (strncmp(ptr, "MAC=", 4) == 0) {
303 			ptr += 4;
304 			for (i = 0;;) {
305 				sc->mac_addr[i] = strtoul(ptr, &endptr, 16);
306 				if (endptr - ptr != 2)
307 					goto abort;
308 				ptr = endptr;
309 				if (++i == 6)
310 					break;
311 				if (*ptr++ != ':')
312 					goto abort;
313 			}
314 			found_mac = 1;
315 		} else if (strncmp(ptr, "PC=", 3) == 0) {
316 			ptr += 3;
317 			strlcpy(sc->product_code_string, ptr,
318 			    sizeof(sc->product_code_string));
319 		} else if (!found_sn2 && (strncmp(ptr, "SN=", 3) == 0)) {
320 			ptr += 3;
321 			strlcpy(sc->serial_number_string, ptr,
322 			    sizeof(sc->serial_number_string));
323 		} else if (strncmp(ptr, "SN2=", 4) == 0) {
324 			/* SN2 takes precedence over SN */
325 			ptr += 4;
326 			found_sn2 = 1;
327 			strlcpy(sc->serial_number_string, ptr,
328 			    sizeof(sc->serial_number_string));
329 		}
330 		while (*ptr++ != '\0') {}
331 	}
332 
333 	if (found_mac)
334 		return 0;
335 
336  abort:
337 	device_printf(sc->dev, "failed to parse eeprom_strings\n");
338 
339 	return ENXIO;
340 }
341 
342 #if defined __i386 || defined i386 || defined __i386__ || defined __x86_64__
343 static void
344 mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
345 {
346 	uint32_t val;
347 	unsigned long base, off;
348 	char *va, *cfgptr;
349 	device_t pdev, mcp55;
350 	uint16_t vendor_id, device_id, word;
351 	uintptr_t bus, slot, func, ivend, idev;
352 	uint32_t *ptr32;
353 
354 
355 	if (!mxge_nvidia_ecrc_enable)
356 		return;
357 
358 	pdev = device_get_parent(device_get_parent(sc->dev));
359 	if (pdev == NULL) {
360 		device_printf(sc->dev, "could not find parent?\n");
361 		return;
362 	}
363 	vendor_id = pci_read_config(pdev, PCIR_VENDOR, 2);
364 	device_id = pci_read_config(pdev, PCIR_DEVICE, 2);
365 
366 	if (vendor_id != 0x10de)
367 		return;
368 
369 	base = 0;
370 
371 	if (device_id == 0x005d) {
372 		/* ck804, base address is magic */
373 		base = 0xe0000000UL;
374 	} else if (device_id >= 0x0374 && device_id <= 0x378) {
375 		/* mcp55, base address stored in chipset */
376 		mcp55 = pci_find_bsf(0, 0, 0);
377 		if (mcp55 &&
378 		    0x10de == pci_read_config(mcp55, PCIR_VENDOR, 2) &&
379 		    0x0369 == pci_read_config(mcp55, PCIR_DEVICE, 2)) {
380 			word = pci_read_config(mcp55, 0x90, 2);
381 			base = ((unsigned long)word & 0x7ffeU) << 25;
382 		}
383 	}
384 	if (!base)
385 		return;
386 
387 	/* XXXX
388 	   Test below is commented because it is believed that doing
389 	   config read/write beyond 0xff will access the config space
390 	   for the next larger function.  Uncomment this and remove
391 	   the hacky pmap_mapdev() way of accessing config space when
392 	   FreeBSD grows support for extended pcie config space access
393 	*/
394 #if 0
395 	/* See if we can, by some miracle, access the extended
396 	   config space */
397 	val = pci_read_config(pdev, 0x178, 4);
398 	if (val != 0xffffffff) {
399 		val |= 0x40;
400 		pci_write_config(pdev, 0x178, val, 4);
401 		return;
402 	}
403 #endif
404 	/* Rather than using normal pci config space writes, we must
405 	 * map the Nvidia config space ourselves.  This is because on
406 	 * opteron/nvidia class machine the 0xe000000 mapping is
407 	 * handled by the nvidia chipset, that means the internal PCI
408 	 * device (the on-chip northbridge), or the amd-8131 bridge
409 	 * and things behind them are not visible by this method.
410 	 */
411 
412 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
413 		      PCI_IVAR_BUS, &bus);
414 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
415 		      PCI_IVAR_SLOT, &slot);
416 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
417 		      PCI_IVAR_FUNCTION, &func);
418 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
419 		      PCI_IVAR_VENDOR, &ivend);
420 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
421 		      PCI_IVAR_DEVICE, &idev);
422 
423 	off =  base
424 		+ 0x00100000UL * (unsigned long)bus
425 		+ 0x00001000UL * (unsigned long)(func
426 						 + 8 * slot);
427 
428 	/* map it into the kernel */
429 	va = pmap_mapdev(trunc_page((vm_paddr_t)off), PAGE_SIZE);
430 
431 
432 	if (va == NULL) {
433 		device_printf(sc->dev, "pmap_kenter_temporary didn't\n");
434 		return;
435 	}
436 	/* get a pointer to the config space mapped into the kernel */
437 	cfgptr = va + (off & PAGE_MASK);
438 
439 	/* make sure that we can really access it */
440 	vendor_id = *(uint16_t *)(cfgptr + PCIR_VENDOR);
441 	device_id = *(uint16_t *)(cfgptr + PCIR_DEVICE);
442 	if (! (vendor_id == ivend && device_id == idev)) {
443 		device_printf(sc->dev, "mapping failed: 0x%x:0x%x\n",
444 			      vendor_id, device_id);
445 		pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
446 		return;
447 	}
448 
449 	ptr32 = (uint32_t*)(cfgptr + 0x178);
450 	val = *ptr32;
451 
452 	if (val == 0xffffffff) {
453 		device_printf(sc->dev, "extended mapping failed\n");
454 		pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
455 		return;
456 	}
457 	*ptr32 = val | 0x40;
458 	pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
459 	if (mxge_verbose)
460 		device_printf(sc->dev,
461 			      "Enabled ECRC on upstream Nvidia bridge "
462 			      "at %d:%d:%d\n",
463 			      (int)bus, (int)slot, (int)func);
464 	return;
465 }
466 #else
467 static void
468 mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
469 {
470 	device_printf(sc->dev,
471 		      "Nforce 4 chipset on non-x86/amd64!?!?!\n");
472 	return;
473 }
474 #endif
475 
476 
477 static int
478 mxge_dma_test(mxge_softc_t *sc, int test_type)
479 {
480 	mxge_cmd_t cmd;
481 	bus_addr_t dmatest_bus = sc->dmabench_dma.bus_addr;
482 	int status;
483 	uint32_t len;
484 	char *test = " ";
485 
486 
487 	/* Run a small DMA test.
488 	 * The magic multipliers to the length tell the firmware
489 	 * to do DMA read, write, or read+write tests.  The
490 	 * results are returned in cmd.data0.  The upper 16
491 	 * bits of the return is the number of transfers completed.
492 	 * The lower 16 bits is the time in 0.5us ticks that the
493 	 * transfers took to complete.
494 	 */
495 
496 	len = sc->tx_boundary;
497 
498 	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
499 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
500 	cmd.data2 = len * 0x10000;
501 	status = mxge_send_cmd(sc, test_type, &cmd);
502 	if (status != 0) {
503 		test = "read";
504 		goto abort;
505 	}
506 	sc->read_dma = ((cmd.data0>>16) * len * 2) /
507 		(cmd.data0 & 0xffff);
508 	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
509 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
510 	cmd.data2 = len * 0x1;
511 	status = mxge_send_cmd(sc, test_type, &cmd);
512 	if (status != 0) {
513 		test = "write";
514 		goto abort;
515 	}
516 	sc->write_dma = ((cmd.data0>>16) * len * 2) /
517 		(cmd.data0 & 0xffff);
518 
519 	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
520 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
521 	cmd.data2 = len * 0x10001;
522 	status = mxge_send_cmd(sc, test_type, &cmd);
523 	if (status != 0) {
524 		test = "read/write";
525 		goto abort;
526 	}
527 	sc->read_write_dma = ((cmd.data0>>16) * len * 2 * 2) /
528 		(cmd.data0 & 0xffff);
529 
530 abort:
531 	if (status != 0 && test_type != MXGEFW_CMD_UNALIGNED_TEST)
532 		device_printf(sc->dev, "DMA %s benchmark failed: %d\n",
533 			      test, status);
534 
535 	return status;
536 }
537 
538 /*
539  * The Lanai Z8E PCI-E interface achieves higher Read-DMA throughput
540  * when the PCI-E Completion packets are aligned on an 8-byte
541  * boundary.  Some PCI-E chip sets always align Completion packets; on
542  * the ones that do not, the alignment can be enforced by enabling
543  * ECRC generation (if supported).
544  *
545  * When PCI-E Completion packets are not aligned, it is actually more
546  * efficient to limit Read-DMA transactions to 2KB, rather than 4KB.
547  *
548  * If the driver can neither enable ECRC nor verify that it has
549  * already been enabled, then it must use a firmware image which works
550  * around unaligned completion packets (ethp_z8e.dat), and it should
551  * also ensure that it never gives the device a Read-DMA which is
552  * larger than 2KB by setting the tx_boundary to 2KB.  If ECRC is
553  * enabled, then the driver should use the aligned (eth_z8e.dat)
554  * firmware image, and set tx_boundary to 4KB.
555  */
556 
557 static int
558 mxge_firmware_probe(mxge_softc_t *sc)
559 {
560 	device_t dev = sc->dev;
561 	int reg, status;
562 	uint16_t pectl;
563 
564 	sc->tx_boundary = 4096;
565 	/*
566 	 * Verify the max read request size was set to 4KB
567 	 * before trying the test with 4KB.
568 	 */
569 	if (pci_find_cap(dev, PCIY_EXPRESS, &reg) == 0) {
570 		pectl = pci_read_config(dev, reg + 0x8, 2);
571 		if ((pectl & (5 << 12)) != (5 << 12)) {
572 			device_printf(dev, "Max Read Req. size != 4k (0x%x\n",
573 				      pectl);
574 			sc->tx_boundary = 2048;
575 		}
576 	}
577 
578 	/*
579 	 * load the optimized firmware (which assumes aligned PCIe
580 	 * completions) in order to see if it works on this host.
581 	 */
582 	sc->fw_name = mxge_fw_aligned;
583 	status = mxge_load_firmware(sc, 1);
584 	if (status != 0) {
585 		return status;
586 	}
587 
588 	/*
589 	 * Enable ECRC if possible
590 	 */
591 	mxge_enable_nvidia_ecrc(sc);
592 
593 	/*
594 	 * Run a DMA test which watches for unaligned completions and
595 	 * aborts on the first one seen.  Not required on Z8ES or newer.
596 	 */
597 	if (pci_get_revid(sc->dev) >= MXGE_PCI_REV_Z8ES)
598 		return 0;
599 	status = mxge_dma_test(sc, MXGEFW_CMD_UNALIGNED_TEST);
600 	if (status == 0)
601 		return 0; /* keep the aligned firmware */
602 
603 	if (status != E2BIG)
604 		device_printf(dev, "DMA test failed: %d\n", status);
605 	if (status == ENOSYS)
606 		device_printf(dev, "Falling back to ethp! "
607 			      "Please install up to date fw\n");
608 	return status;
609 }
610 
611 static int
612 mxge_select_firmware(mxge_softc_t *sc)
613 {
614 	int aligned = 0;
615 	int force_firmware = mxge_force_firmware;
616 
617 	if (sc->throttle)
618 		force_firmware = sc->throttle;
619 
620 	if (force_firmware != 0) {
621 		if (force_firmware == 1)
622 			aligned = 1;
623 		else
624 			aligned = 0;
625 		if (mxge_verbose)
626 			device_printf(sc->dev,
627 				      "Assuming %s completions (forced)\n",
628 				      aligned ? "aligned" : "unaligned");
629 		goto abort;
630 	}
631 
632 	/* if the PCIe link width is 4 or less, we can use the aligned
633 	   firmware and skip any checks */
634 	if (sc->link_width != 0 && sc->link_width <= 4) {
635 		device_printf(sc->dev,
636 			      "PCIe x%d Link, expect reduced performance\n",
637 			      sc->link_width);
638 		aligned = 1;
639 		goto abort;
640 	}
641 
642 	if (0 == mxge_firmware_probe(sc))
643 		return 0;
644 
645 abort:
646 	if (aligned) {
647 		sc->fw_name = mxge_fw_aligned;
648 		sc->tx_boundary = 4096;
649 	} else {
650 		sc->fw_name = mxge_fw_unaligned;
651 		sc->tx_boundary = 2048;
652 	}
653 	return (mxge_load_firmware(sc, 0));
654 }
655 
656 static int
657 mxge_validate_firmware(mxge_softc_t *sc, const mcp_gen_header_t *hdr)
658 {
659 
660 
661 	if (be32toh(hdr->mcp_type) != MCP_TYPE_ETH) {
662 		device_printf(sc->dev, "Bad firmware type: 0x%x\n",
663 			      be32toh(hdr->mcp_type));
664 		return EIO;
665 	}
666 
667 	/* save firmware version for sysctl */
668 	strlcpy(sc->fw_version, hdr->version, sizeof(sc->fw_version));
669 	if (mxge_verbose)
670 		device_printf(sc->dev, "firmware id: %s\n", hdr->version);
671 
672 	sscanf(sc->fw_version, "%d.%d.%d", &sc->fw_ver_major,
673 	       &sc->fw_ver_minor, &sc->fw_ver_tiny);
674 
675 	if (!(sc->fw_ver_major == MXGEFW_VERSION_MAJOR
676 	      && sc->fw_ver_minor == MXGEFW_VERSION_MINOR)) {
677 		device_printf(sc->dev, "Found firmware version %s\n",
678 			      sc->fw_version);
679 		device_printf(sc->dev, "Driver needs %d.%d\n",
680 			      MXGEFW_VERSION_MAJOR, MXGEFW_VERSION_MINOR);
681 		return EINVAL;
682 	}
683 	return 0;
684 
685 }
686 
687 static int
688 mxge_load_firmware_helper(mxge_softc_t *sc, uint32_t *limit)
689 {
690 	z_stream zs;
691 	char *inflate_buffer;
692 	const struct firmware *fw;
693 	const mcp_gen_header_t *hdr;
694 	unsigned hdr_offset;
695 	int status;
696 	unsigned int i;
697 	char dummy;
698 	size_t fw_len;
699 
700 	fw = firmware_get(sc->fw_name);
701 	if (fw == NULL) {
702 		device_printf(sc->dev, "Could not find firmware image %s\n",
703 			      sc->fw_name);
704 		return ENOENT;
705 	}
706 
707 
708 
709 	/* setup zlib and decompress f/w */
710 	bzero(&zs, sizeof (zs));
711 	zs.zalloc = zcalloc_nowait;
712 	zs.zfree = zcfree;
713 	status = inflateInit(&zs);
714 	if (status != Z_OK) {
715 		status = EIO;
716 		goto abort_with_fw;
717 	}
718 
719 	/* the uncompressed size is stored as the firmware version,
720 	   which would otherwise go unused */
721 	fw_len = (size_t) fw->version;
722 	inflate_buffer = malloc(fw_len, M_TEMP, M_NOWAIT);
723 	if (inflate_buffer == NULL)
724 		goto abort_with_zs;
725 	zs.avail_in = fw->datasize;
726 	zs.next_in = __DECONST(char *, fw->data);
727 	zs.avail_out = fw_len;
728 	zs.next_out = inflate_buffer;
729 	status = inflate(&zs, Z_FINISH);
730 	if (status != Z_STREAM_END) {
731 		device_printf(sc->dev, "zlib %d\n", status);
732 		status = EIO;
733 		goto abort_with_buffer;
734 	}
735 
736 	/* check id */
737 	hdr_offset = htobe32(*(const uint32_t *)
738 			     (inflate_buffer + MCP_HEADER_PTR_OFFSET));
739 	if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > fw_len) {
740 		device_printf(sc->dev, "Bad firmware file");
741 		status = EIO;
742 		goto abort_with_buffer;
743 	}
744 	hdr = (const void*)(inflate_buffer + hdr_offset);
745 
746 	status = mxge_validate_firmware(sc, hdr);
747 	if (status != 0)
748 		goto abort_with_buffer;
749 
750 	/* Copy the inflated firmware to NIC SRAM. */
751 	for (i = 0; i < fw_len; i += 256) {
752 		mxge_pio_copy(sc->sram + MXGE_FW_OFFSET + i,
753 			      inflate_buffer + i,
754 			      min(256U, (unsigned)(fw_len - i)));
755 		wmb();
756 		dummy = *sc->sram;
757 		wmb();
758 	}
759 
760 	*limit = fw_len;
761 	status = 0;
762 abort_with_buffer:
763 	free(inflate_buffer, M_TEMP);
764 abort_with_zs:
765 	inflateEnd(&zs);
766 abort_with_fw:
767 	firmware_put(fw, FIRMWARE_UNLOAD);
768 	return status;
769 }
770 
771 /*
772  * Enable or disable periodic RDMAs from the host to make certain
773  * chipsets resend dropped PCIe messages
774  */
775 
776 static void
777 mxge_dummy_rdma(mxge_softc_t *sc, int enable)
778 {
779 	char buf_bytes[72];
780 	volatile uint32_t *confirm;
781 	volatile char *submit;
782 	uint32_t *buf, dma_low, dma_high;
783 	int i;
784 
785 	buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
786 
787 	/* clear confirmation addr */
788 	confirm = (volatile uint32_t *)sc->cmd;
789 	*confirm = 0;
790 	wmb();
791 
792 	/* send an rdma command to the PCIe engine, and wait for the
793 	   response in the confirmation address.  The firmware should
794 	   write a -1 there to indicate it is alive and well
795 	*/
796 
797 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
798 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
799 	buf[0] = htobe32(dma_high);		/* confirm addr MSW */
800 	buf[1] = htobe32(dma_low);		/* confirm addr LSW */
801 	buf[2] = htobe32(0xffffffff);		/* confirm data */
802 	dma_low = MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr);
803 	dma_high = MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr);
804 	buf[3] = htobe32(dma_high); 		/* dummy addr MSW */
805 	buf[4] = htobe32(dma_low); 		/* dummy addr LSW */
806 	buf[5] = htobe32(enable);			/* enable? */
807 
808 
809 	submit = (volatile char *)(sc->sram + MXGEFW_BOOT_DUMMY_RDMA);
810 
811 	mxge_pio_copy(submit, buf, 64);
812 	wmb();
813 	DELAY(1000);
814 	wmb();
815 	i = 0;
816 	while (*confirm != 0xffffffff && i < 20) {
817 		DELAY(1000);
818 		i++;
819 	}
820 	if (*confirm != 0xffffffff) {
821 		device_printf(sc->dev, "dummy rdma %s failed (%p = 0x%x)",
822 			      (enable ? "enable" : "disable"), confirm,
823 			      *confirm);
824 	}
825 	return;
826 }
827 
828 static int
829 mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data)
830 {
831 	mcp_cmd_t *buf;
832 	char buf_bytes[sizeof(*buf) + 8];
833 	volatile mcp_cmd_response_t *response = sc->cmd;
834 	volatile char *cmd_addr = sc->sram + MXGEFW_ETH_CMD;
835 	uint32_t dma_low, dma_high;
836 	int err, sleep_total = 0;
837 
838 	/* ensure buf is aligned to 8 bytes */
839 	buf = (mcp_cmd_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
840 
841 	buf->data0 = htobe32(data->data0);
842 	buf->data1 = htobe32(data->data1);
843 	buf->data2 = htobe32(data->data2);
844 	buf->cmd = htobe32(cmd);
845 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
846 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
847 
848 	buf->response_addr.low = htobe32(dma_low);
849 	buf->response_addr.high = htobe32(dma_high);
850 	mtx_lock(&sc->cmd_mtx);
851 	response->result = 0xffffffff;
852 	wmb();
853 	mxge_pio_copy((volatile void *)cmd_addr, buf, sizeof (*buf));
854 
855 	/* wait up to 20ms */
856 	err = EAGAIN;
857 	for (sleep_total = 0; sleep_total <  20; sleep_total++) {
858 		bus_dmamap_sync(sc->cmd_dma.dmat,
859 				sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
860 		wmb();
861 		switch (be32toh(response->result)) {
862 		case 0:
863 			data->data0 = be32toh(response->data);
864 			err = 0;
865 			break;
866 		case 0xffffffff:
867 			DELAY(1000);
868 			break;
869 		case MXGEFW_CMD_UNKNOWN:
870 			err = ENOSYS;
871 			break;
872 		case MXGEFW_CMD_ERROR_UNALIGNED:
873 			err = E2BIG;
874 			break;
875 		case MXGEFW_CMD_ERROR_BUSY:
876 			err = EBUSY;
877 			break;
878 		case MXGEFW_CMD_ERROR_I2C_ABSENT:
879 			err = ENXIO;
880 			break;
881 		default:
882 			device_printf(sc->dev,
883 				      "mxge: command %d "
884 				      "failed, result = %d\n",
885 				      cmd, be32toh(response->result));
886 			err = ENXIO;
887 			break;
888 		}
889 		if (err != EAGAIN)
890 			break;
891 	}
892 	if (err == EAGAIN)
893 		device_printf(sc->dev, "mxge: command %d timed out"
894 			      "result = %d\n",
895 			      cmd, be32toh(response->result));
896 	mtx_unlock(&sc->cmd_mtx);
897 	return err;
898 }
899 
900 static int
901 mxge_adopt_running_firmware(mxge_softc_t *sc)
902 {
903 	struct mcp_gen_header *hdr;
904 	const size_t bytes = sizeof (struct mcp_gen_header);
905 	size_t hdr_offset;
906 	int status;
907 
908 	/* find running firmware header */
909 	hdr_offset = htobe32(*(volatile uint32_t *)
910 			     (sc->sram + MCP_HEADER_PTR_OFFSET));
911 
912 	if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > sc->sram_size) {
913 		device_printf(sc->dev,
914 			      "Running firmware has bad header offset (%d)\n",
915 			      (int)hdr_offset);
916 		return EIO;
917 	}
918 
919 	/* copy header of running firmware from SRAM to host memory to
920 	 * validate firmware */
921 	hdr = malloc(bytes, M_DEVBUF, M_NOWAIT);
922 	if (hdr == NULL) {
923 		device_printf(sc->dev, "could not malloc firmware hdr\n");
924 		return ENOMEM;
925 	}
926 	bus_space_read_region_1(rman_get_bustag(sc->mem_res),
927 				rman_get_bushandle(sc->mem_res),
928 				hdr_offset, (char *)hdr, bytes);
929 	status = mxge_validate_firmware(sc, hdr);
930 	free(hdr, M_DEVBUF);
931 
932 	/*
933 	 * check to see if adopted firmware has bug where adopting
934 	 * it will cause broadcasts to be filtered unless the NIC
935 	 * is kept in ALLMULTI mode
936 	 */
937 	if (sc->fw_ver_major == 1 && sc->fw_ver_minor == 4 &&
938 	    sc->fw_ver_tiny >= 4 && sc->fw_ver_tiny <= 11) {
939 		sc->adopted_rx_filter_bug = 1;
940 		device_printf(sc->dev, "Adopting fw %d.%d.%d: "
941 			      "working around rx filter bug\n",
942 			      sc->fw_ver_major, sc->fw_ver_minor,
943 			      sc->fw_ver_tiny);
944 	}
945 
946 	return status;
947 }
948 
949 
950 static int
951 mxge_load_firmware(mxge_softc_t *sc, int adopt)
952 {
953 	volatile uint32_t *confirm;
954 	volatile char *submit;
955 	char buf_bytes[72];
956 	uint32_t *buf, size, dma_low, dma_high;
957 	int status, i;
958 
959 	buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
960 
961 	size = sc->sram_size;
962 	status = mxge_load_firmware_helper(sc, &size);
963 	if (status) {
964 		if (!adopt)
965 			return status;
966 		/* Try to use the currently running firmware, if
967 		   it is new enough */
968 		status = mxge_adopt_running_firmware(sc);
969 		if (status) {
970 			device_printf(sc->dev,
971 				      "failed to adopt running firmware\n");
972 			return status;
973 		}
974 		device_printf(sc->dev,
975 			      "Successfully adopted running firmware\n");
976 		if (sc->tx_boundary == 4096) {
977 			device_printf(sc->dev,
978 				"Using firmware currently running on NIC"
979 				 ".  For optimal\n");
980 			device_printf(sc->dev,
981 				 "performance consider loading optimized "
982 				 "firmware\n");
983 		}
984 		sc->fw_name = mxge_fw_unaligned;
985 		sc->tx_boundary = 2048;
986 		return 0;
987 	}
988 	/* clear confirmation addr */
989 	confirm = (volatile uint32_t *)sc->cmd;
990 	*confirm = 0;
991 	wmb();
992 	/* send a reload command to the bootstrap MCP, and wait for the
993 	   response in the confirmation address.  The firmware should
994 	   write a -1 there to indicate it is alive and well
995 	*/
996 
997 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
998 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
999 
1000 	buf[0] = htobe32(dma_high);	/* confirm addr MSW */
1001 	buf[1] = htobe32(dma_low);	/* confirm addr LSW */
1002 	buf[2] = htobe32(0xffffffff);	/* confirm data */
1003 
1004 	/* FIX: All newest firmware should un-protect the bottom of
1005 	   the sram before handoff. However, the very first interfaces
1006 	   do not. Therefore the handoff copy must skip the first 8 bytes
1007 	*/
1008 					/* where the code starts*/
1009 	buf[3] = htobe32(MXGE_FW_OFFSET + 8);
1010 	buf[4] = htobe32(size - 8); 	/* length of code */
1011 	buf[5] = htobe32(8);		/* where to copy to */
1012 	buf[6] = htobe32(0);		/* where to jump to */
1013 
1014 	submit = (volatile char *)(sc->sram + MXGEFW_BOOT_HANDOFF);
1015 	mxge_pio_copy(submit, buf, 64);
1016 	wmb();
1017 	DELAY(1000);
1018 	wmb();
1019 	i = 0;
1020 	while (*confirm != 0xffffffff && i < 20) {
1021 		DELAY(1000*10);
1022 		i++;
1023 		bus_dmamap_sync(sc->cmd_dma.dmat,
1024 				sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
1025 	}
1026 	if (*confirm != 0xffffffff) {
1027 		device_printf(sc->dev,"handoff failed (%p = 0x%x)",
1028 			confirm, *confirm);
1029 
1030 		return ENXIO;
1031 	}
1032 	return 0;
1033 }
1034 
1035 static int
1036 mxge_update_mac_address(mxge_softc_t *sc)
1037 {
1038 	mxge_cmd_t cmd;
1039 	uint8_t *addr = sc->mac_addr;
1040 	int status;
1041 
1042 
1043 	cmd.data0 = ((addr[0] << 24) | (addr[1] << 16)
1044 		     | (addr[2] << 8) | addr[3]);
1045 
1046 	cmd.data1 = ((addr[4] << 8) | (addr[5]));
1047 
1048 	status = mxge_send_cmd(sc, MXGEFW_SET_MAC_ADDRESS, &cmd);
1049 	return status;
1050 }
1051 
1052 static int
1053 mxge_change_pause(mxge_softc_t *sc, int pause)
1054 {
1055 	mxge_cmd_t cmd;
1056 	int status;
1057 
1058 	if (pause)
1059 		status = mxge_send_cmd(sc, MXGEFW_ENABLE_FLOW_CONTROL,
1060 				       &cmd);
1061 	else
1062 		status = mxge_send_cmd(sc, MXGEFW_DISABLE_FLOW_CONTROL,
1063 				       &cmd);
1064 
1065 	if (status) {
1066 		device_printf(sc->dev, "Failed to set flow control mode\n");
1067 		return ENXIO;
1068 	}
1069 	sc->pause = pause;
1070 	return 0;
1071 }
1072 
1073 static void
1074 mxge_change_promisc(mxge_softc_t *sc, int promisc)
1075 {
1076 	mxge_cmd_t cmd;
1077 	int status;
1078 
1079 	if (mxge_always_promisc)
1080 		promisc = 1;
1081 
1082 	if (promisc)
1083 		status = mxge_send_cmd(sc, MXGEFW_ENABLE_PROMISC,
1084 				       &cmd);
1085 	else
1086 		status = mxge_send_cmd(sc, MXGEFW_DISABLE_PROMISC,
1087 				       &cmd);
1088 
1089 	if (status) {
1090 		device_printf(sc->dev, "Failed to set promisc mode\n");
1091 	}
1092 }
1093 
1094 struct mxge_add_maddr_ctx {
1095 	mxge_softc_t *sc;
1096 	int error;
1097 };
1098 
1099 static u_int
1100 mxge_add_maddr(void *arg, struct sockaddr_dl *sdl, u_int cnt)
1101 {
1102 	struct mxge_add_maddr_ctx *ctx = arg;
1103 	mxge_cmd_t cmd;
1104 
1105 	if (ctx->error != 0)
1106 		return (0);
1107 	bcopy(LLADDR(sdl), &cmd.data0, 4);
1108 	bcopy(LLADDR(sdl) + 4, &cmd.data1, 2);
1109 	cmd.data0 = htonl(cmd.data0);
1110 	cmd.data1 = htonl(cmd.data1);
1111 
1112 	ctx->error = mxge_send_cmd(ctx->sc, MXGEFW_JOIN_MULTICAST_GROUP, &cmd);
1113 
1114 	return (1);
1115 }
1116 
1117 static void
1118 mxge_set_multicast_list(mxge_softc_t *sc)
1119 {
1120 	struct mxge_add_maddr_ctx ctx;
1121 	struct ifnet *ifp = sc->ifp;
1122 	mxge_cmd_t cmd;
1123 	int err;
1124 
1125 	/* This firmware is known to not support multicast */
1126 	if (!sc->fw_multicast_support)
1127 		return;
1128 
1129 	/* Disable multicast filtering while we play with the lists*/
1130 	err = mxge_send_cmd(sc, MXGEFW_ENABLE_ALLMULTI, &cmd);
1131 	if (err != 0) {
1132 		device_printf(sc->dev, "Failed MXGEFW_ENABLE_ALLMULTI,"
1133 		       " error status: %d\n", err);
1134 		return;
1135 	}
1136 
1137 	if (sc->adopted_rx_filter_bug)
1138 		return;
1139 
1140 	if (ifp->if_flags & IFF_ALLMULTI)
1141 		/* request to disable multicast filtering, so quit here */
1142 		return;
1143 
1144 	/* Flush all the filters */
1145 
1146 	err = mxge_send_cmd(sc, MXGEFW_LEAVE_ALL_MULTICAST_GROUPS, &cmd);
1147 	if (err != 0) {
1148 		device_printf(sc->dev,
1149 			      "Failed MXGEFW_LEAVE_ALL_MULTICAST_GROUPS"
1150 			      ", error status: %d\n", err);
1151 		return;
1152 	}
1153 
1154 	/* Walk the multicast list, and add each address */
1155 	ctx.sc = sc;
1156 	ctx.error = 0;
1157 	if_foreach_llmaddr(ifp, mxge_add_maddr, &ctx);
1158 	if (ctx.error != 0) {
1159 		device_printf(sc->dev, "Failed MXGEFW_JOIN_MULTICAST_GROUP, "
1160 		    "error status:" "%d\t", ctx.error);
1161 		/* abort, leaving multicast filtering off */
1162 		return;
1163 	}
1164 
1165 	/* Enable multicast filtering */
1166 	err = mxge_send_cmd(sc, MXGEFW_DISABLE_ALLMULTI, &cmd);
1167 	if (err != 0) {
1168 		device_printf(sc->dev, "Failed MXGEFW_DISABLE_ALLMULTI"
1169 		       ", error status: %d\n", err);
1170 	}
1171 }
1172 
1173 static int
1174 mxge_max_mtu(mxge_softc_t *sc)
1175 {
1176 	mxge_cmd_t cmd;
1177 	int status;
1178 
1179 	if (MJUMPAGESIZE - MXGEFW_PAD >  MXGEFW_MAX_MTU)
1180 		return  MXGEFW_MAX_MTU - MXGEFW_PAD;
1181 
1182 	/* try to set nbufs to see if it we can
1183 	   use virtually contiguous jumbos */
1184 	cmd.data0 = 0;
1185 	status = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
1186 			       &cmd);
1187 	if (status == 0)
1188 		return  MXGEFW_MAX_MTU - MXGEFW_PAD;
1189 
1190 	/* otherwise, we're limited to MJUMPAGESIZE */
1191 	return MJUMPAGESIZE - MXGEFW_PAD;
1192 }
1193 
1194 static int
1195 mxge_reset(mxge_softc_t *sc, int interrupts_setup)
1196 {
1197 	struct mxge_slice_state *ss;
1198 	mxge_rx_done_t *rx_done;
1199 	volatile uint32_t *irq_claim;
1200 	mxge_cmd_t cmd;
1201 	int slice, status;
1202 
1203 	/* try to send a reset command to the card to see if it
1204 	   is alive */
1205 	memset(&cmd, 0, sizeof (cmd));
1206 	status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
1207 	if (status != 0) {
1208 		device_printf(sc->dev, "failed reset\n");
1209 		return ENXIO;
1210 	}
1211 
1212 	mxge_dummy_rdma(sc, 1);
1213 
1214 
1215 	/* set the intrq size */
1216 	cmd.data0 = sc->rx_ring_size;
1217 	status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
1218 
1219 	/*
1220 	 * Even though we already know how many slices are supported
1221 	 * via mxge_slice_probe(), MXGEFW_CMD_GET_MAX_RSS_QUEUES
1222 	 * has magic side effects, and must be called after a reset.
1223 	 * It must be called prior to calling any RSS related cmds,
1224 	 * including assigning an interrupt queue for anything but
1225 	 * slice 0.  It must also be called *after*
1226 	 * MXGEFW_CMD_SET_INTRQ_SIZE, since the intrq size is used by
1227 	 * the firmware to compute offsets.
1228 	 */
1229 
1230 	if (sc->num_slices > 1) {
1231 		/* ask the maximum number of slices it supports */
1232 		status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES,
1233 					   &cmd);
1234 		if (status != 0) {
1235 			device_printf(sc->dev,
1236 				      "failed to get number of slices\n");
1237 			return status;
1238 		}
1239 		/*
1240 		 * MXGEFW_CMD_ENABLE_RSS_QUEUES must be called prior
1241 		 * to setting up the interrupt queue DMA
1242 		 */
1243 		cmd.data0 = sc->num_slices;
1244 		cmd.data1 = MXGEFW_SLICE_INTR_MODE_ONE_PER_SLICE;
1245 #ifdef IFNET_BUF_RING
1246 		cmd.data1 |= MXGEFW_SLICE_ENABLE_MULTIPLE_TX_QUEUES;
1247 #endif
1248 		status = mxge_send_cmd(sc, MXGEFW_CMD_ENABLE_RSS_QUEUES,
1249 					   &cmd);
1250 		if (status != 0) {
1251 			device_printf(sc->dev,
1252 				      "failed to set number of slices\n");
1253 			return status;
1254 		}
1255 	}
1256 
1257 
1258 	if (interrupts_setup) {
1259 		/* Now exchange information about interrupts  */
1260 		for (slice = 0; slice < sc->num_slices; slice++) {
1261 			rx_done = &sc->ss[slice].rx_done;
1262 			memset(rx_done->entry, 0, sc->rx_ring_size);
1263 			cmd.data0 = MXGE_LOWPART_TO_U32(rx_done->dma.bus_addr);
1264 			cmd.data1 = MXGE_HIGHPART_TO_U32(rx_done->dma.bus_addr);
1265 			cmd.data2 = slice;
1266 			status |= mxge_send_cmd(sc,
1267 						MXGEFW_CMD_SET_INTRQ_DMA,
1268 						&cmd);
1269 		}
1270 	}
1271 
1272 	status |= mxge_send_cmd(sc,
1273 				MXGEFW_CMD_GET_INTR_COAL_DELAY_OFFSET, &cmd);
1274 
1275 
1276 	sc->intr_coal_delay_ptr = (volatile uint32_t *)(sc->sram + cmd.data0);
1277 
1278 	status |= mxge_send_cmd(sc, MXGEFW_CMD_GET_IRQ_ACK_OFFSET, &cmd);
1279 	irq_claim = (volatile uint32_t *)(sc->sram + cmd.data0);
1280 
1281 
1282 	status |= mxge_send_cmd(sc,  MXGEFW_CMD_GET_IRQ_DEASSERT_OFFSET,
1283 				&cmd);
1284 	sc->irq_deassert = (volatile uint32_t *)(sc->sram + cmd.data0);
1285 	if (status != 0) {
1286 		device_printf(sc->dev, "failed set interrupt parameters\n");
1287 		return status;
1288 	}
1289 
1290 
1291 	*sc->intr_coal_delay_ptr = htobe32(sc->intr_coal_delay);
1292 
1293 
1294 	/* run a DMA benchmark */
1295 	(void) mxge_dma_test(sc, MXGEFW_DMA_TEST);
1296 
1297 	for (slice = 0; slice < sc->num_slices; slice++) {
1298 		ss = &sc->ss[slice];
1299 
1300 		ss->irq_claim = irq_claim + (2 * slice);
1301 		/* reset mcp/driver shared state back to 0 */
1302 		ss->rx_done.idx = 0;
1303 		ss->rx_done.cnt = 0;
1304 		ss->tx.req = 0;
1305 		ss->tx.done = 0;
1306 		ss->tx.pkt_done = 0;
1307 		ss->tx.queue_active = 0;
1308 		ss->tx.activate = 0;
1309 		ss->tx.deactivate = 0;
1310 		ss->tx.wake = 0;
1311 		ss->tx.defrag = 0;
1312 		ss->tx.stall = 0;
1313 		ss->rx_big.cnt = 0;
1314 		ss->rx_small.cnt = 0;
1315 		ss->lc.lro_bad_csum = 0;
1316 		ss->lc.lro_queued = 0;
1317 		ss->lc.lro_flushed = 0;
1318 		if (ss->fw_stats != NULL) {
1319 			bzero(ss->fw_stats, sizeof *ss->fw_stats);
1320 		}
1321 	}
1322 	sc->rdma_tags_available = 15;
1323 	status = mxge_update_mac_address(sc);
1324 	mxge_change_promisc(sc, sc->ifp->if_flags & IFF_PROMISC);
1325 	mxge_change_pause(sc, sc->pause);
1326 	mxge_set_multicast_list(sc);
1327 	if (sc->throttle) {
1328 		cmd.data0 = sc->throttle;
1329 		if (mxge_send_cmd(sc, MXGEFW_CMD_SET_THROTTLE_FACTOR,
1330 				  &cmd)) {
1331 			device_printf(sc->dev,
1332 				      "can't enable throttle\n");
1333 		}
1334 	}
1335 	return status;
1336 }
1337 
1338 static int
1339 mxge_change_throttle(SYSCTL_HANDLER_ARGS)
1340 {
1341 	mxge_cmd_t cmd;
1342 	mxge_softc_t *sc;
1343 	int err;
1344 	unsigned int throttle;
1345 
1346 	sc = arg1;
1347 	throttle = sc->throttle;
1348 	err = sysctl_handle_int(oidp, &throttle, arg2, req);
1349 	if (err != 0) {
1350 		return err;
1351 	}
1352 
1353 	if (throttle == sc->throttle)
1354 		return 0;
1355 
1356 	if (throttle < MXGE_MIN_THROTTLE || throttle > MXGE_MAX_THROTTLE)
1357 		return EINVAL;
1358 
1359 	mtx_lock(&sc->driver_mtx);
1360 	cmd.data0 = throttle;
1361 	err = mxge_send_cmd(sc, MXGEFW_CMD_SET_THROTTLE_FACTOR, &cmd);
1362 	if (err == 0)
1363 		sc->throttle = throttle;
1364 	mtx_unlock(&sc->driver_mtx);
1365 	return err;
1366 }
1367 
1368 static int
1369 mxge_change_intr_coal(SYSCTL_HANDLER_ARGS)
1370 {
1371 	mxge_softc_t *sc;
1372 	unsigned int intr_coal_delay;
1373 	int err;
1374 
1375 	sc = arg1;
1376 	intr_coal_delay = sc->intr_coal_delay;
1377 	err = sysctl_handle_int(oidp, &intr_coal_delay, arg2, req);
1378 	if (err != 0) {
1379 		return err;
1380 	}
1381 	if (intr_coal_delay == sc->intr_coal_delay)
1382 		return 0;
1383 
1384 	if (intr_coal_delay == 0 || intr_coal_delay > 1000*1000)
1385 		return EINVAL;
1386 
1387 	mtx_lock(&sc->driver_mtx);
1388 	*sc->intr_coal_delay_ptr = htobe32(intr_coal_delay);
1389 	sc->intr_coal_delay = intr_coal_delay;
1390 
1391 	mtx_unlock(&sc->driver_mtx);
1392 	return err;
1393 }
1394 
1395 static int
1396 mxge_change_flow_control(SYSCTL_HANDLER_ARGS)
1397 {
1398 	mxge_softc_t *sc;
1399 	unsigned int enabled;
1400 	int err;
1401 
1402 	sc = arg1;
1403 	enabled = sc->pause;
1404 	err = sysctl_handle_int(oidp, &enabled, arg2, req);
1405 	if (err != 0) {
1406 		return err;
1407 	}
1408 	if (enabled == sc->pause)
1409 		return 0;
1410 
1411 	mtx_lock(&sc->driver_mtx);
1412 	err = mxge_change_pause(sc, enabled);
1413 	mtx_unlock(&sc->driver_mtx);
1414 	return err;
1415 }
1416 
1417 static int
1418 mxge_handle_be32(SYSCTL_HANDLER_ARGS)
1419 {
1420 	int err;
1421 
1422 	if (arg1 == NULL)
1423 		return EFAULT;
1424 	arg2 = be32toh(*(int *)arg1);
1425 	arg1 = NULL;
1426 	err = sysctl_handle_int(oidp, arg1, arg2, req);
1427 
1428 	return err;
1429 }
1430 
1431 static void
1432 mxge_rem_sysctls(mxge_softc_t *sc)
1433 {
1434 	struct mxge_slice_state *ss;
1435 	int slice;
1436 
1437 	if (sc->slice_sysctl_tree == NULL)
1438 		return;
1439 
1440 	for (slice = 0; slice < sc->num_slices; slice++) {
1441 		ss = &sc->ss[slice];
1442 		if (ss == NULL || ss->sysctl_tree == NULL)
1443 			continue;
1444 		sysctl_ctx_free(&ss->sysctl_ctx);
1445 		ss->sysctl_tree = NULL;
1446 	}
1447 	sysctl_ctx_free(&sc->slice_sysctl_ctx);
1448 	sc->slice_sysctl_tree = NULL;
1449 }
1450 
1451 static void
1452 mxge_add_sysctls(mxge_softc_t *sc)
1453 {
1454 	struct sysctl_ctx_list *ctx;
1455 	struct sysctl_oid_list *children;
1456 	mcp_irq_data_t *fw;
1457 	struct mxge_slice_state *ss;
1458 	int slice;
1459 	char slice_num[8];
1460 
1461 	ctx = device_get_sysctl_ctx(sc->dev);
1462 	children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev));
1463 	fw = sc->ss[0].fw_stats;
1464 
1465 	/* random information */
1466 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1467 		       "firmware_version",
1468 		       CTLFLAG_RD, sc->fw_version,
1469 		       0, "firmware version");
1470 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1471 		       "serial_number",
1472 		       CTLFLAG_RD, sc->serial_number_string,
1473 		       0, "serial number");
1474 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1475 		       "product_code",
1476 		       CTLFLAG_RD, sc->product_code_string,
1477 		       0, "product_code");
1478 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1479 		       "pcie_link_width",
1480 		       CTLFLAG_RD, &sc->link_width,
1481 		       0, "tx_boundary");
1482 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1483 		       "tx_boundary",
1484 		       CTLFLAG_RD, &sc->tx_boundary,
1485 		       0, "tx_boundary");
1486 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1487 		       "write_combine",
1488 		       CTLFLAG_RD, &sc->wc,
1489 		       0, "write combining PIO?");
1490 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1491 		       "read_dma_MBs",
1492 		       CTLFLAG_RD, &sc->read_dma,
1493 		       0, "DMA Read speed in MB/s");
1494 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1495 		       "write_dma_MBs",
1496 		       CTLFLAG_RD, &sc->write_dma,
1497 		       0, "DMA Write speed in MB/s");
1498 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1499 		       "read_write_dma_MBs",
1500 		       CTLFLAG_RD, &sc->read_write_dma,
1501 		       0, "DMA concurrent Read/Write speed in MB/s");
1502 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1503 		       "watchdog_resets",
1504 		       CTLFLAG_RD, &sc->watchdog_resets,
1505 		       0, "Number of times NIC was reset");
1506 
1507 
1508 	/* performance related tunables */
1509 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1510 			"intr_coal_delay",
1511 			CTLTYPE_INT|CTLFLAG_RW, sc,
1512 			0, mxge_change_intr_coal,
1513 			"I", "interrupt coalescing delay in usecs");
1514 
1515 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1516 			"throttle",
1517 			CTLTYPE_INT|CTLFLAG_RW, sc,
1518 			0, mxge_change_throttle,
1519 			"I", "transmit throttling");
1520 
1521 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1522 			"flow_control_enabled",
1523 			CTLTYPE_INT|CTLFLAG_RW, sc,
1524 			0, mxge_change_flow_control,
1525 			"I", "interrupt coalescing delay in usecs");
1526 
1527 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1528 		       "deassert_wait",
1529 		       CTLFLAG_RW, &mxge_deassert_wait,
1530 		       0, "Wait for IRQ line to go low in ihandler");
1531 
1532 	/* stats block from firmware is in network byte order.
1533 	   Need to swap it */
1534 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1535 			"link_up",
1536 			CTLTYPE_INT|CTLFLAG_RD, &fw->link_up,
1537 			0, mxge_handle_be32,
1538 			"I", "link up");
1539 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1540 			"rdma_tags_available",
1541 			CTLTYPE_INT|CTLFLAG_RD, &fw->rdma_tags_available,
1542 			0, mxge_handle_be32,
1543 			"I", "rdma_tags_available");
1544 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1545 			"dropped_bad_crc32",
1546 			CTLTYPE_INT|CTLFLAG_RD,
1547 			&fw->dropped_bad_crc32,
1548 			0, mxge_handle_be32,
1549 			"I", "dropped_bad_crc32");
1550 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1551 			"dropped_bad_phy",
1552 			CTLTYPE_INT|CTLFLAG_RD,
1553 			&fw->dropped_bad_phy,
1554 			0, mxge_handle_be32,
1555 			"I", "dropped_bad_phy");
1556 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1557 			"dropped_link_error_or_filtered",
1558 			CTLTYPE_INT|CTLFLAG_RD,
1559 			&fw->dropped_link_error_or_filtered,
1560 			0, mxge_handle_be32,
1561 			"I", "dropped_link_error_or_filtered");
1562 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1563 			"dropped_link_overflow",
1564 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_link_overflow,
1565 			0, mxge_handle_be32,
1566 			"I", "dropped_link_overflow");
1567 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1568 			"dropped_multicast_filtered",
1569 			CTLTYPE_INT|CTLFLAG_RD,
1570 			&fw->dropped_multicast_filtered,
1571 			0, mxge_handle_be32,
1572 			"I", "dropped_multicast_filtered");
1573 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1574 			"dropped_no_big_buffer",
1575 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_no_big_buffer,
1576 			0, mxge_handle_be32,
1577 			"I", "dropped_no_big_buffer");
1578 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1579 			"dropped_no_small_buffer",
1580 			CTLTYPE_INT|CTLFLAG_RD,
1581 			&fw->dropped_no_small_buffer,
1582 			0, mxge_handle_be32,
1583 			"I", "dropped_no_small_buffer");
1584 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1585 			"dropped_overrun",
1586 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_overrun,
1587 			0, mxge_handle_be32,
1588 			"I", "dropped_overrun");
1589 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1590 			"dropped_pause",
1591 			CTLTYPE_INT|CTLFLAG_RD,
1592 			&fw->dropped_pause,
1593 			0, mxge_handle_be32,
1594 			"I", "dropped_pause");
1595 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1596 			"dropped_runt",
1597 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_runt,
1598 			0, mxge_handle_be32,
1599 			"I", "dropped_runt");
1600 
1601 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1602 			"dropped_unicast_filtered",
1603 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_unicast_filtered,
1604 			0, mxge_handle_be32,
1605 			"I", "dropped_unicast_filtered");
1606 
1607 	/* verbose printing? */
1608 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1609 		       "verbose",
1610 		       CTLFLAG_RW, &mxge_verbose,
1611 		       0, "verbose printing");
1612 
1613 	/* add counters exported for debugging from all slices */
1614 	sysctl_ctx_init(&sc->slice_sysctl_ctx);
1615 	sc->slice_sysctl_tree =
1616 		SYSCTL_ADD_NODE(&sc->slice_sysctl_ctx, children, OID_AUTO,
1617 				"slice", CTLFLAG_RD, 0, "");
1618 
1619 	for (slice = 0; slice < sc->num_slices; slice++) {
1620 		ss = &sc->ss[slice];
1621 		sysctl_ctx_init(&ss->sysctl_ctx);
1622 		ctx = &ss->sysctl_ctx;
1623 		children = SYSCTL_CHILDREN(sc->slice_sysctl_tree);
1624 		sprintf(slice_num, "%d", slice);
1625 		ss->sysctl_tree =
1626 			SYSCTL_ADD_NODE(ctx, children, OID_AUTO, slice_num,
1627 					CTLFLAG_RD, 0, "");
1628 		children = SYSCTL_CHILDREN(ss->sysctl_tree);
1629 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1630 			       "rx_small_cnt",
1631 			       CTLFLAG_RD, &ss->rx_small.cnt,
1632 			       0, "rx_small_cnt");
1633 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1634 			       "rx_big_cnt",
1635 			       CTLFLAG_RD, &ss->rx_big.cnt,
1636 			       0, "rx_small_cnt");
1637 		SYSCTL_ADD_U64(ctx, children, OID_AUTO,
1638 			       "lro_flushed", CTLFLAG_RD, &ss->lc.lro_flushed,
1639 			       0, "number of lro merge queues flushed");
1640 
1641 		SYSCTL_ADD_U64(ctx, children, OID_AUTO,
1642 			       "lro_bad_csum", CTLFLAG_RD, &ss->lc.lro_bad_csum,
1643 			       0, "number of bad csums preventing LRO");
1644 
1645 		SYSCTL_ADD_U64(ctx, children, OID_AUTO,
1646 			       "lro_queued", CTLFLAG_RD, &ss->lc.lro_queued,
1647 			       0, "number of frames appended to lro merge"
1648 			       "queues");
1649 
1650 #ifndef IFNET_BUF_RING
1651 		/* only transmit from slice 0 for now */
1652 		if (slice > 0)
1653 			continue;
1654 #endif
1655 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1656 			       "tx_req",
1657 			       CTLFLAG_RD, &ss->tx.req,
1658 			       0, "tx_req");
1659 
1660 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1661 			       "tx_done",
1662 			       CTLFLAG_RD, &ss->tx.done,
1663 			       0, "tx_done");
1664 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1665 			       "tx_pkt_done",
1666 			       CTLFLAG_RD, &ss->tx.pkt_done,
1667 			       0, "tx_done");
1668 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1669 			       "tx_stall",
1670 			       CTLFLAG_RD, &ss->tx.stall,
1671 			       0, "tx_stall");
1672 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1673 			       "tx_wake",
1674 			       CTLFLAG_RD, &ss->tx.wake,
1675 			       0, "tx_wake");
1676 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1677 			       "tx_defrag",
1678 			       CTLFLAG_RD, &ss->tx.defrag,
1679 			       0, "tx_defrag");
1680 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1681 			       "tx_queue_active",
1682 			       CTLFLAG_RD, &ss->tx.queue_active,
1683 			       0, "tx_queue_active");
1684 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1685 			       "tx_activate",
1686 			       CTLFLAG_RD, &ss->tx.activate,
1687 			       0, "tx_activate");
1688 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1689 			       "tx_deactivate",
1690 			       CTLFLAG_RD, &ss->tx.deactivate,
1691 			       0, "tx_deactivate");
1692 	}
1693 }
1694 
1695 /* copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
1696    backwards one at a time and handle ring wraps */
1697 
1698 static inline void
1699 mxge_submit_req_backwards(mxge_tx_ring_t *tx,
1700 			    mcp_kreq_ether_send_t *src, int cnt)
1701 {
1702 	int idx, starting_slot;
1703 	starting_slot = tx->req;
1704 	while (cnt > 1) {
1705 		cnt--;
1706 		idx = (starting_slot + cnt) & tx->mask;
1707 		mxge_pio_copy(&tx->lanai[idx],
1708 			      &src[cnt], sizeof(*src));
1709 		wmb();
1710 	}
1711 }
1712 
1713 /*
1714  * copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
1715  * at most 32 bytes at a time, so as to avoid involving the software
1716  * pio handler in the nic.   We re-write the first segment's flags
1717  * to mark them valid only after writing the entire chain
1718  */
1719 
1720 static inline void
1721 mxge_submit_req(mxge_tx_ring_t *tx, mcp_kreq_ether_send_t *src,
1722 		  int cnt)
1723 {
1724 	int idx, i;
1725 	uint32_t *src_ints;
1726 	volatile uint32_t *dst_ints;
1727 	mcp_kreq_ether_send_t *srcp;
1728 	volatile mcp_kreq_ether_send_t *dstp, *dst;
1729 	uint8_t last_flags;
1730 
1731 	idx = tx->req & tx->mask;
1732 
1733 	last_flags = src->flags;
1734 	src->flags = 0;
1735 	wmb();
1736 	dst = dstp = &tx->lanai[idx];
1737 	srcp = src;
1738 
1739 	if ((idx + cnt) < tx->mask) {
1740 		for (i = 0; i < (cnt - 1); i += 2) {
1741 			mxge_pio_copy(dstp, srcp, 2 * sizeof(*src));
1742 			wmb(); /* force write every 32 bytes */
1743 			srcp += 2;
1744 			dstp += 2;
1745 		}
1746 	} else {
1747 		/* submit all but the first request, and ensure
1748 		   that it is submitted below */
1749 		mxge_submit_req_backwards(tx, src, cnt);
1750 		i = 0;
1751 	}
1752 	if (i < cnt) {
1753 		/* submit the first request */
1754 		mxge_pio_copy(dstp, srcp, sizeof(*src));
1755 		wmb(); /* barrier before setting valid flag */
1756 	}
1757 
1758 	/* re-write the last 32-bits with the valid flags */
1759 	src->flags = last_flags;
1760 	src_ints = (uint32_t *)src;
1761 	src_ints+=3;
1762 	dst_ints = (volatile uint32_t *)dst;
1763 	dst_ints+=3;
1764 	*dst_ints =  *src_ints;
1765 	tx->req += cnt;
1766 	wmb();
1767 }
1768 
1769 static int
1770 mxge_parse_tx(struct mxge_slice_state *ss, struct mbuf *m,
1771     struct mxge_pkt_info *pi)
1772 {
1773 	struct ether_vlan_header *eh;
1774 	uint16_t etype;
1775 	int tso = m->m_pkthdr.csum_flags & (CSUM_TSO);
1776 #if IFCAP_TSO6 && defined(INET6)
1777 	int nxt;
1778 #endif
1779 
1780 	eh = mtod(m, struct ether_vlan_header *);
1781 	if (eh->evl_encap_proto == htons(ETHERTYPE_VLAN)) {
1782 		etype = ntohs(eh->evl_proto);
1783 		pi->ip_off = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
1784 	} else {
1785 		etype = ntohs(eh->evl_encap_proto);
1786 		pi->ip_off = ETHER_HDR_LEN;
1787 	}
1788 
1789 	switch (etype) {
1790 	case ETHERTYPE_IP:
1791 		/*
1792 		 * ensure ip header is in first mbuf, copy it to a
1793 		 * scratch buffer if not
1794 		 */
1795 		pi->ip = (struct ip *)(m->m_data + pi->ip_off);
1796 		pi->ip6 = NULL;
1797 		if (__predict_false(m->m_len < pi->ip_off + sizeof(*pi->ip))) {
1798 			m_copydata(m, 0, pi->ip_off + sizeof(*pi->ip),
1799 			    ss->scratch);
1800 			pi->ip = (struct ip *)(ss->scratch + pi->ip_off);
1801 		}
1802 		pi->ip_hlen = pi->ip->ip_hl << 2;
1803 		if (!tso)
1804 			return 0;
1805 
1806 		if (__predict_false(m->m_len < pi->ip_off + pi->ip_hlen +
1807 		    sizeof(struct tcphdr))) {
1808 			m_copydata(m, 0, pi->ip_off + pi->ip_hlen +
1809 			    sizeof(struct tcphdr), ss->scratch);
1810 			pi->ip = (struct ip *)(ss->scratch + pi->ip_off);
1811 		}
1812 		pi->tcp = (struct tcphdr *)((char *)pi->ip + pi->ip_hlen);
1813 		break;
1814 #if IFCAP_TSO6 && defined(INET6)
1815 	case ETHERTYPE_IPV6:
1816 		pi->ip6 = (struct ip6_hdr *)(m->m_data + pi->ip_off);
1817 		if (__predict_false(m->m_len < pi->ip_off + sizeof(*pi->ip6))) {
1818 			m_copydata(m, 0, pi->ip_off + sizeof(*pi->ip6),
1819 			    ss->scratch);
1820 			pi->ip6 = (struct ip6_hdr *)(ss->scratch + pi->ip_off);
1821 		}
1822 		nxt = 0;
1823 		pi->ip_hlen = ip6_lasthdr(m, pi->ip_off, IPPROTO_IPV6, &nxt);
1824 		pi->ip_hlen -= pi->ip_off;
1825 		if (nxt != IPPROTO_TCP && nxt != IPPROTO_UDP)
1826 			return EINVAL;
1827 
1828 		if (!tso)
1829 			return 0;
1830 
1831 		if (pi->ip_off + pi->ip_hlen > ss->sc->max_tso6_hlen)
1832 			return EINVAL;
1833 
1834 		if (__predict_false(m->m_len < pi->ip_off + pi->ip_hlen +
1835 		    sizeof(struct tcphdr))) {
1836 			m_copydata(m, 0, pi->ip_off + pi->ip_hlen +
1837 			    sizeof(struct tcphdr), ss->scratch);
1838 			pi->ip6 = (struct ip6_hdr *)(ss->scratch + pi->ip_off);
1839 		}
1840 		pi->tcp = (struct tcphdr *)((char *)pi->ip6 + pi->ip_hlen);
1841 		break;
1842 #endif
1843 	default:
1844 		return EINVAL;
1845 	}
1846 	return 0;
1847 }
1848 
1849 #if IFCAP_TSO4
1850 
1851 static void
1852 mxge_encap_tso(struct mxge_slice_state *ss, struct mbuf *m,
1853 	       int busdma_seg_cnt, struct mxge_pkt_info *pi)
1854 {
1855 	mxge_tx_ring_t *tx;
1856 	mcp_kreq_ether_send_t *req;
1857 	bus_dma_segment_t *seg;
1858 	uint32_t low, high_swapped;
1859 	int len, seglen, cum_len, cum_len_next;
1860 	int next_is_first, chop, cnt, rdma_count, small;
1861 	uint16_t pseudo_hdr_offset, cksum_offset, mss, sum;
1862 	uint8_t flags, flags_next;
1863 	static int once;
1864 
1865 	mss = m->m_pkthdr.tso_segsz;
1866 
1867 	/* negative cum_len signifies to the
1868 	 * send loop that we are still in the
1869 	 * header portion of the TSO packet.
1870 	 */
1871 
1872 	cksum_offset = pi->ip_off + pi->ip_hlen;
1873 	cum_len = -(cksum_offset + (pi->tcp->th_off << 2));
1874 
1875 	/* TSO implies checksum offload on this hardware */
1876 	if (__predict_false((m->m_pkthdr.csum_flags & (CSUM_TCP|CSUM_TCP_IPV6)) == 0)) {
1877 		/*
1878 		 * If packet has full TCP csum, replace it with pseudo hdr
1879 		 * sum that the NIC expects, otherwise the NIC will emit
1880 		 * packets with bad TCP checksums.
1881 		 */
1882 		m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
1883 		if (pi->ip6) {
1884 #if (CSUM_TCP_IPV6 != 0) && defined(INET6)
1885 			m->m_pkthdr.csum_flags |= CSUM_TCP_IPV6;
1886 			sum = in6_cksum_pseudo(pi->ip6,
1887 			    m->m_pkthdr.len - cksum_offset,
1888 			    IPPROTO_TCP, 0);
1889 #endif
1890 		} else {
1891 #ifdef INET
1892 			m->m_pkthdr.csum_flags |= CSUM_TCP;
1893 			sum = in_pseudo(pi->ip->ip_src.s_addr,
1894 			    pi->ip->ip_dst.s_addr,
1895 			    htons(IPPROTO_TCP + (m->m_pkthdr.len -
1896 				    cksum_offset)));
1897 #endif
1898 		}
1899 		m_copyback(m, offsetof(struct tcphdr, th_sum) +
1900 		    cksum_offset, sizeof(sum), (caddr_t)&sum);
1901 	}
1902 	flags = MXGEFW_FLAGS_TSO_HDR | MXGEFW_FLAGS_FIRST;
1903 
1904 
1905 	/* for TSO, pseudo_hdr_offset holds mss.
1906 	 * The firmware figures out where to put
1907 	 * the checksum by parsing the header. */
1908 	pseudo_hdr_offset = htobe16(mss);
1909 
1910 	if (pi->ip6) {
1911 		/*
1912 		 * for IPv6 TSO, the "checksum offset" is re-purposed
1913 		 * to store the TCP header len
1914 		 */
1915 		cksum_offset = (pi->tcp->th_off << 2);
1916 	}
1917 
1918 	tx = &ss->tx;
1919 	req = tx->req_list;
1920 	seg = tx->seg_list;
1921 	cnt = 0;
1922 	rdma_count = 0;
1923 	/* "rdma_count" is the number of RDMAs belonging to the
1924 	 * current packet BEFORE the current send request. For
1925 	 * non-TSO packets, this is equal to "count".
1926 	 * For TSO packets, rdma_count needs to be reset
1927 	 * to 0 after a segment cut.
1928 	 *
1929 	 * The rdma_count field of the send request is
1930 	 * the number of RDMAs of the packet starting at
1931 	 * that request. For TSO send requests with one ore more cuts
1932 	 * in the middle, this is the number of RDMAs starting
1933 	 * after the last cut in the request. All previous
1934 	 * segments before the last cut implicitly have 1 RDMA.
1935 	 *
1936 	 * Since the number of RDMAs is not known beforehand,
1937 	 * it must be filled-in retroactively - after each
1938 	 * segmentation cut or at the end of the entire packet.
1939 	 */
1940 
1941 	while (busdma_seg_cnt) {
1942 		/* Break the busdma segment up into pieces*/
1943 		low = MXGE_LOWPART_TO_U32(seg->ds_addr);
1944 		high_swapped = 	htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
1945 		len = seg->ds_len;
1946 
1947 		while (len) {
1948 			flags_next = flags & ~MXGEFW_FLAGS_FIRST;
1949 			seglen = len;
1950 			cum_len_next = cum_len + seglen;
1951 			(req-rdma_count)->rdma_count = rdma_count + 1;
1952 			if (__predict_true(cum_len >= 0)) {
1953 				/* payload */
1954 				chop = (cum_len_next > mss);
1955 				cum_len_next = cum_len_next % mss;
1956 				next_is_first = (cum_len_next == 0);
1957 				flags |= chop * MXGEFW_FLAGS_TSO_CHOP;
1958 				flags_next |= next_is_first *
1959 					MXGEFW_FLAGS_FIRST;
1960 				rdma_count |= -(chop | next_is_first);
1961 				rdma_count += chop & !next_is_first;
1962 			} else if (cum_len_next >= 0) {
1963 				/* header ends */
1964 				rdma_count = -1;
1965 				cum_len_next = 0;
1966 				seglen = -cum_len;
1967 				small = (mss <= MXGEFW_SEND_SMALL_SIZE);
1968 				flags_next = MXGEFW_FLAGS_TSO_PLD |
1969 					MXGEFW_FLAGS_FIRST |
1970 					(small * MXGEFW_FLAGS_SMALL);
1971 			    }
1972 
1973 			req->addr_high = high_swapped;
1974 			req->addr_low = htobe32(low);
1975 			req->pseudo_hdr_offset = pseudo_hdr_offset;
1976 			req->pad = 0;
1977 			req->rdma_count = 1;
1978 			req->length = htobe16(seglen);
1979 			req->cksum_offset = cksum_offset;
1980 			req->flags = flags | ((cum_len & 1) *
1981 					      MXGEFW_FLAGS_ALIGN_ODD);
1982 			low += seglen;
1983 			len -= seglen;
1984 			cum_len = cum_len_next;
1985 			flags = flags_next;
1986 			req++;
1987 			cnt++;
1988 			rdma_count++;
1989 			if (cksum_offset != 0 && !pi->ip6) {
1990 				if (__predict_false(cksum_offset > seglen))
1991 					cksum_offset -= seglen;
1992 				else
1993 					cksum_offset = 0;
1994 			}
1995 			if (__predict_false(cnt > tx->max_desc))
1996 				goto drop;
1997 		}
1998 		busdma_seg_cnt--;
1999 		seg++;
2000 	}
2001 	(req-rdma_count)->rdma_count = rdma_count;
2002 
2003 	do {
2004 		req--;
2005 		req->flags |= MXGEFW_FLAGS_TSO_LAST;
2006 	} while (!(req->flags & (MXGEFW_FLAGS_TSO_CHOP | MXGEFW_FLAGS_FIRST)));
2007 
2008 	tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
2009 	mxge_submit_req(tx, tx->req_list, cnt);
2010 #ifdef IFNET_BUF_RING
2011 	if ((ss->sc->num_slices > 1) && tx->queue_active == 0) {
2012 		/* tell the NIC to start polling this slice */
2013 		*tx->send_go = 1;
2014 		tx->queue_active = 1;
2015 		tx->activate++;
2016 		wmb();
2017 	}
2018 #endif
2019 	return;
2020 
2021 drop:
2022 	bus_dmamap_unload(tx->dmat, tx->info[tx->req & tx->mask].map);
2023 	m_freem(m);
2024 	ss->oerrors++;
2025 	if (!once) {
2026 		printf("tx->max_desc exceeded via TSO!\n");
2027 		printf("mss = %d, %ld, %d!\n", mss,
2028 		       (long)seg - (long)tx->seg_list, tx->max_desc);
2029 		once = 1;
2030 	}
2031 	return;
2032 
2033 }
2034 
2035 #endif /* IFCAP_TSO4 */
2036 
2037 #ifdef MXGE_NEW_VLAN_API
2038 /*
2039  * We reproduce the software vlan tag insertion from
2040  * net/if_vlan.c:vlan_start() here so that we can advertise "hardware"
2041  * vlan tag insertion. We need to advertise this in order to have the
2042  * vlan interface respect our csum offload flags.
2043  */
2044 static struct mbuf *
2045 mxge_vlan_tag_insert(struct mbuf *m)
2046 {
2047 	struct ether_vlan_header *evl;
2048 
2049 	M_PREPEND(m, ETHER_VLAN_ENCAP_LEN, M_NOWAIT);
2050 	if (__predict_false(m == NULL))
2051 		return NULL;
2052 	if (m->m_len < sizeof(*evl)) {
2053 		m = m_pullup(m, sizeof(*evl));
2054 		if (__predict_false(m == NULL))
2055 			return NULL;
2056 	}
2057 	/*
2058 	 * Transform the Ethernet header into an Ethernet header
2059 	 * with 802.1Q encapsulation.
2060 	 */
2061 	evl = mtod(m, struct ether_vlan_header *);
2062 	bcopy((char *)evl + ETHER_VLAN_ENCAP_LEN,
2063 	      (char *)evl, ETHER_HDR_LEN - ETHER_TYPE_LEN);
2064 	evl->evl_encap_proto = htons(ETHERTYPE_VLAN);
2065 	evl->evl_tag = htons(m->m_pkthdr.ether_vtag);
2066 	m->m_flags &= ~M_VLANTAG;
2067 	return m;
2068 }
2069 #endif /* MXGE_NEW_VLAN_API */
2070 
2071 static void
2072 mxge_encap(struct mxge_slice_state *ss, struct mbuf *m)
2073 {
2074 	struct mxge_pkt_info pi = {0,0,0,0};
2075 	mxge_softc_t *sc;
2076 	mcp_kreq_ether_send_t *req;
2077 	bus_dma_segment_t *seg;
2078 	struct mbuf *m_tmp;
2079 	struct ifnet *ifp;
2080 	mxge_tx_ring_t *tx;
2081 	int cnt, cum_len, err, i, idx, odd_flag;
2082 	uint16_t pseudo_hdr_offset;
2083 	uint8_t flags, cksum_offset;
2084 
2085 
2086 	sc = ss->sc;
2087 	ifp = sc->ifp;
2088 	tx = &ss->tx;
2089 
2090 #ifdef MXGE_NEW_VLAN_API
2091 	if (m->m_flags & M_VLANTAG) {
2092 		m = mxge_vlan_tag_insert(m);
2093 		if (__predict_false(m == NULL))
2094 			goto drop_without_m;
2095 	}
2096 #endif
2097 	if (m->m_pkthdr.csum_flags &
2098 	    (CSUM_TSO | CSUM_DELAY_DATA | CSUM_DELAY_DATA_IPV6)) {
2099 		if (mxge_parse_tx(ss, m, &pi))
2100 			goto drop;
2101 	}
2102 
2103 	/* (try to) map the frame for DMA */
2104 	idx = tx->req & tx->mask;
2105 	err = bus_dmamap_load_mbuf_sg(tx->dmat, tx->info[idx].map,
2106 				      m, tx->seg_list, &cnt,
2107 				      BUS_DMA_NOWAIT);
2108 	if (__predict_false(err == EFBIG)) {
2109 		/* Too many segments in the chain.  Try
2110 		   to defrag */
2111 		m_tmp = m_defrag(m, M_NOWAIT);
2112 		if (m_tmp == NULL) {
2113 			goto drop;
2114 		}
2115 		ss->tx.defrag++;
2116 		m = m_tmp;
2117 		err = bus_dmamap_load_mbuf_sg(tx->dmat,
2118 					      tx->info[idx].map,
2119 					      m, tx->seg_list, &cnt,
2120 					      BUS_DMA_NOWAIT);
2121 	}
2122 	if (__predict_false(err != 0)) {
2123 		device_printf(sc->dev, "bus_dmamap_load_mbuf_sg returned %d"
2124 			      " packet len = %d\n", err, m->m_pkthdr.len);
2125 		goto drop;
2126 	}
2127 	bus_dmamap_sync(tx->dmat, tx->info[idx].map,
2128 			BUS_DMASYNC_PREWRITE);
2129 	tx->info[idx].m = m;
2130 
2131 #if IFCAP_TSO4
2132 	/* TSO is different enough, we handle it in another routine */
2133 	if (m->m_pkthdr.csum_flags & (CSUM_TSO)) {
2134 		mxge_encap_tso(ss, m, cnt, &pi);
2135 		return;
2136 	}
2137 #endif
2138 
2139 	req = tx->req_list;
2140 	cksum_offset = 0;
2141 	pseudo_hdr_offset = 0;
2142 	flags = MXGEFW_FLAGS_NO_TSO;
2143 
2144 	/* checksum offloading? */
2145 	if (m->m_pkthdr.csum_flags &
2146 	    (CSUM_DELAY_DATA | CSUM_DELAY_DATA_IPV6)) {
2147 		/* ensure ip header is in first mbuf, copy
2148 		   it to a scratch buffer if not */
2149 		cksum_offset = pi.ip_off + pi.ip_hlen;
2150 		pseudo_hdr_offset = cksum_offset +  m->m_pkthdr.csum_data;
2151 		pseudo_hdr_offset = htobe16(pseudo_hdr_offset);
2152 		req->cksum_offset = cksum_offset;
2153 		flags |= MXGEFW_FLAGS_CKSUM;
2154 		odd_flag = MXGEFW_FLAGS_ALIGN_ODD;
2155 	} else {
2156 		odd_flag = 0;
2157 	}
2158 	if (m->m_pkthdr.len < MXGEFW_SEND_SMALL_SIZE)
2159 		flags |= MXGEFW_FLAGS_SMALL;
2160 
2161 	/* convert segments into a request list */
2162 	cum_len = 0;
2163 	seg = tx->seg_list;
2164 	req->flags = MXGEFW_FLAGS_FIRST;
2165 	for (i = 0; i < cnt; i++) {
2166 		req->addr_low =
2167 			htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
2168 		req->addr_high =
2169 			htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
2170 		req->length = htobe16(seg->ds_len);
2171 		req->cksum_offset = cksum_offset;
2172 		if (cksum_offset > seg->ds_len)
2173 			cksum_offset -= seg->ds_len;
2174 		else
2175 			cksum_offset = 0;
2176 		req->pseudo_hdr_offset = pseudo_hdr_offset;
2177 		req->pad = 0; /* complete solid 16-byte block */
2178 		req->rdma_count = 1;
2179 		req->flags |= flags | ((cum_len & 1) * odd_flag);
2180 		cum_len += seg->ds_len;
2181 		seg++;
2182 		req++;
2183 		req->flags = 0;
2184 	}
2185 	req--;
2186 	/* pad runts to 60 bytes */
2187 	if (cum_len < 60) {
2188 		req++;
2189 		req->addr_low =
2190 			htobe32(MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr));
2191 		req->addr_high =
2192 			htobe32(MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr));
2193 		req->length = htobe16(60 - cum_len);
2194 		req->cksum_offset = 0;
2195 		req->pseudo_hdr_offset = pseudo_hdr_offset;
2196 		req->pad = 0; /* complete solid 16-byte block */
2197 		req->rdma_count = 1;
2198 		req->flags |= flags | ((cum_len & 1) * odd_flag);
2199 		cnt++;
2200 	}
2201 
2202 	tx->req_list[0].rdma_count = cnt;
2203 #if 0
2204 	/* print what the firmware will see */
2205 	for (i = 0; i < cnt; i++) {
2206 		printf("%d: addr: 0x%x 0x%x len:%d pso%d,"
2207 		    "cso:%d, flags:0x%x, rdma:%d\n",
2208 		    i, (int)ntohl(tx->req_list[i].addr_high),
2209 		    (int)ntohl(tx->req_list[i].addr_low),
2210 		    (int)ntohs(tx->req_list[i].length),
2211 		    (int)ntohs(tx->req_list[i].pseudo_hdr_offset),
2212 		    tx->req_list[i].cksum_offset, tx->req_list[i].flags,
2213 		    tx->req_list[i].rdma_count);
2214 	}
2215 	printf("--------------\n");
2216 #endif
2217 	tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
2218 	mxge_submit_req(tx, tx->req_list, cnt);
2219 #ifdef IFNET_BUF_RING
2220 	if ((ss->sc->num_slices > 1) && tx->queue_active == 0) {
2221 		/* tell the NIC to start polling this slice */
2222 		*tx->send_go = 1;
2223 		tx->queue_active = 1;
2224 		tx->activate++;
2225 		wmb();
2226 	}
2227 #endif
2228 	return;
2229 
2230 drop:
2231 	m_freem(m);
2232 drop_without_m:
2233 	ss->oerrors++;
2234 	return;
2235 }
2236 
2237 #ifdef IFNET_BUF_RING
2238 static void
2239 mxge_qflush(struct ifnet *ifp)
2240 {
2241 	mxge_softc_t *sc = ifp->if_softc;
2242 	mxge_tx_ring_t *tx;
2243 	struct mbuf *m;
2244 	int slice;
2245 
2246 	for (slice = 0; slice < sc->num_slices; slice++) {
2247 		tx = &sc->ss[slice].tx;
2248 		mtx_lock(&tx->mtx);
2249 		while ((m = buf_ring_dequeue_sc(tx->br)) != NULL)
2250 			m_freem(m);
2251 		mtx_unlock(&tx->mtx);
2252 	}
2253 	if_qflush(ifp);
2254 }
2255 
2256 static inline void
2257 mxge_start_locked(struct mxge_slice_state *ss)
2258 {
2259 	mxge_softc_t *sc;
2260 	struct mbuf *m;
2261 	struct ifnet *ifp;
2262 	mxge_tx_ring_t *tx;
2263 
2264 	sc = ss->sc;
2265 	ifp = sc->ifp;
2266 	tx = &ss->tx;
2267 
2268 	while ((tx->mask - (tx->req - tx->done)) > tx->max_desc) {
2269 		m = drbr_dequeue(ifp, tx->br);
2270 		if (m == NULL) {
2271 			return;
2272 		}
2273 		/* let BPF see it */
2274 		BPF_MTAP(ifp, m);
2275 
2276 		/* give it to the nic */
2277 		mxge_encap(ss, m);
2278 	}
2279 	/* ran out of transmit slots */
2280 	if (((ss->if_drv_flags & IFF_DRV_OACTIVE) == 0)
2281 	    && (!drbr_empty(ifp, tx->br))) {
2282 		ss->if_drv_flags |= IFF_DRV_OACTIVE;
2283 		tx->stall++;
2284 	}
2285 }
2286 
2287 static int
2288 mxge_transmit_locked(struct mxge_slice_state *ss, struct mbuf *m)
2289 {
2290 	mxge_softc_t *sc;
2291 	struct ifnet *ifp;
2292 	mxge_tx_ring_t *tx;
2293 	int err;
2294 
2295 	sc = ss->sc;
2296 	ifp = sc->ifp;
2297 	tx = &ss->tx;
2298 
2299 	if ((ss->if_drv_flags & (IFF_DRV_RUNNING|IFF_DRV_OACTIVE)) !=
2300 	    IFF_DRV_RUNNING) {
2301 		err = drbr_enqueue(ifp, tx->br, m);
2302 		return (err);
2303 	}
2304 
2305 	if (!drbr_needs_enqueue(ifp, tx->br) &&
2306 	    ((tx->mask - (tx->req - tx->done)) > tx->max_desc)) {
2307 		/* let BPF see it */
2308 		BPF_MTAP(ifp, m);
2309 		/* give it to the nic */
2310 		mxge_encap(ss, m);
2311 	} else if ((err = drbr_enqueue(ifp, tx->br, m)) != 0) {
2312 		return (err);
2313 	}
2314 	if (!drbr_empty(ifp, tx->br))
2315 		mxge_start_locked(ss);
2316 	return (0);
2317 }
2318 
2319 static int
2320 mxge_transmit(struct ifnet *ifp, struct mbuf *m)
2321 {
2322 	mxge_softc_t *sc = ifp->if_softc;
2323 	struct mxge_slice_state *ss;
2324 	mxge_tx_ring_t *tx;
2325 	int err = 0;
2326 	int slice;
2327 
2328 	slice = m->m_pkthdr.flowid;
2329 	slice &= (sc->num_slices - 1);  /* num_slices always power of 2 */
2330 
2331 	ss = &sc->ss[slice];
2332 	tx = &ss->tx;
2333 
2334 	if (mtx_trylock(&tx->mtx)) {
2335 		err = mxge_transmit_locked(ss, m);
2336 		mtx_unlock(&tx->mtx);
2337 	} else {
2338 		err = drbr_enqueue(ifp, tx->br, m);
2339 	}
2340 
2341 	return (err);
2342 }
2343 
2344 #else
2345 
2346 static inline void
2347 mxge_start_locked(struct mxge_slice_state *ss)
2348 {
2349 	mxge_softc_t *sc;
2350 	struct mbuf *m;
2351 	struct ifnet *ifp;
2352 	mxge_tx_ring_t *tx;
2353 
2354 	sc = ss->sc;
2355 	ifp = sc->ifp;
2356 	tx = &ss->tx;
2357 	while ((tx->mask - (tx->req - tx->done)) > tx->max_desc) {
2358 		IFQ_DRV_DEQUEUE(&ifp->if_snd, m);
2359 		if (m == NULL) {
2360 			return;
2361 		}
2362 		/* let BPF see it */
2363 		BPF_MTAP(ifp, m);
2364 
2365 		/* give it to the nic */
2366 		mxge_encap(ss, m);
2367 	}
2368 	/* ran out of transmit slots */
2369 	if ((sc->ifp->if_drv_flags & IFF_DRV_OACTIVE) == 0) {
2370 		sc->ifp->if_drv_flags |= IFF_DRV_OACTIVE;
2371 		tx->stall++;
2372 	}
2373 }
2374 #endif
2375 static void
2376 mxge_start(struct ifnet *ifp)
2377 {
2378 	mxge_softc_t *sc = ifp->if_softc;
2379 	struct mxge_slice_state *ss;
2380 
2381 	/* only use the first slice for now */
2382 	ss = &sc->ss[0];
2383 	mtx_lock(&ss->tx.mtx);
2384 	mxge_start_locked(ss);
2385 	mtx_unlock(&ss->tx.mtx);
2386 }
2387 
2388 /*
2389  * copy an array of mcp_kreq_ether_recv_t's to the mcp.  Copy
2390  * at most 32 bytes at a time, so as to avoid involving the software
2391  * pio handler in the nic.   We re-write the first segment's low
2392  * DMA address to mark it valid only after we write the entire chunk
2393  * in a burst
2394  */
2395 static inline void
2396 mxge_submit_8rx(volatile mcp_kreq_ether_recv_t *dst,
2397 		mcp_kreq_ether_recv_t *src)
2398 {
2399 	uint32_t low;
2400 
2401 	low = src->addr_low;
2402 	src->addr_low = 0xffffffff;
2403 	mxge_pio_copy(dst, src, 4 * sizeof (*src));
2404 	wmb();
2405 	mxge_pio_copy(dst + 4, src + 4, 4 * sizeof (*src));
2406 	wmb();
2407 	src->addr_low = low;
2408 	dst->addr_low = low;
2409 	wmb();
2410 }
2411 
2412 static int
2413 mxge_get_buf_small(struct mxge_slice_state *ss, bus_dmamap_t map, int idx)
2414 {
2415 	bus_dma_segment_t seg;
2416 	struct mbuf *m;
2417 	mxge_rx_ring_t *rx = &ss->rx_small;
2418 	int cnt, err;
2419 
2420 	m = m_gethdr(M_NOWAIT, MT_DATA);
2421 	if (m == NULL) {
2422 		rx->alloc_fail++;
2423 		err = ENOBUFS;
2424 		goto done;
2425 	}
2426 	m->m_len = MHLEN;
2427 	err = bus_dmamap_load_mbuf_sg(rx->dmat, map, m,
2428 				      &seg, &cnt, BUS_DMA_NOWAIT);
2429 	if (err != 0) {
2430 		m_free(m);
2431 		goto done;
2432 	}
2433 	rx->info[idx].m = m;
2434 	rx->shadow[idx].addr_low =
2435 		htobe32(MXGE_LOWPART_TO_U32(seg.ds_addr));
2436 	rx->shadow[idx].addr_high =
2437 		htobe32(MXGE_HIGHPART_TO_U32(seg.ds_addr));
2438 
2439 done:
2440 	if ((idx & 7) == 7)
2441 		mxge_submit_8rx(&rx->lanai[idx - 7], &rx->shadow[idx - 7]);
2442 	return err;
2443 }
2444 
2445 static int
2446 mxge_get_buf_big(struct mxge_slice_state *ss, bus_dmamap_t map, int idx)
2447 {
2448 	bus_dma_segment_t seg[3];
2449 	struct mbuf *m;
2450 	mxge_rx_ring_t *rx = &ss->rx_big;
2451 	int cnt, err, i;
2452 
2453 	m = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, rx->cl_size);
2454 	if (m == NULL) {
2455 		rx->alloc_fail++;
2456 		err = ENOBUFS;
2457 		goto done;
2458 	}
2459 	m->m_len = rx->mlen;
2460 	err = bus_dmamap_load_mbuf_sg(rx->dmat, map, m,
2461 				      seg, &cnt, BUS_DMA_NOWAIT);
2462 	if (err != 0) {
2463 		m_free(m);
2464 		goto done;
2465 	}
2466 	rx->info[idx].m = m;
2467 	rx->shadow[idx].addr_low =
2468 		htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
2469 	rx->shadow[idx].addr_high =
2470 		htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
2471 
2472 #if MXGE_VIRT_JUMBOS
2473 	for (i = 1; i < cnt; i++) {
2474 		rx->shadow[idx + i].addr_low =
2475 			htobe32(MXGE_LOWPART_TO_U32(seg[i].ds_addr));
2476 		rx->shadow[idx + i].addr_high =
2477 			htobe32(MXGE_HIGHPART_TO_U32(seg[i].ds_addr));
2478        }
2479 #endif
2480 
2481 done:
2482        for (i = 0; i < rx->nbufs; i++) {
2483 		if ((idx & 7) == 7) {
2484 			mxge_submit_8rx(&rx->lanai[idx - 7],
2485 					&rx->shadow[idx - 7]);
2486 		}
2487 		idx++;
2488 	}
2489 	return err;
2490 }
2491 
2492 #ifdef INET6
2493 
2494 static uint16_t
2495 mxge_csum_generic(uint16_t *raw, int len)
2496 {
2497 	uint32_t csum;
2498 
2499 
2500 	csum = 0;
2501 	while (len > 0) {
2502 		csum += *raw;
2503 		raw++;
2504 		len -= 2;
2505 	}
2506 	csum = (csum >> 16) + (csum & 0xffff);
2507 	csum = (csum >> 16) + (csum & 0xffff);
2508 	return (uint16_t)csum;
2509 }
2510 
2511 static inline uint16_t
2512 mxge_rx_csum6(void *p, struct mbuf *m, uint32_t csum)
2513 {
2514 	uint32_t partial;
2515 	int nxt, cksum_offset;
2516 	struct ip6_hdr *ip6 = p;
2517 	uint16_t c;
2518 
2519 	nxt = ip6->ip6_nxt;
2520 	cksum_offset = sizeof (*ip6) + ETHER_HDR_LEN;
2521 	if (nxt != IPPROTO_TCP && nxt != IPPROTO_UDP) {
2522 		cksum_offset = ip6_lasthdr(m, ETHER_HDR_LEN,
2523 					   IPPROTO_IPV6, &nxt);
2524 		if (nxt != IPPROTO_TCP && nxt != IPPROTO_UDP)
2525 			return (1);
2526 	}
2527 
2528 	/*
2529 	 * IPv6 headers do not contain a checksum, and hence
2530 	 * do not checksum to zero, so they don't "fall out"
2531 	 * of the partial checksum calculation like IPv4
2532 	 * headers do.  We need to fix the partial checksum by
2533 	 * subtracting the checksum of the IPv6 header.
2534 	 */
2535 
2536 	partial = mxge_csum_generic((uint16_t *)ip6, cksum_offset -
2537 				    ETHER_HDR_LEN);
2538 	csum += ~partial;
2539 	csum +=	 (csum < ~partial);
2540 	csum = (csum >> 16) + (csum & 0xFFFF);
2541 	csum = (csum >> 16) + (csum & 0xFFFF);
2542 	c = in6_cksum_pseudo(ip6, m->m_pkthdr.len - cksum_offset, nxt,
2543 			     csum);
2544 	c ^= 0xffff;
2545 	return (c);
2546 }
2547 #endif /* INET6 */
2548 /*
2549  *  Myri10GE hardware checksums are not valid if the sender
2550  *  padded the frame with non-zero padding.  This is because
2551  *  the firmware just does a simple 16-bit 1s complement
2552  *  checksum across the entire frame, excluding the first 14
2553  *  bytes.  It is best to simply to check the checksum and
2554  *  tell the stack about it only if the checksum is good
2555  */
2556 
2557 static inline uint16_t
2558 mxge_rx_csum(struct mbuf *m, int csum)
2559 {
2560 	struct ether_header *eh;
2561 #ifdef INET
2562 	struct ip *ip;
2563 #endif
2564 #if defined(INET) || defined(INET6)
2565 	int cap = m->m_pkthdr.rcvif->if_capenable;
2566 #endif
2567 	uint16_t c, etype;
2568 
2569 
2570 	eh = mtod(m, struct ether_header *);
2571 	etype = ntohs(eh->ether_type);
2572 	switch (etype) {
2573 #ifdef INET
2574 	case ETHERTYPE_IP:
2575 		if ((cap & IFCAP_RXCSUM) == 0)
2576 			return (1);
2577 		ip = (struct ip *)(eh + 1);
2578 		if (ip->ip_p != IPPROTO_TCP && ip->ip_p != IPPROTO_UDP)
2579 			return (1);
2580 		c = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
2581 			      htonl(ntohs(csum) + ntohs(ip->ip_len) -
2582 				    (ip->ip_hl << 2) + ip->ip_p));
2583 		c ^= 0xffff;
2584 		break;
2585 #endif
2586 #ifdef INET6
2587 	case ETHERTYPE_IPV6:
2588 		if ((cap & IFCAP_RXCSUM_IPV6) == 0)
2589 			return (1);
2590 		c = mxge_rx_csum6((eh + 1), m, csum);
2591 		break;
2592 #endif
2593 	default:
2594 		c = 1;
2595 	}
2596 	return (c);
2597 }
2598 
2599 static void
2600 mxge_vlan_tag_remove(struct mbuf *m, uint32_t *csum)
2601 {
2602 	struct ether_vlan_header *evl;
2603 	struct ether_header *eh;
2604 	uint32_t partial;
2605 
2606 	evl = mtod(m, struct ether_vlan_header *);
2607 	eh = mtod(m, struct ether_header *);
2608 
2609 	/*
2610 	 * fix checksum by subtracting ETHER_VLAN_ENCAP_LEN bytes
2611 	 * after what the firmware thought was the end of the ethernet
2612 	 * header.
2613 	 */
2614 
2615 	/* put checksum into host byte order */
2616 	*csum = ntohs(*csum);
2617 	partial = ntohl(*(uint32_t *)(mtod(m, char *) + ETHER_HDR_LEN));
2618 	(*csum) += ~partial;
2619 	(*csum) +=  ((*csum) < ~partial);
2620 	(*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2621 	(*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2622 
2623 	/* restore checksum to network byte order;
2624 	   later consumers expect this */
2625 	*csum = htons(*csum);
2626 
2627 	/* save the tag */
2628 #ifdef MXGE_NEW_VLAN_API
2629 	m->m_pkthdr.ether_vtag = ntohs(evl->evl_tag);
2630 #else
2631 	{
2632 		struct m_tag *mtag;
2633 		mtag = m_tag_alloc(MTAG_VLAN, MTAG_VLAN_TAG, sizeof(u_int),
2634 				   M_NOWAIT);
2635 		if (mtag == NULL)
2636 			return;
2637 		VLAN_TAG_VALUE(mtag) = ntohs(evl->evl_tag);
2638 		m_tag_prepend(m, mtag);
2639 	}
2640 
2641 #endif
2642 	m->m_flags |= M_VLANTAG;
2643 
2644 	/*
2645 	 * Remove the 802.1q header by copying the Ethernet
2646 	 * addresses over it and adjusting the beginning of
2647 	 * the data in the mbuf.  The encapsulated Ethernet
2648 	 * type field is already in place.
2649 	 */
2650 	bcopy((char *)evl, (char *)evl + ETHER_VLAN_ENCAP_LEN,
2651 	      ETHER_HDR_LEN - ETHER_TYPE_LEN);
2652 	m_adj(m, ETHER_VLAN_ENCAP_LEN);
2653 }
2654 
2655 
2656 static inline void
2657 mxge_rx_done_big(struct mxge_slice_state *ss, uint32_t len,
2658 		 uint32_t csum, int lro)
2659 {
2660 	mxge_softc_t *sc;
2661 	struct ifnet *ifp;
2662 	struct mbuf *m;
2663 	struct ether_header *eh;
2664 	mxge_rx_ring_t *rx;
2665 	bus_dmamap_t old_map;
2666 	int idx;
2667 
2668 	sc = ss->sc;
2669 	ifp = sc->ifp;
2670 	rx = &ss->rx_big;
2671 	idx = rx->cnt & rx->mask;
2672 	rx->cnt += rx->nbufs;
2673 	/* save a pointer to the received mbuf */
2674 	m = rx->info[idx].m;
2675 	/* try to replace the received mbuf */
2676 	if (mxge_get_buf_big(ss, rx->extra_map, idx)) {
2677 		/* drop the frame -- the old mbuf is re-cycled */
2678 		if_inc_counter(ifp, IFCOUNTER_IERRORS, 1);
2679 		return;
2680 	}
2681 
2682 	/* unmap the received buffer */
2683 	old_map = rx->info[idx].map;
2684 	bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2685 	bus_dmamap_unload(rx->dmat, old_map);
2686 
2687 	/* swap the bus_dmamap_t's */
2688 	rx->info[idx].map = rx->extra_map;
2689 	rx->extra_map = old_map;
2690 
2691 	/* mcp implicitly skips 1st 2 bytes so that packet is properly
2692 	 * aligned */
2693 	m->m_data += MXGEFW_PAD;
2694 
2695 	m->m_pkthdr.rcvif = ifp;
2696 	m->m_len = m->m_pkthdr.len = len;
2697 	ss->ipackets++;
2698 	eh = mtod(m, struct ether_header *);
2699 	if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2700 		mxge_vlan_tag_remove(m, &csum);
2701 	}
2702 	/* flowid only valid if RSS hashing is enabled */
2703 	if (sc->num_slices > 1) {
2704 		m->m_pkthdr.flowid = (ss - sc->ss);
2705 		M_HASHTYPE_SET(m, M_HASHTYPE_OPAQUE);
2706 	}
2707 	/* if the checksum is valid, mark it in the mbuf header */
2708 	if ((ifp->if_capenable & (IFCAP_RXCSUM_IPV6 | IFCAP_RXCSUM)) &&
2709 	    (0 == mxge_rx_csum(m, csum))) {
2710 		/* Tell the stack that the  checksum is good */
2711 		m->m_pkthdr.csum_data = 0xffff;
2712 		m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR |
2713 			CSUM_DATA_VALID;
2714 
2715 #if defined(INET) || defined (INET6)
2716 		if (lro && (0 == tcp_lro_rx(&ss->lc, m, 0)))
2717 			return;
2718 #endif
2719 	}
2720 	/* pass the frame up the stack */
2721 	(*ifp->if_input)(ifp, m);
2722 }
2723 
2724 static inline void
2725 mxge_rx_done_small(struct mxge_slice_state *ss, uint32_t len,
2726 		   uint32_t csum, int lro)
2727 {
2728 	mxge_softc_t *sc;
2729 	struct ifnet *ifp;
2730 	struct ether_header *eh;
2731 	struct mbuf *m;
2732 	mxge_rx_ring_t *rx;
2733 	bus_dmamap_t old_map;
2734 	int idx;
2735 
2736 	sc = ss->sc;
2737 	ifp = sc->ifp;
2738 	rx = &ss->rx_small;
2739 	idx = rx->cnt & rx->mask;
2740 	rx->cnt++;
2741 	/* save a pointer to the received mbuf */
2742 	m = rx->info[idx].m;
2743 	/* try to replace the received mbuf */
2744 	if (mxge_get_buf_small(ss, rx->extra_map, idx)) {
2745 		/* drop the frame -- the old mbuf is re-cycled */
2746 		if_inc_counter(ifp, IFCOUNTER_IERRORS, 1);
2747 		return;
2748 	}
2749 
2750 	/* unmap the received buffer */
2751 	old_map = rx->info[idx].map;
2752 	bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2753 	bus_dmamap_unload(rx->dmat, old_map);
2754 
2755 	/* swap the bus_dmamap_t's */
2756 	rx->info[idx].map = rx->extra_map;
2757 	rx->extra_map = old_map;
2758 
2759 	/* mcp implicitly skips 1st 2 bytes so that packet is properly
2760 	 * aligned */
2761 	m->m_data += MXGEFW_PAD;
2762 
2763 	m->m_pkthdr.rcvif = ifp;
2764 	m->m_len = m->m_pkthdr.len = len;
2765 	ss->ipackets++;
2766 	eh = mtod(m, struct ether_header *);
2767 	if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2768 		mxge_vlan_tag_remove(m, &csum);
2769 	}
2770 	/* flowid only valid if RSS hashing is enabled */
2771 	if (sc->num_slices > 1) {
2772 		m->m_pkthdr.flowid = (ss - sc->ss);
2773 		M_HASHTYPE_SET(m, M_HASHTYPE_OPAQUE);
2774 	}
2775 	/* if the checksum is valid, mark it in the mbuf header */
2776 	if ((ifp->if_capenable & (IFCAP_RXCSUM_IPV6 | IFCAP_RXCSUM)) &&
2777 	    (0 == mxge_rx_csum(m, csum))) {
2778 		/* Tell the stack that the  checksum is good */
2779 		m->m_pkthdr.csum_data = 0xffff;
2780 		m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR |
2781 			CSUM_DATA_VALID;
2782 
2783 #if defined(INET) || defined (INET6)
2784 		if (lro && (0 == tcp_lro_rx(&ss->lc, m, csum)))
2785 			return;
2786 #endif
2787 	}
2788 	/* pass the frame up the stack */
2789 	(*ifp->if_input)(ifp, m);
2790 }
2791 
2792 static inline void
2793 mxge_clean_rx_done(struct mxge_slice_state *ss)
2794 {
2795 	mxge_rx_done_t *rx_done = &ss->rx_done;
2796 	int limit = 0;
2797 	uint16_t length;
2798 	uint16_t checksum;
2799 	int lro;
2800 
2801 	lro = ss->sc->ifp->if_capenable & IFCAP_LRO;
2802 	while (rx_done->entry[rx_done->idx].length != 0) {
2803 		length = ntohs(rx_done->entry[rx_done->idx].length);
2804 		rx_done->entry[rx_done->idx].length = 0;
2805 		checksum = rx_done->entry[rx_done->idx].checksum;
2806 		if (length <= (MHLEN - MXGEFW_PAD))
2807 			mxge_rx_done_small(ss, length, checksum, lro);
2808 		else
2809 			mxge_rx_done_big(ss, length, checksum, lro);
2810 		rx_done->cnt++;
2811 		rx_done->idx = rx_done->cnt & rx_done->mask;
2812 
2813 		/* limit potential for livelock */
2814 		if (__predict_false(++limit > rx_done->mask / 2))
2815 			break;
2816 	}
2817 #if defined(INET)  || defined (INET6)
2818 	tcp_lro_flush_all(&ss->lc);
2819 #endif
2820 }
2821 
2822 
2823 static inline void
2824 mxge_tx_done(struct mxge_slice_state *ss, uint32_t mcp_idx)
2825 {
2826 	struct ifnet *ifp;
2827 	mxge_tx_ring_t *tx;
2828 	struct mbuf *m;
2829 	bus_dmamap_t map;
2830 	int idx;
2831 	int *flags;
2832 
2833 	tx = &ss->tx;
2834 	ifp = ss->sc->ifp;
2835 	while (tx->pkt_done != mcp_idx) {
2836 		idx = tx->done & tx->mask;
2837 		tx->done++;
2838 		m = tx->info[idx].m;
2839 		/* mbuf and DMA map only attached to the first
2840 		   segment per-mbuf */
2841 		if (m != NULL) {
2842 			ss->obytes += m->m_pkthdr.len;
2843 			if (m->m_flags & M_MCAST)
2844 				ss->omcasts++;
2845 			ss->opackets++;
2846 			tx->info[idx].m = NULL;
2847 			map = tx->info[idx].map;
2848 			bus_dmamap_unload(tx->dmat, map);
2849 			m_freem(m);
2850 		}
2851 		if (tx->info[idx].flag) {
2852 			tx->info[idx].flag = 0;
2853 			tx->pkt_done++;
2854 		}
2855 	}
2856 
2857 	/* If we have space, clear IFF_OACTIVE to tell the stack that
2858 	   its OK to send packets */
2859 #ifdef IFNET_BUF_RING
2860 	flags = &ss->if_drv_flags;
2861 #else
2862 	flags = &ifp->if_drv_flags;
2863 #endif
2864 	mtx_lock(&ss->tx.mtx);
2865 	if ((*flags) & IFF_DRV_OACTIVE &&
2866 	    tx->req - tx->done < (tx->mask + 1)/4) {
2867 		*(flags) &= ~IFF_DRV_OACTIVE;
2868 		ss->tx.wake++;
2869 		mxge_start_locked(ss);
2870 	}
2871 #ifdef IFNET_BUF_RING
2872 	if ((ss->sc->num_slices > 1) && (tx->req == tx->done)) {
2873 		/* let the NIC stop polling this queue, since there
2874 		 * are no more transmits pending */
2875 		if (tx->req == tx->done) {
2876 			*tx->send_stop = 1;
2877 			tx->queue_active = 0;
2878 			tx->deactivate++;
2879 			wmb();
2880 		}
2881 	}
2882 #endif
2883 	mtx_unlock(&ss->tx.mtx);
2884 
2885 }
2886 
2887 static struct mxge_media_type mxge_xfp_media_types[] =
2888 {
2889 	{IFM_10G_CX4,	0x7f, 		"10GBASE-CX4 (module)"},
2890 	{IFM_10G_SR, 	(1 << 7),	"10GBASE-SR"},
2891 	{IFM_10G_LR, 	(1 << 6),	"10GBASE-LR"},
2892 	{0,		(1 << 5),	"10GBASE-ER"},
2893 	{IFM_10G_LRM,	(1 << 4),	"10GBASE-LRM"},
2894 	{0,		(1 << 3),	"10GBASE-SW"},
2895 	{0,		(1 << 2),	"10GBASE-LW"},
2896 	{0,		(1 << 1),	"10GBASE-EW"},
2897 	{0,		(1 << 0),	"Reserved"}
2898 };
2899 static struct mxge_media_type mxge_sfp_media_types[] =
2900 {
2901 	{IFM_10G_TWINAX,      0,	"10GBASE-Twinax"},
2902 	{0,		(1 << 7),	"Reserved"},
2903 	{IFM_10G_LRM,	(1 << 6),	"10GBASE-LRM"},
2904 	{IFM_10G_LR, 	(1 << 5),	"10GBASE-LR"},
2905 	{IFM_10G_SR,	(1 << 4),	"10GBASE-SR"},
2906 	{IFM_10G_TWINAX,(1 << 0),	"10GBASE-Twinax"}
2907 };
2908 
2909 static void
2910 mxge_media_set(mxge_softc_t *sc, int media_type)
2911 {
2912 
2913 
2914 	ifmedia_add(&sc->media, IFM_ETHER | IFM_FDX | media_type,
2915 		    0, NULL);
2916 	ifmedia_set(&sc->media, IFM_ETHER | IFM_FDX | media_type);
2917 	sc->current_media = media_type;
2918 	sc->media.ifm_media = sc->media.ifm_cur->ifm_media;
2919 }
2920 
2921 static void
2922 mxge_media_init(mxge_softc_t *sc)
2923 {
2924 	char *ptr;
2925 	int i;
2926 
2927 	ifmedia_removeall(&sc->media);
2928 	mxge_media_set(sc, IFM_AUTO);
2929 
2930 	/*
2931 	 * parse the product code to deterimine the interface type
2932 	 * (CX4, XFP, Quad Ribbon Fiber) by looking at the character
2933 	 * after the 3rd dash in the driver's cached copy of the
2934 	 * EEPROM's product code string.
2935 	 */
2936 	ptr = sc->product_code_string;
2937 	if (ptr == NULL) {
2938 		device_printf(sc->dev, "Missing product code\n");
2939 		return;
2940 	}
2941 
2942 	for (i = 0; i < 3; i++, ptr++) {
2943 		ptr = strchr(ptr, '-');
2944 		if (ptr == NULL) {
2945 			device_printf(sc->dev,
2946 				      "only %d dashes in PC?!?\n", i);
2947 			return;
2948 		}
2949 	}
2950 	if (*ptr == 'C' || *(ptr +1) == 'C') {
2951 		/* -C is CX4 */
2952 		sc->connector = MXGE_CX4;
2953 		mxge_media_set(sc, IFM_10G_CX4);
2954 	} else if (*ptr == 'Q') {
2955 		/* -Q is Quad Ribbon Fiber */
2956 		sc->connector = MXGE_QRF;
2957 		device_printf(sc->dev, "Quad Ribbon Fiber Media\n");
2958 		/* FreeBSD has no media type for Quad ribbon fiber */
2959 	} else if (*ptr == 'R') {
2960 		/* -R is XFP */
2961 		sc->connector = MXGE_XFP;
2962 	} else if (*ptr == 'S' || *(ptr +1) == 'S') {
2963 		/* -S or -2S is SFP+ */
2964 		sc->connector = MXGE_SFP;
2965 	} else {
2966 		device_printf(sc->dev, "Unknown media type: %c\n", *ptr);
2967 	}
2968 }
2969 
2970 /*
2971  * Determine the media type for a NIC.  Some XFPs will identify
2972  * themselves only when their link is up, so this is initiated via a
2973  * link up interrupt.  However, this can potentially take up to
2974  * several milliseconds, so it is run via the watchdog routine, rather
2975  * than in the interrupt handler itself.
2976  */
2977 static void
2978 mxge_media_probe(mxge_softc_t *sc)
2979 {
2980 	mxge_cmd_t cmd;
2981 	char *cage_type;
2982 
2983 	struct mxge_media_type *mxge_media_types = NULL;
2984 	int i, err, ms, mxge_media_type_entries;
2985 	uint32_t byte;
2986 
2987 	sc->need_media_probe = 0;
2988 
2989 	if (sc->connector == MXGE_XFP) {
2990 		/* -R is XFP */
2991 		mxge_media_types = mxge_xfp_media_types;
2992 		mxge_media_type_entries =
2993 			nitems(mxge_xfp_media_types);
2994 		byte = MXGE_XFP_COMPLIANCE_BYTE;
2995 		cage_type = "XFP";
2996 	} else 	if (sc->connector == MXGE_SFP) {
2997 		/* -S or -2S is SFP+ */
2998 		mxge_media_types = mxge_sfp_media_types;
2999 		mxge_media_type_entries =
3000 			nitems(mxge_sfp_media_types);
3001 		cage_type = "SFP+";
3002 		byte = 3;
3003 	} else {
3004 		/* nothing to do; media type cannot change */
3005 		return;
3006 	}
3007 
3008 	/*
3009 	 * At this point we know the NIC has an XFP cage, so now we
3010 	 * try to determine what is in the cage by using the
3011 	 * firmware's XFP I2C commands to read the XFP 10GbE compilance
3012 	 * register.  We read just one byte, which may take over
3013 	 * a millisecond
3014 	 */
3015 
3016 	cmd.data0 = 0;	 /* just fetch 1 byte, not all 256 */
3017 	cmd.data1 = byte;
3018 	err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_READ, &cmd);
3019 	if (err == MXGEFW_CMD_ERROR_I2C_FAILURE) {
3020 		device_printf(sc->dev, "failed to read XFP\n");
3021 	}
3022 	if (err == MXGEFW_CMD_ERROR_I2C_ABSENT) {
3023 		device_printf(sc->dev, "Type R/S with no XFP!?!?\n");
3024 	}
3025 	if (err != MXGEFW_CMD_OK) {
3026 		return;
3027 	}
3028 
3029 	/* now we wait for the data to be cached */
3030 	cmd.data0 = byte;
3031 	err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
3032 	for (ms = 0; (err == EBUSY) && (ms < 50); ms++) {
3033 		DELAY(1000);
3034 		cmd.data0 = byte;
3035 		err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
3036 	}
3037 	if (err != MXGEFW_CMD_OK) {
3038 		device_printf(sc->dev, "failed to read %s (%d, %dms)\n",
3039 			      cage_type, err, ms);
3040 		return;
3041 	}
3042 
3043 	if (cmd.data0 == mxge_media_types[0].bitmask) {
3044 		if (mxge_verbose)
3045 			device_printf(sc->dev, "%s:%s\n", cage_type,
3046 				      mxge_media_types[0].name);
3047 		if (sc->current_media != mxge_media_types[0].flag) {
3048 			mxge_media_init(sc);
3049 			mxge_media_set(sc, mxge_media_types[0].flag);
3050 		}
3051 		return;
3052 	}
3053 	for (i = 1; i < mxge_media_type_entries; i++) {
3054 		if (cmd.data0 & mxge_media_types[i].bitmask) {
3055 			if (mxge_verbose)
3056 				device_printf(sc->dev, "%s:%s\n",
3057 					      cage_type,
3058 					      mxge_media_types[i].name);
3059 
3060 			if (sc->current_media != mxge_media_types[i].flag) {
3061 				mxge_media_init(sc);
3062 				mxge_media_set(sc, mxge_media_types[i].flag);
3063 			}
3064 			return;
3065 		}
3066 	}
3067 	if (mxge_verbose)
3068 		device_printf(sc->dev, "%s media 0x%x unknown\n",
3069 			      cage_type, cmd.data0);
3070 
3071 	return;
3072 }
3073 
3074 static void
3075 mxge_intr(void *arg)
3076 {
3077 	struct mxge_slice_state *ss = arg;
3078 	mxge_softc_t *sc = ss->sc;
3079 	mcp_irq_data_t *stats = ss->fw_stats;
3080 	mxge_tx_ring_t *tx = &ss->tx;
3081 	mxge_rx_done_t *rx_done = &ss->rx_done;
3082 	uint32_t send_done_count;
3083 	uint8_t valid;
3084 
3085 
3086 #ifndef IFNET_BUF_RING
3087 	/* an interrupt on a non-zero slice is implicitly valid
3088 	   since MSI-X irqs are not shared */
3089 	if (ss != sc->ss) {
3090 		mxge_clean_rx_done(ss);
3091 		*ss->irq_claim = be32toh(3);
3092 		return;
3093 	}
3094 #endif
3095 
3096 	/* make sure the DMA has finished */
3097 	if (!stats->valid) {
3098 		return;
3099 	}
3100 	valid = stats->valid;
3101 
3102 	if (sc->legacy_irq) {
3103 		/* lower legacy IRQ  */
3104 		*sc->irq_deassert = 0;
3105 		if (!mxge_deassert_wait)
3106 			/* don't wait for conf. that irq is low */
3107 			stats->valid = 0;
3108 	} else {
3109 		stats->valid = 0;
3110 	}
3111 
3112 	/* loop while waiting for legacy irq deassertion */
3113 	do {
3114 		/* check for transmit completes and receives */
3115 		send_done_count = be32toh(stats->send_done_count);
3116 		while ((send_done_count != tx->pkt_done) ||
3117 		       (rx_done->entry[rx_done->idx].length != 0)) {
3118 			if (send_done_count != tx->pkt_done)
3119 				mxge_tx_done(ss, (int)send_done_count);
3120 			mxge_clean_rx_done(ss);
3121 			send_done_count = be32toh(stats->send_done_count);
3122 		}
3123 		if (sc->legacy_irq && mxge_deassert_wait)
3124 			wmb();
3125 	} while (*((volatile uint8_t *) &stats->valid));
3126 
3127 	/* fw link & error stats meaningful only on the first slice */
3128 	if (__predict_false((ss == sc->ss) && stats->stats_updated)) {
3129 		if (sc->link_state != stats->link_up) {
3130 			sc->link_state = stats->link_up;
3131 			if (sc->link_state) {
3132 				if_link_state_change(sc->ifp, LINK_STATE_UP);
3133 				if (mxge_verbose)
3134 					device_printf(sc->dev, "link up\n");
3135 			} else {
3136 				if_link_state_change(sc->ifp, LINK_STATE_DOWN);
3137 				if (mxge_verbose)
3138 					device_printf(sc->dev, "link down\n");
3139 			}
3140 			sc->need_media_probe = 1;
3141 		}
3142 		if (sc->rdma_tags_available !=
3143 		    be32toh(stats->rdma_tags_available)) {
3144 			sc->rdma_tags_available =
3145 				be32toh(stats->rdma_tags_available);
3146 			device_printf(sc->dev, "RDMA timed out! %d tags "
3147 				      "left\n", sc->rdma_tags_available);
3148 		}
3149 
3150 		if (stats->link_down) {
3151 			sc->down_cnt += stats->link_down;
3152 			sc->link_state = 0;
3153 			if_link_state_change(sc->ifp, LINK_STATE_DOWN);
3154 		}
3155 	}
3156 
3157 	/* check to see if we have rx token to pass back */
3158 	if (valid & 0x1)
3159 	    *ss->irq_claim = be32toh(3);
3160 	*(ss->irq_claim + 1) = be32toh(3);
3161 }
3162 
3163 static void
3164 mxge_init(void *arg)
3165 {
3166 	mxge_softc_t *sc = arg;
3167 	struct ifnet *ifp = sc->ifp;
3168 
3169 
3170 	mtx_lock(&sc->driver_mtx);
3171 	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0)
3172 		(void) mxge_open(sc);
3173 	mtx_unlock(&sc->driver_mtx);
3174 }
3175 
3176 
3177 
3178 static void
3179 mxge_free_slice_mbufs(struct mxge_slice_state *ss)
3180 {
3181 	int i;
3182 
3183 #if defined(INET) || defined(INET6)
3184 	tcp_lro_free(&ss->lc);
3185 #endif
3186 	for (i = 0; i <= ss->rx_big.mask; i++) {
3187 		if (ss->rx_big.info[i].m == NULL)
3188 			continue;
3189 		bus_dmamap_unload(ss->rx_big.dmat,
3190 				  ss->rx_big.info[i].map);
3191 		m_freem(ss->rx_big.info[i].m);
3192 		ss->rx_big.info[i].m = NULL;
3193 	}
3194 
3195 	for (i = 0; i <= ss->rx_small.mask; i++) {
3196 		if (ss->rx_small.info[i].m == NULL)
3197 			continue;
3198 		bus_dmamap_unload(ss->rx_small.dmat,
3199 				  ss->rx_small.info[i].map);
3200 		m_freem(ss->rx_small.info[i].m);
3201 		ss->rx_small.info[i].m = NULL;
3202 	}
3203 
3204 	/* transmit ring used only on the first slice */
3205 	if (ss->tx.info == NULL)
3206 		return;
3207 
3208 	for (i = 0; i <= ss->tx.mask; i++) {
3209 		ss->tx.info[i].flag = 0;
3210 		if (ss->tx.info[i].m == NULL)
3211 			continue;
3212 		bus_dmamap_unload(ss->tx.dmat,
3213 				  ss->tx.info[i].map);
3214 		m_freem(ss->tx.info[i].m);
3215 		ss->tx.info[i].m = NULL;
3216 	}
3217 }
3218 
3219 static void
3220 mxge_free_mbufs(mxge_softc_t *sc)
3221 {
3222 	int slice;
3223 
3224 	for (slice = 0; slice < sc->num_slices; slice++)
3225 		mxge_free_slice_mbufs(&sc->ss[slice]);
3226 }
3227 
3228 static void
3229 mxge_free_slice_rings(struct mxge_slice_state *ss)
3230 {
3231 	int i;
3232 
3233 
3234 	if (ss->rx_done.entry != NULL)
3235 		mxge_dma_free(&ss->rx_done.dma);
3236 	ss->rx_done.entry = NULL;
3237 
3238 	if (ss->tx.req_bytes != NULL)
3239 		free(ss->tx.req_bytes, M_DEVBUF);
3240 	ss->tx.req_bytes = NULL;
3241 
3242 	if (ss->tx.seg_list != NULL)
3243 		free(ss->tx.seg_list, M_DEVBUF);
3244 	ss->tx.seg_list = NULL;
3245 
3246 	if (ss->rx_small.shadow != NULL)
3247 		free(ss->rx_small.shadow, M_DEVBUF);
3248 	ss->rx_small.shadow = NULL;
3249 
3250 	if (ss->rx_big.shadow != NULL)
3251 		free(ss->rx_big.shadow, M_DEVBUF);
3252 	ss->rx_big.shadow = NULL;
3253 
3254 	if (ss->tx.info != NULL) {
3255 		if (ss->tx.dmat != NULL) {
3256 			for (i = 0; i <= ss->tx.mask; i++) {
3257 				bus_dmamap_destroy(ss->tx.dmat,
3258 						   ss->tx.info[i].map);
3259 			}
3260 			bus_dma_tag_destroy(ss->tx.dmat);
3261 		}
3262 		free(ss->tx.info, M_DEVBUF);
3263 	}
3264 	ss->tx.info = NULL;
3265 
3266 	if (ss->rx_small.info != NULL) {
3267 		if (ss->rx_small.dmat != NULL) {
3268 			for (i = 0; i <= ss->rx_small.mask; i++) {
3269 				bus_dmamap_destroy(ss->rx_small.dmat,
3270 						   ss->rx_small.info[i].map);
3271 			}
3272 			bus_dmamap_destroy(ss->rx_small.dmat,
3273 					   ss->rx_small.extra_map);
3274 			bus_dma_tag_destroy(ss->rx_small.dmat);
3275 		}
3276 		free(ss->rx_small.info, M_DEVBUF);
3277 	}
3278 	ss->rx_small.info = NULL;
3279 
3280 	if (ss->rx_big.info != NULL) {
3281 		if (ss->rx_big.dmat != NULL) {
3282 			for (i = 0; i <= ss->rx_big.mask; i++) {
3283 				bus_dmamap_destroy(ss->rx_big.dmat,
3284 						   ss->rx_big.info[i].map);
3285 			}
3286 			bus_dmamap_destroy(ss->rx_big.dmat,
3287 					   ss->rx_big.extra_map);
3288 			bus_dma_tag_destroy(ss->rx_big.dmat);
3289 		}
3290 		free(ss->rx_big.info, M_DEVBUF);
3291 	}
3292 	ss->rx_big.info = NULL;
3293 }
3294 
3295 static void
3296 mxge_free_rings(mxge_softc_t *sc)
3297 {
3298 	int slice;
3299 
3300 	for (slice = 0; slice < sc->num_slices; slice++)
3301 		mxge_free_slice_rings(&sc->ss[slice]);
3302 }
3303 
3304 static int
3305 mxge_alloc_slice_rings(struct mxge_slice_state *ss, int rx_ring_entries,
3306 		       int tx_ring_entries)
3307 {
3308 	mxge_softc_t *sc = ss->sc;
3309 	size_t bytes;
3310 	int err, i;
3311 
3312 	/* allocate per-slice receive resources */
3313 
3314 	ss->rx_small.mask = ss->rx_big.mask = rx_ring_entries - 1;
3315 	ss->rx_done.mask = (2 * rx_ring_entries) - 1;
3316 
3317 	/* allocate the rx shadow rings */
3318 	bytes = rx_ring_entries * sizeof (*ss->rx_small.shadow);
3319 	ss->rx_small.shadow = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3320 
3321 	bytes = rx_ring_entries * sizeof (*ss->rx_big.shadow);
3322 	ss->rx_big.shadow = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3323 
3324 	/* allocate the rx host info rings */
3325 	bytes = rx_ring_entries * sizeof (*ss->rx_small.info);
3326 	ss->rx_small.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3327 
3328 	bytes = rx_ring_entries * sizeof (*ss->rx_big.info);
3329 	ss->rx_big.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3330 
3331 	/* allocate the rx busdma resources */
3332 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
3333 				 1,			/* alignment */
3334 				 4096,			/* boundary */
3335 				 BUS_SPACE_MAXADDR,	/* low */
3336 				 BUS_SPACE_MAXADDR,	/* high */
3337 				 NULL, NULL,		/* filter */
3338 				 MHLEN,			/* maxsize */
3339 				 1,			/* num segs */
3340 				 MHLEN,			/* maxsegsize */
3341 				 BUS_DMA_ALLOCNOW,	/* flags */
3342 				 NULL, NULL,		/* lock */
3343 				 &ss->rx_small.dmat);	/* tag */
3344 	if (err != 0) {
3345 		device_printf(sc->dev, "Err %d allocating rx_small dmat\n",
3346 			      err);
3347 		return err;
3348 	}
3349 
3350 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
3351 				 1,			/* alignment */
3352 #if MXGE_VIRT_JUMBOS
3353 				 4096,			/* boundary */
3354 #else
3355 				 0,			/* boundary */
3356 #endif
3357 				 BUS_SPACE_MAXADDR,	/* low */
3358 				 BUS_SPACE_MAXADDR,	/* high */
3359 				 NULL, NULL,		/* filter */
3360 				 3*4096,		/* maxsize */
3361 #if MXGE_VIRT_JUMBOS
3362 				 3,			/* num segs */
3363 				 4096,			/* maxsegsize*/
3364 #else
3365 				 1,			/* num segs */
3366 				 MJUM9BYTES,		/* maxsegsize*/
3367 #endif
3368 				 BUS_DMA_ALLOCNOW,	/* flags */
3369 				 NULL, NULL,		/* lock */
3370 				 &ss->rx_big.dmat);	/* tag */
3371 	if (err != 0) {
3372 		device_printf(sc->dev, "Err %d allocating rx_big dmat\n",
3373 			      err);
3374 		return err;
3375 	}
3376 	for (i = 0; i <= ss->rx_small.mask; i++) {
3377 		err = bus_dmamap_create(ss->rx_small.dmat, 0,
3378 					&ss->rx_small.info[i].map);
3379 		if (err != 0) {
3380 			device_printf(sc->dev, "Err %d  rx_small dmamap\n",
3381 				      err);
3382 			return err;
3383 		}
3384 	}
3385 	err = bus_dmamap_create(ss->rx_small.dmat, 0,
3386 				&ss->rx_small.extra_map);
3387 	if (err != 0) {
3388 		device_printf(sc->dev, "Err %d extra rx_small dmamap\n",
3389 			      err);
3390 		return err;
3391 	}
3392 
3393 	for (i = 0; i <= ss->rx_big.mask; i++) {
3394 		err = bus_dmamap_create(ss->rx_big.dmat, 0,
3395 					&ss->rx_big.info[i].map);
3396 		if (err != 0) {
3397 			device_printf(sc->dev, "Err %d  rx_big dmamap\n",
3398 				      err);
3399 			return err;
3400 		}
3401 	}
3402 	err = bus_dmamap_create(ss->rx_big.dmat, 0,
3403 				&ss->rx_big.extra_map);
3404 	if (err != 0) {
3405 		device_printf(sc->dev, "Err %d extra rx_big dmamap\n",
3406 			      err);
3407 		return err;
3408 	}
3409 
3410 	/* now allocate TX resources */
3411 
3412 #ifndef IFNET_BUF_RING
3413 	/* only use a single TX ring for now */
3414 	if (ss != ss->sc->ss)
3415 		return 0;
3416 #endif
3417 
3418 	ss->tx.mask = tx_ring_entries - 1;
3419 	ss->tx.max_desc = MIN(MXGE_MAX_SEND_DESC, tx_ring_entries / 4);
3420 
3421 
3422 	/* allocate the tx request copy block */
3423 	bytes = 8 +
3424 		sizeof (*ss->tx.req_list) * (ss->tx.max_desc + 4);
3425 	ss->tx.req_bytes = malloc(bytes, M_DEVBUF, M_WAITOK);
3426 	/* ensure req_list entries are aligned to 8 bytes */
3427 	ss->tx.req_list = (mcp_kreq_ether_send_t *)
3428 		((unsigned long)(ss->tx.req_bytes + 7) & ~7UL);
3429 
3430 	/* allocate the tx busdma segment list */
3431 	bytes = sizeof (*ss->tx.seg_list) * ss->tx.max_desc;
3432 	ss->tx.seg_list = (bus_dma_segment_t *)
3433 		malloc(bytes, M_DEVBUF, M_WAITOK);
3434 
3435 	/* allocate the tx host info ring */
3436 	bytes = tx_ring_entries * sizeof (*ss->tx.info);
3437 	ss->tx.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3438 
3439 	/* allocate the tx busdma resources */
3440 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
3441 				 1,			/* alignment */
3442 				 sc->tx_boundary,	/* boundary */
3443 				 BUS_SPACE_MAXADDR,	/* low */
3444 				 BUS_SPACE_MAXADDR,	/* high */
3445 				 NULL, NULL,		/* filter */
3446 				 65536 + 256,		/* maxsize */
3447 				 ss->tx.max_desc - 2,	/* num segs */
3448 				 sc->tx_boundary,	/* maxsegsz */
3449 				 BUS_DMA_ALLOCNOW,	/* flags */
3450 				 NULL, NULL,		/* lock */
3451 				 &ss->tx.dmat);		/* tag */
3452 
3453 	if (err != 0) {
3454 		device_printf(sc->dev, "Err %d allocating tx dmat\n",
3455 			      err);
3456 		return err;
3457 	}
3458 
3459 	/* now use these tags to setup dmamaps for each slot
3460 	   in the ring */
3461 	for (i = 0; i <= ss->tx.mask; i++) {
3462 		err = bus_dmamap_create(ss->tx.dmat, 0,
3463 					&ss->tx.info[i].map);
3464 		if (err != 0) {
3465 			device_printf(sc->dev, "Err %d  tx dmamap\n",
3466 				      err);
3467 			return err;
3468 		}
3469 	}
3470 	return 0;
3471 
3472 }
3473 
3474 static int
3475 mxge_alloc_rings(mxge_softc_t *sc)
3476 {
3477 	mxge_cmd_t cmd;
3478 	int tx_ring_size;
3479 	int tx_ring_entries, rx_ring_entries;
3480 	int err, slice;
3481 
3482 	/* get ring sizes */
3483 	err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_RING_SIZE, &cmd);
3484 	tx_ring_size = cmd.data0;
3485 	if (err != 0) {
3486 		device_printf(sc->dev, "Cannot determine tx ring sizes\n");
3487 		goto abort;
3488 	}
3489 
3490 	tx_ring_entries = tx_ring_size / sizeof (mcp_kreq_ether_send_t);
3491 	rx_ring_entries = sc->rx_ring_size / sizeof (mcp_dma_addr_t);
3492 	IFQ_SET_MAXLEN(&sc->ifp->if_snd, tx_ring_entries - 1);
3493 	sc->ifp->if_snd.ifq_drv_maxlen = sc->ifp->if_snd.ifq_maxlen;
3494 	IFQ_SET_READY(&sc->ifp->if_snd);
3495 
3496 	for (slice = 0; slice < sc->num_slices; slice++) {
3497 		err = mxge_alloc_slice_rings(&sc->ss[slice],
3498 					     rx_ring_entries,
3499 					     tx_ring_entries);
3500 		if (err != 0)
3501 			goto abort;
3502 	}
3503 	return 0;
3504 
3505 abort:
3506 	mxge_free_rings(sc);
3507 	return err;
3508 
3509 }
3510 
3511 
3512 static void
3513 mxge_choose_params(int mtu, int *big_buf_size, int *cl_size, int *nbufs)
3514 {
3515 	int bufsize = mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN + MXGEFW_PAD;
3516 
3517 	if (bufsize < MCLBYTES) {
3518 		/* easy, everything fits in a single buffer */
3519 		*big_buf_size = MCLBYTES;
3520 		*cl_size = MCLBYTES;
3521 		*nbufs = 1;
3522 		return;
3523 	}
3524 
3525 	if (bufsize < MJUMPAGESIZE) {
3526 		/* still easy, everything still fits in a single buffer */
3527 		*big_buf_size = MJUMPAGESIZE;
3528 		*cl_size = MJUMPAGESIZE;
3529 		*nbufs = 1;
3530 		return;
3531 	}
3532 #if MXGE_VIRT_JUMBOS
3533 	/* now we need to use virtually contiguous buffers */
3534 	*cl_size = MJUM9BYTES;
3535 	*big_buf_size = 4096;
3536 	*nbufs = mtu / 4096 + 1;
3537 	/* needs to be a power of two, so round up */
3538 	if (*nbufs == 3)
3539 		*nbufs = 4;
3540 #else
3541 	*cl_size = MJUM9BYTES;
3542 	*big_buf_size = MJUM9BYTES;
3543 	*nbufs = 1;
3544 #endif
3545 }
3546 
3547 static int
3548 mxge_slice_open(struct mxge_slice_state *ss, int nbufs, int cl_size)
3549 {
3550 	mxge_softc_t *sc;
3551 	mxge_cmd_t cmd;
3552 	bus_dmamap_t map;
3553 	int err, i, slice;
3554 
3555 
3556 	sc = ss->sc;
3557 	slice = ss - sc->ss;
3558 
3559 #if defined(INET) || defined(INET6)
3560 	(void)tcp_lro_init(&ss->lc);
3561 #endif
3562 	ss->lc.ifp = sc->ifp;
3563 
3564 	/* get the lanai pointers to the send and receive rings */
3565 
3566 	err = 0;
3567 #ifndef IFNET_BUF_RING
3568 	/* We currently only send from the first slice */
3569 	if (slice == 0) {
3570 #endif
3571 		cmd.data0 = slice;
3572 		err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_OFFSET, &cmd);
3573 		ss->tx.lanai =
3574 			(volatile mcp_kreq_ether_send_t *)(sc->sram + cmd.data0);
3575 		ss->tx.send_go = (volatile uint32_t *)
3576 			(sc->sram + MXGEFW_ETH_SEND_GO + 64 * slice);
3577 		ss->tx.send_stop = (volatile uint32_t *)
3578 		(sc->sram + MXGEFW_ETH_SEND_STOP + 64 * slice);
3579 #ifndef IFNET_BUF_RING
3580 	}
3581 #endif
3582 	cmd.data0 = slice;
3583 	err |= mxge_send_cmd(sc,
3584 			     MXGEFW_CMD_GET_SMALL_RX_OFFSET, &cmd);
3585 	ss->rx_small.lanai =
3586 		(volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
3587 	cmd.data0 = slice;
3588 	err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_BIG_RX_OFFSET, &cmd);
3589 	ss->rx_big.lanai =
3590 		(volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
3591 
3592 	if (err != 0) {
3593 		device_printf(sc->dev,
3594 			      "failed to get ring sizes or locations\n");
3595 		return EIO;
3596 	}
3597 
3598 	/* stock receive rings */
3599 	for (i = 0; i <= ss->rx_small.mask; i++) {
3600 		map = ss->rx_small.info[i].map;
3601 		err = mxge_get_buf_small(ss, map, i);
3602 		if (err) {
3603 			device_printf(sc->dev, "alloced %d/%d smalls\n",
3604 				      i, ss->rx_small.mask + 1);
3605 			return ENOMEM;
3606 		}
3607 	}
3608 	for (i = 0; i <= ss->rx_big.mask; i++) {
3609 		ss->rx_big.shadow[i].addr_low = 0xffffffff;
3610 		ss->rx_big.shadow[i].addr_high = 0xffffffff;
3611 	}
3612 	ss->rx_big.nbufs = nbufs;
3613 	ss->rx_big.cl_size = cl_size;
3614 	ss->rx_big.mlen = ss->sc->ifp->if_mtu + ETHER_HDR_LEN +
3615 		ETHER_VLAN_ENCAP_LEN + MXGEFW_PAD;
3616 	for (i = 0; i <= ss->rx_big.mask; i += ss->rx_big.nbufs) {
3617 		map = ss->rx_big.info[i].map;
3618 		err = mxge_get_buf_big(ss, map, i);
3619 		if (err) {
3620 			device_printf(sc->dev, "alloced %d/%d bigs\n",
3621 				      i, ss->rx_big.mask + 1);
3622 			return ENOMEM;
3623 		}
3624 	}
3625 	return 0;
3626 }
3627 
3628 static int
3629 mxge_open(mxge_softc_t *sc)
3630 {
3631 	mxge_cmd_t cmd;
3632 	int err, big_bytes, nbufs, slice, cl_size, i;
3633 	bus_addr_t bus;
3634 	volatile uint8_t *itable;
3635 	struct mxge_slice_state *ss;
3636 
3637 	/* Copy the MAC address in case it was overridden */
3638 	bcopy(IF_LLADDR(sc->ifp), sc->mac_addr, ETHER_ADDR_LEN);
3639 
3640 	err = mxge_reset(sc, 1);
3641 	if (err != 0) {
3642 		device_printf(sc->dev, "failed to reset\n");
3643 		return EIO;
3644 	}
3645 
3646 	if (sc->num_slices > 1) {
3647 		/* setup the indirection table */
3648 		cmd.data0 = sc->num_slices;
3649 		err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_TABLE_SIZE,
3650 				    &cmd);
3651 
3652 		err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_RSS_TABLE_OFFSET,
3653 				     &cmd);
3654 		if (err != 0) {
3655 			device_printf(sc->dev,
3656 				      "failed to setup rss tables\n");
3657 			return err;
3658 		}
3659 
3660 		/* just enable an identity mapping */
3661 		itable = sc->sram + cmd.data0;
3662 		for (i = 0; i < sc->num_slices; i++)
3663 			itable[i] = (uint8_t)i;
3664 
3665 		cmd.data0 = 1;
3666 		cmd.data1 = mxge_rss_hash_type;
3667 		err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_ENABLE, &cmd);
3668 		if (err != 0) {
3669 			device_printf(sc->dev, "failed to enable slices\n");
3670 			return err;
3671 		}
3672 	}
3673 
3674 
3675 	mxge_choose_params(sc->ifp->if_mtu, &big_bytes, &cl_size, &nbufs);
3676 
3677 	cmd.data0 = nbufs;
3678 	err = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
3679 			    &cmd);
3680 	/* error is only meaningful if we're trying to set
3681 	   MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS > 1 */
3682 	if (err && nbufs > 1) {
3683 		device_printf(sc->dev,
3684 			      "Failed to set alway-use-n to %d\n",
3685 			      nbufs);
3686 		return EIO;
3687 	}
3688 	/* Give the firmware the mtu and the big and small buffer
3689 	   sizes.  The firmware wants the big buf size to be a power
3690 	   of two. Luckily, FreeBSD's clusters are powers of two */
3691 	cmd.data0 = sc->ifp->if_mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
3692 	err = mxge_send_cmd(sc, MXGEFW_CMD_SET_MTU, &cmd);
3693 	cmd.data0 = MHLEN - MXGEFW_PAD;
3694 	err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_SMALL_BUFFER_SIZE,
3695 			     &cmd);
3696 	cmd.data0 = big_bytes;
3697 	err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_BIG_BUFFER_SIZE, &cmd);
3698 
3699 	if (err != 0) {
3700 		device_printf(sc->dev, "failed to setup params\n");
3701 		goto abort;
3702 	}
3703 
3704 	/* Now give him the pointer to the stats block */
3705 	for (slice = 0;
3706 #ifdef IFNET_BUF_RING
3707 	     slice < sc->num_slices;
3708 #else
3709 	     slice < 1;
3710 #endif
3711 	     slice++) {
3712 		ss = &sc->ss[slice];
3713 		cmd.data0 =
3714 			MXGE_LOWPART_TO_U32(ss->fw_stats_dma.bus_addr);
3715 		cmd.data1 =
3716 			MXGE_HIGHPART_TO_U32(ss->fw_stats_dma.bus_addr);
3717 		cmd.data2 = sizeof(struct mcp_irq_data);
3718 		cmd.data2 |= (slice << 16);
3719 		err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_STATS_DMA_V2, &cmd);
3720 	}
3721 
3722 	if (err != 0) {
3723 		bus = sc->ss->fw_stats_dma.bus_addr;
3724 		bus += offsetof(struct mcp_irq_data, send_done_count);
3725 		cmd.data0 = MXGE_LOWPART_TO_U32(bus);
3726 		cmd.data1 = MXGE_HIGHPART_TO_U32(bus);
3727 		err = mxge_send_cmd(sc,
3728 				    MXGEFW_CMD_SET_STATS_DMA_OBSOLETE,
3729 				    &cmd);
3730 		/* Firmware cannot support multicast without STATS_DMA_V2 */
3731 		sc->fw_multicast_support = 0;
3732 	} else {
3733 		sc->fw_multicast_support = 1;
3734 	}
3735 
3736 	if (err != 0) {
3737 		device_printf(sc->dev, "failed to setup params\n");
3738 		goto abort;
3739 	}
3740 
3741 	for (slice = 0; slice < sc->num_slices; slice++) {
3742 		err = mxge_slice_open(&sc->ss[slice], nbufs, cl_size);
3743 		if (err != 0) {
3744 			device_printf(sc->dev, "couldn't open slice %d\n",
3745 				      slice);
3746 			goto abort;
3747 		}
3748 	}
3749 
3750 	/* Finally, start the firmware running */
3751 	err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_UP, &cmd);
3752 	if (err) {
3753 		device_printf(sc->dev, "Couldn't bring up link\n");
3754 		goto abort;
3755 	}
3756 #ifdef IFNET_BUF_RING
3757 	for (slice = 0; slice < sc->num_slices; slice++) {
3758 		ss = &sc->ss[slice];
3759 		ss->if_drv_flags |= IFF_DRV_RUNNING;
3760 		ss->if_drv_flags &= ~IFF_DRV_OACTIVE;
3761 	}
3762 #endif
3763 	sc->ifp->if_drv_flags |= IFF_DRV_RUNNING;
3764 	sc->ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
3765 
3766 	return 0;
3767 
3768 
3769 abort:
3770 	mxge_free_mbufs(sc);
3771 
3772 	return err;
3773 }
3774 
3775 static int
3776 mxge_close(mxge_softc_t *sc, int down)
3777 {
3778 	mxge_cmd_t cmd;
3779 	int err, old_down_cnt;
3780 #ifdef IFNET_BUF_RING
3781 	struct mxge_slice_state *ss;
3782 	int slice;
3783 #endif
3784 
3785 #ifdef IFNET_BUF_RING
3786 	for (slice = 0; slice < sc->num_slices; slice++) {
3787 		ss = &sc->ss[slice];
3788 		ss->if_drv_flags &= ~IFF_DRV_RUNNING;
3789 	}
3790 #endif
3791 	sc->ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
3792 	if (!down) {
3793 		old_down_cnt = sc->down_cnt;
3794 		wmb();
3795 		err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_DOWN, &cmd);
3796 		if (err) {
3797 			device_printf(sc->dev,
3798 				      "Couldn't bring down link\n");
3799 		}
3800 		if (old_down_cnt == sc->down_cnt) {
3801 			/* wait for down irq */
3802 			DELAY(10 * sc->intr_coal_delay);
3803 		}
3804 		wmb();
3805 		if (old_down_cnt == sc->down_cnt) {
3806 			device_printf(sc->dev, "never got down irq\n");
3807 		}
3808 	}
3809 	mxge_free_mbufs(sc);
3810 
3811 	return 0;
3812 }
3813 
3814 static void
3815 mxge_setup_cfg_space(mxge_softc_t *sc)
3816 {
3817 	device_t dev = sc->dev;
3818 	int reg;
3819 	uint16_t lnk, pectl;
3820 
3821 	/* find the PCIe link width and set max read request to 4KB*/
3822 	if (pci_find_cap(dev, PCIY_EXPRESS, &reg) == 0) {
3823 		lnk = pci_read_config(dev, reg + 0x12, 2);
3824 		sc->link_width = (lnk >> 4) & 0x3f;
3825 
3826 		if (sc->pectl == 0) {
3827 			pectl = pci_read_config(dev, reg + 0x8, 2);
3828 			pectl = (pectl & ~0x7000) | (5 << 12);
3829 			pci_write_config(dev, reg + 0x8, pectl, 2);
3830 			sc->pectl = pectl;
3831 		} else {
3832 			/* restore saved pectl after watchdog reset */
3833 			pci_write_config(dev, reg + 0x8, sc->pectl, 2);
3834 		}
3835 	}
3836 
3837 	/* Enable DMA and Memory space access */
3838 	pci_enable_busmaster(dev);
3839 }
3840 
3841 static uint32_t
3842 mxge_read_reboot(mxge_softc_t *sc)
3843 {
3844 	device_t dev = sc->dev;
3845 	uint32_t vs;
3846 
3847 	/* find the vendor specific offset */
3848 	if (pci_find_cap(dev, PCIY_VENDOR, &vs) != 0) {
3849 		device_printf(sc->dev,
3850 			      "could not find vendor specific offset\n");
3851 		return (uint32_t)-1;
3852 	}
3853 	/* enable read32 mode */
3854 	pci_write_config(dev, vs + 0x10, 0x3, 1);
3855 	/* tell NIC which register to read */
3856 	pci_write_config(dev, vs + 0x18, 0xfffffff0, 4);
3857 	return (pci_read_config(dev, vs + 0x14, 4));
3858 }
3859 
3860 static void
3861 mxge_watchdog_reset(mxge_softc_t *sc)
3862 {
3863 	struct pci_devinfo *dinfo;
3864 	struct mxge_slice_state *ss;
3865 	int err, running, s, num_tx_slices = 1;
3866 	uint32_t reboot;
3867 	uint16_t cmd;
3868 
3869 	err = ENXIO;
3870 
3871 	device_printf(sc->dev, "Watchdog reset!\n");
3872 
3873 	/*
3874 	 * check to see if the NIC rebooted.  If it did, then all of
3875 	 * PCI config space has been reset, and things like the
3876 	 * busmaster bit will be zero.  If this is the case, then we
3877 	 * must restore PCI config space before the NIC can be used
3878 	 * again
3879 	 */
3880 	cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3881 	if (cmd == 0xffff) {
3882 		/*
3883 		 * maybe the watchdog caught the NIC rebooting; wait
3884 		 * up to 100ms for it to finish.  If it does not come
3885 		 * back, then give up
3886 		 */
3887 		DELAY(1000*100);
3888 		cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3889 		if (cmd == 0xffff) {
3890 			device_printf(sc->dev, "NIC disappeared!\n");
3891 		}
3892 	}
3893 	if ((cmd & PCIM_CMD_BUSMASTEREN) == 0) {
3894 		/* print the reboot status */
3895 		reboot = mxge_read_reboot(sc);
3896 		device_printf(sc->dev, "NIC rebooted, status = 0x%x\n",
3897 			      reboot);
3898 		running = sc->ifp->if_drv_flags & IFF_DRV_RUNNING;
3899 		if (running) {
3900 
3901 			/*
3902 			 * quiesce NIC so that TX routines will not try to
3903 			 * xmit after restoration of BAR
3904 			 */
3905 
3906 			/* Mark the link as down */
3907 			if (sc->link_state) {
3908 				sc->link_state = 0;
3909 				if_link_state_change(sc->ifp,
3910 						     LINK_STATE_DOWN);
3911 			}
3912 #ifdef IFNET_BUF_RING
3913 			num_tx_slices = sc->num_slices;
3914 #endif
3915 			/* grab all TX locks to ensure no tx  */
3916 			for (s = 0; s < num_tx_slices; s++) {
3917 				ss = &sc->ss[s];
3918 				mtx_lock(&ss->tx.mtx);
3919 			}
3920 			mxge_close(sc, 1);
3921 		}
3922 		/* restore PCI configuration space */
3923 		dinfo = device_get_ivars(sc->dev);
3924 		pci_cfg_restore(sc->dev, dinfo);
3925 
3926 		/* and redo any changes we made to our config space */
3927 		mxge_setup_cfg_space(sc);
3928 
3929 		/* reload f/w */
3930 		err = mxge_load_firmware(sc, 0);
3931 		if (err) {
3932 			device_printf(sc->dev,
3933 				      "Unable to re-load f/w\n");
3934 		}
3935 		if (running) {
3936 			if (!err)
3937 				err = mxge_open(sc);
3938 			/* release all TX locks */
3939 			for (s = 0; s < num_tx_slices; s++) {
3940 				ss = &sc->ss[s];
3941 #ifdef IFNET_BUF_RING
3942 				mxge_start_locked(ss);
3943 #endif
3944 				mtx_unlock(&ss->tx.mtx);
3945 			}
3946 		}
3947 		sc->watchdog_resets++;
3948 	} else {
3949 		device_printf(sc->dev,
3950 			      "NIC did not reboot, not resetting\n");
3951 		err = 0;
3952 	}
3953 	if (err) {
3954 		device_printf(sc->dev, "watchdog reset failed\n");
3955 	} else {
3956 		if (sc->dying == 2)
3957 			sc->dying = 0;
3958 		callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
3959 	}
3960 }
3961 
3962 static void
3963 mxge_watchdog_task(void *arg, int pending)
3964 {
3965 	mxge_softc_t *sc = arg;
3966 
3967 
3968 	mtx_lock(&sc->driver_mtx);
3969 	mxge_watchdog_reset(sc);
3970 	mtx_unlock(&sc->driver_mtx);
3971 }
3972 
3973 static void
3974 mxge_warn_stuck(mxge_softc_t *sc, mxge_tx_ring_t *tx, int slice)
3975 {
3976 	tx = &sc->ss[slice].tx;
3977 	device_printf(sc->dev, "slice %d struck? ring state:\n", slice);
3978 	device_printf(sc->dev,
3979 		      "tx.req=%d tx.done=%d, tx.queue_active=%d\n",
3980 		      tx->req, tx->done, tx->queue_active);
3981 	device_printf(sc->dev, "tx.activate=%d tx.deactivate=%d\n",
3982 			      tx->activate, tx->deactivate);
3983 	device_printf(sc->dev, "pkt_done=%d fw=%d\n",
3984 		      tx->pkt_done,
3985 		      be32toh(sc->ss->fw_stats->send_done_count));
3986 }
3987 
3988 static int
3989 mxge_watchdog(mxge_softc_t *sc)
3990 {
3991 	mxge_tx_ring_t *tx;
3992 	uint32_t rx_pause = be32toh(sc->ss->fw_stats->dropped_pause);
3993 	int i, err = 0;
3994 
3995 	/* see if we have outstanding transmits, which
3996 	   have been pending for more than mxge_ticks */
3997 	for (i = 0;
3998 #ifdef IFNET_BUF_RING
3999 	     (i < sc->num_slices) && (err == 0);
4000 #else
4001 	     (i < 1) && (err == 0);
4002 #endif
4003 	     i++) {
4004 		tx = &sc->ss[i].tx;
4005 		if (tx->req != tx->done &&
4006 		    tx->watchdog_req != tx->watchdog_done &&
4007 		    tx->done == tx->watchdog_done) {
4008 			/* check for pause blocking before resetting */
4009 			if (tx->watchdog_rx_pause == rx_pause) {
4010 				mxge_warn_stuck(sc, tx, i);
4011 				taskqueue_enqueue(sc->tq, &sc->watchdog_task);
4012 				return (ENXIO);
4013 			}
4014 			else
4015 				device_printf(sc->dev, "Flow control blocking "
4016 					      "xmits, check link partner\n");
4017 		}
4018 
4019 		tx->watchdog_req = tx->req;
4020 		tx->watchdog_done = tx->done;
4021 		tx->watchdog_rx_pause = rx_pause;
4022 	}
4023 
4024 	if (sc->need_media_probe)
4025 		mxge_media_probe(sc);
4026 	return (err);
4027 }
4028 
4029 static uint64_t
4030 mxge_get_counter(struct ifnet *ifp, ift_counter cnt)
4031 {
4032 	struct mxge_softc *sc;
4033 	uint64_t rv;
4034 
4035 	sc = if_getsoftc(ifp);
4036 	rv = 0;
4037 
4038 	switch (cnt) {
4039 	case IFCOUNTER_IPACKETS:
4040 		for (int s = 0; s < sc->num_slices; s++)
4041 			rv += sc->ss[s].ipackets;
4042 		return (rv);
4043 	case IFCOUNTER_OPACKETS:
4044 		for (int s = 0; s < sc->num_slices; s++)
4045 			rv += sc->ss[s].opackets;
4046 		return (rv);
4047 	case IFCOUNTER_OERRORS:
4048 		for (int s = 0; s < sc->num_slices; s++)
4049 			rv += sc->ss[s].oerrors;
4050 		return (rv);
4051 #ifdef IFNET_BUF_RING
4052 	case IFCOUNTER_OBYTES:
4053 		for (int s = 0; s < sc->num_slices; s++)
4054 			rv += sc->ss[s].obytes;
4055 		return (rv);
4056 	case IFCOUNTER_OMCASTS:
4057 		for (int s = 0; s < sc->num_slices; s++)
4058 			rv += sc->ss[s].omcasts;
4059 		return (rv);
4060 	case IFCOUNTER_OQDROPS:
4061 		for (int s = 0; s < sc->num_slices; s++)
4062 			rv += sc->ss[s].tx.br->br_drops;
4063 		return (rv);
4064 #endif
4065 	default:
4066 		return (if_get_counter_default(ifp, cnt));
4067 	}
4068 }
4069 
4070 static void
4071 mxge_tick(void *arg)
4072 {
4073 	mxge_softc_t *sc = arg;
4074 	u_long pkts = 0;
4075 	int err = 0;
4076 	int running, ticks;
4077 	uint16_t cmd;
4078 
4079 	ticks = mxge_ticks;
4080 	running = sc->ifp->if_drv_flags & IFF_DRV_RUNNING;
4081 	if (running) {
4082 		if (!sc->watchdog_countdown) {
4083 			err = mxge_watchdog(sc);
4084 			sc->watchdog_countdown = 4;
4085 		}
4086 		sc->watchdog_countdown--;
4087 	}
4088 	if (pkts == 0) {
4089 		/* ensure NIC did not suffer h/w fault while idle */
4090 		cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
4091 		if ((cmd & PCIM_CMD_BUSMASTEREN) == 0) {
4092 			sc->dying = 2;
4093 			taskqueue_enqueue(sc->tq, &sc->watchdog_task);
4094 			err = ENXIO;
4095 		}
4096 		/* look less often if NIC is idle */
4097 		ticks *= 4;
4098 	}
4099 
4100 	if (err == 0)
4101 		callout_reset(&sc->co_hdl, ticks, mxge_tick, sc);
4102 
4103 }
4104 
4105 static int
4106 mxge_media_change(struct ifnet *ifp)
4107 {
4108 	return EINVAL;
4109 }
4110 
4111 static int
4112 mxge_change_mtu(mxge_softc_t *sc, int mtu)
4113 {
4114 	struct ifnet *ifp = sc->ifp;
4115 	int real_mtu, old_mtu;
4116 	int err = 0;
4117 
4118 
4119 	real_mtu = mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
4120 	if ((real_mtu > sc->max_mtu) || real_mtu < 60)
4121 		return EINVAL;
4122 	mtx_lock(&sc->driver_mtx);
4123 	old_mtu = ifp->if_mtu;
4124 	ifp->if_mtu = mtu;
4125 	if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
4126 		mxge_close(sc, 0);
4127 		err = mxge_open(sc);
4128 		if (err != 0) {
4129 			ifp->if_mtu = old_mtu;
4130 			mxge_close(sc, 0);
4131 			(void) mxge_open(sc);
4132 		}
4133 	}
4134 	mtx_unlock(&sc->driver_mtx);
4135 	return err;
4136 }
4137 
4138 static void
4139 mxge_media_status(struct ifnet *ifp, struct ifmediareq *ifmr)
4140 {
4141 	mxge_softc_t *sc = ifp->if_softc;
4142 
4143 
4144 	if (sc == NULL)
4145 		return;
4146 	ifmr->ifm_status = IFM_AVALID;
4147 	ifmr->ifm_active = IFM_ETHER | IFM_FDX;
4148 	ifmr->ifm_status |= sc->link_state ? IFM_ACTIVE : 0;
4149 	ifmr->ifm_active |= sc->current_media;
4150 }
4151 
4152 static int
4153 mxge_fetch_i2c(mxge_softc_t *sc, struct ifi2creq *i2c)
4154 {
4155 	mxge_cmd_t cmd;
4156 	uint32_t i2c_args;
4157 	int i, ms, err;
4158 
4159 
4160 	if (i2c->dev_addr != 0xA0 &&
4161 	    i2c->dev_addr != 0xA2)
4162 		return (EINVAL);
4163 	if (i2c->len > sizeof(i2c->data))
4164 		return (EINVAL);
4165 
4166 	for (i = 0; i < i2c->len; i++) {
4167 		i2c_args = i2c->dev_addr << 0x8;
4168 		i2c_args |= i2c->offset + i;
4169 		cmd.data0 = 0;	 /* just fetch 1 byte, not all 256 */
4170 		cmd.data1 = i2c_args;
4171 		err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_READ, &cmd);
4172 
4173 		if (err != MXGEFW_CMD_OK)
4174 			return (EIO);
4175 		/* now we wait for the data to be cached */
4176 		cmd.data0 = i2c_args & 0xff;
4177 		err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
4178 		for (ms = 0; (err == EBUSY) && (ms < 50); ms++) {
4179 			cmd.data0 = i2c_args & 0xff;
4180 			err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
4181 			if (err == EBUSY)
4182 				DELAY(1000);
4183 		}
4184 		if (err != MXGEFW_CMD_OK)
4185 			return (EIO);
4186 		i2c->data[i] = cmd.data0;
4187 	}
4188 	return (0);
4189 }
4190 
4191 static int
4192 mxge_ioctl(struct ifnet *ifp, u_long command, caddr_t data)
4193 {
4194 	mxge_softc_t *sc = ifp->if_softc;
4195 	struct ifreq *ifr = (struct ifreq *)data;
4196 	struct ifi2creq i2c;
4197 	int err, mask;
4198 
4199 	err = 0;
4200 	switch (command) {
4201 	case SIOCSIFMTU:
4202 		err = mxge_change_mtu(sc, ifr->ifr_mtu);
4203 		break;
4204 
4205 	case SIOCSIFFLAGS:
4206 		mtx_lock(&sc->driver_mtx);
4207 		if (sc->dying) {
4208 			mtx_unlock(&sc->driver_mtx);
4209 			return EINVAL;
4210 		}
4211 		if (ifp->if_flags & IFF_UP) {
4212 			if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) {
4213 				err = mxge_open(sc);
4214 			} else {
4215 				/* take care of promis can allmulti
4216 				   flag chages */
4217 				mxge_change_promisc(sc,
4218 						    ifp->if_flags & IFF_PROMISC);
4219 				mxge_set_multicast_list(sc);
4220 			}
4221 		} else {
4222 			if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
4223 				mxge_close(sc, 0);
4224 			}
4225 		}
4226 		mtx_unlock(&sc->driver_mtx);
4227 		break;
4228 
4229 	case SIOCADDMULTI:
4230 	case SIOCDELMULTI:
4231 		mtx_lock(&sc->driver_mtx);
4232 		if (sc->dying) {
4233 			mtx_unlock(&sc->driver_mtx);
4234 			return (EINVAL);
4235 		}
4236 		mxge_set_multicast_list(sc);
4237 		mtx_unlock(&sc->driver_mtx);
4238 		break;
4239 
4240 	case SIOCSIFCAP:
4241 		mtx_lock(&sc->driver_mtx);
4242 		mask = ifr->ifr_reqcap ^ ifp->if_capenable;
4243 		if (mask & IFCAP_TXCSUM) {
4244 			if (IFCAP_TXCSUM & ifp->if_capenable) {
4245 				ifp->if_capenable &= ~(IFCAP_TXCSUM|IFCAP_TSO4);
4246 				ifp->if_hwassist &= ~(CSUM_TCP | CSUM_UDP);
4247 			} else {
4248 				ifp->if_capenable |= IFCAP_TXCSUM;
4249 				ifp->if_hwassist |= (CSUM_TCP | CSUM_UDP);
4250 			}
4251 		} else if (mask & IFCAP_RXCSUM) {
4252 			if (IFCAP_RXCSUM & ifp->if_capenable) {
4253 				ifp->if_capenable &= ~IFCAP_RXCSUM;
4254 			} else {
4255 				ifp->if_capenable |= IFCAP_RXCSUM;
4256 			}
4257 		}
4258 		if (mask & IFCAP_TSO4) {
4259 			if (IFCAP_TSO4 & ifp->if_capenable) {
4260 				ifp->if_capenable &= ~IFCAP_TSO4;
4261 			} else if (IFCAP_TXCSUM & ifp->if_capenable) {
4262 				ifp->if_capenable |= IFCAP_TSO4;
4263 				ifp->if_hwassist |= CSUM_TSO;
4264 			} else {
4265 				printf("mxge requires tx checksum offload"
4266 				       " be enabled to use TSO\n");
4267 				err = EINVAL;
4268 			}
4269 		}
4270 #if IFCAP_TSO6
4271 		if (mask & IFCAP_TXCSUM_IPV6) {
4272 			if (IFCAP_TXCSUM_IPV6 & ifp->if_capenable) {
4273 				ifp->if_capenable &= ~(IFCAP_TXCSUM_IPV6
4274 						       | IFCAP_TSO6);
4275 				ifp->if_hwassist &= ~(CSUM_TCP_IPV6
4276 						      | CSUM_UDP);
4277 			} else {
4278 				ifp->if_capenable |= IFCAP_TXCSUM_IPV6;
4279 				ifp->if_hwassist |= (CSUM_TCP_IPV6
4280 						     | CSUM_UDP_IPV6);
4281 			}
4282 		} else if (mask & IFCAP_RXCSUM_IPV6) {
4283 			if (IFCAP_RXCSUM_IPV6 & ifp->if_capenable) {
4284 				ifp->if_capenable &= ~IFCAP_RXCSUM_IPV6;
4285 			} else {
4286 				ifp->if_capenable |= IFCAP_RXCSUM_IPV6;
4287 			}
4288 		}
4289 		if (mask & IFCAP_TSO6) {
4290 			if (IFCAP_TSO6 & ifp->if_capenable) {
4291 				ifp->if_capenable &= ~IFCAP_TSO6;
4292 			} else if (IFCAP_TXCSUM_IPV6 & ifp->if_capenable) {
4293 				ifp->if_capenable |= IFCAP_TSO6;
4294 				ifp->if_hwassist |= CSUM_TSO;
4295 			} else {
4296 				printf("mxge requires tx checksum offload"
4297 				       " be enabled to use TSO\n");
4298 				err = EINVAL;
4299 			}
4300 		}
4301 #endif /*IFCAP_TSO6 */
4302 
4303 		if (mask & IFCAP_LRO)
4304 			ifp->if_capenable ^= IFCAP_LRO;
4305 		if (mask & IFCAP_VLAN_HWTAGGING)
4306 			ifp->if_capenable ^= IFCAP_VLAN_HWTAGGING;
4307 		if (mask & IFCAP_VLAN_HWTSO)
4308 			ifp->if_capenable ^= IFCAP_VLAN_HWTSO;
4309 
4310 		if (!(ifp->if_capabilities & IFCAP_VLAN_HWTSO) ||
4311 		    !(ifp->if_capenable & IFCAP_VLAN_HWTAGGING))
4312 			ifp->if_capenable &= ~IFCAP_VLAN_HWTSO;
4313 
4314 		mtx_unlock(&sc->driver_mtx);
4315 		VLAN_CAPABILITIES(ifp);
4316 
4317 		break;
4318 
4319 	case SIOCGIFMEDIA:
4320 		mtx_lock(&sc->driver_mtx);
4321 		if (sc->dying) {
4322 			mtx_unlock(&sc->driver_mtx);
4323 			return (EINVAL);
4324 		}
4325 		mxge_media_probe(sc);
4326 		mtx_unlock(&sc->driver_mtx);
4327 		err = ifmedia_ioctl(ifp, (struct ifreq *)data,
4328 				    &sc->media, command);
4329 		break;
4330 
4331 	case SIOCGI2C:
4332 		if (sc->connector != MXGE_XFP &&
4333 		    sc->connector != MXGE_SFP) {
4334 			err = ENXIO;
4335 			break;
4336 		}
4337 		err = copyin(ifr_data_get_ptr(ifr), &i2c, sizeof(i2c));
4338 		if (err != 0)
4339 			break;
4340 		mtx_lock(&sc->driver_mtx);
4341 		if (sc->dying) {
4342 			mtx_unlock(&sc->driver_mtx);
4343 			return (EINVAL);
4344 		}
4345 		err = mxge_fetch_i2c(sc, &i2c);
4346 		mtx_unlock(&sc->driver_mtx);
4347 		if (err == 0)
4348 			err = copyout(&i2c, ifr->ifr_ifru.ifru_data,
4349 			    sizeof(i2c));
4350 		break;
4351 	default:
4352 		err = ether_ioctl(ifp, command, data);
4353 		break;
4354 	}
4355 	return err;
4356 }
4357 
4358 static void
4359 mxge_fetch_tunables(mxge_softc_t *sc)
4360 {
4361 
4362 	TUNABLE_INT_FETCH("hw.mxge.max_slices", &mxge_max_slices);
4363 	TUNABLE_INT_FETCH("hw.mxge.flow_control_enabled",
4364 			  &mxge_flow_control);
4365 	TUNABLE_INT_FETCH("hw.mxge.intr_coal_delay",
4366 			  &mxge_intr_coal_delay);
4367 	TUNABLE_INT_FETCH("hw.mxge.nvidia_ecrc_enable",
4368 			  &mxge_nvidia_ecrc_enable);
4369 	TUNABLE_INT_FETCH("hw.mxge.force_firmware",
4370 			  &mxge_force_firmware);
4371 	TUNABLE_INT_FETCH("hw.mxge.deassert_wait",
4372 			  &mxge_deassert_wait);
4373 	TUNABLE_INT_FETCH("hw.mxge.verbose",
4374 			  &mxge_verbose);
4375 	TUNABLE_INT_FETCH("hw.mxge.ticks", &mxge_ticks);
4376 	TUNABLE_INT_FETCH("hw.mxge.always_promisc", &mxge_always_promisc);
4377 	TUNABLE_INT_FETCH("hw.mxge.rss_hash_type", &mxge_rss_hash_type);
4378 	TUNABLE_INT_FETCH("hw.mxge.rss_hashtype", &mxge_rss_hash_type);
4379 	TUNABLE_INT_FETCH("hw.mxge.initial_mtu", &mxge_initial_mtu);
4380 	TUNABLE_INT_FETCH("hw.mxge.throttle", &mxge_throttle);
4381 
4382 	if (bootverbose)
4383 		mxge_verbose = 1;
4384 	if (mxge_intr_coal_delay < 0 || mxge_intr_coal_delay > 10*1000)
4385 		mxge_intr_coal_delay = 30;
4386 	if (mxge_ticks == 0)
4387 		mxge_ticks = hz / 2;
4388 	sc->pause = mxge_flow_control;
4389 	if (mxge_rss_hash_type < MXGEFW_RSS_HASH_TYPE_IPV4
4390 	    || mxge_rss_hash_type > MXGEFW_RSS_HASH_TYPE_MAX) {
4391 		mxge_rss_hash_type = MXGEFW_RSS_HASH_TYPE_SRC_DST_PORT;
4392 	}
4393 	if (mxge_initial_mtu > ETHERMTU_JUMBO ||
4394 	    mxge_initial_mtu < ETHER_MIN_LEN)
4395 		mxge_initial_mtu = ETHERMTU_JUMBO;
4396 
4397 	if (mxge_throttle && mxge_throttle > MXGE_MAX_THROTTLE)
4398 		mxge_throttle = MXGE_MAX_THROTTLE;
4399 	if (mxge_throttle && mxge_throttle < MXGE_MIN_THROTTLE)
4400 		mxge_throttle = MXGE_MIN_THROTTLE;
4401 	sc->throttle = mxge_throttle;
4402 }
4403 
4404 
4405 static void
4406 mxge_free_slices(mxge_softc_t *sc)
4407 {
4408 	struct mxge_slice_state *ss;
4409 	int i;
4410 
4411 
4412 	if (sc->ss == NULL)
4413 		return;
4414 
4415 	for (i = 0; i < sc->num_slices; i++) {
4416 		ss = &sc->ss[i];
4417 		if (ss->fw_stats != NULL) {
4418 			mxge_dma_free(&ss->fw_stats_dma);
4419 			ss->fw_stats = NULL;
4420 #ifdef IFNET_BUF_RING
4421 			if (ss->tx.br != NULL) {
4422 				drbr_free(ss->tx.br, M_DEVBUF);
4423 				ss->tx.br = NULL;
4424 			}
4425 #endif
4426 			mtx_destroy(&ss->tx.mtx);
4427 		}
4428 		if (ss->rx_done.entry != NULL) {
4429 			mxge_dma_free(&ss->rx_done.dma);
4430 			ss->rx_done.entry = NULL;
4431 		}
4432 	}
4433 	free(sc->ss, M_DEVBUF);
4434 	sc->ss = NULL;
4435 }
4436 
4437 static int
4438 mxge_alloc_slices(mxge_softc_t *sc)
4439 {
4440 	mxge_cmd_t cmd;
4441 	struct mxge_slice_state *ss;
4442 	size_t bytes;
4443 	int err, i, max_intr_slots;
4444 
4445 	err = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
4446 	if (err != 0) {
4447 		device_printf(sc->dev, "Cannot determine rx ring size\n");
4448 		return err;
4449 	}
4450 	sc->rx_ring_size = cmd.data0;
4451 	max_intr_slots = 2 * (sc->rx_ring_size / sizeof (mcp_dma_addr_t));
4452 
4453 	bytes = sizeof (*sc->ss) * sc->num_slices;
4454 	sc->ss = malloc(bytes, M_DEVBUF, M_NOWAIT | M_ZERO);
4455 	if (sc->ss == NULL)
4456 		return (ENOMEM);
4457 	for (i = 0; i < sc->num_slices; i++) {
4458 		ss = &sc->ss[i];
4459 
4460 		ss->sc = sc;
4461 
4462 		/* allocate per-slice rx interrupt queues */
4463 
4464 		bytes = max_intr_slots * sizeof (*ss->rx_done.entry);
4465 		err = mxge_dma_alloc(sc, &ss->rx_done.dma, bytes, 4096);
4466 		if (err != 0)
4467 			goto abort;
4468 		ss->rx_done.entry = ss->rx_done.dma.addr;
4469 		bzero(ss->rx_done.entry, bytes);
4470 
4471 		/*
4472 		 * allocate the per-slice firmware stats; stats
4473 		 * (including tx) are used used only on the first
4474 		 * slice for now
4475 		 */
4476 #ifndef IFNET_BUF_RING
4477 		if (i > 0)
4478 			continue;
4479 #endif
4480 
4481 		bytes = sizeof (*ss->fw_stats);
4482 		err = mxge_dma_alloc(sc, &ss->fw_stats_dma,
4483 				     sizeof (*ss->fw_stats), 64);
4484 		if (err != 0)
4485 			goto abort;
4486 		ss->fw_stats = (mcp_irq_data_t *)ss->fw_stats_dma.addr;
4487 		snprintf(ss->tx.mtx_name, sizeof(ss->tx.mtx_name),
4488 			 "%s:tx(%d)", device_get_nameunit(sc->dev), i);
4489 		mtx_init(&ss->tx.mtx, ss->tx.mtx_name, NULL, MTX_DEF);
4490 #ifdef IFNET_BUF_RING
4491 		ss->tx.br = buf_ring_alloc(2048, M_DEVBUF, M_WAITOK,
4492 					   &ss->tx.mtx);
4493 #endif
4494 	}
4495 
4496 	return (0);
4497 
4498 abort:
4499 	mxge_free_slices(sc);
4500 	return (ENOMEM);
4501 }
4502 
4503 static void
4504 mxge_slice_probe(mxge_softc_t *sc)
4505 {
4506 	mxge_cmd_t cmd;
4507 	char *old_fw;
4508 	int msix_cnt, status, max_intr_slots;
4509 
4510 	sc->num_slices = 1;
4511 	/*
4512 	 *  don't enable multiple slices if they are not enabled,
4513 	 *  or if this is not an SMP system
4514 	 */
4515 
4516 	if (mxge_max_slices == 0 || mxge_max_slices == 1 || mp_ncpus < 2)
4517 		return;
4518 
4519 	/* see how many MSI-X interrupts are available */
4520 	msix_cnt = pci_msix_count(sc->dev);
4521 	if (msix_cnt < 2)
4522 		return;
4523 
4524 	/* now load the slice aware firmware see what it supports */
4525 	old_fw = sc->fw_name;
4526 	if (old_fw == mxge_fw_aligned)
4527 		sc->fw_name = mxge_fw_rss_aligned;
4528 	else
4529 		sc->fw_name = mxge_fw_rss_unaligned;
4530 	status = mxge_load_firmware(sc, 0);
4531 	if (status != 0) {
4532 		device_printf(sc->dev, "Falling back to a single slice\n");
4533 		return;
4534 	}
4535 
4536 	/* try to send a reset command to the card to see if it
4537 	   is alive */
4538 	memset(&cmd, 0, sizeof (cmd));
4539 	status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
4540 	if (status != 0) {
4541 		device_printf(sc->dev, "failed reset\n");
4542 		goto abort_with_fw;
4543 	}
4544 
4545 	/* get rx ring size */
4546 	status = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
4547 	if (status != 0) {
4548 		device_printf(sc->dev, "Cannot determine rx ring size\n");
4549 		goto abort_with_fw;
4550 	}
4551 	max_intr_slots = 2 * (cmd.data0 / sizeof (mcp_dma_addr_t));
4552 
4553 	/* tell it the size of the interrupt queues */
4554 	cmd.data0 = max_intr_slots * sizeof (struct mcp_slot);
4555 	status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
4556 	if (status != 0) {
4557 		device_printf(sc->dev, "failed MXGEFW_CMD_SET_INTRQ_SIZE\n");
4558 		goto abort_with_fw;
4559 	}
4560 
4561 	/* ask the maximum number of slices it supports */
4562 	status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES, &cmd);
4563 	if (status != 0) {
4564 		device_printf(sc->dev,
4565 			      "failed MXGEFW_CMD_GET_MAX_RSS_QUEUES\n");
4566 		goto abort_with_fw;
4567 	}
4568 	sc->num_slices = cmd.data0;
4569 	if (sc->num_slices > msix_cnt)
4570 		sc->num_slices = msix_cnt;
4571 
4572 	if (mxge_max_slices == -1) {
4573 		/* cap to number of CPUs in system */
4574 		if (sc->num_slices > mp_ncpus)
4575 			sc->num_slices = mp_ncpus;
4576 	} else {
4577 		if (sc->num_slices > mxge_max_slices)
4578 			sc->num_slices = mxge_max_slices;
4579 	}
4580 	/* make sure it is a power of two */
4581 	while (sc->num_slices & (sc->num_slices - 1))
4582 		sc->num_slices--;
4583 
4584 	if (mxge_verbose)
4585 		device_printf(sc->dev, "using %d slices\n",
4586 			      sc->num_slices);
4587 
4588 	return;
4589 
4590 abort_with_fw:
4591 	sc->fw_name = old_fw;
4592 	(void) mxge_load_firmware(sc, 0);
4593 }
4594 
4595 static int
4596 mxge_add_msix_irqs(mxge_softc_t *sc)
4597 {
4598 	size_t bytes;
4599 	int count, err, i, rid;
4600 
4601 	rid = PCIR_BAR(2);
4602 	sc->msix_table_res = bus_alloc_resource_any(sc->dev, SYS_RES_MEMORY,
4603 						    &rid, RF_ACTIVE);
4604 
4605 	if (sc->msix_table_res == NULL) {
4606 		device_printf(sc->dev, "couldn't alloc MSIX table res\n");
4607 		return ENXIO;
4608 	}
4609 
4610 	count = sc->num_slices;
4611 	err = pci_alloc_msix(sc->dev, &count);
4612 	if (err != 0) {
4613 		device_printf(sc->dev, "pci_alloc_msix: failed, wanted %d"
4614 			      "err = %d \n", sc->num_slices, err);
4615 		goto abort_with_msix_table;
4616 	}
4617 	if (count < sc->num_slices) {
4618 		device_printf(sc->dev, "pci_alloc_msix: need %d, got %d\n",
4619 			      count, sc->num_slices);
4620 		device_printf(sc->dev,
4621 			      "Try setting hw.mxge.max_slices to %d\n",
4622 			      count);
4623 		err = ENOSPC;
4624 		goto abort_with_msix;
4625 	}
4626 	bytes = sizeof (*sc->msix_irq_res) * sc->num_slices;
4627 	sc->msix_irq_res = malloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO);
4628 	if (sc->msix_irq_res == NULL) {
4629 		err = ENOMEM;
4630 		goto abort_with_msix;
4631 	}
4632 
4633 	for (i = 0; i < sc->num_slices; i++) {
4634 		rid = i + 1;
4635 		sc->msix_irq_res[i] = bus_alloc_resource_any(sc->dev,
4636 							  SYS_RES_IRQ,
4637 							  &rid, RF_ACTIVE);
4638 		if (sc->msix_irq_res[i] == NULL) {
4639 			device_printf(sc->dev, "couldn't allocate IRQ res"
4640 				      " for message %d\n", i);
4641 			err = ENXIO;
4642 			goto abort_with_res;
4643 		}
4644 	}
4645 
4646 	bytes = sizeof (*sc->msix_ih) * sc->num_slices;
4647 	sc->msix_ih =  malloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO);
4648 
4649 	for (i = 0; i < sc->num_slices; i++) {
4650 		err = bus_setup_intr(sc->dev, sc->msix_irq_res[i],
4651 				     INTR_TYPE_NET | INTR_MPSAFE,
4652 #if __FreeBSD_version > 700030
4653 				     NULL,
4654 #endif
4655 				     mxge_intr, &sc->ss[i], &sc->msix_ih[i]);
4656 		if (err != 0) {
4657 			device_printf(sc->dev, "couldn't setup intr for "
4658 				      "message %d\n", i);
4659 			goto abort_with_intr;
4660 		}
4661 		bus_describe_intr(sc->dev, sc->msix_irq_res[i],
4662 				  sc->msix_ih[i], "s%d", i);
4663 	}
4664 
4665 	if (mxge_verbose) {
4666 		device_printf(sc->dev, "using %d msix IRQs:",
4667 			      sc->num_slices);
4668 		for (i = 0; i < sc->num_slices; i++)
4669 			printf(" %jd", rman_get_start(sc->msix_irq_res[i]));
4670 		printf("\n");
4671 	}
4672 	return (0);
4673 
4674 abort_with_intr:
4675 	for (i = 0; i < sc->num_slices; i++) {
4676 		if (sc->msix_ih[i] != NULL) {
4677 			bus_teardown_intr(sc->dev, sc->msix_irq_res[i],
4678 					  sc->msix_ih[i]);
4679 			sc->msix_ih[i] = NULL;
4680 		}
4681 	}
4682 	free(sc->msix_ih, M_DEVBUF);
4683 
4684 
4685 abort_with_res:
4686 	for (i = 0; i < sc->num_slices; i++) {
4687 		rid = i + 1;
4688 		if (sc->msix_irq_res[i] != NULL)
4689 			bus_release_resource(sc->dev, SYS_RES_IRQ, rid,
4690 					     sc->msix_irq_res[i]);
4691 		sc->msix_irq_res[i] = NULL;
4692 	}
4693 	free(sc->msix_irq_res, M_DEVBUF);
4694 
4695 
4696 abort_with_msix:
4697 	pci_release_msi(sc->dev);
4698 
4699 abort_with_msix_table:
4700 	bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2),
4701 			     sc->msix_table_res);
4702 
4703 	return err;
4704 }
4705 
4706 static int
4707 mxge_add_single_irq(mxge_softc_t *sc)
4708 {
4709 	int count, err, rid;
4710 
4711 	count = pci_msi_count(sc->dev);
4712 	if (count == 1 && pci_alloc_msi(sc->dev, &count) == 0) {
4713 		rid = 1;
4714 	} else {
4715 		rid = 0;
4716 		sc->legacy_irq = 1;
4717 	}
4718 	sc->irq_res = bus_alloc_resource_any(sc->dev, SYS_RES_IRQ, &rid,
4719 					     RF_SHAREABLE | RF_ACTIVE);
4720 	if (sc->irq_res == NULL) {
4721 		device_printf(sc->dev, "could not alloc interrupt\n");
4722 		return ENXIO;
4723 	}
4724 	if (mxge_verbose)
4725 		device_printf(sc->dev, "using %s irq %jd\n",
4726 			      sc->legacy_irq ? "INTx" : "MSI",
4727 			      rman_get_start(sc->irq_res));
4728 	err = bus_setup_intr(sc->dev, sc->irq_res,
4729 			     INTR_TYPE_NET | INTR_MPSAFE,
4730 #if __FreeBSD_version > 700030
4731 			     NULL,
4732 #endif
4733 			     mxge_intr, &sc->ss[0], &sc->ih);
4734 	if (err != 0) {
4735 		bus_release_resource(sc->dev, SYS_RES_IRQ,
4736 				     sc->legacy_irq ? 0 : 1, sc->irq_res);
4737 		if (!sc->legacy_irq)
4738 			pci_release_msi(sc->dev);
4739 	}
4740 	return err;
4741 }
4742 
4743 static void
4744 mxge_rem_msix_irqs(mxge_softc_t *sc)
4745 {
4746 	int i, rid;
4747 
4748 	for (i = 0; i < sc->num_slices; i++) {
4749 		if (sc->msix_ih[i] != NULL) {
4750 			bus_teardown_intr(sc->dev, sc->msix_irq_res[i],
4751 					  sc->msix_ih[i]);
4752 			sc->msix_ih[i] = NULL;
4753 		}
4754 	}
4755 	free(sc->msix_ih, M_DEVBUF);
4756 
4757 	for (i = 0; i < sc->num_slices; i++) {
4758 		rid = i + 1;
4759 		if (sc->msix_irq_res[i] != NULL)
4760 			bus_release_resource(sc->dev, SYS_RES_IRQ, rid,
4761 					     sc->msix_irq_res[i]);
4762 		sc->msix_irq_res[i] = NULL;
4763 	}
4764 	free(sc->msix_irq_res, M_DEVBUF);
4765 
4766 	bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2),
4767 			     sc->msix_table_res);
4768 
4769 	pci_release_msi(sc->dev);
4770 	return;
4771 }
4772 
4773 static void
4774 mxge_rem_single_irq(mxge_softc_t *sc)
4775 {
4776 	bus_teardown_intr(sc->dev, sc->irq_res, sc->ih);
4777 	bus_release_resource(sc->dev, SYS_RES_IRQ,
4778 			     sc->legacy_irq ? 0 : 1, sc->irq_res);
4779 	if (!sc->legacy_irq)
4780 		pci_release_msi(sc->dev);
4781 }
4782 
4783 static void
4784 mxge_rem_irq(mxge_softc_t *sc)
4785 {
4786 	if (sc->num_slices > 1)
4787 		mxge_rem_msix_irqs(sc);
4788 	else
4789 		mxge_rem_single_irq(sc);
4790 }
4791 
4792 static int
4793 mxge_add_irq(mxge_softc_t *sc)
4794 {
4795 	int err;
4796 
4797 	if (sc->num_slices > 1)
4798 		err = mxge_add_msix_irqs(sc);
4799 	else
4800 		err = mxge_add_single_irq(sc);
4801 
4802 	if (0 && err == 0 && sc->num_slices > 1) {
4803 		mxge_rem_msix_irqs(sc);
4804 		err = mxge_add_msix_irqs(sc);
4805 	}
4806 	return err;
4807 }
4808 
4809 
4810 static int
4811 mxge_attach(device_t dev)
4812 {
4813 	mxge_cmd_t cmd;
4814 	mxge_softc_t *sc = device_get_softc(dev);
4815 	struct ifnet *ifp;
4816 	int err, rid;
4817 
4818 	sc->dev = dev;
4819 	mxge_fetch_tunables(sc);
4820 
4821 	TASK_INIT(&sc->watchdog_task, 1, mxge_watchdog_task, sc);
4822 	sc->tq = taskqueue_create("mxge_taskq", M_WAITOK,
4823 				  taskqueue_thread_enqueue, &sc->tq);
4824 	if (sc->tq == NULL) {
4825 		err = ENOMEM;
4826 		goto abort_with_nothing;
4827 	}
4828 
4829 	err = bus_dma_tag_create(bus_get_dma_tag(dev),	/* parent */
4830 				 1,			/* alignment */
4831 				 0,			/* boundary */
4832 				 BUS_SPACE_MAXADDR,	/* low */
4833 				 BUS_SPACE_MAXADDR,	/* high */
4834 				 NULL, NULL,		/* filter */
4835 				 65536 + 256,		/* maxsize */
4836 				 MXGE_MAX_SEND_DESC, 	/* num segs */
4837 				 65536,			/* maxsegsize */
4838 				 0,			/* flags */
4839 				 NULL, NULL,		/* lock */
4840 				 &sc->parent_dmat);	/* tag */
4841 
4842 	if (err != 0) {
4843 		device_printf(sc->dev, "Err %d allocating parent dmat\n",
4844 			      err);
4845 		goto abort_with_tq;
4846 	}
4847 
4848 	ifp = sc->ifp = if_alloc(IFT_ETHER);
4849 	if (ifp == NULL) {
4850 		device_printf(dev, "can not if_alloc()\n");
4851 		err = ENOSPC;
4852 		goto abort_with_parent_dmat;
4853 	}
4854 	if_initname(ifp, device_get_name(dev), device_get_unit(dev));
4855 
4856 	snprintf(sc->cmd_mtx_name, sizeof(sc->cmd_mtx_name), "%s:cmd",
4857 		 device_get_nameunit(dev));
4858 	mtx_init(&sc->cmd_mtx, sc->cmd_mtx_name, NULL, MTX_DEF);
4859 	snprintf(sc->driver_mtx_name, sizeof(sc->driver_mtx_name),
4860 		 "%s:drv", device_get_nameunit(dev));
4861 	mtx_init(&sc->driver_mtx, sc->driver_mtx_name,
4862 		 MTX_NETWORK_LOCK, MTX_DEF);
4863 
4864 	callout_init_mtx(&sc->co_hdl, &sc->driver_mtx, 0);
4865 
4866 	mxge_setup_cfg_space(sc);
4867 
4868 	/* Map the board into the kernel */
4869 	rid = PCIR_BARS;
4870 	sc->mem_res = bus_alloc_resource_any(dev, SYS_RES_MEMORY, &rid,
4871 					     RF_ACTIVE);
4872 	if (sc->mem_res == NULL) {
4873 		device_printf(dev, "could not map memory\n");
4874 		err = ENXIO;
4875 		goto abort_with_lock;
4876 	}
4877 	sc->sram = rman_get_virtual(sc->mem_res);
4878 	sc->sram_size = 2*1024*1024 - (2*(48*1024)+(32*1024)) - 0x100;
4879 	if (sc->sram_size > rman_get_size(sc->mem_res)) {
4880 		device_printf(dev, "impossible memory region size %jd\n",
4881 			      rman_get_size(sc->mem_res));
4882 		err = ENXIO;
4883 		goto abort_with_mem_res;
4884 	}
4885 
4886 	/* make NULL terminated copy of the EEPROM strings section of
4887 	   lanai SRAM */
4888 	bzero(sc->eeprom_strings, MXGE_EEPROM_STRINGS_SIZE);
4889 	bus_space_read_region_1(rman_get_bustag(sc->mem_res),
4890 				rman_get_bushandle(sc->mem_res),
4891 				sc->sram_size - MXGE_EEPROM_STRINGS_SIZE,
4892 				sc->eeprom_strings,
4893 				MXGE_EEPROM_STRINGS_SIZE - 2);
4894 	err = mxge_parse_strings(sc);
4895 	if (err != 0)
4896 		goto abort_with_mem_res;
4897 
4898 	/* Enable write combining for efficient use of PCIe bus */
4899 	mxge_enable_wc(sc);
4900 
4901 	/* Allocate the out of band dma memory */
4902 	err = mxge_dma_alloc(sc, &sc->cmd_dma,
4903 			     sizeof (mxge_cmd_t), 64);
4904 	if (err != 0)
4905 		goto abort_with_mem_res;
4906 	sc->cmd = (mcp_cmd_response_t *) sc->cmd_dma.addr;
4907 	err = mxge_dma_alloc(sc, &sc->zeropad_dma, 64, 64);
4908 	if (err != 0)
4909 		goto abort_with_cmd_dma;
4910 
4911 	err = mxge_dma_alloc(sc, &sc->dmabench_dma, 4096, 4096);
4912 	if (err != 0)
4913 		goto abort_with_zeropad_dma;
4914 
4915 	/* select & load the firmware */
4916 	err = mxge_select_firmware(sc);
4917 	if (err != 0)
4918 		goto abort_with_dmabench;
4919 	sc->intr_coal_delay = mxge_intr_coal_delay;
4920 
4921 	mxge_slice_probe(sc);
4922 	err = mxge_alloc_slices(sc);
4923 	if (err != 0)
4924 		goto abort_with_dmabench;
4925 
4926 	err = mxge_reset(sc, 0);
4927 	if (err != 0)
4928 		goto abort_with_slices;
4929 
4930 	err = mxge_alloc_rings(sc);
4931 	if (err != 0) {
4932 		device_printf(sc->dev, "failed to allocate rings\n");
4933 		goto abort_with_slices;
4934 	}
4935 
4936 	err = mxge_add_irq(sc);
4937 	if (err != 0) {
4938 		device_printf(sc->dev, "failed to add irq\n");
4939 		goto abort_with_rings;
4940 	}
4941 
4942 	ifp->if_baudrate = IF_Gbps(10);
4943 	ifp->if_capabilities = IFCAP_RXCSUM | IFCAP_TXCSUM | IFCAP_TSO4 |
4944 		IFCAP_VLAN_MTU | IFCAP_LINKSTATE | IFCAP_TXCSUM_IPV6 |
4945 		IFCAP_RXCSUM_IPV6;
4946 #if defined(INET) || defined(INET6)
4947 	ifp->if_capabilities |= IFCAP_LRO;
4948 #endif
4949 
4950 #ifdef MXGE_NEW_VLAN_API
4951 	ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_HWCSUM;
4952 
4953 	/* Only FW 1.4.32 and newer can do TSO over vlans */
4954 	if (sc->fw_ver_major == 1 && sc->fw_ver_minor == 4 &&
4955 	    sc->fw_ver_tiny >= 32)
4956 		ifp->if_capabilities |= IFCAP_VLAN_HWTSO;
4957 #endif
4958 	sc->max_mtu = mxge_max_mtu(sc);
4959 	if (sc->max_mtu >= 9000)
4960 		ifp->if_capabilities |= IFCAP_JUMBO_MTU;
4961 	else
4962 		device_printf(dev, "MTU limited to %d.  Install "
4963 			      "latest firmware for 9000 byte jumbo support\n",
4964 			      sc->max_mtu - ETHER_HDR_LEN);
4965 	ifp->if_hwassist = CSUM_TCP | CSUM_UDP | CSUM_TSO;
4966 	ifp->if_hwassist |= CSUM_TCP_IPV6 | CSUM_UDP_IPV6;
4967 	/* check to see if f/w supports TSO for IPv6 */
4968 	if (!mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_TSO6_HDR_SIZE, &cmd)) {
4969 		if (CSUM_TCP_IPV6)
4970 			ifp->if_capabilities |= IFCAP_TSO6;
4971 		sc->max_tso6_hlen = min(cmd.data0,
4972 					sizeof (sc->ss[0].scratch));
4973 	}
4974 	ifp->if_capenable = ifp->if_capabilities;
4975 	if (sc->lro_cnt == 0)
4976 		ifp->if_capenable &= ~IFCAP_LRO;
4977 	ifp->if_init = mxge_init;
4978 	ifp->if_softc = sc;
4979 	ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
4980 	ifp->if_ioctl = mxge_ioctl;
4981 	ifp->if_start = mxge_start;
4982 	ifp->if_get_counter = mxge_get_counter;
4983 	ifp->if_hw_tsomax = IP_MAXPACKET - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN);
4984 	ifp->if_hw_tsomaxsegcount = sc->ss[0].tx.max_desc;
4985 	ifp->if_hw_tsomaxsegsize = IP_MAXPACKET;
4986 	/* Initialise the ifmedia structure */
4987 	ifmedia_init(&sc->media, 0, mxge_media_change,
4988 		     mxge_media_status);
4989 	mxge_media_init(sc);
4990 	mxge_media_probe(sc);
4991 	sc->dying = 0;
4992 	ether_ifattach(ifp, sc->mac_addr);
4993 	/* ether_ifattach sets mtu to ETHERMTU */
4994 	if (mxge_initial_mtu != ETHERMTU)
4995 		mxge_change_mtu(sc, mxge_initial_mtu);
4996 
4997 	mxge_add_sysctls(sc);
4998 #ifdef IFNET_BUF_RING
4999 	ifp->if_transmit = mxge_transmit;
5000 	ifp->if_qflush = mxge_qflush;
5001 #endif
5002 	taskqueue_start_threads(&sc->tq, 1, PI_NET, "%s taskq",
5003 				device_get_nameunit(sc->dev));
5004 	callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
5005 	return 0;
5006 
5007 abort_with_rings:
5008 	mxge_free_rings(sc);
5009 abort_with_slices:
5010 	mxge_free_slices(sc);
5011 abort_with_dmabench:
5012 	mxge_dma_free(&sc->dmabench_dma);
5013 abort_with_zeropad_dma:
5014 	mxge_dma_free(&sc->zeropad_dma);
5015 abort_with_cmd_dma:
5016 	mxge_dma_free(&sc->cmd_dma);
5017 abort_with_mem_res:
5018 	bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
5019 abort_with_lock:
5020 	pci_disable_busmaster(dev);
5021 	mtx_destroy(&sc->cmd_mtx);
5022 	mtx_destroy(&sc->driver_mtx);
5023 	if_free(ifp);
5024 abort_with_parent_dmat:
5025 	bus_dma_tag_destroy(sc->parent_dmat);
5026 abort_with_tq:
5027 	if (sc->tq != NULL) {
5028 		taskqueue_drain(sc->tq, &sc->watchdog_task);
5029 		taskqueue_free(sc->tq);
5030 		sc->tq = NULL;
5031 	}
5032 abort_with_nothing:
5033 	return err;
5034 }
5035 
5036 static int
5037 mxge_detach(device_t dev)
5038 {
5039 	mxge_softc_t *sc = device_get_softc(dev);
5040 
5041 	if (mxge_vlans_active(sc)) {
5042 		device_printf(sc->dev,
5043 			      "Detach vlans before removing module\n");
5044 		return EBUSY;
5045 	}
5046 	mtx_lock(&sc->driver_mtx);
5047 	sc->dying = 1;
5048 	if (sc->ifp->if_drv_flags & IFF_DRV_RUNNING)
5049 		mxge_close(sc, 0);
5050 	mtx_unlock(&sc->driver_mtx);
5051 	ether_ifdetach(sc->ifp);
5052 	if (sc->tq != NULL) {
5053 		taskqueue_drain(sc->tq, &sc->watchdog_task);
5054 		taskqueue_free(sc->tq);
5055 		sc->tq = NULL;
5056 	}
5057 	callout_drain(&sc->co_hdl);
5058 	ifmedia_removeall(&sc->media);
5059 	mxge_dummy_rdma(sc, 0);
5060 	mxge_rem_sysctls(sc);
5061 	mxge_rem_irq(sc);
5062 	mxge_free_rings(sc);
5063 	mxge_free_slices(sc);
5064 	mxge_dma_free(&sc->dmabench_dma);
5065 	mxge_dma_free(&sc->zeropad_dma);
5066 	mxge_dma_free(&sc->cmd_dma);
5067 	bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
5068 	pci_disable_busmaster(dev);
5069 	mtx_destroy(&sc->cmd_mtx);
5070 	mtx_destroy(&sc->driver_mtx);
5071 	if_free(sc->ifp);
5072 	bus_dma_tag_destroy(sc->parent_dmat);
5073 	return 0;
5074 }
5075 
5076 static int
5077 mxge_shutdown(device_t dev)
5078 {
5079 	return 0;
5080 }
5081 
5082 /*
5083   This file uses Myri10GE driver indentation.
5084 
5085   Local Variables:
5086   c-file-style:"linux"
5087   tab-width:8
5088   End:
5089 */
5090