xref: /freebsd/sys/dev/mxge/if_mxge.c (revision 22cf89c938886d14f5796fc49f9f020c23ea8eaf)
1 /******************************************************************************
2 SPDX-License-Identifier: BSD-2-Clause
3 
4 Copyright (c) 2006-2013, Myricom Inc.
5 All rights reserved.
6 
7 Redistribution and use in source and binary forms, with or without
8 modification, are permitted provided that the following conditions are met:
9 
10  1. Redistributions of source code must retain the above copyright notice,
11     this list of conditions and the following disclaimer.
12 
13  2. Neither the name of the Myricom Inc, nor the names of its
14     contributors may be used to endorse or promote products derived from
15     this software without specific prior written permission.
16 
17 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
21 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27 POSSIBILITY OF SUCH DAMAGE.
28 
29 ***************************************************************************/
30 
31 #include <sys/cdefs.h>
32 #include <sys/param.h>
33 #include <sys/systm.h>
34 #include <sys/linker.h>
35 #include <sys/firmware.h>
36 #include <sys/endian.h>
37 #include <sys/sockio.h>
38 #include <sys/mbuf.h>
39 #include <sys/malloc.h>
40 #include <sys/kdb.h>
41 #include <sys/kernel.h>
42 #include <sys/lock.h>
43 #include <sys/module.h>
44 #include <sys/socket.h>
45 #include <sys/sysctl.h>
46 #include <sys/sx.h>
47 #include <sys/taskqueue.h>
48 #include <contrib/zlib/zlib.h>
49 #include <dev/zlib/zcalloc.h>
50 
51 #include <net/if.h>
52 #include <net/if_var.h>
53 #include <net/if_arp.h>
54 #include <net/ethernet.h>
55 #include <net/if_dl.h>
56 #include <net/if_media.h>
57 
58 #include <net/bpf.h>
59 
60 #include <net/if_types.h>
61 #include <net/if_vlan_var.h>
62 
63 #include <netinet/in_systm.h>
64 #include <netinet/in.h>
65 #include <netinet/ip.h>
66 #include <netinet/ip6.h>
67 #include <netinet/tcp.h>
68 #include <netinet/tcp_lro.h>
69 #include <netinet6/ip6_var.h>
70 
71 #include <machine/bus.h>
72 #include <machine/in_cksum.h>
73 #include <machine/resource.h>
74 #include <sys/bus.h>
75 #include <sys/rman.h>
76 #include <sys/smp.h>
77 
78 #include <dev/pci/pcireg.h>
79 #include <dev/pci/pcivar.h>
80 #include <dev/pci/pci_private.h> /* XXX for pci_cfg_restore */
81 
82 #include <vm/vm.h>		/* for pmap_mapdev() */
83 #include <vm/pmap.h>
84 
85 #if defined(__i386) || defined(__amd64)
86 #include <machine/specialreg.h>
87 #endif
88 
89 #include <dev/mxge/mxge_mcp.h>
90 #include <dev/mxge/mcp_gen_header.h>
91 /*#define MXGE_FAKE_IFP*/
92 #include <dev/mxge/if_mxge_var.h>
93 #include <sys/buf_ring.h>
94 
95 #include "opt_inet.h"
96 #include "opt_inet6.h"
97 
98 /* tunable params */
99 static int mxge_nvidia_ecrc_enable = 1;
100 static int mxge_force_firmware = 0;
101 static int mxge_intr_coal_delay = 30;
102 static int mxge_deassert_wait = 1;
103 static int mxge_flow_control = 1;
104 static int mxge_verbose = 0;
105 static int mxge_ticks;
106 static int mxge_max_slices = 1;
107 static int mxge_rss_hash_type = MXGEFW_RSS_HASH_TYPE_SRC_DST_PORT;
108 static int mxge_always_promisc = 0;
109 static int mxge_initial_mtu = ETHERMTU_JUMBO;
110 static int mxge_throttle = 0;
111 static char *mxge_fw_unaligned = "mxge_ethp_z8e";
112 static char *mxge_fw_aligned = "mxge_eth_z8e";
113 static char *mxge_fw_rss_aligned = "mxge_rss_eth_z8e";
114 static char *mxge_fw_rss_unaligned = "mxge_rss_ethp_z8e";
115 
116 static int mxge_probe(device_t dev);
117 static int mxge_attach(device_t dev);
118 static int mxge_detach(device_t dev);
119 static int mxge_shutdown(device_t dev);
120 static void mxge_intr(void *arg);
121 
122 static device_method_t mxge_methods[] =
123 {
124   /* Device interface */
125   DEVMETHOD(device_probe, mxge_probe),
126   DEVMETHOD(device_attach, mxge_attach),
127   DEVMETHOD(device_detach, mxge_detach),
128   DEVMETHOD(device_shutdown, mxge_shutdown),
129 
130   DEVMETHOD_END
131 };
132 
133 static driver_t mxge_driver =
134 {
135   "mxge",
136   mxge_methods,
137   sizeof(mxge_softc_t),
138 };
139 
140 /* Declare ourselves to be a child of the PCI bus.*/
141 DRIVER_MODULE(mxge, pci, mxge_driver, 0, 0);
142 MODULE_DEPEND(mxge, firmware, 1, 1, 1);
143 MODULE_DEPEND(mxge, zlib, 1, 1, 1);
144 
145 static int mxge_load_firmware(mxge_softc_t *sc, int adopt);
146 static int mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data);
147 static int mxge_close(mxge_softc_t *sc, int down);
148 static int mxge_open(mxge_softc_t *sc);
149 static void mxge_tick(void *arg);
150 
151 static int
152 mxge_probe(device_t dev)
153 {
154 	int rev;
155 
156 	if ((pci_get_vendor(dev) == MXGE_PCI_VENDOR_MYRICOM) &&
157 	    ((pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E) ||
158 	     (pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E_9))) {
159 		rev = pci_get_revid(dev);
160 		switch (rev) {
161 		case MXGE_PCI_REV_Z8E:
162 			device_set_desc(dev, "Myri10G-PCIE-8A");
163 			break;
164 		case MXGE_PCI_REV_Z8ES:
165 			device_set_desc(dev, "Myri10G-PCIE-8B");
166 			break;
167 		default:
168 			device_set_desc(dev, "Myri10G-PCIE-8??");
169 			device_printf(dev, "Unrecognized rev %d NIC\n",
170 				      rev);
171 			break;
172 		}
173 		return 0;
174 	}
175 	return ENXIO;
176 }
177 
178 static void
179 mxge_enable_wc(mxge_softc_t *sc)
180 {
181 #if defined(__i386) || defined(__amd64)
182 	vm_offset_t len;
183 	int err;
184 
185 	sc->wc = 1;
186 	len = rman_get_size(sc->mem_res);
187 	err = pmap_change_attr((vm_offset_t) sc->sram,
188 			       len, PAT_WRITE_COMBINING);
189 	if (err != 0) {
190 		device_printf(sc->dev, "pmap_change_attr failed, %d\n",
191 			      err);
192 		sc->wc = 0;
193 	}
194 #endif
195 }
196 
197 /* callback to get our DMA address */
198 static void
199 mxge_dmamap_callback(void *arg, bus_dma_segment_t *segs, int nsegs,
200 			 int error)
201 {
202 	if (error == 0) {
203 		*(bus_addr_t *) arg = segs->ds_addr;
204 	}
205 }
206 
207 static int
208 mxge_dma_alloc(mxge_softc_t *sc, mxge_dma_t *dma, size_t bytes,
209 		   bus_size_t alignment)
210 {
211 	int err;
212 	device_t dev = sc->dev;
213 	bus_size_t boundary, maxsegsize;
214 
215 	if (bytes > 4096 && alignment == 4096) {
216 		boundary = 0;
217 		maxsegsize = bytes;
218 	} else {
219 		boundary = 4096;
220 		maxsegsize = 4096;
221 	}
222 
223 	/* allocate DMAable memory tags */
224 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
225 				 alignment,		/* alignment */
226 				 boundary,		/* boundary */
227 				 BUS_SPACE_MAXADDR,	/* low */
228 				 BUS_SPACE_MAXADDR,	/* high */
229 				 NULL, NULL,		/* filter */
230 				 bytes,			/* maxsize */
231 				 1,			/* num segs */
232 				 maxsegsize,		/* maxsegsize */
233 				 BUS_DMA_COHERENT,	/* flags */
234 				 NULL, NULL,		/* lock */
235 				 &dma->dmat);		/* tag */
236 	if (err != 0) {
237 		device_printf(dev, "couldn't alloc tag (err = %d)\n", err);
238 		return err;
239 	}
240 
241 	/* allocate DMAable memory & map */
242 	err = bus_dmamem_alloc(dma->dmat, &dma->addr,
243 			       (BUS_DMA_WAITOK | BUS_DMA_COHERENT
244 				| BUS_DMA_ZERO),  &dma->map);
245 	if (err != 0) {
246 		device_printf(dev, "couldn't alloc mem (err = %d)\n", err);
247 		goto abort_with_dmat;
248 	}
249 
250 	/* load the memory */
251 	err = bus_dmamap_load(dma->dmat, dma->map, dma->addr, bytes,
252 			      mxge_dmamap_callback,
253 			      (void *)&dma->bus_addr, 0);
254 	if (err != 0) {
255 		device_printf(dev, "couldn't load map (err = %d)\n", err);
256 		goto abort_with_mem;
257 	}
258 	return 0;
259 
260 abort_with_mem:
261 	bus_dmamem_free(dma->dmat, dma->addr, dma->map);
262 abort_with_dmat:
263 	(void)bus_dma_tag_destroy(dma->dmat);
264 	return err;
265 }
266 
267 static void
268 mxge_dma_free(mxge_dma_t *dma)
269 {
270 	bus_dmamap_unload(dma->dmat, dma->map);
271 	bus_dmamem_free(dma->dmat, dma->addr, dma->map);
272 	(void)bus_dma_tag_destroy(dma->dmat);
273 }
274 
275 /*
276  * The eeprom strings on the lanaiX have the format
277  * SN=x\0
278  * MAC=x:x:x:x:x:x\0
279  * PC=text\0
280  */
281 
282 static int
283 mxge_parse_strings(mxge_softc_t *sc)
284 {
285 	char *ptr;
286 	int i, found_mac, found_sn2;
287 	char *endptr;
288 
289 	ptr = sc->eeprom_strings;
290 	found_mac = 0;
291 	found_sn2 = 0;
292 	while (*ptr != '\0') {
293 		if (strncmp(ptr, "MAC=", 4) == 0) {
294 			ptr += 4;
295 			for (i = 0;;) {
296 				sc->mac_addr[i] = strtoul(ptr, &endptr, 16);
297 				if (endptr - ptr != 2)
298 					goto abort;
299 				ptr = endptr;
300 				if (++i == 6)
301 					break;
302 				if (*ptr++ != ':')
303 					goto abort;
304 			}
305 			found_mac = 1;
306 		} else if (strncmp(ptr, "PC=", 3) == 0) {
307 			ptr += 3;
308 			strlcpy(sc->product_code_string, ptr,
309 			    sizeof(sc->product_code_string));
310 		} else if (!found_sn2 && (strncmp(ptr, "SN=", 3) == 0)) {
311 			ptr += 3;
312 			strlcpy(sc->serial_number_string, ptr,
313 			    sizeof(sc->serial_number_string));
314 		} else if (strncmp(ptr, "SN2=", 4) == 0) {
315 			/* SN2 takes precedence over SN */
316 			ptr += 4;
317 			found_sn2 = 1;
318 			strlcpy(sc->serial_number_string, ptr,
319 			    sizeof(sc->serial_number_string));
320 		}
321 		while (*ptr++ != '\0') {}
322 	}
323 
324 	if (found_mac)
325 		return 0;
326 
327  abort:
328 	device_printf(sc->dev, "failed to parse eeprom_strings\n");
329 
330 	return ENXIO;
331 }
332 
333 #if defined __i386 || defined i386 || defined __i386__ || defined __x86_64__
334 static void
335 mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
336 {
337 	uint32_t val;
338 	unsigned long base, off;
339 	char *va, *cfgptr;
340 	device_t pdev, mcp55;
341 	uint16_t vendor_id, device_id, word;
342 	uintptr_t bus, slot, func, ivend, idev;
343 	uint32_t *ptr32;
344 
345 	if (!mxge_nvidia_ecrc_enable)
346 		return;
347 
348 	pdev = device_get_parent(device_get_parent(sc->dev));
349 	if (pdev == NULL) {
350 		device_printf(sc->dev, "could not find parent?\n");
351 		return;
352 	}
353 	vendor_id = pci_read_config(pdev, PCIR_VENDOR, 2);
354 	device_id = pci_read_config(pdev, PCIR_DEVICE, 2);
355 
356 	if (vendor_id != 0x10de)
357 		return;
358 
359 	base = 0;
360 
361 	if (device_id == 0x005d) {
362 		/* ck804, base address is magic */
363 		base = 0xe0000000UL;
364 	} else if (device_id >= 0x0374 && device_id <= 0x378) {
365 		/* mcp55, base address stored in chipset */
366 		mcp55 = pci_find_bsf(0, 0, 0);
367 		if (mcp55 &&
368 		    0x10de == pci_read_config(mcp55, PCIR_VENDOR, 2) &&
369 		    0x0369 == pci_read_config(mcp55, PCIR_DEVICE, 2)) {
370 			word = pci_read_config(mcp55, 0x90, 2);
371 			base = ((unsigned long)word & 0x7ffeU) << 25;
372 		}
373 	}
374 	if (!base)
375 		return;
376 
377 	/* XXXX
378 	   Test below is commented because it is believed that doing
379 	   config read/write beyond 0xff will access the config space
380 	   for the next larger function.  Uncomment this and remove
381 	   the hacky pmap_mapdev() way of accessing config space when
382 	   FreeBSD grows support for extended pcie config space access
383 	*/
384 #if 0
385 	/* See if we can, by some miracle, access the extended
386 	   config space */
387 	val = pci_read_config(pdev, 0x178, 4);
388 	if (val != 0xffffffff) {
389 		val |= 0x40;
390 		pci_write_config(pdev, 0x178, val, 4);
391 		return;
392 	}
393 #endif
394 	/* Rather than using normal pci config space writes, we must
395 	 * map the Nvidia config space ourselves.  This is because on
396 	 * opteron/nvidia class machine the 0xe000000 mapping is
397 	 * handled by the nvidia chipset, that means the internal PCI
398 	 * device (the on-chip northbridge), or the amd-8131 bridge
399 	 * and things behind them are not visible by this method.
400 	 */
401 
402 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
403 		      PCI_IVAR_BUS, &bus);
404 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
405 		      PCI_IVAR_SLOT, &slot);
406 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
407 		      PCI_IVAR_FUNCTION, &func);
408 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
409 		      PCI_IVAR_VENDOR, &ivend);
410 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
411 		      PCI_IVAR_DEVICE, &idev);
412 
413 	off =  base
414 		+ 0x00100000UL * (unsigned long)bus
415 		+ 0x00001000UL * (unsigned long)(func
416 						 + 8 * slot);
417 
418 	/* map it into the kernel */
419 	va = pmap_mapdev(trunc_page((vm_paddr_t)off), PAGE_SIZE);
420 
421 	if (va == NULL) {
422 		device_printf(sc->dev, "pmap_kenter_temporary didn't\n");
423 		return;
424 	}
425 	/* get a pointer to the config space mapped into the kernel */
426 	cfgptr = va + (off & PAGE_MASK);
427 
428 	/* make sure that we can really access it */
429 	vendor_id = *(uint16_t *)(cfgptr + PCIR_VENDOR);
430 	device_id = *(uint16_t *)(cfgptr + PCIR_DEVICE);
431 	if (! (vendor_id == ivend && device_id == idev)) {
432 		device_printf(sc->dev, "mapping failed: 0x%x:0x%x\n",
433 			      vendor_id, device_id);
434 		pmap_unmapdev(va, PAGE_SIZE);
435 		return;
436 	}
437 
438 	ptr32 = (uint32_t*)(cfgptr + 0x178);
439 	val = *ptr32;
440 
441 	if (val == 0xffffffff) {
442 		device_printf(sc->dev, "extended mapping failed\n");
443 		pmap_unmapdev(va, PAGE_SIZE);
444 		return;
445 	}
446 	*ptr32 = val | 0x40;
447 	pmap_unmapdev(va, PAGE_SIZE);
448 	if (mxge_verbose)
449 		device_printf(sc->dev,
450 			      "Enabled ECRC on upstream Nvidia bridge "
451 			      "at %d:%d:%d\n",
452 			      (int)bus, (int)slot, (int)func);
453 	return;
454 }
455 #else
456 static void
457 mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
458 {
459 	device_printf(sc->dev,
460 		      "Nforce 4 chipset on non-x86/amd64!?!?!\n");
461 	return;
462 }
463 #endif
464 
465 static int
466 mxge_dma_test(mxge_softc_t *sc, int test_type)
467 {
468 	mxge_cmd_t cmd;
469 	bus_addr_t dmatest_bus = sc->dmabench_dma.bus_addr;
470 	int status;
471 	uint32_t len;
472 	char *test = " ";
473 
474 	/* Run a small DMA test.
475 	 * The magic multipliers to the length tell the firmware
476 	 * to do DMA read, write, or read+write tests.  The
477 	 * results are returned in cmd.data0.  The upper 16
478 	 * bits of the return is the number of transfers completed.
479 	 * The lower 16 bits is the time in 0.5us ticks that the
480 	 * transfers took to complete.
481 	 */
482 
483 	len = sc->tx_boundary;
484 
485 	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
486 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
487 	cmd.data2 = len * 0x10000;
488 	status = mxge_send_cmd(sc, test_type, &cmd);
489 	if (status != 0) {
490 		test = "read";
491 		goto abort;
492 	}
493 	sc->read_dma = ((cmd.data0>>16) * len * 2) /
494 		(cmd.data0 & 0xffff);
495 	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
496 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
497 	cmd.data2 = len * 0x1;
498 	status = mxge_send_cmd(sc, test_type, &cmd);
499 	if (status != 0) {
500 		test = "write";
501 		goto abort;
502 	}
503 	sc->write_dma = ((cmd.data0>>16) * len * 2) /
504 		(cmd.data0 & 0xffff);
505 
506 	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
507 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
508 	cmd.data2 = len * 0x10001;
509 	status = mxge_send_cmd(sc, test_type, &cmd);
510 	if (status != 0) {
511 		test = "read/write";
512 		goto abort;
513 	}
514 	sc->read_write_dma = ((cmd.data0>>16) * len * 2 * 2) /
515 		(cmd.data0 & 0xffff);
516 
517 abort:
518 	if (status != 0 && test_type != MXGEFW_CMD_UNALIGNED_TEST)
519 		device_printf(sc->dev, "DMA %s benchmark failed: %d\n",
520 			      test, status);
521 
522 	return status;
523 }
524 
525 /*
526  * The Lanai Z8E PCI-E interface achieves higher Read-DMA throughput
527  * when the PCI-E Completion packets are aligned on an 8-byte
528  * boundary.  Some PCI-E chip sets always align Completion packets; on
529  * the ones that do not, the alignment can be enforced by enabling
530  * ECRC generation (if supported).
531  *
532  * When PCI-E Completion packets are not aligned, it is actually more
533  * efficient to limit Read-DMA transactions to 2KB, rather than 4KB.
534  *
535  * If the driver can neither enable ECRC nor verify that it has
536  * already been enabled, then it must use a firmware image which works
537  * around unaligned completion packets (ethp_z8e.dat), and it should
538  * also ensure that it never gives the device a Read-DMA which is
539  * larger than 2KB by setting the tx_boundary to 2KB.  If ECRC is
540  * enabled, then the driver should use the aligned (eth_z8e.dat)
541  * firmware image, and set tx_boundary to 4KB.
542  */
543 
544 static int
545 mxge_firmware_probe(mxge_softc_t *sc)
546 {
547 	device_t dev = sc->dev;
548 	int reg, status;
549 	uint16_t pectl;
550 
551 	sc->tx_boundary = 4096;
552 	/*
553 	 * Verify the max read request size was set to 4KB
554 	 * before trying the test with 4KB.
555 	 */
556 	if (pci_find_cap(dev, PCIY_EXPRESS, &reg) == 0) {
557 		pectl = pci_read_config(dev, reg + 0x8, 2);
558 		if ((pectl & (5 << 12)) != (5 << 12)) {
559 			device_printf(dev, "Max Read Req. size != 4k (0x%x\n",
560 				      pectl);
561 			sc->tx_boundary = 2048;
562 		}
563 	}
564 
565 	/*
566 	 * load the optimized firmware (which assumes aligned PCIe
567 	 * completions) in order to see if it works on this host.
568 	 */
569 	sc->fw_name = mxge_fw_aligned;
570 	status = mxge_load_firmware(sc, 1);
571 	if (status != 0) {
572 		return status;
573 	}
574 
575 	/*
576 	 * Enable ECRC if possible
577 	 */
578 	mxge_enable_nvidia_ecrc(sc);
579 
580 	/*
581 	 * Run a DMA test which watches for unaligned completions and
582 	 * aborts on the first one seen.  Not required on Z8ES or newer.
583 	 */
584 	if (pci_get_revid(sc->dev) >= MXGE_PCI_REV_Z8ES)
585 		return 0;
586 	status = mxge_dma_test(sc, MXGEFW_CMD_UNALIGNED_TEST);
587 	if (status == 0)
588 		return 0; /* keep the aligned firmware */
589 
590 	if (status != E2BIG)
591 		device_printf(dev, "DMA test failed: %d\n", status);
592 	if (status == ENOSYS)
593 		device_printf(dev, "Falling back to ethp! "
594 			      "Please install up to date fw\n");
595 	return status;
596 }
597 
598 static int
599 mxge_select_firmware(mxge_softc_t *sc)
600 {
601 	int aligned = 0;
602 	int force_firmware = mxge_force_firmware;
603 
604 	if (sc->throttle)
605 		force_firmware = sc->throttle;
606 
607 	if (force_firmware != 0) {
608 		if (force_firmware == 1)
609 			aligned = 1;
610 		else
611 			aligned = 0;
612 		if (mxge_verbose)
613 			device_printf(sc->dev,
614 				      "Assuming %s completions (forced)\n",
615 				      aligned ? "aligned" : "unaligned");
616 		goto abort;
617 	}
618 
619 	/* if the PCIe link width is 4 or less, we can use the aligned
620 	   firmware and skip any checks */
621 	if (sc->link_width != 0 && sc->link_width <= 4) {
622 		device_printf(sc->dev,
623 			      "PCIe x%d Link, expect reduced performance\n",
624 			      sc->link_width);
625 		aligned = 1;
626 		goto abort;
627 	}
628 
629 	if (0 == mxge_firmware_probe(sc))
630 		return 0;
631 
632 abort:
633 	if (aligned) {
634 		sc->fw_name = mxge_fw_aligned;
635 		sc->tx_boundary = 4096;
636 	} else {
637 		sc->fw_name = mxge_fw_unaligned;
638 		sc->tx_boundary = 2048;
639 	}
640 	return (mxge_load_firmware(sc, 0));
641 }
642 
643 static int
644 mxge_validate_firmware(mxge_softc_t *sc, const mcp_gen_header_t *hdr)
645 {
646 
647 	if (be32toh(hdr->mcp_type) != MCP_TYPE_ETH) {
648 		device_printf(sc->dev, "Bad firmware type: 0x%x\n",
649 			      be32toh(hdr->mcp_type));
650 		return EIO;
651 	}
652 
653 	/* save firmware version for sysctl */
654 	strlcpy(sc->fw_version, hdr->version, sizeof(sc->fw_version));
655 	if (mxge_verbose)
656 		device_printf(sc->dev, "firmware id: %s\n", hdr->version);
657 
658 	sscanf(sc->fw_version, "%d.%d.%d", &sc->fw_ver_major,
659 	       &sc->fw_ver_minor, &sc->fw_ver_tiny);
660 
661 	if (!(sc->fw_ver_major == MXGEFW_VERSION_MAJOR
662 	      && sc->fw_ver_minor == MXGEFW_VERSION_MINOR)) {
663 		device_printf(sc->dev, "Found firmware version %s\n",
664 			      sc->fw_version);
665 		device_printf(sc->dev, "Driver needs %d.%d\n",
666 			      MXGEFW_VERSION_MAJOR, MXGEFW_VERSION_MINOR);
667 		return EINVAL;
668 	}
669 	return 0;
670 
671 }
672 
673 static int
674 mxge_load_firmware_helper(mxge_softc_t *sc, uint32_t *limit)
675 {
676 	z_stream zs;
677 	char *inflate_buffer;
678 	const struct firmware *fw;
679 	const mcp_gen_header_t *hdr;
680 	unsigned hdr_offset;
681 	int status;
682 	unsigned int i;
683 	size_t fw_len;
684 
685 	fw = firmware_get(sc->fw_name);
686 	if (fw == NULL) {
687 		device_printf(sc->dev, "Could not find firmware image %s\n",
688 			      sc->fw_name);
689 		return ENOENT;
690 	}
691 
692 	/* setup zlib and decompress f/w */
693 	bzero(&zs, sizeof (zs));
694 	zs.zalloc = zcalloc_nowait;
695 	zs.zfree = zcfree;
696 	status = inflateInit(&zs);
697 	if (status != Z_OK) {
698 		status = EIO;
699 		goto abort_with_fw;
700 	}
701 
702 	/* the uncompressed size is stored as the firmware version,
703 	   which would otherwise go unused */
704 	fw_len = (size_t) fw->version;
705 	inflate_buffer = malloc(fw_len, M_TEMP, M_NOWAIT);
706 	if (inflate_buffer == NULL)
707 		goto abort_with_zs;
708 	zs.avail_in = fw->datasize;
709 	zs.next_in = __DECONST(char *, fw->data);
710 	zs.avail_out = fw_len;
711 	zs.next_out = inflate_buffer;
712 	status = inflate(&zs, Z_FINISH);
713 	if (status != Z_STREAM_END) {
714 		device_printf(sc->dev, "zlib %d\n", status);
715 		status = EIO;
716 		goto abort_with_buffer;
717 	}
718 
719 	/* check id */
720 	hdr_offset = htobe32(*(const uint32_t *)
721 			     (inflate_buffer + MCP_HEADER_PTR_OFFSET));
722 	if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > fw_len) {
723 		device_printf(sc->dev, "Bad firmware file");
724 		status = EIO;
725 		goto abort_with_buffer;
726 	}
727 	hdr = (const void*)(inflate_buffer + hdr_offset);
728 
729 	status = mxge_validate_firmware(sc, hdr);
730 	if (status != 0)
731 		goto abort_with_buffer;
732 
733 	/* Copy the inflated firmware to NIC SRAM. */
734 	for (i = 0; i < fw_len; i += 256) {
735 		mxge_pio_copy(sc->sram + MXGE_FW_OFFSET + i,
736 			      inflate_buffer + i,
737 			      min(256U, (unsigned)(fw_len - i)));
738 		wmb();
739 		(void)*sc->sram;
740 		wmb();
741 	}
742 
743 	*limit = fw_len;
744 	status = 0;
745 abort_with_buffer:
746 	free(inflate_buffer, M_TEMP);
747 abort_with_zs:
748 	inflateEnd(&zs);
749 abort_with_fw:
750 	firmware_put(fw, FIRMWARE_UNLOAD);
751 	return status;
752 }
753 
754 /*
755  * Enable or disable periodic RDMAs from the host to make certain
756  * chipsets resend dropped PCIe messages
757  */
758 
759 static void
760 mxge_dummy_rdma(mxge_softc_t *sc, int enable)
761 {
762 	char buf_bytes[72];
763 	volatile uint32_t *confirm;
764 	volatile char *submit;
765 	uint32_t *buf, dma_low, dma_high;
766 	int i;
767 
768 	buf = (uint32_t *)((uintptr_t)(buf_bytes + 7) & ~7UL);
769 
770 	/* clear confirmation addr */
771 	confirm = (volatile uint32_t *)sc->cmd;
772 	*confirm = 0;
773 	wmb();
774 
775 	/* send an rdma command to the PCIe engine, and wait for the
776 	   response in the confirmation address.  The firmware should
777 	   write a -1 there to indicate it is alive and well
778 	*/
779 
780 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
781 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
782 	buf[0] = htobe32(dma_high);		/* confirm addr MSW */
783 	buf[1] = htobe32(dma_low);		/* confirm addr LSW */
784 	buf[2] = htobe32(0xffffffff);		/* confirm data */
785 	dma_low = MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr);
786 	dma_high = MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr);
787 	buf[3] = htobe32(dma_high); 		/* dummy addr MSW */
788 	buf[4] = htobe32(dma_low); 		/* dummy addr LSW */
789 	buf[5] = htobe32(enable);			/* enable? */
790 
791 	submit = (volatile char *)(sc->sram + MXGEFW_BOOT_DUMMY_RDMA);
792 
793 	mxge_pio_copy(submit, buf, 64);
794 	wmb();
795 	DELAY(1000);
796 	wmb();
797 	i = 0;
798 	while (*confirm != 0xffffffff && i < 20) {
799 		DELAY(1000);
800 		i++;
801 	}
802 	if (*confirm != 0xffffffff) {
803 		device_printf(sc->dev, "dummy rdma %s failed (%p = 0x%x)",
804 			      (enable ? "enable" : "disable"), confirm,
805 			      *confirm);
806 	}
807 	return;
808 }
809 
810 static int
811 mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data)
812 {
813 	mcp_cmd_t *buf;
814 	char buf_bytes[sizeof(*buf) + 8];
815 	volatile mcp_cmd_response_t *response = sc->cmd;
816 	volatile char *cmd_addr = sc->sram + MXGEFW_ETH_CMD;
817 	uint32_t dma_low, dma_high;
818 	int err, sleep_total = 0;
819 
820 	/* ensure buf is aligned to 8 bytes */
821 	buf = (mcp_cmd_t *)((uintptr_t)(buf_bytes + 7) & ~7UL);
822 
823 	buf->data0 = htobe32(data->data0);
824 	buf->data1 = htobe32(data->data1);
825 	buf->data2 = htobe32(data->data2);
826 	buf->cmd = htobe32(cmd);
827 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
828 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
829 
830 	buf->response_addr.low = htobe32(dma_low);
831 	buf->response_addr.high = htobe32(dma_high);
832 	mtx_lock(&sc->cmd_mtx);
833 	response->result = 0xffffffff;
834 	wmb();
835 	mxge_pio_copy((volatile void *)cmd_addr, buf, sizeof (*buf));
836 
837 	/* wait up to 20ms */
838 	err = EAGAIN;
839 	for (sleep_total = 0; sleep_total <  20; sleep_total++) {
840 		bus_dmamap_sync(sc->cmd_dma.dmat,
841 				sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
842 		wmb();
843 		switch (be32toh(response->result)) {
844 		case 0:
845 			data->data0 = be32toh(response->data);
846 			err = 0;
847 			break;
848 		case 0xffffffff:
849 			DELAY(1000);
850 			break;
851 		case MXGEFW_CMD_UNKNOWN:
852 			err = ENOSYS;
853 			break;
854 		case MXGEFW_CMD_ERROR_UNALIGNED:
855 			err = E2BIG;
856 			break;
857 		case MXGEFW_CMD_ERROR_BUSY:
858 			err = EBUSY;
859 			break;
860 		case MXGEFW_CMD_ERROR_I2C_ABSENT:
861 			err = ENXIO;
862 			break;
863 		default:
864 			device_printf(sc->dev,
865 				      "mxge: command %d "
866 				      "failed, result = %d\n",
867 				      cmd, be32toh(response->result));
868 			err = ENXIO;
869 			break;
870 		}
871 		if (err != EAGAIN)
872 			break;
873 	}
874 	if (err == EAGAIN)
875 		device_printf(sc->dev, "mxge: command %d timed out"
876 			      "result = %d\n",
877 			      cmd, be32toh(response->result));
878 	mtx_unlock(&sc->cmd_mtx);
879 	return err;
880 }
881 
882 static int
883 mxge_adopt_running_firmware(mxge_softc_t *sc)
884 {
885 	struct mcp_gen_header *hdr;
886 	const size_t bytes = sizeof (struct mcp_gen_header);
887 	size_t hdr_offset;
888 	int status;
889 
890 	/* find running firmware header */
891 	hdr_offset = htobe32(*(volatile uint32_t *)
892 			     (sc->sram + MCP_HEADER_PTR_OFFSET));
893 
894 	if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > sc->sram_size) {
895 		device_printf(sc->dev,
896 			      "Running firmware has bad header offset (%d)\n",
897 			      (int)hdr_offset);
898 		return EIO;
899 	}
900 
901 	/* copy header of running firmware from SRAM to host memory to
902 	 * validate firmware */
903 	hdr = malloc(bytes, M_DEVBUF, M_NOWAIT);
904 	if (hdr == NULL) {
905 		device_printf(sc->dev, "could not malloc firmware hdr\n");
906 		return ENOMEM;
907 	}
908 	bus_space_read_region_1(rman_get_bustag(sc->mem_res),
909 				rman_get_bushandle(sc->mem_res),
910 				hdr_offset, (char *)hdr, bytes);
911 	status = mxge_validate_firmware(sc, hdr);
912 	free(hdr, M_DEVBUF);
913 
914 	/*
915 	 * check to see if adopted firmware has bug where adopting
916 	 * it will cause broadcasts to be filtered unless the NIC
917 	 * is kept in ALLMULTI mode
918 	 */
919 	if (sc->fw_ver_major == 1 && sc->fw_ver_minor == 4 &&
920 	    sc->fw_ver_tiny >= 4 && sc->fw_ver_tiny <= 11) {
921 		sc->adopted_rx_filter_bug = 1;
922 		device_printf(sc->dev, "Adopting fw %d.%d.%d: "
923 			      "working around rx filter bug\n",
924 			      sc->fw_ver_major, sc->fw_ver_minor,
925 			      sc->fw_ver_tiny);
926 	}
927 
928 	return status;
929 }
930 
931 static int
932 mxge_load_firmware(mxge_softc_t *sc, int adopt)
933 {
934 	volatile uint32_t *confirm;
935 	volatile char *submit;
936 	char buf_bytes[72];
937 	uint32_t *buf, size, dma_low, dma_high;
938 	int status, i;
939 
940 	buf = (uint32_t *)((uintptr_t)(buf_bytes + 7) & ~7UL);
941 
942 	size = sc->sram_size;
943 	status = mxge_load_firmware_helper(sc, &size);
944 	if (status) {
945 		if (!adopt)
946 			return status;
947 		/* Try to use the currently running firmware, if
948 		   it is new enough */
949 		status = mxge_adopt_running_firmware(sc);
950 		if (status) {
951 			device_printf(sc->dev,
952 				      "failed to adopt running firmware\n");
953 			return status;
954 		}
955 		device_printf(sc->dev,
956 			      "Successfully adopted running firmware\n");
957 		if (sc->tx_boundary == 4096) {
958 			device_printf(sc->dev,
959 				"Using firmware currently running on NIC"
960 				 ".  For optimal\n");
961 			device_printf(sc->dev,
962 				 "performance consider loading optimized "
963 				 "firmware\n");
964 		}
965 		sc->fw_name = mxge_fw_unaligned;
966 		sc->tx_boundary = 2048;
967 		return 0;
968 	}
969 	/* clear confirmation addr */
970 	confirm = (volatile uint32_t *)sc->cmd;
971 	*confirm = 0;
972 	wmb();
973 	/* send a reload command to the bootstrap MCP, and wait for the
974 	   response in the confirmation address.  The firmware should
975 	   write a -1 there to indicate it is alive and well
976 	*/
977 
978 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
979 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
980 
981 	buf[0] = htobe32(dma_high);	/* confirm addr MSW */
982 	buf[1] = htobe32(dma_low);	/* confirm addr LSW */
983 	buf[2] = htobe32(0xffffffff);	/* confirm data */
984 
985 	/* FIX: All newest firmware should un-protect the bottom of
986 	   the sram before handoff. However, the very first interfaces
987 	   do not. Therefore the handoff copy must skip the first 8 bytes
988 	*/
989 					/* where the code starts*/
990 	buf[3] = htobe32(MXGE_FW_OFFSET + 8);
991 	buf[4] = htobe32(size - 8); 	/* length of code */
992 	buf[5] = htobe32(8);		/* where to copy to */
993 	buf[6] = htobe32(0);		/* where to jump to */
994 
995 	submit = (volatile char *)(sc->sram + MXGEFW_BOOT_HANDOFF);
996 	mxge_pio_copy(submit, buf, 64);
997 	wmb();
998 	DELAY(1000);
999 	wmb();
1000 	i = 0;
1001 	while (*confirm != 0xffffffff && i < 20) {
1002 		DELAY(1000*10);
1003 		i++;
1004 		bus_dmamap_sync(sc->cmd_dma.dmat,
1005 				sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
1006 	}
1007 	if (*confirm != 0xffffffff) {
1008 		device_printf(sc->dev,"handoff failed (%p = 0x%x)",
1009 			confirm, *confirm);
1010 
1011 		return ENXIO;
1012 	}
1013 	return 0;
1014 }
1015 
1016 static int
1017 mxge_update_mac_address(mxge_softc_t *sc)
1018 {
1019 	mxge_cmd_t cmd;
1020 	uint8_t *addr = sc->mac_addr;
1021 	int status;
1022 
1023 	cmd.data0 = ((addr[0] << 24) | (addr[1] << 16)
1024 		     | (addr[2] << 8) | addr[3]);
1025 
1026 	cmd.data1 = ((addr[4] << 8) | (addr[5]));
1027 
1028 	status = mxge_send_cmd(sc, MXGEFW_SET_MAC_ADDRESS, &cmd);
1029 	return status;
1030 }
1031 
1032 static int
1033 mxge_change_pause(mxge_softc_t *sc, int pause)
1034 {
1035 	mxge_cmd_t cmd;
1036 	int status;
1037 
1038 	if (pause)
1039 		status = mxge_send_cmd(sc, MXGEFW_ENABLE_FLOW_CONTROL,
1040 				       &cmd);
1041 	else
1042 		status = mxge_send_cmd(sc, MXGEFW_DISABLE_FLOW_CONTROL,
1043 				       &cmd);
1044 
1045 	if (status) {
1046 		device_printf(sc->dev, "Failed to set flow control mode\n");
1047 		return ENXIO;
1048 	}
1049 	sc->pause = pause;
1050 	return 0;
1051 }
1052 
1053 static void
1054 mxge_change_promisc(mxge_softc_t *sc, int promisc)
1055 {
1056 	mxge_cmd_t cmd;
1057 	int status;
1058 
1059 	if (mxge_always_promisc)
1060 		promisc = 1;
1061 
1062 	if (promisc)
1063 		status = mxge_send_cmd(sc, MXGEFW_ENABLE_PROMISC,
1064 				       &cmd);
1065 	else
1066 		status = mxge_send_cmd(sc, MXGEFW_DISABLE_PROMISC,
1067 				       &cmd);
1068 
1069 	if (status) {
1070 		device_printf(sc->dev, "Failed to set promisc mode\n");
1071 	}
1072 }
1073 
1074 struct mxge_add_maddr_ctx {
1075 	mxge_softc_t *sc;
1076 	int error;
1077 };
1078 
1079 static u_int
1080 mxge_add_maddr(void *arg, struct sockaddr_dl *sdl, u_int cnt)
1081 {
1082 	struct mxge_add_maddr_ctx *ctx = arg;
1083 	mxge_cmd_t cmd;
1084 
1085 	if (ctx->error != 0)
1086 		return (0);
1087 	bcopy(LLADDR(sdl), &cmd.data0, 4);
1088 	bcopy(LLADDR(sdl) + 4, &cmd.data1, 2);
1089 	cmd.data0 = htonl(cmd.data0);
1090 	cmd.data1 = htonl(cmd.data1);
1091 
1092 	ctx->error = mxge_send_cmd(ctx->sc, MXGEFW_JOIN_MULTICAST_GROUP, &cmd);
1093 
1094 	return (1);
1095 }
1096 
1097 static void
1098 mxge_set_multicast_list(mxge_softc_t *sc)
1099 {
1100 	struct mxge_add_maddr_ctx ctx;
1101 	if_t ifp = sc->ifp;
1102 	mxge_cmd_t cmd;
1103 	int err;
1104 
1105 	/* This firmware is known to not support multicast */
1106 	if (!sc->fw_multicast_support)
1107 		return;
1108 
1109 	/* Disable multicast filtering while we play with the lists*/
1110 	err = mxge_send_cmd(sc, MXGEFW_ENABLE_ALLMULTI, &cmd);
1111 	if (err != 0) {
1112 		device_printf(sc->dev, "Failed MXGEFW_ENABLE_ALLMULTI,"
1113 		       " error status: %d\n", err);
1114 		return;
1115 	}
1116 
1117 	if (sc->adopted_rx_filter_bug)
1118 		return;
1119 
1120 	if (if_getflags(ifp) & IFF_ALLMULTI)
1121 		/* request to disable multicast filtering, so quit here */
1122 		return;
1123 
1124 	/* Flush all the filters */
1125 
1126 	err = mxge_send_cmd(sc, MXGEFW_LEAVE_ALL_MULTICAST_GROUPS, &cmd);
1127 	if (err != 0) {
1128 		device_printf(sc->dev,
1129 			      "Failed MXGEFW_LEAVE_ALL_MULTICAST_GROUPS"
1130 			      ", error status: %d\n", err);
1131 		return;
1132 	}
1133 
1134 	/* Walk the multicast list, and add each address */
1135 	ctx.sc = sc;
1136 	ctx.error = 0;
1137 	if_foreach_llmaddr(ifp, mxge_add_maddr, &ctx);
1138 	if (ctx.error != 0) {
1139 		device_printf(sc->dev, "Failed MXGEFW_JOIN_MULTICAST_GROUP, "
1140 		    "error status:" "%d\t", ctx.error);
1141 		/* abort, leaving multicast filtering off */
1142 		return;
1143 	}
1144 
1145 	/* Enable multicast filtering */
1146 	err = mxge_send_cmd(sc, MXGEFW_DISABLE_ALLMULTI, &cmd);
1147 	if (err != 0) {
1148 		device_printf(sc->dev, "Failed MXGEFW_DISABLE_ALLMULTI"
1149 		       ", error status: %d\n", err);
1150 	}
1151 }
1152 
1153 static int
1154 mxge_max_mtu(mxge_softc_t *sc)
1155 {
1156 	mxge_cmd_t cmd;
1157 	int status;
1158 
1159 	if (MJUMPAGESIZE - MXGEFW_PAD >  MXGEFW_MAX_MTU)
1160 		return  MXGEFW_MAX_MTU - MXGEFW_PAD;
1161 
1162 	/* try to set nbufs to see if it we can
1163 	   use virtually contiguous jumbos */
1164 	cmd.data0 = 0;
1165 	status = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
1166 			       &cmd);
1167 	if (status == 0)
1168 		return  MXGEFW_MAX_MTU - MXGEFW_PAD;
1169 
1170 	/* otherwise, we're limited to MJUMPAGESIZE */
1171 	return MJUMPAGESIZE - MXGEFW_PAD;
1172 }
1173 
1174 static int
1175 mxge_reset(mxge_softc_t *sc, int interrupts_setup)
1176 {
1177 	struct mxge_slice_state *ss;
1178 	mxge_rx_done_t *rx_done;
1179 	volatile uint32_t *irq_claim;
1180 	mxge_cmd_t cmd;
1181 	int slice, status;
1182 
1183 	/* try to send a reset command to the card to see if it
1184 	   is alive */
1185 	memset(&cmd, 0, sizeof (cmd));
1186 	status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
1187 	if (status != 0) {
1188 		device_printf(sc->dev, "failed reset\n");
1189 		return ENXIO;
1190 	}
1191 
1192 	mxge_dummy_rdma(sc, 1);
1193 
1194 	/* set the intrq size */
1195 	cmd.data0 = sc->rx_ring_size;
1196 	status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
1197 
1198 	/*
1199 	 * Even though we already know how many slices are supported
1200 	 * via mxge_slice_probe(), MXGEFW_CMD_GET_MAX_RSS_QUEUES
1201 	 * has magic side effects, and must be called after a reset.
1202 	 * It must be called prior to calling any RSS related cmds,
1203 	 * including assigning an interrupt queue for anything but
1204 	 * slice 0.  It must also be called *after*
1205 	 * MXGEFW_CMD_SET_INTRQ_SIZE, since the intrq size is used by
1206 	 * the firmware to compute offsets.
1207 	 */
1208 
1209 	if (sc->num_slices > 1) {
1210 		/* ask the maximum number of slices it supports */
1211 		status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES,
1212 					   &cmd);
1213 		if (status != 0) {
1214 			device_printf(sc->dev,
1215 				      "failed to get number of slices\n");
1216 			return status;
1217 		}
1218 		/*
1219 		 * MXGEFW_CMD_ENABLE_RSS_QUEUES must be called prior
1220 		 * to setting up the interrupt queue DMA
1221 		 */
1222 		cmd.data0 = sc->num_slices;
1223 		cmd.data1 = MXGEFW_SLICE_INTR_MODE_ONE_PER_SLICE;
1224 		cmd.data1 |= MXGEFW_SLICE_ENABLE_MULTIPLE_TX_QUEUES;
1225 		status = mxge_send_cmd(sc, MXGEFW_CMD_ENABLE_RSS_QUEUES,
1226 					   &cmd);
1227 		if (status != 0) {
1228 			device_printf(sc->dev,
1229 				      "failed to set number of slices\n");
1230 			return status;
1231 		}
1232 	}
1233 
1234 	if (interrupts_setup) {
1235 		/* Now exchange information about interrupts  */
1236 		for (slice = 0; slice < sc->num_slices; slice++) {
1237 			rx_done = &sc->ss[slice].rx_done;
1238 			memset(rx_done->entry, 0, sc->rx_ring_size);
1239 			cmd.data0 = MXGE_LOWPART_TO_U32(rx_done->dma.bus_addr);
1240 			cmd.data1 = MXGE_HIGHPART_TO_U32(rx_done->dma.bus_addr);
1241 			cmd.data2 = slice;
1242 			status |= mxge_send_cmd(sc,
1243 						MXGEFW_CMD_SET_INTRQ_DMA,
1244 						&cmd);
1245 		}
1246 	}
1247 
1248 	status |= mxge_send_cmd(sc,
1249 				MXGEFW_CMD_GET_INTR_COAL_DELAY_OFFSET, &cmd);
1250 
1251 	sc->intr_coal_delay_ptr = (volatile uint32_t *)(sc->sram + cmd.data0);
1252 
1253 	status |= mxge_send_cmd(sc, MXGEFW_CMD_GET_IRQ_ACK_OFFSET, &cmd);
1254 	irq_claim = (volatile uint32_t *)(sc->sram + cmd.data0);
1255 
1256 	status |= mxge_send_cmd(sc,  MXGEFW_CMD_GET_IRQ_DEASSERT_OFFSET,
1257 				&cmd);
1258 	sc->irq_deassert = (volatile uint32_t *)(sc->sram + cmd.data0);
1259 	if (status != 0) {
1260 		device_printf(sc->dev, "failed set interrupt parameters\n");
1261 		return status;
1262 	}
1263 
1264 	*sc->intr_coal_delay_ptr = htobe32(sc->intr_coal_delay);
1265 
1266 	/* run a DMA benchmark */
1267 	(void) mxge_dma_test(sc, MXGEFW_DMA_TEST);
1268 
1269 	for (slice = 0; slice < sc->num_slices; slice++) {
1270 		ss = &sc->ss[slice];
1271 
1272 		ss->irq_claim = irq_claim + (2 * slice);
1273 		/* reset mcp/driver shared state back to 0 */
1274 		ss->rx_done.idx = 0;
1275 		ss->rx_done.cnt = 0;
1276 		ss->tx.req = 0;
1277 		ss->tx.done = 0;
1278 		ss->tx.pkt_done = 0;
1279 		ss->tx.queue_active = 0;
1280 		ss->tx.activate = 0;
1281 		ss->tx.deactivate = 0;
1282 		ss->tx.wake = 0;
1283 		ss->tx.defrag = 0;
1284 		ss->tx.stall = 0;
1285 		ss->rx_big.cnt = 0;
1286 		ss->rx_small.cnt = 0;
1287 		ss->lc.lro_bad_csum = 0;
1288 		ss->lc.lro_queued = 0;
1289 		ss->lc.lro_flushed = 0;
1290 		if (ss->fw_stats != NULL) {
1291 			bzero(ss->fw_stats, sizeof *ss->fw_stats);
1292 		}
1293 	}
1294 	sc->rdma_tags_available = 15;
1295 	status = mxge_update_mac_address(sc);
1296 	mxge_change_promisc(sc, if_getflags(sc->ifp) & IFF_PROMISC);
1297 	mxge_change_pause(sc, sc->pause);
1298 	mxge_set_multicast_list(sc);
1299 	if (sc->throttle) {
1300 		cmd.data0 = sc->throttle;
1301 		if (mxge_send_cmd(sc, MXGEFW_CMD_SET_THROTTLE_FACTOR,
1302 				  &cmd)) {
1303 			device_printf(sc->dev,
1304 				      "can't enable throttle\n");
1305 		}
1306 	}
1307 	return status;
1308 }
1309 
1310 static int
1311 mxge_change_throttle(SYSCTL_HANDLER_ARGS)
1312 {
1313 	mxge_cmd_t cmd;
1314 	mxge_softc_t *sc;
1315 	int err;
1316 	unsigned int throttle;
1317 
1318 	sc = arg1;
1319 	throttle = sc->throttle;
1320 	err = sysctl_handle_int(oidp, &throttle, arg2, req);
1321 	if (err != 0) {
1322 		return err;
1323 	}
1324 
1325 	if (throttle == sc->throttle)
1326 		return 0;
1327 
1328 	if (throttle < MXGE_MIN_THROTTLE || throttle > MXGE_MAX_THROTTLE)
1329 		return EINVAL;
1330 
1331 	mtx_lock(&sc->driver_mtx);
1332 	cmd.data0 = throttle;
1333 	err = mxge_send_cmd(sc, MXGEFW_CMD_SET_THROTTLE_FACTOR, &cmd);
1334 	if (err == 0)
1335 		sc->throttle = throttle;
1336 	mtx_unlock(&sc->driver_mtx);
1337 	return err;
1338 }
1339 
1340 static int
1341 mxge_change_intr_coal(SYSCTL_HANDLER_ARGS)
1342 {
1343 	mxge_softc_t *sc;
1344 	unsigned int intr_coal_delay;
1345 	int err;
1346 
1347 	sc = arg1;
1348 	intr_coal_delay = sc->intr_coal_delay;
1349 	err = sysctl_handle_int(oidp, &intr_coal_delay, arg2, req);
1350 	if (err != 0) {
1351 		return err;
1352 	}
1353 	if (intr_coal_delay == sc->intr_coal_delay)
1354 		return 0;
1355 
1356 	if (intr_coal_delay == 0 || intr_coal_delay > 1000*1000)
1357 		return EINVAL;
1358 
1359 	mtx_lock(&sc->driver_mtx);
1360 	*sc->intr_coal_delay_ptr = htobe32(intr_coal_delay);
1361 	sc->intr_coal_delay = intr_coal_delay;
1362 
1363 	mtx_unlock(&sc->driver_mtx);
1364 	return err;
1365 }
1366 
1367 static int
1368 mxge_change_flow_control(SYSCTL_HANDLER_ARGS)
1369 {
1370 	mxge_softc_t *sc;
1371 	unsigned int enabled;
1372 	int err;
1373 
1374 	sc = arg1;
1375 	enabled = sc->pause;
1376 	err = sysctl_handle_int(oidp, &enabled, arg2, req);
1377 	if (err != 0) {
1378 		return err;
1379 	}
1380 	if (enabled == sc->pause)
1381 		return 0;
1382 
1383 	mtx_lock(&sc->driver_mtx);
1384 	err = mxge_change_pause(sc, enabled);
1385 	mtx_unlock(&sc->driver_mtx);
1386 	return err;
1387 }
1388 
1389 static int
1390 mxge_handle_be32(SYSCTL_HANDLER_ARGS)
1391 {
1392 	int err;
1393 
1394 	if (arg1 == NULL)
1395 		return EFAULT;
1396 	arg2 = be32toh(*(int *)arg1);
1397 	arg1 = NULL;
1398 	err = sysctl_handle_int(oidp, arg1, arg2, req);
1399 
1400 	return err;
1401 }
1402 
1403 static void
1404 mxge_rem_sysctls(mxge_softc_t *sc)
1405 {
1406 	struct mxge_slice_state *ss;
1407 	int slice;
1408 
1409 	if (sc->slice_sysctl_tree == NULL)
1410 		return;
1411 
1412 	for (slice = 0; slice < sc->num_slices; slice++) {
1413 		ss = &sc->ss[slice];
1414 		if (ss == NULL || ss->sysctl_tree == NULL)
1415 			continue;
1416 		sysctl_ctx_free(&ss->sysctl_ctx);
1417 		ss->sysctl_tree = NULL;
1418 	}
1419 	sysctl_ctx_free(&sc->slice_sysctl_ctx);
1420 	sc->slice_sysctl_tree = NULL;
1421 }
1422 
1423 static void
1424 mxge_add_sysctls(mxge_softc_t *sc)
1425 {
1426 	struct sysctl_ctx_list *ctx;
1427 	struct sysctl_oid_list *children;
1428 	mcp_irq_data_t *fw;
1429 	struct mxge_slice_state *ss;
1430 	int slice;
1431 	char slice_num[8];
1432 
1433 	ctx = device_get_sysctl_ctx(sc->dev);
1434 	children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev));
1435 	fw = sc->ss[0].fw_stats;
1436 
1437 	/* random information */
1438 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1439 		       "firmware_version",
1440 		       CTLFLAG_RD, sc->fw_version,
1441 		       0, "firmware version");
1442 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1443 		       "serial_number",
1444 		       CTLFLAG_RD, sc->serial_number_string,
1445 		       0, "serial number");
1446 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1447 		       "product_code",
1448 		       CTLFLAG_RD, sc->product_code_string,
1449 		       0, "product_code");
1450 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1451 		       "pcie_link_width",
1452 		       CTLFLAG_RD, &sc->link_width,
1453 		       0, "tx_boundary");
1454 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1455 		       "tx_boundary",
1456 		       CTLFLAG_RD, &sc->tx_boundary,
1457 		       0, "tx_boundary");
1458 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1459 		       "write_combine",
1460 		       CTLFLAG_RD, &sc->wc,
1461 		       0, "write combining PIO?");
1462 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1463 		       "read_dma_MBs",
1464 		       CTLFLAG_RD, &sc->read_dma,
1465 		       0, "DMA Read speed in MB/s");
1466 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1467 		       "write_dma_MBs",
1468 		       CTLFLAG_RD, &sc->write_dma,
1469 		       0, "DMA Write speed in MB/s");
1470 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1471 		       "read_write_dma_MBs",
1472 		       CTLFLAG_RD, &sc->read_write_dma,
1473 		       0, "DMA concurrent Read/Write speed in MB/s");
1474 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1475 		       "watchdog_resets",
1476 		       CTLFLAG_RD, &sc->watchdog_resets,
1477 		       0, "Number of times NIC was reset");
1478 
1479 	/* performance related tunables */
1480 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1481 	    "intr_coal_delay", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
1482 	    sc, 0, mxge_change_intr_coal, "I",
1483 	    "interrupt coalescing delay in usecs");
1484 
1485 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1486 	    "throttle", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
1487 	    mxge_change_throttle, "I", "transmit throttling");
1488 
1489 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1490 	    "flow_control_enabled",
1491 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
1492 	    mxge_change_flow_control, "I",
1493 	    "interrupt coalescing delay in usecs");
1494 
1495 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1496 		       "deassert_wait",
1497 		       CTLFLAG_RW, &mxge_deassert_wait,
1498 		       0, "Wait for IRQ line to go low in ihandler");
1499 
1500 	/* stats block from firmware is in network byte order.
1501 	   Need to swap it */
1502 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1503 	    "link_up", CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
1504 	    &fw->link_up, 0, mxge_handle_be32, "I", "link up");
1505 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1506 	    "rdma_tags_available", CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
1507 	    &fw->rdma_tags_available, 0, mxge_handle_be32, "I",
1508 	    "rdma_tags_available");
1509 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1510 	    "dropped_bad_crc32", CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
1511 	    &fw->dropped_bad_crc32, 0, mxge_handle_be32, "I",
1512 	    "dropped_bad_crc32");
1513 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1514 	    "dropped_bad_phy", CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
1515 	    &fw->dropped_bad_phy, 0, mxge_handle_be32, "I", "dropped_bad_phy");
1516 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1517 	    "dropped_link_error_or_filtered",
1518 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
1519 	    &fw->dropped_link_error_or_filtered, 0, mxge_handle_be32, "I",
1520 	    "dropped_link_error_or_filtered");
1521 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1522 	    "dropped_link_overflow",
1523 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
1524 	    &fw->dropped_link_overflow, 0, mxge_handle_be32, "I",
1525 	    "dropped_link_overflow");
1526 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1527 	    "dropped_multicast_filtered",
1528 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
1529 	    &fw->dropped_multicast_filtered, 0, mxge_handle_be32, "I",
1530 	    "dropped_multicast_filtered");
1531 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1532 	    "dropped_no_big_buffer",
1533 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
1534 	    &fw->dropped_no_big_buffer, 0, mxge_handle_be32, "I",
1535 	    "dropped_no_big_buffer");
1536 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1537 	    "dropped_no_small_buffer",
1538 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
1539 	    &fw->dropped_no_small_buffer, 0, mxge_handle_be32, "I",
1540 	    "dropped_no_small_buffer");
1541 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1542 	    "dropped_overrun",
1543 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
1544 	    &fw->dropped_overrun, 0, mxge_handle_be32, "I",
1545 	    "dropped_overrun");
1546 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1547 	    "dropped_pause", CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
1548 	    &fw->dropped_pause, 0, mxge_handle_be32, "I", "dropped_pause");
1549 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1550 	    "dropped_runt", CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
1551 	    &fw->dropped_runt, 0, mxge_handle_be32, "I", "dropped_runt");
1552 
1553 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1554 	    "dropped_unicast_filtered",
1555 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
1556 	    &fw->dropped_unicast_filtered, 0, mxge_handle_be32, "I",
1557 	    "dropped_unicast_filtered");
1558 
1559 	/* verbose printing? */
1560 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1561 		       "verbose",
1562 		       CTLFLAG_RW, &mxge_verbose,
1563 		       0, "verbose printing");
1564 
1565 	/* add counters exported for debugging from all slices */
1566 	sysctl_ctx_init(&sc->slice_sysctl_ctx);
1567 	sc->slice_sysctl_tree =
1568 		SYSCTL_ADD_NODE(&sc->slice_sysctl_ctx, children, OID_AUTO,
1569 		    "slice", CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
1570 
1571 	for (slice = 0; slice < sc->num_slices; slice++) {
1572 		ss = &sc->ss[slice];
1573 		sysctl_ctx_init(&ss->sysctl_ctx);
1574 		ctx = &ss->sysctl_ctx;
1575 		children = SYSCTL_CHILDREN(sc->slice_sysctl_tree);
1576 		sprintf(slice_num, "%d", slice);
1577 		ss->sysctl_tree =
1578 			SYSCTL_ADD_NODE(ctx, children, OID_AUTO, slice_num,
1579 			    CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
1580 		children = SYSCTL_CHILDREN(ss->sysctl_tree);
1581 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1582 			       "rx_small_cnt",
1583 			       CTLFLAG_RD, &ss->rx_small.cnt,
1584 			       0, "rx_small_cnt");
1585 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1586 			       "rx_big_cnt",
1587 			       CTLFLAG_RD, &ss->rx_big.cnt,
1588 			       0, "rx_small_cnt");
1589 		SYSCTL_ADD_U64(ctx, children, OID_AUTO,
1590 			       "lro_flushed", CTLFLAG_RD, &ss->lc.lro_flushed,
1591 			       0, "number of lro merge queues flushed");
1592 
1593 		SYSCTL_ADD_U64(ctx, children, OID_AUTO,
1594 			       "lro_bad_csum", CTLFLAG_RD, &ss->lc.lro_bad_csum,
1595 			       0, "number of bad csums preventing LRO");
1596 
1597 		SYSCTL_ADD_U64(ctx, children, OID_AUTO,
1598 			       "lro_queued", CTLFLAG_RD, &ss->lc.lro_queued,
1599 			       0, "number of frames appended to lro merge"
1600 			       "queues");
1601 
1602 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1603 			       "tx_req",
1604 			       CTLFLAG_RD, &ss->tx.req,
1605 			       0, "tx_req");
1606 
1607 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1608 			       "tx_done",
1609 			       CTLFLAG_RD, &ss->tx.done,
1610 			       0, "tx_done");
1611 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1612 			       "tx_pkt_done",
1613 			       CTLFLAG_RD, &ss->tx.pkt_done,
1614 			       0, "tx_done");
1615 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1616 			       "tx_stall",
1617 			       CTLFLAG_RD, &ss->tx.stall,
1618 			       0, "tx_stall");
1619 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1620 			       "tx_wake",
1621 			       CTLFLAG_RD, &ss->tx.wake,
1622 			       0, "tx_wake");
1623 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1624 			       "tx_defrag",
1625 			       CTLFLAG_RD, &ss->tx.defrag,
1626 			       0, "tx_defrag");
1627 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1628 			       "tx_queue_active",
1629 			       CTLFLAG_RD, &ss->tx.queue_active,
1630 			       0, "tx_queue_active");
1631 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1632 			       "tx_activate",
1633 			       CTLFLAG_RD, &ss->tx.activate,
1634 			       0, "tx_activate");
1635 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1636 			       "tx_deactivate",
1637 			       CTLFLAG_RD, &ss->tx.deactivate,
1638 			       0, "tx_deactivate");
1639 	}
1640 }
1641 
1642 /* copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
1643    backwards one at a time and handle ring wraps */
1644 
1645 static inline void
1646 mxge_submit_req_backwards(mxge_tx_ring_t *tx,
1647 			    mcp_kreq_ether_send_t *src, int cnt)
1648 {
1649 	int idx, starting_slot;
1650 	starting_slot = tx->req;
1651 	while (cnt > 1) {
1652 		cnt--;
1653 		idx = (starting_slot + cnt) & tx->mask;
1654 		mxge_pio_copy(&tx->lanai[idx],
1655 			      &src[cnt], sizeof(*src));
1656 		wmb();
1657 	}
1658 }
1659 
1660 /*
1661  * copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
1662  * at most 32 bytes at a time, so as to avoid involving the software
1663  * pio handler in the nic.   We re-write the first segment's flags
1664  * to mark them valid only after writing the entire chain
1665  */
1666 
1667 static inline void
1668 mxge_submit_req(mxge_tx_ring_t *tx, mcp_kreq_ether_send_t *src,
1669 		  int cnt)
1670 {
1671 	int idx, i;
1672 	uint32_t *src_ints;
1673 	volatile uint32_t *dst_ints;
1674 	mcp_kreq_ether_send_t *srcp;
1675 	volatile mcp_kreq_ether_send_t *dstp, *dst;
1676 	uint8_t last_flags;
1677 
1678 	idx = tx->req & tx->mask;
1679 
1680 	last_flags = src->flags;
1681 	src->flags = 0;
1682 	wmb();
1683 	dst = dstp = &tx->lanai[idx];
1684 	srcp = src;
1685 
1686 	if ((idx + cnt) < tx->mask) {
1687 		for (i = 0; i < (cnt - 1); i += 2) {
1688 			mxge_pio_copy(dstp, srcp, 2 * sizeof(*src));
1689 			wmb(); /* force write every 32 bytes */
1690 			srcp += 2;
1691 			dstp += 2;
1692 		}
1693 	} else {
1694 		/* submit all but the first request, and ensure
1695 		   that it is submitted below */
1696 		mxge_submit_req_backwards(tx, src, cnt);
1697 		i = 0;
1698 	}
1699 	if (i < cnt) {
1700 		/* submit the first request */
1701 		mxge_pio_copy(dstp, srcp, sizeof(*src));
1702 		wmb(); /* barrier before setting valid flag */
1703 	}
1704 
1705 	/* re-write the last 32-bits with the valid flags */
1706 	src->flags = last_flags;
1707 	src_ints = (uint32_t *)src;
1708 	src_ints+=3;
1709 	dst_ints = (volatile uint32_t *)dst;
1710 	dst_ints+=3;
1711 	*dst_ints =  *src_ints;
1712 	tx->req += cnt;
1713 	wmb();
1714 }
1715 
1716 static int
1717 mxge_parse_tx(struct mxge_slice_state *ss, struct mbuf *m,
1718     struct mxge_pkt_info *pi)
1719 {
1720 	struct ether_vlan_header *eh;
1721 	uint16_t etype;
1722 	int tso = m->m_pkthdr.csum_flags & (CSUM_TSO);
1723 #if IFCAP_TSO6 && defined(INET6)
1724 	int nxt;
1725 #endif
1726 
1727 	eh = mtod(m, struct ether_vlan_header *);
1728 	if (eh->evl_encap_proto == htons(ETHERTYPE_VLAN)) {
1729 		etype = ntohs(eh->evl_proto);
1730 		pi->ip_off = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
1731 	} else {
1732 		etype = ntohs(eh->evl_encap_proto);
1733 		pi->ip_off = ETHER_HDR_LEN;
1734 	}
1735 
1736 	switch (etype) {
1737 	case ETHERTYPE_IP:
1738 		/*
1739 		 * ensure ip header is in first mbuf, copy it to a
1740 		 * scratch buffer if not
1741 		 */
1742 		pi->ip = (struct ip *)(m->m_data + pi->ip_off);
1743 		pi->ip6 = NULL;
1744 		if (__predict_false(m->m_len < pi->ip_off + sizeof(*pi->ip))) {
1745 			m_copydata(m, 0, pi->ip_off + sizeof(*pi->ip),
1746 			    ss->scratch);
1747 			pi->ip = (struct ip *)(ss->scratch + pi->ip_off);
1748 		}
1749 		pi->ip_hlen = pi->ip->ip_hl << 2;
1750 		if (!tso)
1751 			return 0;
1752 
1753 		if (__predict_false(m->m_len < pi->ip_off + pi->ip_hlen +
1754 		    sizeof(struct tcphdr))) {
1755 			m_copydata(m, 0, pi->ip_off + pi->ip_hlen +
1756 			    sizeof(struct tcphdr), ss->scratch);
1757 			pi->ip = (struct ip *)(ss->scratch + pi->ip_off);
1758 		}
1759 		pi->tcp = (struct tcphdr *)((char *)pi->ip + pi->ip_hlen);
1760 		break;
1761 #if IFCAP_TSO6 && defined(INET6)
1762 	case ETHERTYPE_IPV6:
1763 		pi->ip6 = (struct ip6_hdr *)(m->m_data + pi->ip_off);
1764 		if (__predict_false(m->m_len < pi->ip_off + sizeof(*pi->ip6))) {
1765 			m_copydata(m, 0, pi->ip_off + sizeof(*pi->ip6),
1766 			    ss->scratch);
1767 			pi->ip6 = (struct ip6_hdr *)(ss->scratch + pi->ip_off);
1768 		}
1769 		nxt = 0;
1770 		pi->ip_hlen = ip6_lasthdr(m, pi->ip_off, IPPROTO_IPV6, &nxt);
1771 		pi->ip_hlen -= pi->ip_off;
1772 		if (nxt != IPPROTO_TCP && nxt != IPPROTO_UDP)
1773 			return EINVAL;
1774 
1775 		if (!tso)
1776 			return 0;
1777 
1778 		if (pi->ip_off + pi->ip_hlen > ss->sc->max_tso6_hlen)
1779 			return EINVAL;
1780 
1781 		if (__predict_false(m->m_len < pi->ip_off + pi->ip_hlen +
1782 		    sizeof(struct tcphdr))) {
1783 			m_copydata(m, 0, pi->ip_off + pi->ip_hlen +
1784 			    sizeof(struct tcphdr), ss->scratch);
1785 			pi->ip6 = (struct ip6_hdr *)(ss->scratch + pi->ip_off);
1786 		}
1787 		pi->tcp = (struct tcphdr *)((char *)pi->ip6 + pi->ip_hlen);
1788 		break;
1789 #endif
1790 	default:
1791 		return EINVAL;
1792 	}
1793 	return 0;
1794 }
1795 
1796 #if IFCAP_TSO4
1797 
1798 static void
1799 mxge_encap_tso(struct mxge_slice_state *ss, struct mbuf *m,
1800 	       int busdma_seg_cnt, struct mxge_pkt_info *pi)
1801 {
1802 	mxge_tx_ring_t *tx;
1803 	mcp_kreq_ether_send_t *req;
1804 	bus_dma_segment_t *seg;
1805 	uint32_t low, high_swapped;
1806 	int len, seglen, cum_len, cum_len_next;
1807 	int next_is_first, chop, cnt, rdma_count, small;
1808 	uint16_t pseudo_hdr_offset, cksum_offset, mss, sum;
1809 	uint8_t flags, flags_next;
1810 	static int once;
1811 
1812 	mss = m->m_pkthdr.tso_segsz;
1813 
1814 	/* negative cum_len signifies to the
1815 	 * send loop that we are still in the
1816 	 * header portion of the TSO packet.
1817 	 */
1818 
1819 	cksum_offset = pi->ip_off + pi->ip_hlen;
1820 	cum_len = -(cksum_offset + (pi->tcp->th_off << 2));
1821 
1822 	/* TSO implies checksum offload on this hardware */
1823 	if (__predict_false((m->m_pkthdr.csum_flags & (CSUM_TCP|CSUM_TCP_IPV6)) == 0)) {
1824 		/*
1825 		 * If packet has full TCP csum, replace it with pseudo hdr
1826 		 * sum that the NIC expects, otherwise the NIC will emit
1827 		 * packets with bad TCP checksums.
1828 		 */
1829 		m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
1830 		if (pi->ip6) {
1831 #if (CSUM_TCP_IPV6 != 0) && defined(INET6)
1832 			m->m_pkthdr.csum_flags |= CSUM_TCP_IPV6;
1833 			sum = in6_cksum_pseudo(pi->ip6,
1834 			    m->m_pkthdr.len - cksum_offset,
1835 			    IPPROTO_TCP, 0);
1836 #endif
1837 		} else {
1838 #ifdef INET
1839 			m->m_pkthdr.csum_flags |= CSUM_TCP;
1840 			sum = in_pseudo(pi->ip->ip_src.s_addr,
1841 			    pi->ip->ip_dst.s_addr,
1842 			    htons(IPPROTO_TCP + (m->m_pkthdr.len -
1843 				    cksum_offset)));
1844 #endif
1845 		}
1846 		m_copyback(m, offsetof(struct tcphdr, th_sum) +
1847 		    cksum_offset, sizeof(sum), (caddr_t)&sum);
1848 	}
1849 	flags = MXGEFW_FLAGS_TSO_HDR | MXGEFW_FLAGS_FIRST;
1850 
1851 	/* for TSO, pseudo_hdr_offset holds mss.
1852 	 * The firmware figures out where to put
1853 	 * the checksum by parsing the header. */
1854 	pseudo_hdr_offset = htobe16(mss);
1855 
1856 	if (pi->ip6) {
1857 		/*
1858 		 * for IPv6 TSO, the "checksum offset" is re-purposed
1859 		 * to store the TCP header len
1860 		 */
1861 		cksum_offset = (pi->tcp->th_off << 2);
1862 	}
1863 
1864 	tx = &ss->tx;
1865 	req = tx->req_list;
1866 	seg = tx->seg_list;
1867 	cnt = 0;
1868 	rdma_count = 0;
1869 	/* "rdma_count" is the number of RDMAs belonging to the
1870 	 * current packet BEFORE the current send request. For
1871 	 * non-TSO packets, this is equal to "count".
1872 	 * For TSO packets, rdma_count needs to be reset
1873 	 * to 0 after a segment cut.
1874 	 *
1875 	 * The rdma_count field of the send request is
1876 	 * the number of RDMAs of the packet starting at
1877 	 * that request. For TSO send requests with one ore more cuts
1878 	 * in the middle, this is the number of RDMAs starting
1879 	 * after the last cut in the request. All previous
1880 	 * segments before the last cut implicitly have 1 RDMA.
1881 	 *
1882 	 * Since the number of RDMAs is not known beforehand,
1883 	 * it must be filled-in retroactively - after each
1884 	 * segmentation cut or at the end of the entire packet.
1885 	 */
1886 
1887 	while (busdma_seg_cnt) {
1888 		/* Break the busdma segment up into pieces*/
1889 		low = MXGE_LOWPART_TO_U32(seg->ds_addr);
1890 		high_swapped = 	htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
1891 		len = seg->ds_len;
1892 
1893 		while (len) {
1894 			flags_next = flags & ~MXGEFW_FLAGS_FIRST;
1895 			seglen = len;
1896 			cum_len_next = cum_len + seglen;
1897 			(req-rdma_count)->rdma_count = rdma_count + 1;
1898 			if (__predict_true(cum_len >= 0)) {
1899 				/* payload */
1900 				chop = (cum_len_next > mss);
1901 				cum_len_next = cum_len_next % mss;
1902 				next_is_first = (cum_len_next == 0);
1903 				flags |= chop * MXGEFW_FLAGS_TSO_CHOP;
1904 				flags_next |= next_is_first *
1905 					MXGEFW_FLAGS_FIRST;
1906 				rdma_count |= -(chop | next_is_first);
1907 				rdma_count += chop & !next_is_first;
1908 			} else if (cum_len_next >= 0) {
1909 				/* header ends */
1910 				rdma_count = -1;
1911 				cum_len_next = 0;
1912 				seglen = -cum_len;
1913 				small = (mss <= MXGEFW_SEND_SMALL_SIZE);
1914 				flags_next = MXGEFW_FLAGS_TSO_PLD |
1915 					MXGEFW_FLAGS_FIRST |
1916 					(small * MXGEFW_FLAGS_SMALL);
1917 			    }
1918 
1919 			req->addr_high = high_swapped;
1920 			req->addr_low = htobe32(low);
1921 			req->pseudo_hdr_offset = pseudo_hdr_offset;
1922 			req->pad = 0;
1923 			req->rdma_count = 1;
1924 			req->length = htobe16(seglen);
1925 			req->cksum_offset = cksum_offset;
1926 			req->flags = flags | ((cum_len & 1) *
1927 					      MXGEFW_FLAGS_ALIGN_ODD);
1928 			low += seglen;
1929 			len -= seglen;
1930 			cum_len = cum_len_next;
1931 			flags = flags_next;
1932 			req++;
1933 			cnt++;
1934 			rdma_count++;
1935 			if (cksum_offset != 0 && !pi->ip6) {
1936 				if (__predict_false(cksum_offset > seglen))
1937 					cksum_offset -= seglen;
1938 				else
1939 					cksum_offset = 0;
1940 			}
1941 			if (__predict_false(cnt > tx->max_desc))
1942 				goto drop;
1943 		}
1944 		busdma_seg_cnt--;
1945 		seg++;
1946 	}
1947 	(req-rdma_count)->rdma_count = rdma_count;
1948 
1949 	do {
1950 		req--;
1951 		req->flags |= MXGEFW_FLAGS_TSO_LAST;
1952 	} while (!(req->flags & (MXGEFW_FLAGS_TSO_CHOP | MXGEFW_FLAGS_FIRST)));
1953 
1954 	tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
1955 	mxge_submit_req(tx, tx->req_list, cnt);
1956 
1957 	if ((ss->sc->num_slices > 1) && tx->queue_active == 0) {
1958 		/* tell the NIC to start polling this slice */
1959 		*tx->send_go = 1;
1960 		tx->queue_active = 1;
1961 		tx->activate++;
1962 		wmb();
1963 	}
1964 
1965 	return;
1966 
1967 drop:
1968 	bus_dmamap_unload(tx->dmat, tx->info[tx->req & tx->mask].map);
1969 	m_freem(m);
1970 	ss->oerrors++;
1971 	if (!once) {
1972 		printf("tx->max_desc exceeded via TSO!\n");
1973 		printf("mss = %d, %ld, %d!\n", mss,
1974 		       (long)seg - (long)tx->seg_list, tx->max_desc);
1975 		once = 1;
1976 	}
1977 	return;
1978 
1979 }
1980 
1981 #endif /* IFCAP_TSO4 */
1982 
1983 #ifdef MXGE_NEW_VLAN_API
1984 /*
1985  * We reproduce the software vlan tag insertion from
1986  * net/if_vlan.c:vlan_start() here so that we can advertise "hardware"
1987  * vlan tag insertion. We need to advertise this in order to have the
1988  * vlan interface respect our csum offload flags.
1989  */
1990 static struct mbuf *
1991 mxge_vlan_tag_insert(struct mbuf *m)
1992 {
1993 	struct ether_vlan_header *evl;
1994 
1995 	M_PREPEND(m, ETHER_VLAN_ENCAP_LEN, M_NOWAIT);
1996 	if (__predict_false(m == NULL))
1997 		return NULL;
1998 	if (m->m_len < sizeof(*evl)) {
1999 		m = m_pullup(m, sizeof(*evl));
2000 		if (__predict_false(m == NULL))
2001 			return NULL;
2002 	}
2003 	/*
2004 	 * Transform the Ethernet header into an Ethernet header
2005 	 * with 802.1Q encapsulation.
2006 	 */
2007 	evl = mtod(m, struct ether_vlan_header *);
2008 	bcopy((char *)evl + ETHER_VLAN_ENCAP_LEN,
2009 	      (char *)evl, ETHER_HDR_LEN - ETHER_TYPE_LEN);
2010 	evl->evl_encap_proto = htons(ETHERTYPE_VLAN);
2011 	evl->evl_tag = htons(m->m_pkthdr.ether_vtag);
2012 	m->m_flags &= ~M_VLANTAG;
2013 	return m;
2014 }
2015 #endif /* MXGE_NEW_VLAN_API */
2016 
2017 static void
2018 mxge_encap(struct mxge_slice_state *ss, struct mbuf *m)
2019 {
2020 	struct mxge_pkt_info pi = {0,0,0,0};
2021 	mxge_softc_t *sc;
2022 	mcp_kreq_ether_send_t *req;
2023 	bus_dma_segment_t *seg;
2024 	struct mbuf *m_tmp;
2025 	mxge_tx_ring_t *tx;
2026 	int cnt, cum_len, err, i, idx, odd_flag;
2027 	uint16_t pseudo_hdr_offset;
2028 	uint8_t flags, cksum_offset;
2029 
2030 	sc = ss->sc;
2031 	tx = &ss->tx;
2032 
2033 #ifdef MXGE_NEW_VLAN_API
2034 	if (m->m_flags & M_VLANTAG) {
2035 		m = mxge_vlan_tag_insert(m);
2036 		if (__predict_false(m == NULL))
2037 			goto drop_without_m;
2038 	}
2039 #endif
2040 	if (m->m_pkthdr.csum_flags &
2041 	    (CSUM_TSO | CSUM_DELAY_DATA | CSUM_DELAY_DATA_IPV6)) {
2042 		if (mxge_parse_tx(ss, m, &pi))
2043 			goto drop;
2044 	}
2045 
2046 	/* (try to) map the frame for DMA */
2047 	idx = tx->req & tx->mask;
2048 	err = bus_dmamap_load_mbuf_sg(tx->dmat, tx->info[idx].map,
2049 				      m, tx->seg_list, &cnt,
2050 				      BUS_DMA_NOWAIT);
2051 	if (__predict_false(err == EFBIG)) {
2052 		/* Too many segments in the chain.  Try
2053 		   to defrag */
2054 		m_tmp = m_defrag(m, M_NOWAIT);
2055 		if (m_tmp == NULL) {
2056 			goto drop;
2057 		}
2058 		ss->tx.defrag++;
2059 		m = m_tmp;
2060 		err = bus_dmamap_load_mbuf_sg(tx->dmat,
2061 					      tx->info[idx].map,
2062 					      m, tx->seg_list, &cnt,
2063 					      BUS_DMA_NOWAIT);
2064 	}
2065 	if (__predict_false(err != 0)) {
2066 		device_printf(sc->dev, "bus_dmamap_load_mbuf_sg returned %d"
2067 			      " packet len = %d\n", err, m->m_pkthdr.len);
2068 		goto drop;
2069 	}
2070 	bus_dmamap_sync(tx->dmat, tx->info[idx].map,
2071 			BUS_DMASYNC_PREWRITE);
2072 	tx->info[idx].m = m;
2073 
2074 #if IFCAP_TSO4
2075 	/* TSO is different enough, we handle it in another routine */
2076 	if (m->m_pkthdr.csum_flags & (CSUM_TSO)) {
2077 		mxge_encap_tso(ss, m, cnt, &pi);
2078 		return;
2079 	}
2080 #endif
2081 
2082 	req = tx->req_list;
2083 	cksum_offset = 0;
2084 	pseudo_hdr_offset = 0;
2085 	flags = MXGEFW_FLAGS_NO_TSO;
2086 
2087 	/* checksum offloading? */
2088 	if (m->m_pkthdr.csum_flags &
2089 	    (CSUM_DELAY_DATA | CSUM_DELAY_DATA_IPV6)) {
2090 		/* ensure ip header is in first mbuf, copy
2091 		   it to a scratch buffer if not */
2092 		cksum_offset = pi.ip_off + pi.ip_hlen;
2093 		pseudo_hdr_offset = cksum_offset +  m->m_pkthdr.csum_data;
2094 		pseudo_hdr_offset = htobe16(pseudo_hdr_offset);
2095 		req->cksum_offset = cksum_offset;
2096 		flags |= MXGEFW_FLAGS_CKSUM;
2097 		odd_flag = MXGEFW_FLAGS_ALIGN_ODD;
2098 	} else {
2099 		odd_flag = 0;
2100 	}
2101 	if (m->m_pkthdr.len < MXGEFW_SEND_SMALL_SIZE)
2102 		flags |= MXGEFW_FLAGS_SMALL;
2103 
2104 	/* convert segments into a request list */
2105 	cum_len = 0;
2106 	seg = tx->seg_list;
2107 	req->flags = MXGEFW_FLAGS_FIRST;
2108 	for (i = 0; i < cnt; i++) {
2109 		req->addr_low =
2110 			htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
2111 		req->addr_high =
2112 			htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
2113 		req->length = htobe16(seg->ds_len);
2114 		req->cksum_offset = cksum_offset;
2115 		if (cksum_offset > seg->ds_len)
2116 			cksum_offset -= seg->ds_len;
2117 		else
2118 			cksum_offset = 0;
2119 		req->pseudo_hdr_offset = pseudo_hdr_offset;
2120 		req->pad = 0; /* complete solid 16-byte block */
2121 		req->rdma_count = 1;
2122 		req->flags |= flags | ((cum_len & 1) * odd_flag);
2123 		cum_len += seg->ds_len;
2124 		seg++;
2125 		req++;
2126 		req->flags = 0;
2127 	}
2128 	req--;
2129 	/* pad runts to 60 bytes */
2130 	if (cum_len < 60) {
2131 		req++;
2132 		req->addr_low =
2133 			htobe32(MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr));
2134 		req->addr_high =
2135 			htobe32(MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr));
2136 		req->length = htobe16(60 - cum_len);
2137 		req->cksum_offset = 0;
2138 		req->pseudo_hdr_offset = pseudo_hdr_offset;
2139 		req->pad = 0; /* complete solid 16-byte block */
2140 		req->rdma_count = 1;
2141 		req->flags |= flags | ((cum_len & 1) * odd_flag);
2142 		cnt++;
2143 	}
2144 
2145 	tx->req_list[0].rdma_count = cnt;
2146 #if 0
2147 	/* print what the firmware will see */
2148 	for (i = 0; i < cnt; i++) {
2149 		printf("%d: addr: 0x%x 0x%x len:%d pso%d,"
2150 		    "cso:%d, flags:0x%x, rdma:%d\n",
2151 		    i, (int)ntohl(tx->req_list[i].addr_high),
2152 		    (int)ntohl(tx->req_list[i].addr_low),
2153 		    (int)ntohs(tx->req_list[i].length),
2154 		    (int)ntohs(tx->req_list[i].pseudo_hdr_offset),
2155 		    tx->req_list[i].cksum_offset, tx->req_list[i].flags,
2156 		    tx->req_list[i].rdma_count);
2157 	}
2158 	printf("--------------\n");
2159 #endif
2160 	tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
2161 	mxge_submit_req(tx, tx->req_list, cnt);
2162 
2163 	if ((ss->sc->num_slices > 1) && tx->queue_active == 0) {
2164 		/* tell the NIC to start polling this slice */
2165 		*tx->send_go = 1;
2166 		tx->queue_active = 1;
2167 		tx->activate++;
2168 		wmb();
2169 	}
2170 
2171 	return;
2172 
2173 drop:
2174 	m_freem(m);
2175 drop_without_m:
2176 	ss->oerrors++;
2177 	return;
2178 }
2179 
2180 static void
2181 mxge_qflush(if_t ifp)
2182 {
2183 	mxge_softc_t *sc = if_getsoftc(ifp);
2184 	mxge_tx_ring_t *tx;
2185 	struct mbuf *m;
2186 	int slice;
2187 
2188 	for (slice = 0; slice < sc->num_slices; slice++) {
2189 		tx = &sc->ss[slice].tx;
2190 		mtx_lock(&tx->mtx);
2191 		while ((m = buf_ring_dequeue_sc(tx->br)) != NULL)
2192 			m_freem(m);
2193 		mtx_unlock(&tx->mtx);
2194 	}
2195 	if_qflush(ifp);
2196 }
2197 
2198 static inline void
2199 mxge_start_locked(struct mxge_slice_state *ss)
2200 {
2201 	mxge_softc_t *sc;
2202 	struct mbuf *m;
2203 	if_t ifp;
2204 	mxge_tx_ring_t *tx;
2205 
2206 	sc = ss->sc;
2207 	ifp = sc->ifp;
2208 	tx = &ss->tx;
2209 
2210 	while ((tx->mask - (tx->req - tx->done)) > tx->max_desc) {
2211 		m = drbr_dequeue(ifp, tx->br);
2212 		if (m == NULL) {
2213 			return;
2214 		}
2215 		/* let BPF see it */
2216 		BPF_MTAP(ifp, m);
2217 
2218 		/* give it to the nic */
2219 		mxge_encap(ss, m);
2220 	}
2221 	/* ran out of transmit slots */
2222 	if (((ss->if_drv_flags & IFF_DRV_OACTIVE) == 0)
2223 	    && (!drbr_empty(ifp, tx->br))) {
2224 		ss->if_drv_flags |= IFF_DRV_OACTIVE;
2225 		tx->stall++;
2226 	}
2227 }
2228 
2229 static int
2230 mxge_transmit_locked(struct mxge_slice_state *ss, struct mbuf *m)
2231 {
2232 	mxge_softc_t *sc;
2233 	if_t ifp;
2234 	mxge_tx_ring_t *tx;
2235 	int err;
2236 
2237 	sc = ss->sc;
2238 	ifp = sc->ifp;
2239 	tx = &ss->tx;
2240 
2241 	if ((ss->if_drv_flags & (IFF_DRV_RUNNING|IFF_DRV_OACTIVE)) !=
2242 	    IFF_DRV_RUNNING) {
2243 		err = drbr_enqueue(ifp, tx->br, m);
2244 		return (err);
2245 	}
2246 
2247 	if (!drbr_needs_enqueue(ifp, tx->br) &&
2248 	    ((tx->mask - (tx->req - tx->done)) > tx->max_desc)) {
2249 		/* let BPF see it */
2250 		BPF_MTAP(ifp, m);
2251 		/* give it to the nic */
2252 		mxge_encap(ss, m);
2253 	} else if ((err = drbr_enqueue(ifp, tx->br, m)) != 0) {
2254 		return (err);
2255 	}
2256 	if (!drbr_empty(ifp, tx->br))
2257 		mxge_start_locked(ss);
2258 	return (0);
2259 }
2260 
2261 static int
2262 mxge_transmit(if_t ifp, struct mbuf *m)
2263 {
2264 	mxge_softc_t *sc = if_getsoftc(ifp);
2265 	struct mxge_slice_state *ss;
2266 	mxge_tx_ring_t *tx;
2267 	int err = 0;
2268 	int slice;
2269 
2270 	slice = m->m_pkthdr.flowid;
2271 	slice &= (sc->num_slices - 1);  /* num_slices always power of 2 */
2272 
2273 	ss = &sc->ss[slice];
2274 	tx = &ss->tx;
2275 
2276 	if (mtx_trylock(&tx->mtx)) {
2277 		err = mxge_transmit_locked(ss, m);
2278 		mtx_unlock(&tx->mtx);
2279 	} else {
2280 		err = drbr_enqueue(ifp, tx->br, m);
2281 	}
2282 
2283 	return (err);
2284 }
2285 
2286 static void
2287 mxge_start(if_t ifp)
2288 {
2289 	mxge_softc_t *sc = if_getsoftc(ifp);
2290 	struct mxge_slice_state *ss;
2291 
2292 	/* only use the first slice for now */
2293 	ss = &sc->ss[0];
2294 	mtx_lock(&ss->tx.mtx);
2295 	mxge_start_locked(ss);
2296 	mtx_unlock(&ss->tx.mtx);
2297 }
2298 
2299 /*
2300  * copy an array of mcp_kreq_ether_recv_t's to the mcp.  Copy
2301  * at most 32 bytes at a time, so as to avoid involving the software
2302  * pio handler in the nic.   We re-write the first segment's low
2303  * DMA address to mark it valid only after we write the entire chunk
2304  * in a burst
2305  */
2306 static inline void
2307 mxge_submit_8rx(volatile mcp_kreq_ether_recv_t *dst,
2308 		mcp_kreq_ether_recv_t *src)
2309 {
2310 	uint32_t low;
2311 
2312 	low = src->addr_low;
2313 	src->addr_low = 0xffffffff;
2314 	mxge_pio_copy(dst, src, 4 * sizeof (*src));
2315 	wmb();
2316 	mxge_pio_copy(dst + 4, src + 4, 4 * sizeof (*src));
2317 	wmb();
2318 	src->addr_low = low;
2319 	dst->addr_low = low;
2320 	wmb();
2321 }
2322 
2323 static int
2324 mxge_get_buf_small(struct mxge_slice_state *ss, bus_dmamap_t map, int idx)
2325 {
2326 	bus_dma_segment_t seg;
2327 	struct mbuf *m;
2328 	mxge_rx_ring_t *rx = &ss->rx_small;
2329 	int cnt, err;
2330 
2331 	m = m_gethdr(M_NOWAIT, MT_DATA);
2332 	if (m == NULL) {
2333 		rx->alloc_fail++;
2334 		err = ENOBUFS;
2335 		goto done;
2336 	}
2337 	m->m_len = MHLEN;
2338 	err = bus_dmamap_load_mbuf_sg(rx->dmat, map, m,
2339 				      &seg, &cnt, BUS_DMA_NOWAIT);
2340 	if (err != 0) {
2341 		m_free(m);
2342 		goto done;
2343 	}
2344 	rx->info[idx].m = m;
2345 	rx->shadow[idx].addr_low =
2346 		htobe32(MXGE_LOWPART_TO_U32(seg.ds_addr));
2347 	rx->shadow[idx].addr_high =
2348 		htobe32(MXGE_HIGHPART_TO_U32(seg.ds_addr));
2349 
2350 done:
2351 	if ((idx & 7) == 7)
2352 		mxge_submit_8rx(&rx->lanai[idx - 7], &rx->shadow[idx - 7]);
2353 	return err;
2354 }
2355 
2356 static int
2357 mxge_get_buf_big(struct mxge_slice_state *ss, bus_dmamap_t map, int idx)
2358 {
2359 	bus_dma_segment_t seg[3];
2360 	struct mbuf *m;
2361 	mxge_rx_ring_t *rx = &ss->rx_big;
2362 	int cnt, err, i;
2363 
2364 	m = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, rx->cl_size);
2365 	if (m == NULL) {
2366 		rx->alloc_fail++;
2367 		err = ENOBUFS;
2368 		goto done;
2369 	}
2370 	m->m_len = rx->mlen;
2371 	err = bus_dmamap_load_mbuf_sg(rx->dmat, map, m,
2372 				      seg, &cnt, BUS_DMA_NOWAIT);
2373 	if (err != 0) {
2374 		m_free(m);
2375 		goto done;
2376 	}
2377 	rx->info[idx].m = m;
2378 	rx->shadow[idx].addr_low =
2379 		htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
2380 	rx->shadow[idx].addr_high =
2381 		htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
2382 
2383 done:
2384        for (i = 0; i < rx->nbufs; i++) {
2385 		if ((idx & 7) == 7) {
2386 			mxge_submit_8rx(&rx->lanai[idx - 7],
2387 					&rx->shadow[idx - 7]);
2388 		}
2389 		idx++;
2390 	}
2391 	return err;
2392 }
2393 
2394 #ifdef INET6
2395 
2396 static uint16_t
2397 mxge_csum_generic(uint16_t *raw, int len)
2398 {
2399 	uint32_t csum;
2400 
2401 	csum = 0;
2402 	while (len > 0) {
2403 		csum += *raw;
2404 		raw++;
2405 		len -= 2;
2406 	}
2407 	csum = (csum >> 16) + (csum & 0xffff);
2408 	csum = (csum >> 16) + (csum & 0xffff);
2409 	return (uint16_t)csum;
2410 }
2411 
2412 static inline uint16_t
2413 mxge_rx_csum6(void *p, struct mbuf *m, uint32_t csum)
2414 {
2415 	uint32_t partial;
2416 	int nxt, cksum_offset;
2417 	struct ip6_hdr *ip6 = p;
2418 	uint16_t c;
2419 
2420 	nxt = ip6->ip6_nxt;
2421 	cksum_offset = sizeof (*ip6) + ETHER_HDR_LEN;
2422 	if (nxt != IPPROTO_TCP && nxt != IPPROTO_UDP) {
2423 		cksum_offset = ip6_lasthdr(m, ETHER_HDR_LEN,
2424 					   IPPROTO_IPV6, &nxt);
2425 		if (nxt != IPPROTO_TCP && nxt != IPPROTO_UDP)
2426 			return (1);
2427 	}
2428 
2429 	/*
2430 	 * IPv6 headers do not contain a checksum, and hence
2431 	 * do not checksum to zero, so they don't "fall out"
2432 	 * of the partial checksum calculation like IPv4
2433 	 * headers do.  We need to fix the partial checksum by
2434 	 * subtracting the checksum of the IPv6 header.
2435 	 */
2436 
2437 	partial = mxge_csum_generic((uint16_t *)ip6, cksum_offset -
2438 				    ETHER_HDR_LEN);
2439 	csum += ~partial;
2440 	csum +=	 (csum < ~partial);
2441 	csum = (csum >> 16) + (csum & 0xFFFF);
2442 	csum = (csum >> 16) + (csum & 0xFFFF);
2443 	c = in6_cksum_pseudo(ip6, m->m_pkthdr.len - cksum_offset, nxt,
2444 			     csum);
2445 	c ^= 0xffff;
2446 	return (c);
2447 }
2448 #endif /* INET6 */
2449 /*
2450  *  Myri10GE hardware checksums are not valid if the sender
2451  *  padded the frame with non-zero padding.  This is because
2452  *  the firmware just does a simple 16-bit 1s complement
2453  *  checksum across the entire frame, excluding the first 14
2454  *  bytes.  It is best to simply to check the checksum and
2455  *  tell the stack about it only if the checksum is good
2456  */
2457 
2458 static inline uint16_t
2459 mxge_rx_csum(struct mbuf *m, int csum)
2460 {
2461 	struct ether_header *eh;
2462 #ifdef INET
2463 	struct ip *ip;
2464 #endif
2465 #if defined(INET) || defined(INET6)
2466 	int cap = if_getcapenable(m->m_pkthdr.rcvif);
2467 #endif
2468 	uint16_t c, etype;
2469 
2470 	eh = mtod(m, struct ether_header *);
2471 	etype = ntohs(eh->ether_type);
2472 	switch (etype) {
2473 #ifdef INET
2474 	case ETHERTYPE_IP:
2475 		if ((cap & IFCAP_RXCSUM) == 0)
2476 			return (1);
2477 		ip = (struct ip *)(eh + 1);
2478 		if (ip->ip_p != IPPROTO_TCP && ip->ip_p != IPPROTO_UDP)
2479 			return (1);
2480 		c = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
2481 			      htonl(ntohs(csum) + ntohs(ip->ip_len) -
2482 				    (ip->ip_hl << 2) + ip->ip_p));
2483 		c ^= 0xffff;
2484 		break;
2485 #endif
2486 #ifdef INET6
2487 	case ETHERTYPE_IPV6:
2488 		if ((cap & IFCAP_RXCSUM_IPV6) == 0)
2489 			return (1);
2490 		c = mxge_rx_csum6((eh + 1), m, csum);
2491 		break;
2492 #endif
2493 	default:
2494 		c = 1;
2495 	}
2496 	return (c);
2497 }
2498 
2499 static void
2500 mxge_vlan_tag_remove(struct mbuf *m, uint32_t *csum)
2501 {
2502 	struct ether_vlan_header *evl;
2503 	uint32_t partial;
2504 
2505 	evl = mtod(m, struct ether_vlan_header *);
2506 
2507 	/*
2508 	 * fix checksum by subtracting ETHER_VLAN_ENCAP_LEN bytes
2509 	 * after what the firmware thought was the end of the ethernet
2510 	 * header.
2511 	 */
2512 
2513 	/* put checksum into host byte order */
2514 	*csum = ntohs(*csum);
2515 	partial = ntohl(*(uint32_t *)(mtod(m, char *) + ETHER_HDR_LEN));
2516 	(*csum) += ~partial;
2517 	(*csum) +=  ((*csum) < ~partial);
2518 	(*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2519 	(*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2520 
2521 	/* restore checksum to network byte order;
2522 	   later consumers expect this */
2523 	*csum = htons(*csum);
2524 
2525 	/* save the tag */
2526 #ifdef MXGE_NEW_VLAN_API
2527 	m->m_pkthdr.ether_vtag = ntohs(evl->evl_tag);
2528 #else
2529 	{
2530 		struct m_tag *mtag;
2531 		mtag = m_tag_alloc(MTAG_VLAN, MTAG_VLAN_TAG, sizeof(u_int),
2532 				   M_NOWAIT);
2533 		if (mtag == NULL)
2534 			return;
2535 		VLAN_TAG_VALUE(mtag) = ntohs(evl->evl_tag);
2536 		m_tag_prepend(m, mtag);
2537 	}
2538 
2539 #endif
2540 	m->m_flags |= M_VLANTAG;
2541 
2542 	/*
2543 	 * Remove the 802.1q header by copying the Ethernet
2544 	 * addresses over it and adjusting the beginning of
2545 	 * the data in the mbuf.  The encapsulated Ethernet
2546 	 * type field is already in place.
2547 	 */
2548 	bcopy((char *)evl, (char *)evl + ETHER_VLAN_ENCAP_LEN,
2549 	      ETHER_HDR_LEN - ETHER_TYPE_LEN);
2550 	m_adj(m, ETHER_VLAN_ENCAP_LEN);
2551 }
2552 
2553 static inline void
2554 mxge_rx_done_big(struct mxge_slice_state *ss, uint32_t len,
2555 		 uint32_t csum, int lro)
2556 {
2557 	mxge_softc_t *sc;
2558 	if_t ifp;
2559 	struct mbuf *m;
2560 	struct ether_header *eh;
2561 	mxge_rx_ring_t *rx;
2562 	bus_dmamap_t old_map;
2563 	int idx;
2564 
2565 	sc = ss->sc;
2566 	ifp = sc->ifp;
2567 	rx = &ss->rx_big;
2568 	idx = rx->cnt & rx->mask;
2569 	rx->cnt += rx->nbufs;
2570 	/* save a pointer to the received mbuf */
2571 	m = rx->info[idx].m;
2572 	/* try to replace the received mbuf */
2573 	if (mxge_get_buf_big(ss, rx->extra_map, idx)) {
2574 		/* drop the frame -- the old mbuf is re-cycled */
2575 		if_inc_counter(ifp, IFCOUNTER_IERRORS, 1);
2576 		return;
2577 	}
2578 
2579 	/* unmap the received buffer */
2580 	old_map = rx->info[idx].map;
2581 	bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2582 	bus_dmamap_unload(rx->dmat, old_map);
2583 
2584 	/* swap the bus_dmamap_t's */
2585 	rx->info[idx].map = rx->extra_map;
2586 	rx->extra_map = old_map;
2587 
2588 	/* mcp implicitly skips 1st 2 bytes so that packet is properly
2589 	 * aligned */
2590 	m->m_data += MXGEFW_PAD;
2591 
2592 	m->m_pkthdr.rcvif = ifp;
2593 	m->m_len = m->m_pkthdr.len = len;
2594 	ss->ipackets++;
2595 	eh = mtod(m, struct ether_header *);
2596 	if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2597 		mxge_vlan_tag_remove(m, &csum);
2598 	}
2599 	/* flowid only valid if RSS hashing is enabled */
2600 	if (sc->num_slices > 1) {
2601 		m->m_pkthdr.flowid = (ss - sc->ss);
2602 		M_HASHTYPE_SET(m, M_HASHTYPE_OPAQUE);
2603 	}
2604 	/* if the checksum is valid, mark it in the mbuf header */
2605 	if ((if_getcapenable(ifp) & (IFCAP_RXCSUM_IPV6 | IFCAP_RXCSUM)) &&
2606 	    (0 == mxge_rx_csum(m, csum))) {
2607 		/* Tell the stack that the  checksum is good */
2608 		m->m_pkthdr.csum_data = 0xffff;
2609 		m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR |
2610 			CSUM_DATA_VALID;
2611 
2612 #if defined(INET) || defined (INET6)
2613 		if (lro && (0 == tcp_lro_rx(&ss->lc, m, 0)))
2614 			return;
2615 #endif
2616 	}
2617 	/* pass the frame up the stack */
2618 	if_input(ifp, m);
2619 }
2620 
2621 static inline void
2622 mxge_rx_done_small(struct mxge_slice_state *ss, uint32_t len,
2623 		   uint32_t csum, int lro)
2624 {
2625 	mxge_softc_t *sc;
2626 	if_t ifp;
2627 	struct ether_header *eh;
2628 	struct mbuf *m;
2629 	mxge_rx_ring_t *rx;
2630 	bus_dmamap_t old_map;
2631 	int idx;
2632 
2633 	sc = ss->sc;
2634 	ifp = sc->ifp;
2635 	rx = &ss->rx_small;
2636 	idx = rx->cnt & rx->mask;
2637 	rx->cnt++;
2638 	/* save a pointer to the received mbuf */
2639 	m = rx->info[idx].m;
2640 	/* try to replace the received mbuf */
2641 	if (mxge_get_buf_small(ss, rx->extra_map, idx)) {
2642 		/* drop the frame -- the old mbuf is re-cycled */
2643 		if_inc_counter(ifp, IFCOUNTER_IERRORS, 1);
2644 		return;
2645 	}
2646 
2647 	/* unmap the received buffer */
2648 	old_map = rx->info[idx].map;
2649 	bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2650 	bus_dmamap_unload(rx->dmat, old_map);
2651 
2652 	/* swap the bus_dmamap_t's */
2653 	rx->info[idx].map = rx->extra_map;
2654 	rx->extra_map = old_map;
2655 
2656 	/* mcp implicitly skips 1st 2 bytes so that packet is properly
2657 	 * aligned */
2658 	m->m_data += MXGEFW_PAD;
2659 
2660 	m->m_pkthdr.rcvif = ifp;
2661 	m->m_len = m->m_pkthdr.len = len;
2662 	ss->ipackets++;
2663 	eh = mtod(m, struct ether_header *);
2664 	if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2665 		mxge_vlan_tag_remove(m, &csum);
2666 	}
2667 	/* flowid only valid if RSS hashing is enabled */
2668 	if (sc->num_slices > 1) {
2669 		m->m_pkthdr.flowid = (ss - sc->ss);
2670 		M_HASHTYPE_SET(m, M_HASHTYPE_OPAQUE);
2671 	}
2672 	/* if the checksum is valid, mark it in the mbuf header */
2673 	if ((if_getcapenable(ifp) & (IFCAP_RXCSUM_IPV6 | IFCAP_RXCSUM)) &&
2674 	    (0 == mxge_rx_csum(m, csum))) {
2675 		/* Tell the stack that the  checksum is good */
2676 		m->m_pkthdr.csum_data = 0xffff;
2677 		m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR |
2678 			CSUM_DATA_VALID;
2679 
2680 #if defined(INET) || defined (INET6)
2681 		if (lro && (0 == tcp_lro_rx(&ss->lc, m, csum)))
2682 			return;
2683 #endif
2684 	}
2685 	/* pass the frame up the stack */
2686 	if_input(ifp, m);
2687 }
2688 
2689 static inline void
2690 mxge_clean_rx_done(struct mxge_slice_state *ss)
2691 {
2692 	mxge_rx_done_t *rx_done = &ss->rx_done;
2693 	int limit = 0;
2694 	uint16_t length;
2695 	uint16_t checksum;
2696 	int lro;
2697 
2698 	lro = if_getcapenable(ss->sc->ifp) & IFCAP_LRO;
2699 	while (rx_done->entry[rx_done->idx].length != 0) {
2700 		length = ntohs(rx_done->entry[rx_done->idx].length);
2701 		rx_done->entry[rx_done->idx].length = 0;
2702 		checksum = rx_done->entry[rx_done->idx].checksum;
2703 		if (length <= (MHLEN - MXGEFW_PAD))
2704 			mxge_rx_done_small(ss, length, checksum, lro);
2705 		else
2706 			mxge_rx_done_big(ss, length, checksum, lro);
2707 		rx_done->cnt++;
2708 		rx_done->idx = rx_done->cnt & rx_done->mask;
2709 
2710 		/* limit potential for livelock */
2711 		if (__predict_false(++limit > rx_done->mask / 2))
2712 			break;
2713 	}
2714 #if defined(INET)  || defined (INET6)
2715 	tcp_lro_flush_all(&ss->lc);
2716 #endif
2717 }
2718 
2719 static inline void
2720 mxge_tx_done(struct mxge_slice_state *ss, uint32_t mcp_idx)
2721 {
2722 	if_t ifp __unused;
2723 	mxge_tx_ring_t *tx;
2724 	struct mbuf *m;
2725 	bus_dmamap_t map;
2726 	int idx;
2727 	int *flags;
2728 
2729 	tx = &ss->tx;
2730 	ifp = ss->sc->ifp;
2731 	while (tx->pkt_done != mcp_idx) {
2732 		idx = tx->done & tx->mask;
2733 		tx->done++;
2734 		m = tx->info[idx].m;
2735 		/* mbuf and DMA map only attached to the first
2736 		   segment per-mbuf */
2737 		if (m != NULL) {
2738 			ss->obytes += m->m_pkthdr.len;
2739 			if (m->m_flags & M_MCAST)
2740 				ss->omcasts++;
2741 			ss->opackets++;
2742 			tx->info[idx].m = NULL;
2743 			map = tx->info[idx].map;
2744 			bus_dmamap_unload(tx->dmat, map);
2745 			m_freem(m);
2746 		}
2747 		if (tx->info[idx].flag) {
2748 			tx->info[idx].flag = 0;
2749 			tx->pkt_done++;
2750 		}
2751 	}
2752 
2753 	/* If we have space, clear IFF_OACTIVE to tell the stack that
2754 	   its OK to send packets */
2755 	flags = &ss->if_drv_flags;
2756 	mtx_lock(&ss->tx.mtx);
2757 	if ((*flags) & IFF_DRV_OACTIVE &&
2758 	    tx->req - tx->done < (tx->mask + 1)/4) {
2759 		*(flags) &= ~IFF_DRV_OACTIVE;
2760 		ss->tx.wake++;
2761 		mxge_start_locked(ss);
2762 	}
2763 	if ((ss->sc->num_slices > 1) && (tx->req == tx->done)) {
2764 		/* let the NIC stop polling this queue, since there
2765 		 * are no more transmits pending */
2766 		if (tx->req == tx->done) {
2767 			*tx->send_stop = 1;
2768 			tx->queue_active = 0;
2769 			tx->deactivate++;
2770 			wmb();
2771 		}
2772 	}
2773 	mtx_unlock(&ss->tx.mtx);
2774 }
2775 
2776 static struct mxge_media_type mxge_xfp_media_types[] =
2777 {
2778 	{IFM_10G_CX4,	0x7f, 		"10GBASE-CX4 (module)"},
2779 	{IFM_10G_SR, 	(1 << 7),	"10GBASE-SR"},
2780 	{IFM_10G_LR, 	(1 << 6),	"10GBASE-LR"},
2781 	{0,		(1 << 5),	"10GBASE-ER"},
2782 	{IFM_10G_LRM,	(1 << 4),	"10GBASE-LRM"},
2783 	{0,		(1 << 3),	"10GBASE-SW"},
2784 	{0,		(1 << 2),	"10GBASE-LW"},
2785 	{0,		(1 << 1),	"10GBASE-EW"},
2786 	{0,		(1 << 0),	"Reserved"}
2787 };
2788 static struct mxge_media_type mxge_sfp_media_types[] =
2789 {
2790 	{IFM_10G_TWINAX,      0,	"10GBASE-Twinax"},
2791 	{0,		(1 << 7),	"Reserved"},
2792 	{IFM_10G_LRM,	(1 << 6),	"10GBASE-LRM"},
2793 	{IFM_10G_LR, 	(1 << 5),	"10GBASE-LR"},
2794 	{IFM_10G_SR,	(1 << 4),	"10GBASE-SR"},
2795 	{IFM_10G_TWINAX,(1 << 0),	"10GBASE-Twinax"}
2796 };
2797 
2798 static void
2799 mxge_media_set(mxge_softc_t *sc, int media_type)
2800 {
2801 
2802 	ifmedia_add(&sc->media, IFM_ETHER | IFM_FDX | media_type,
2803 		    0, NULL);
2804 	ifmedia_set(&sc->media, IFM_ETHER | IFM_FDX | media_type);
2805 	sc->current_media = media_type;
2806 	sc->media.ifm_media = sc->media.ifm_cur->ifm_media;
2807 }
2808 
2809 static void
2810 mxge_media_init(mxge_softc_t *sc)
2811 {
2812 	char *ptr;
2813 	int i;
2814 
2815 	ifmedia_removeall(&sc->media);
2816 	mxge_media_set(sc, IFM_AUTO);
2817 
2818 	/*
2819 	 * parse the product code to deterimine the interface type
2820 	 * (CX4, XFP, Quad Ribbon Fiber) by looking at the character
2821 	 * after the 3rd dash in the driver's cached copy of the
2822 	 * EEPROM's product code string.
2823 	 */
2824 	ptr = sc->product_code_string;
2825 	if (ptr == NULL) {
2826 		device_printf(sc->dev, "Missing product code\n");
2827 		return;
2828 	}
2829 
2830 	for (i = 0; i < 3; i++, ptr++) {
2831 		ptr = strchr(ptr, '-');
2832 		if (ptr == NULL) {
2833 			device_printf(sc->dev,
2834 				      "only %d dashes in PC?!?\n", i);
2835 			return;
2836 		}
2837 	}
2838 	if (*ptr == 'C' || *(ptr +1) == 'C') {
2839 		/* -C is CX4 */
2840 		sc->connector = MXGE_CX4;
2841 		mxge_media_set(sc, IFM_10G_CX4);
2842 	} else if (*ptr == 'Q') {
2843 		/* -Q is Quad Ribbon Fiber */
2844 		sc->connector = MXGE_QRF;
2845 		device_printf(sc->dev, "Quad Ribbon Fiber Media\n");
2846 		/* FreeBSD has no media type for Quad ribbon fiber */
2847 	} else if (*ptr == 'R') {
2848 		/* -R is XFP */
2849 		sc->connector = MXGE_XFP;
2850 	} else if (*ptr == 'S' || *(ptr +1) == 'S') {
2851 		/* -S or -2S is SFP+ */
2852 		sc->connector = MXGE_SFP;
2853 	} else {
2854 		device_printf(sc->dev, "Unknown media type: %c\n", *ptr);
2855 	}
2856 }
2857 
2858 /*
2859  * Determine the media type for a NIC.  Some XFPs will identify
2860  * themselves only when their link is up, so this is initiated via a
2861  * link up interrupt.  However, this can potentially take up to
2862  * several milliseconds, so it is run via the watchdog routine, rather
2863  * than in the interrupt handler itself.
2864  */
2865 static void
2866 mxge_media_probe(mxge_softc_t *sc)
2867 {
2868 	mxge_cmd_t cmd;
2869 	char *cage_type;
2870 
2871 	struct mxge_media_type *mxge_media_types = NULL;
2872 	int i, err, ms, mxge_media_type_entries;
2873 	uint32_t byte;
2874 
2875 	sc->need_media_probe = 0;
2876 
2877 	if (sc->connector == MXGE_XFP) {
2878 		/* -R is XFP */
2879 		mxge_media_types = mxge_xfp_media_types;
2880 		mxge_media_type_entries =
2881 			nitems(mxge_xfp_media_types);
2882 		byte = MXGE_XFP_COMPLIANCE_BYTE;
2883 		cage_type = "XFP";
2884 	} else 	if (sc->connector == MXGE_SFP) {
2885 		/* -S or -2S is SFP+ */
2886 		mxge_media_types = mxge_sfp_media_types;
2887 		mxge_media_type_entries =
2888 			nitems(mxge_sfp_media_types);
2889 		cage_type = "SFP+";
2890 		byte = 3;
2891 	} else {
2892 		/* nothing to do; media type cannot change */
2893 		return;
2894 	}
2895 
2896 	/*
2897 	 * At this point we know the NIC has an XFP cage, so now we
2898 	 * try to determine what is in the cage by using the
2899 	 * firmware's XFP I2C commands to read the XFP 10GbE compilance
2900 	 * register.  We read just one byte, which may take over
2901 	 * a millisecond
2902 	 */
2903 
2904 	cmd.data0 = 0;	 /* just fetch 1 byte, not all 256 */
2905 	cmd.data1 = byte;
2906 	err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_READ, &cmd);
2907 	if (err == MXGEFW_CMD_ERROR_I2C_FAILURE) {
2908 		device_printf(sc->dev, "failed to read XFP\n");
2909 	}
2910 	if (err == MXGEFW_CMD_ERROR_I2C_ABSENT) {
2911 		device_printf(sc->dev, "Type R/S with no XFP!?!?\n");
2912 	}
2913 	if (err != MXGEFW_CMD_OK) {
2914 		return;
2915 	}
2916 
2917 	/* now we wait for the data to be cached */
2918 	cmd.data0 = byte;
2919 	err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
2920 	for (ms = 0; (err == EBUSY) && (ms < 50); ms++) {
2921 		DELAY(1000);
2922 		cmd.data0 = byte;
2923 		err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
2924 	}
2925 	if (err != MXGEFW_CMD_OK) {
2926 		device_printf(sc->dev, "failed to read %s (%d, %dms)\n",
2927 			      cage_type, err, ms);
2928 		return;
2929 	}
2930 
2931 	if (cmd.data0 == mxge_media_types[0].bitmask) {
2932 		if (mxge_verbose)
2933 			device_printf(sc->dev, "%s:%s\n", cage_type,
2934 				      mxge_media_types[0].name);
2935 		if (sc->current_media != mxge_media_types[0].flag) {
2936 			mxge_media_init(sc);
2937 			mxge_media_set(sc, mxge_media_types[0].flag);
2938 		}
2939 		return;
2940 	}
2941 	for (i = 1; i < mxge_media_type_entries; i++) {
2942 		if (cmd.data0 & mxge_media_types[i].bitmask) {
2943 			if (mxge_verbose)
2944 				device_printf(sc->dev, "%s:%s\n",
2945 					      cage_type,
2946 					      mxge_media_types[i].name);
2947 
2948 			if (sc->current_media != mxge_media_types[i].flag) {
2949 				mxge_media_init(sc);
2950 				mxge_media_set(sc, mxge_media_types[i].flag);
2951 			}
2952 			return;
2953 		}
2954 	}
2955 	if (mxge_verbose)
2956 		device_printf(sc->dev, "%s media 0x%x unknown\n",
2957 			      cage_type, cmd.data0);
2958 
2959 	return;
2960 }
2961 
2962 static void
2963 mxge_intr(void *arg)
2964 {
2965 	struct mxge_slice_state *ss = arg;
2966 	mxge_softc_t *sc = ss->sc;
2967 	mcp_irq_data_t *stats = ss->fw_stats;
2968 	mxge_tx_ring_t *tx = &ss->tx;
2969 	mxge_rx_done_t *rx_done = &ss->rx_done;
2970 	uint32_t send_done_count;
2971 	uint8_t valid;
2972 
2973 	/* make sure the DMA has finished */
2974 	if (!stats->valid) {
2975 		return;
2976 	}
2977 	valid = stats->valid;
2978 
2979 	if (sc->legacy_irq) {
2980 		/* lower legacy IRQ  */
2981 		*sc->irq_deassert = 0;
2982 		if (!mxge_deassert_wait)
2983 			/* don't wait for conf. that irq is low */
2984 			stats->valid = 0;
2985 	} else {
2986 		stats->valid = 0;
2987 	}
2988 
2989 	/* loop while waiting for legacy irq deassertion */
2990 	do {
2991 		/* check for transmit completes and receives */
2992 		send_done_count = be32toh(stats->send_done_count);
2993 		while ((send_done_count != tx->pkt_done) ||
2994 		       (rx_done->entry[rx_done->idx].length != 0)) {
2995 			if (send_done_count != tx->pkt_done)
2996 				mxge_tx_done(ss, (int)send_done_count);
2997 			mxge_clean_rx_done(ss);
2998 			send_done_count = be32toh(stats->send_done_count);
2999 		}
3000 		if (sc->legacy_irq && mxge_deassert_wait)
3001 			wmb();
3002 	} while (*((volatile uint8_t *) &stats->valid));
3003 
3004 	/* fw link & error stats meaningful only on the first slice */
3005 	if (__predict_false((ss == sc->ss) && stats->stats_updated)) {
3006 		if (sc->link_state != stats->link_up) {
3007 			sc->link_state = stats->link_up;
3008 			if (sc->link_state) {
3009 				if_link_state_change(sc->ifp, LINK_STATE_UP);
3010 				if (mxge_verbose)
3011 					device_printf(sc->dev, "link up\n");
3012 			} else {
3013 				if_link_state_change(sc->ifp, LINK_STATE_DOWN);
3014 				if (mxge_verbose)
3015 					device_printf(sc->dev, "link down\n");
3016 			}
3017 			sc->need_media_probe = 1;
3018 		}
3019 		if (sc->rdma_tags_available !=
3020 		    be32toh(stats->rdma_tags_available)) {
3021 			sc->rdma_tags_available =
3022 				be32toh(stats->rdma_tags_available);
3023 			device_printf(sc->dev, "RDMA timed out! %d tags "
3024 				      "left\n", sc->rdma_tags_available);
3025 		}
3026 
3027 		if (stats->link_down) {
3028 			sc->down_cnt += stats->link_down;
3029 			sc->link_state = 0;
3030 			if_link_state_change(sc->ifp, LINK_STATE_DOWN);
3031 		}
3032 	}
3033 
3034 	/* check to see if we have rx token to pass back */
3035 	if (valid & 0x1)
3036 	    *ss->irq_claim = be32toh(3);
3037 	*(ss->irq_claim + 1) = be32toh(3);
3038 }
3039 
3040 static void
3041 mxge_init(void *arg)
3042 {
3043 	mxge_softc_t *sc = arg;
3044 	if_t ifp = sc->ifp;
3045 
3046 	mtx_lock(&sc->driver_mtx);
3047 	if ((if_getdrvflags(ifp) & IFF_DRV_RUNNING) == 0)
3048 		(void) mxge_open(sc);
3049 	mtx_unlock(&sc->driver_mtx);
3050 }
3051 
3052 static void
3053 mxge_free_slice_mbufs(struct mxge_slice_state *ss)
3054 {
3055 	int i;
3056 
3057 #if defined(INET) || defined(INET6)
3058 	tcp_lro_free(&ss->lc);
3059 #endif
3060 	for (i = 0; i <= ss->rx_big.mask; i++) {
3061 		if (ss->rx_big.info[i].m == NULL)
3062 			continue;
3063 		bus_dmamap_unload(ss->rx_big.dmat,
3064 				  ss->rx_big.info[i].map);
3065 		m_freem(ss->rx_big.info[i].m);
3066 		ss->rx_big.info[i].m = NULL;
3067 	}
3068 
3069 	for (i = 0; i <= ss->rx_small.mask; i++) {
3070 		if (ss->rx_small.info[i].m == NULL)
3071 			continue;
3072 		bus_dmamap_unload(ss->rx_small.dmat,
3073 				  ss->rx_small.info[i].map);
3074 		m_freem(ss->rx_small.info[i].m);
3075 		ss->rx_small.info[i].m = NULL;
3076 	}
3077 
3078 	/* transmit ring used only on the first slice */
3079 	if (ss->tx.info == NULL)
3080 		return;
3081 
3082 	for (i = 0; i <= ss->tx.mask; i++) {
3083 		ss->tx.info[i].flag = 0;
3084 		if (ss->tx.info[i].m == NULL)
3085 			continue;
3086 		bus_dmamap_unload(ss->tx.dmat,
3087 				  ss->tx.info[i].map);
3088 		m_freem(ss->tx.info[i].m);
3089 		ss->tx.info[i].m = NULL;
3090 	}
3091 }
3092 
3093 static void
3094 mxge_free_mbufs(mxge_softc_t *sc)
3095 {
3096 	int slice;
3097 
3098 	for (slice = 0; slice < sc->num_slices; slice++)
3099 		mxge_free_slice_mbufs(&sc->ss[slice]);
3100 }
3101 
3102 static void
3103 mxge_free_slice_rings(struct mxge_slice_state *ss)
3104 {
3105 	int i;
3106 
3107 	if (ss->rx_done.entry != NULL)
3108 		mxge_dma_free(&ss->rx_done.dma);
3109 	ss->rx_done.entry = NULL;
3110 
3111 	if (ss->tx.req_bytes != NULL)
3112 		free(ss->tx.req_bytes, M_DEVBUF);
3113 	ss->tx.req_bytes = NULL;
3114 
3115 	if (ss->tx.seg_list != NULL)
3116 		free(ss->tx.seg_list, M_DEVBUF);
3117 	ss->tx.seg_list = NULL;
3118 
3119 	if (ss->rx_small.shadow != NULL)
3120 		free(ss->rx_small.shadow, M_DEVBUF);
3121 	ss->rx_small.shadow = NULL;
3122 
3123 	if (ss->rx_big.shadow != NULL)
3124 		free(ss->rx_big.shadow, M_DEVBUF);
3125 	ss->rx_big.shadow = NULL;
3126 
3127 	if (ss->tx.info != NULL) {
3128 		if (ss->tx.dmat != NULL) {
3129 			for (i = 0; i <= ss->tx.mask; i++) {
3130 				bus_dmamap_destroy(ss->tx.dmat,
3131 						   ss->tx.info[i].map);
3132 			}
3133 			bus_dma_tag_destroy(ss->tx.dmat);
3134 		}
3135 		free(ss->tx.info, M_DEVBUF);
3136 	}
3137 	ss->tx.info = NULL;
3138 
3139 	if (ss->rx_small.info != NULL) {
3140 		if (ss->rx_small.dmat != NULL) {
3141 			for (i = 0; i <= ss->rx_small.mask; i++) {
3142 				bus_dmamap_destroy(ss->rx_small.dmat,
3143 						   ss->rx_small.info[i].map);
3144 			}
3145 			bus_dmamap_destroy(ss->rx_small.dmat,
3146 					   ss->rx_small.extra_map);
3147 			bus_dma_tag_destroy(ss->rx_small.dmat);
3148 		}
3149 		free(ss->rx_small.info, M_DEVBUF);
3150 	}
3151 	ss->rx_small.info = NULL;
3152 
3153 	if (ss->rx_big.info != NULL) {
3154 		if (ss->rx_big.dmat != NULL) {
3155 			for (i = 0; i <= ss->rx_big.mask; i++) {
3156 				bus_dmamap_destroy(ss->rx_big.dmat,
3157 						   ss->rx_big.info[i].map);
3158 			}
3159 			bus_dmamap_destroy(ss->rx_big.dmat,
3160 					   ss->rx_big.extra_map);
3161 			bus_dma_tag_destroy(ss->rx_big.dmat);
3162 		}
3163 		free(ss->rx_big.info, M_DEVBUF);
3164 	}
3165 	ss->rx_big.info = NULL;
3166 }
3167 
3168 static void
3169 mxge_free_rings(mxge_softc_t *sc)
3170 {
3171 	int slice;
3172 
3173 	for (slice = 0; slice < sc->num_slices; slice++)
3174 		mxge_free_slice_rings(&sc->ss[slice]);
3175 }
3176 
3177 static int
3178 mxge_alloc_slice_rings(struct mxge_slice_state *ss, int rx_ring_entries,
3179 		       int tx_ring_entries)
3180 {
3181 	mxge_softc_t *sc = ss->sc;
3182 	size_t bytes;
3183 	int err, i;
3184 
3185 	/* allocate per-slice receive resources */
3186 
3187 	ss->rx_small.mask = ss->rx_big.mask = rx_ring_entries - 1;
3188 	ss->rx_done.mask = (2 * rx_ring_entries) - 1;
3189 
3190 	/* allocate the rx shadow rings */
3191 	bytes = rx_ring_entries * sizeof (*ss->rx_small.shadow);
3192 	ss->rx_small.shadow = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3193 
3194 	bytes = rx_ring_entries * sizeof (*ss->rx_big.shadow);
3195 	ss->rx_big.shadow = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3196 
3197 	/* allocate the rx host info rings */
3198 	bytes = rx_ring_entries * sizeof (*ss->rx_small.info);
3199 	ss->rx_small.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3200 
3201 	bytes = rx_ring_entries * sizeof (*ss->rx_big.info);
3202 	ss->rx_big.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3203 
3204 	/* allocate the rx busdma resources */
3205 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
3206 				 1,			/* alignment */
3207 				 4096,			/* boundary */
3208 				 BUS_SPACE_MAXADDR,	/* low */
3209 				 BUS_SPACE_MAXADDR,	/* high */
3210 				 NULL, NULL,		/* filter */
3211 				 MHLEN,			/* maxsize */
3212 				 1,			/* num segs */
3213 				 MHLEN,			/* maxsegsize */
3214 				 BUS_DMA_ALLOCNOW,	/* flags */
3215 				 NULL, NULL,		/* lock */
3216 				 &ss->rx_small.dmat);	/* tag */
3217 	if (err != 0) {
3218 		device_printf(sc->dev, "Err %d allocating rx_small dmat\n",
3219 			      err);
3220 		return err;
3221 	}
3222 
3223 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
3224 				 1,			/* alignment */
3225 				 0,			/* boundary */
3226 				 BUS_SPACE_MAXADDR,	/* low */
3227 				 BUS_SPACE_MAXADDR,	/* high */
3228 				 NULL, NULL,		/* filter */
3229 				 3*4096,		/* maxsize */
3230 				 1,			/* num segs */
3231 				 MJUM9BYTES,		/* maxsegsize*/
3232 				 BUS_DMA_ALLOCNOW,	/* flags */
3233 				 NULL, NULL,		/* lock */
3234 				 &ss->rx_big.dmat);	/* tag */
3235 	if (err != 0) {
3236 		device_printf(sc->dev, "Err %d allocating rx_big dmat\n",
3237 			      err);
3238 		return err;
3239 	}
3240 	for (i = 0; i <= ss->rx_small.mask; i++) {
3241 		err = bus_dmamap_create(ss->rx_small.dmat, 0,
3242 					&ss->rx_small.info[i].map);
3243 		if (err != 0) {
3244 			device_printf(sc->dev, "Err %d  rx_small dmamap\n",
3245 				      err);
3246 			return err;
3247 		}
3248 	}
3249 	err = bus_dmamap_create(ss->rx_small.dmat, 0,
3250 				&ss->rx_small.extra_map);
3251 	if (err != 0) {
3252 		device_printf(sc->dev, "Err %d extra rx_small dmamap\n",
3253 			      err);
3254 		return err;
3255 	}
3256 
3257 	for (i = 0; i <= ss->rx_big.mask; i++) {
3258 		err = bus_dmamap_create(ss->rx_big.dmat, 0,
3259 					&ss->rx_big.info[i].map);
3260 		if (err != 0) {
3261 			device_printf(sc->dev, "Err %d  rx_big dmamap\n",
3262 				      err);
3263 			return err;
3264 		}
3265 	}
3266 	err = bus_dmamap_create(ss->rx_big.dmat, 0,
3267 				&ss->rx_big.extra_map);
3268 	if (err != 0) {
3269 		device_printf(sc->dev, "Err %d extra rx_big dmamap\n",
3270 			      err);
3271 		return err;
3272 	}
3273 
3274 	/* now allocate TX resources */
3275 
3276 	ss->tx.mask = tx_ring_entries - 1;
3277 	ss->tx.max_desc = MIN(MXGE_MAX_SEND_DESC, tx_ring_entries / 4);
3278 
3279 	/* allocate the tx request copy block */
3280 	bytes = 8 +
3281 		sizeof (*ss->tx.req_list) * (ss->tx.max_desc + 4);
3282 	ss->tx.req_bytes = malloc(bytes, M_DEVBUF, M_WAITOK);
3283 	/* ensure req_list entries are aligned to 8 bytes */
3284 	ss->tx.req_list = (mcp_kreq_ether_send_t *)
3285 		((uintptr_t)(ss->tx.req_bytes + 7) & ~7UL);
3286 
3287 	/* allocate the tx busdma segment list */
3288 	bytes = sizeof (*ss->tx.seg_list) * ss->tx.max_desc;
3289 	ss->tx.seg_list = (bus_dma_segment_t *)
3290 		malloc(bytes, M_DEVBUF, M_WAITOK);
3291 
3292 	/* allocate the tx host info ring */
3293 	bytes = tx_ring_entries * sizeof (*ss->tx.info);
3294 	ss->tx.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3295 
3296 	/* allocate the tx busdma resources */
3297 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
3298 				 1,			/* alignment */
3299 				 sc->tx_boundary,	/* boundary */
3300 				 BUS_SPACE_MAXADDR,	/* low */
3301 				 BUS_SPACE_MAXADDR,	/* high */
3302 				 NULL, NULL,		/* filter */
3303 				 65536 + 256,		/* maxsize */
3304 				 ss->tx.max_desc - 2,	/* num segs */
3305 				 sc->tx_boundary,	/* maxsegsz */
3306 				 BUS_DMA_ALLOCNOW,	/* flags */
3307 				 NULL, NULL,		/* lock */
3308 				 &ss->tx.dmat);		/* tag */
3309 
3310 	if (err != 0) {
3311 		device_printf(sc->dev, "Err %d allocating tx dmat\n",
3312 			      err);
3313 		return err;
3314 	}
3315 
3316 	/* now use these tags to setup dmamaps for each slot
3317 	   in the ring */
3318 	for (i = 0; i <= ss->tx.mask; i++) {
3319 		err = bus_dmamap_create(ss->tx.dmat, 0,
3320 					&ss->tx.info[i].map);
3321 		if (err != 0) {
3322 			device_printf(sc->dev, "Err %d  tx dmamap\n",
3323 				      err);
3324 			return err;
3325 		}
3326 	}
3327 	return 0;
3328 
3329 }
3330 
3331 static int
3332 mxge_alloc_rings(mxge_softc_t *sc)
3333 {
3334 	mxge_cmd_t cmd;
3335 	int tx_ring_size;
3336 	int tx_ring_entries, rx_ring_entries;
3337 	int err, slice;
3338 
3339 	/* get ring sizes */
3340 	err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_RING_SIZE, &cmd);
3341 	tx_ring_size = cmd.data0;
3342 	if (err != 0) {
3343 		device_printf(sc->dev, "Cannot determine tx ring sizes\n");
3344 		goto abort;
3345 	}
3346 
3347 	tx_ring_entries = tx_ring_size / sizeof (mcp_kreq_ether_send_t);
3348 	rx_ring_entries = sc->rx_ring_size / sizeof (mcp_dma_addr_t);
3349 	if_setsendqlen(sc->ifp, tx_ring_entries - 1);
3350 	if_setsendqready(sc->ifp);
3351 
3352 	for (slice = 0; slice < sc->num_slices; slice++) {
3353 		err = mxge_alloc_slice_rings(&sc->ss[slice],
3354 					     rx_ring_entries,
3355 					     tx_ring_entries);
3356 		if (err != 0)
3357 			goto abort;
3358 	}
3359 	return 0;
3360 
3361 abort:
3362 	mxge_free_rings(sc);
3363 	return err;
3364 
3365 }
3366 
3367 static void
3368 mxge_choose_params(int mtu, int *big_buf_size, int *cl_size, int *nbufs)
3369 {
3370 	int bufsize = mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN + MXGEFW_PAD;
3371 
3372 	if (bufsize < MCLBYTES) {
3373 		/* easy, everything fits in a single buffer */
3374 		*big_buf_size = MCLBYTES;
3375 		*cl_size = MCLBYTES;
3376 		*nbufs = 1;
3377 		return;
3378 	}
3379 
3380 	if (bufsize < MJUMPAGESIZE) {
3381 		/* still easy, everything still fits in a single buffer */
3382 		*big_buf_size = MJUMPAGESIZE;
3383 		*cl_size = MJUMPAGESIZE;
3384 		*nbufs = 1;
3385 		return;
3386 	}
3387 	*cl_size = MJUM9BYTES;
3388 	*big_buf_size = MJUM9BYTES;
3389 	*nbufs = 1;
3390 }
3391 
3392 static int
3393 mxge_slice_open(struct mxge_slice_state *ss, int nbufs, int cl_size)
3394 {
3395 	mxge_softc_t *sc;
3396 	mxge_cmd_t cmd;
3397 	bus_dmamap_t map;
3398 	int err, i, slice;
3399 
3400 	sc = ss->sc;
3401 	slice = ss - sc->ss;
3402 
3403 #if defined(INET) || defined(INET6)
3404 	(void)tcp_lro_init(&ss->lc);
3405 #endif
3406 	ss->lc.ifp = sc->ifp;
3407 
3408 	/* get the lanai pointers to the send and receive rings */
3409 
3410 	err = 0;
3411 
3412 	cmd.data0 = slice;
3413 	err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_OFFSET, &cmd);
3414 	ss->tx.lanai =
3415 		(volatile mcp_kreq_ether_send_t *)(sc->sram + cmd.data0);
3416 	ss->tx.send_go = (volatile uint32_t *)
3417 		(sc->sram + MXGEFW_ETH_SEND_GO + 64 * slice);
3418 	ss->tx.send_stop = (volatile uint32_t *)
3419 	(sc->sram + MXGEFW_ETH_SEND_STOP + 64 * slice);
3420 
3421 	cmd.data0 = slice;
3422 	err |= mxge_send_cmd(sc,
3423 			     MXGEFW_CMD_GET_SMALL_RX_OFFSET, &cmd);
3424 	ss->rx_small.lanai =
3425 		(volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
3426 	cmd.data0 = slice;
3427 	err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_BIG_RX_OFFSET, &cmd);
3428 	ss->rx_big.lanai =
3429 		(volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
3430 
3431 	if (err != 0) {
3432 		device_printf(sc->dev,
3433 			      "failed to get ring sizes or locations\n");
3434 		return EIO;
3435 	}
3436 
3437 	/* stock receive rings */
3438 	for (i = 0; i <= ss->rx_small.mask; i++) {
3439 		map = ss->rx_small.info[i].map;
3440 		err = mxge_get_buf_small(ss, map, i);
3441 		if (err) {
3442 			device_printf(sc->dev, "alloced %d/%d smalls\n",
3443 				      i, ss->rx_small.mask + 1);
3444 			return ENOMEM;
3445 		}
3446 	}
3447 	for (i = 0; i <= ss->rx_big.mask; i++) {
3448 		ss->rx_big.shadow[i].addr_low = 0xffffffff;
3449 		ss->rx_big.shadow[i].addr_high = 0xffffffff;
3450 	}
3451 	ss->rx_big.nbufs = nbufs;
3452 	ss->rx_big.cl_size = cl_size;
3453 	ss->rx_big.mlen = if_getmtu(ss->sc->ifp) + ETHER_HDR_LEN +
3454 		ETHER_VLAN_ENCAP_LEN + MXGEFW_PAD;
3455 	for (i = 0; i <= ss->rx_big.mask; i += ss->rx_big.nbufs) {
3456 		map = ss->rx_big.info[i].map;
3457 		err = mxge_get_buf_big(ss, map, i);
3458 		if (err) {
3459 			device_printf(sc->dev, "alloced %d/%d bigs\n",
3460 				      i, ss->rx_big.mask + 1);
3461 			return ENOMEM;
3462 		}
3463 	}
3464 	return 0;
3465 }
3466 
3467 static int
3468 mxge_open(mxge_softc_t *sc)
3469 {
3470 	mxge_cmd_t cmd;
3471 	int err, big_bytes, nbufs, slice, cl_size, i;
3472 	bus_addr_t bus;
3473 	volatile uint8_t *itable;
3474 	struct mxge_slice_state *ss;
3475 
3476 	/* Copy the MAC address in case it was overridden */
3477 	bcopy(if_getlladdr(sc->ifp), sc->mac_addr, ETHER_ADDR_LEN);
3478 
3479 	err = mxge_reset(sc, 1);
3480 	if (err != 0) {
3481 		device_printf(sc->dev, "failed to reset\n");
3482 		return EIO;
3483 	}
3484 
3485 	if (sc->num_slices > 1) {
3486 		/* setup the indirection table */
3487 		cmd.data0 = sc->num_slices;
3488 		err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_TABLE_SIZE,
3489 				    &cmd);
3490 
3491 		err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_RSS_TABLE_OFFSET,
3492 				     &cmd);
3493 		if (err != 0) {
3494 			device_printf(sc->dev,
3495 				      "failed to setup rss tables\n");
3496 			return err;
3497 		}
3498 
3499 		/* just enable an identity mapping */
3500 		itable = sc->sram + cmd.data0;
3501 		for (i = 0; i < sc->num_slices; i++)
3502 			itable[i] = (uint8_t)i;
3503 
3504 		cmd.data0 = 1;
3505 		cmd.data1 = mxge_rss_hash_type;
3506 		err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_ENABLE, &cmd);
3507 		if (err != 0) {
3508 			device_printf(sc->dev, "failed to enable slices\n");
3509 			return err;
3510 		}
3511 	}
3512 
3513 	mxge_choose_params(if_getmtu(sc->ifp), &big_bytes, &cl_size, &nbufs);
3514 
3515 	cmd.data0 = nbufs;
3516 	err = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
3517 			    &cmd);
3518 	/* error is only meaningful if we're trying to set
3519 	   MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS > 1 */
3520 	if (err && nbufs > 1) {
3521 		device_printf(sc->dev,
3522 			      "Failed to set alway-use-n to %d\n",
3523 			      nbufs);
3524 		return EIO;
3525 	}
3526 	/* Give the firmware the mtu and the big and small buffer
3527 	   sizes.  The firmware wants the big buf size to be a power
3528 	   of two. Luckily, FreeBSD's clusters are powers of two */
3529 	cmd.data0 = if_getmtu(sc->ifp) + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
3530 	err = mxge_send_cmd(sc, MXGEFW_CMD_SET_MTU, &cmd);
3531 	cmd.data0 = MHLEN - MXGEFW_PAD;
3532 	err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_SMALL_BUFFER_SIZE,
3533 			     &cmd);
3534 	cmd.data0 = big_bytes;
3535 	err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_BIG_BUFFER_SIZE, &cmd);
3536 
3537 	if (err != 0) {
3538 		device_printf(sc->dev, "failed to setup params\n");
3539 		goto abort;
3540 	}
3541 
3542 	/* Now give him the pointer to the stats block */
3543 	for (slice = 0; slice < sc->num_slices; slice++) {
3544 		ss = &sc->ss[slice];
3545 		cmd.data0 =
3546 			MXGE_LOWPART_TO_U32(ss->fw_stats_dma.bus_addr);
3547 		cmd.data1 =
3548 			MXGE_HIGHPART_TO_U32(ss->fw_stats_dma.bus_addr);
3549 		cmd.data2 = sizeof(struct mcp_irq_data);
3550 		cmd.data2 |= (slice << 16);
3551 		err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_STATS_DMA_V2, &cmd);
3552 	}
3553 
3554 	if (err != 0) {
3555 		bus = sc->ss->fw_stats_dma.bus_addr;
3556 		bus += offsetof(struct mcp_irq_data, send_done_count);
3557 		cmd.data0 = MXGE_LOWPART_TO_U32(bus);
3558 		cmd.data1 = MXGE_HIGHPART_TO_U32(bus);
3559 		err = mxge_send_cmd(sc,
3560 				    MXGEFW_CMD_SET_STATS_DMA_OBSOLETE,
3561 				    &cmd);
3562 		/* Firmware cannot support multicast without STATS_DMA_V2 */
3563 		sc->fw_multicast_support = 0;
3564 	} else {
3565 		sc->fw_multicast_support = 1;
3566 	}
3567 
3568 	if (err != 0) {
3569 		device_printf(sc->dev, "failed to setup params\n");
3570 		goto abort;
3571 	}
3572 
3573 	for (slice = 0; slice < sc->num_slices; slice++) {
3574 		err = mxge_slice_open(&sc->ss[slice], nbufs, cl_size);
3575 		if (err != 0) {
3576 			device_printf(sc->dev, "couldn't open slice %d\n",
3577 				      slice);
3578 			goto abort;
3579 		}
3580 	}
3581 
3582 	/* Finally, start the firmware running */
3583 	err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_UP, &cmd);
3584 	if (err) {
3585 		device_printf(sc->dev, "Couldn't bring up link\n");
3586 		goto abort;
3587 	}
3588 	for (slice = 0; slice < sc->num_slices; slice++) {
3589 		ss = &sc->ss[slice];
3590 		ss->if_drv_flags |= IFF_DRV_RUNNING;
3591 		ss->if_drv_flags &= ~IFF_DRV_OACTIVE;
3592 	}
3593 	if_setdrvflagbits(sc->ifp, IFF_DRV_RUNNING, 0);
3594 	if_setdrvflagbits(sc->ifp, 0, IFF_DRV_OACTIVE);
3595 
3596 	return 0;
3597 
3598 abort:
3599 	mxge_free_mbufs(sc);
3600 
3601 	return err;
3602 }
3603 
3604 static int
3605 mxge_close(mxge_softc_t *sc, int down)
3606 {
3607 	mxge_cmd_t cmd;
3608 	int err, old_down_cnt;
3609 	struct mxge_slice_state *ss;
3610 	int slice;
3611 
3612 	for (slice = 0; slice < sc->num_slices; slice++) {
3613 		ss = &sc->ss[slice];
3614 		ss->if_drv_flags &= ~IFF_DRV_RUNNING;
3615 	}
3616 	if_setdrvflagbits(sc->ifp, 0, IFF_DRV_RUNNING);
3617 	if (!down) {
3618 		old_down_cnt = sc->down_cnt;
3619 		wmb();
3620 		err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_DOWN, &cmd);
3621 		if (err) {
3622 			device_printf(sc->dev,
3623 				      "Couldn't bring down link\n");
3624 		}
3625 		if (old_down_cnt == sc->down_cnt) {
3626 			/* wait for down irq */
3627 			DELAY(10 * sc->intr_coal_delay);
3628 		}
3629 		wmb();
3630 		if (old_down_cnt == sc->down_cnt) {
3631 			device_printf(sc->dev, "never got down irq\n");
3632 		}
3633 	}
3634 	mxge_free_mbufs(sc);
3635 
3636 	return 0;
3637 }
3638 
3639 static void
3640 mxge_setup_cfg_space(mxge_softc_t *sc)
3641 {
3642 	device_t dev = sc->dev;
3643 	int reg;
3644 	uint16_t lnk, pectl;
3645 
3646 	/* find the PCIe link width and set max read request to 4KB*/
3647 	if (pci_find_cap(dev, PCIY_EXPRESS, &reg) == 0) {
3648 		lnk = pci_read_config(dev, reg + 0x12, 2);
3649 		sc->link_width = (lnk >> 4) & 0x3f;
3650 
3651 		if (sc->pectl == 0) {
3652 			pectl = pci_read_config(dev, reg + 0x8, 2);
3653 			pectl = (pectl & ~0x7000) | (5 << 12);
3654 			pci_write_config(dev, reg + 0x8, pectl, 2);
3655 			sc->pectl = pectl;
3656 		} else {
3657 			/* restore saved pectl after watchdog reset */
3658 			pci_write_config(dev, reg + 0x8, sc->pectl, 2);
3659 		}
3660 	}
3661 
3662 	/* Enable DMA and Memory space access */
3663 	pci_enable_busmaster(dev);
3664 }
3665 
3666 static uint32_t
3667 mxge_read_reboot(mxge_softc_t *sc)
3668 {
3669 	device_t dev = sc->dev;
3670 	uint32_t vs;
3671 
3672 	/* find the vendor specific offset */
3673 	if (pci_find_cap(dev, PCIY_VENDOR, &vs) != 0) {
3674 		device_printf(sc->dev,
3675 			      "could not find vendor specific offset\n");
3676 		return (uint32_t)-1;
3677 	}
3678 	/* enable read32 mode */
3679 	pci_write_config(dev, vs + 0x10, 0x3, 1);
3680 	/* tell NIC which register to read */
3681 	pci_write_config(dev, vs + 0x18, 0xfffffff0, 4);
3682 	return (pci_read_config(dev, vs + 0x14, 4));
3683 }
3684 
3685 static void
3686 mxge_watchdog_reset(mxge_softc_t *sc)
3687 {
3688 	struct pci_devinfo *dinfo;
3689 	struct mxge_slice_state *ss;
3690 	int err, running, s, num_tx_slices = 1;
3691 	uint32_t reboot;
3692 	uint16_t cmd;
3693 
3694 	err = ENXIO;
3695 
3696 	device_printf(sc->dev, "Watchdog reset!\n");
3697 
3698 	/*
3699 	 * check to see if the NIC rebooted.  If it did, then all of
3700 	 * PCI config space has been reset, and things like the
3701 	 * busmaster bit will be zero.  If this is the case, then we
3702 	 * must restore PCI config space before the NIC can be used
3703 	 * again
3704 	 */
3705 	cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3706 	if (cmd == 0xffff) {
3707 		/*
3708 		 * maybe the watchdog caught the NIC rebooting; wait
3709 		 * up to 100ms for it to finish.  If it does not come
3710 		 * back, then give up
3711 		 */
3712 		DELAY(1000*100);
3713 		cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3714 		if (cmd == 0xffff) {
3715 			device_printf(sc->dev, "NIC disappeared!\n");
3716 		}
3717 	}
3718 	if ((cmd & PCIM_CMD_BUSMASTEREN) == 0) {
3719 		/* print the reboot status */
3720 		reboot = mxge_read_reboot(sc);
3721 		device_printf(sc->dev, "NIC rebooted, status = 0x%x\n",
3722 			      reboot);
3723 		running = if_getdrvflags(sc->ifp) & IFF_DRV_RUNNING;
3724 		if (running) {
3725 			/*
3726 			 * quiesce NIC so that TX routines will not try to
3727 			 * xmit after restoration of BAR
3728 			 */
3729 
3730 			/* Mark the link as down */
3731 			if (sc->link_state) {
3732 				sc->link_state = 0;
3733 				if_link_state_change(sc->ifp,
3734 						     LINK_STATE_DOWN);
3735 			}
3736 
3737 			num_tx_slices = sc->num_slices;
3738 
3739 			/* grab all TX locks to ensure no tx  */
3740 			for (s = 0; s < num_tx_slices; s++) {
3741 				ss = &sc->ss[s];
3742 				mtx_lock(&ss->tx.mtx);
3743 			}
3744 			mxge_close(sc, 1);
3745 		}
3746 		/* restore PCI configuration space */
3747 		dinfo = device_get_ivars(sc->dev);
3748 		pci_cfg_restore(sc->dev, dinfo);
3749 
3750 		/* and redo any changes we made to our config space */
3751 		mxge_setup_cfg_space(sc);
3752 
3753 		/* reload f/w */
3754 		err = mxge_load_firmware(sc, 0);
3755 		if (err) {
3756 			device_printf(sc->dev,
3757 				      "Unable to re-load f/w\n");
3758 		}
3759 		if (running) {
3760 			if (!err)
3761 				err = mxge_open(sc);
3762 			/* release all TX locks */
3763 			for (s = 0; s < num_tx_slices; s++) {
3764 				ss = &sc->ss[s];
3765 				mxge_start_locked(ss);
3766 				mtx_unlock(&ss->tx.mtx);
3767 			}
3768 		}
3769 		sc->watchdog_resets++;
3770 	} else {
3771 		device_printf(sc->dev,
3772 			      "NIC did not reboot, not resetting\n");
3773 		err = 0;
3774 	}
3775 	if (err) {
3776 		device_printf(sc->dev, "watchdog reset failed\n");
3777 	} else {
3778 		if (sc->dying == 2)
3779 			sc->dying = 0;
3780 		callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
3781 	}
3782 }
3783 
3784 static void
3785 mxge_watchdog_task(void *arg, int pending)
3786 {
3787 	mxge_softc_t *sc = arg;
3788 
3789 	mtx_lock(&sc->driver_mtx);
3790 	mxge_watchdog_reset(sc);
3791 	mtx_unlock(&sc->driver_mtx);
3792 }
3793 
3794 static void
3795 mxge_warn_stuck(mxge_softc_t *sc, mxge_tx_ring_t *tx, int slice)
3796 {
3797 	tx = &sc->ss[slice].tx;
3798 	device_printf(sc->dev, "slice %d struck? ring state:\n", slice);
3799 	device_printf(sc->dev,
3800 		      "tx.req=%d tx.done=%d, tx.queue_active=%d\n",
3801 		      tx->req, tx->done, tx->queue_active);
3802 	device_printf(sc->dev, "tx.activate=%d tx.deactivate=%d\n",
3803 			      tx->activate, tx->deactivate);
3804 	device_printf(sc->dev, "pkt_done=%d fw=%d\n",
3805 		      tx->pkt_done,
3806 		      be32toh(sc->ss->fw_stats->send_done_count));
3807 }
3808 
3809 static int
3810 mxge_watchdog(mxge_softc_t *sc)
3811 {
3812 	mxge_tx_ring_t *tx;
3813 	uint32_t rx_pause = be32toh(sc->ss->fw_stats->dropped_pause);
3814 	int i, err = 0;
3815 
3816 	/* see if we have outstanding transmits, which
3817 	   have been pending for more than mxge_ticks */
3818 	for (i = 0; (i < sc->num_slices) && (err == 0); i++) {
3819 		tx = &sc->ss[i].tx;
3820 		if (tx->req != tx->done &&
3821 		    tx->watchdog_req != tx->watchdog_done &&
3822 		    tx->done == tx->watchdog_done) {
3823 			/* check for pause blocking before resetting */
3824 			if (tx->watchdog_rx_pause == rx_pause) {
3825 				mxge_warn_stuck(sc, tx, i);
3826 				taskqueue_enqueue(sc->tq, &sc->watchdog_task);
3827 				return (ENXIO);
3828 			}
3829 			else
3830 				device_printf(sc->dev, "Flow control blocking "
3831 					      "xmits, check link partner\n");
3832 		}
3833 
3834 		tx->watchdog_req = tx->req;
3835 		tx->watchdog_done = tx->done;
3836 		tx->watchdog_rx_pause = rx_pause;
3837 	}
3838 
3839 	if (sc->need_media_probe)
3840 		mxge_media_probe(sc);
3841 	return (err);
3842 }
3843 
3844 static uint64_t
3845 mxge_get_counter(if_t ifp, ift_counter cnt)
3846 {
3847 	struct mxge_softc *sc;
3848 	uint64_t rv;
3849 
3850 	sc = if_getsoftc(ifp);
3851 	rv = 0;
3852 
3853 	switch (cnt) {
3854 	case IFCOUNTER_IPACKETS:
3855 		for (int s = 0; s < sc->num_slices; s++)
3856 			rv += sc->ss[s].ipackets;
3857 		return (rv);
3858 	case IFCOUNTER_OPACKETS:
3859 		for (int s = 0; s < sc->num_slices; s++)
3860 			rv += sc->ss[s].opackets;
3861 		return (rv);
3862 	case IFCOUNTER_OERRORS:
3863 		for (int s = 0; s < sc->num_slices; s++)
3864 			rv += sc->ss[s].oerrors;
3865 		return (rv);
3866 	case IFCOUNTER_OBYTES:
3867 		for (int s = 0; s < sc->num_slices; s++)
3868 			rv += sc->ss[s].obytes;
3869 		return (rv);
3870 	case IFCOUNTER_OMCASTS:
3871 		for (int s = 0; s < sc->num_slices; s++)
3872 			rv += sc->ss[s].omcasts;
3873 		return (rv);
3874 	case IFCOUNTER_OQDROPS:
3875 		for (int s = 0; s < sc->num_slices; s++)
3876 			rv += sc->ss[s].tx.br->br_drops;
3877 		return (rv);
3878 	default:
3879 		return (if_get_counter_default(ifp, cnt));
3880 	}
3881 }
3882 
3883 static void
3884 mxge_tick(void *arg)
3885 {
3886 	mxge_softc_t *sc = arg;
3887 	u_long pkts = 0;
3888 	int err = 0;
3889 	int running, ticks;
3890 	uint16_t cmd;
3891 
3892 	ticks = mxge_ticks;
3893 	running = if_getdrvflags(sc->ifp) & IFF_DRV_RUNNING;
3894 	if (running) {
3895 		if (!sc->watchdog_countdown) {
3896 			err = mxge_watchdog(sc);
3897 			sc->watchdog_countdown = 4;
3898 		}
3899 		sc->watchdog_countdown--;
3900 	}
3901 	if (pkts == 0) {
3902 		/* ensure NIC did not suffer h/w fault while idle */
3903 		cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3904 		if ((cmd & PCIM_CMD_BUSMASTEREN) == 0) {
3905 			sc->dying = 2;
3906 			taskqueue_enqueue(sc->tq, &sc->watchdog_task);
3907 			err = ENXIO;
3908 		}
3909 		/* look less often if NIC is idle */
3910 		ticks *= 4;
3911 	}
3912 
3913 	if (err == 0)
3914 		callout_reset(&sc->co_hdl, ticks, mxge_tick, sc);
3915 
3916 }
3917 
3918 static int
3919 mxge_media_change(if_t ifp)
3920 {
3921 	return EINVAL;
3922 }
3923 
3924 static int
3925 mxge_change_mtu(mxge_softc_t *sc, int mtu)
3926 {
3927 	if_t ifp = sc->ifp;
3928 	int real_mtu, old_mtu;
3929 	int err = 0;
3930 
3931 	real_mtu = mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
3932 	if ((real_mtu > sc->max_mtu) || real_mtu < 60)
3933 		return EINVAL;
3934 	mtx_lock(&sc->driver_mtx);
3935 	old_mtu = if_getmtu(ifp);
3936 	if_setmtu(ifp, mtu);
3937 	if (if_getdrvflags(ifp) & IFF_DRV_RUNNING) {
3938 		mxge_close(sc, 0);
3939 		err = mxge_open(sc);
3940 		if (err != 0) {
3941 			if_setmtu(ifp, old_mtu);
3942 			mxge_close(sc, 0);
3943 			(void) mxge_open(sc);
3944 		}
3945 	}
3946 	mtx_unlock(&sc->driver_mtx);
3947 	return err;
3948 }
3949 
3950 static void
3951 mxge_media_status(if_t ifp, struct ifmediareq *ifmr)
3952 {
3953 	mxge_softc_t *sc = if_getsoftc(ifp);
3954 
3955 	if (sc == NULL)
3956 		return;
3957 	ifmr->ifm_status = IFM_AVALID;
3958 	ifmr->ifm_active = IFM_ETHER | IFM_FDX;
3959 	ifmr->ifm_status |= sc->link_state ? IFM_ACTIVE : 0;
3960 	ifmr->ifm_active |= sc->current_media;
3961 }
3962 
3963 static int
3964 mxge_fetch_i2c(mxge_softc_t *sc, struct ifi2creq *i2c)
3965 {
3966 	mxge_cmd_t cmd;
3967 	uint32_t i2c_args;
3968 	int i, ms, err;
3969 
3970 	if (i2c->dev_addr != 0xA0 &&
3971 	    i2c->dev_addr != 0xA2)
3972 		return (EINVAL);
3973 	if (i2c->len > sizeof(i2c->data))
3974 		return (EINVAL);
3975 
3976 	for (i = 0; i < i2c->len; i++) {
3977 		i2c_args = i2c->dev_addr << 0x8;
3978 		i2c_args |= i2c->offset + i;
3979 		cmd.data0 = 0;	 /* just fetch 1 byte, not all 256 */
3980 		cmd.data1 = i2c_args;
3981 		err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_READ, &cmd);
3982 
3983 		if (err != MXGEFW_CMD_OK)
3984 			return (EIO);
3985 		/* now we wait for the data to be cached */
3986 		cmd.data0 = i2c_args & 0xff;
3987 		err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
3988 		for (ms = 0; (err == EBUSY) && (ms < 50); ms++) {
3989 			cmd.data0 = i2c_args & 0xff;
3990 			err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
3991 			if (err == EBUSY)
3992 				DELAY(1000);
3993 		}
3994 		if (err != MXGEFW_CMD_OK)
3995 			return (EIO);
3996 		i2c->data[i] = cmd.data0;
3997 	}
3998 	return (0);
3999 }
4000 
4001 static int
4002 mxge_ioctl(if_t ifp, u_long command, caddr_t data)
4003 {
4004 	mxge_softc_t *sc = if_getsoftc(ifp);
4005 	struct ifreq *ifr = (struct ifreq *)data;
4006 	struct ifi2creq i2c;
4007 	int err, mask;
4008 
4009 	err = 0;
4010 	switch (command) {
4011 	case SIOCSIFMTU:
4012 		err = mxge_change_mtu(sc, ifr->ifr_mtu);
4013 		break;
4014 
4015 	case SIOCSIFFLAGS:
4016 		mtx_lock(&sc->driver_mtx);
4017 		if (sc->dying) {
4018 			mtx_unlock(&sc->driver_mtx);
4019 			return EINVAL;
4020 		}
4021 		if (if_getflags(ifp) & IFF_UP) {
4022 			if (!(if_getdrvflags(ifp) & IFF_DRV_RUNNING)) {
4023 				err = mxge_open(sc);
4024 			} else {
4025 				/* take care of promis can allmulti
4026 				   flag chages */
4027 				mxge_change_promisc(sc,
4028 						    if_getflags(ifp) & IFF_PROMISC);
4029 				mxge_set_multicast_list(sc);
4030 			}
4031 		} else {
4032 			if (if_getdrvflags(ifp) & IFF_DRV_RUNNING) {
4033 				mxge_close(sc, 0);
4034 			}
4035 		}
4036 		mtx_unlock(&sc->driver_mtx);
4037 		break;
4038 
4039 	case SIOCADDMULTI:
4040 	case SIOCDELMULTI:
4041 		mtx_lock(&sc->driver_mtx);
4042 		if (sc->dying) {
4043 			mtx_unlock(&sc->driver_mtx);
4044 			return (EINVAL);
4045 		}
4046 		mxge_set_multicast_list(sc);
4047 		mtx_unlock(&sc->driver_mtx);
4048 		break;
4049 
4050 	case SIOCSIFCAP:
4051 		mtx_lock(&sc->driver_mtx);
4052 		mask = ifr->ifr_reqcap ^ if_getcapenable(ifp);
4053 		if (mask & IFCAP_TXCSUM) {
4054 			if (IFCAP_TXCSUM & if_getcapenable(ifp)) {
4055 				mask &= ~IFCAP_TSO4;
4056 				if_setcapenablebit(ifp, 0, (IFCAP_TXCSUM|IFCAP_TSO4));
4057 				if_sethwassistbits(ifp, 0, (CSUM_TCP | CSUM_UDP));
4058 			} else {
4059 				if_setcapenablebit(ifp, IFCAP_TXCSUM, 0);
4060 				if_sethwassistbits(ifp, (CSUM_TCP | CSUM_UDP), 0);
4061 			}
4062 		}
4063 		if (mask & IFCAP_RXCSUM) {
4064 			if (IFCAP_RXCSUM & if_getcapenable(ifp)) {
4065 				if_setcapenablebit(ifp, 0, IFCAP_RXCSUM);
4066 			} else {
4067 				if_setcapenablebit(ifp, IFCAP_RXCSUM, 0);
4068 			}
4069 		}
4070 		if (mask & IFCAP_TSO4) {
4071 			if (IFCAP_TSO4 & if_getcapenable(ifp)) {
4072 				if_setcapenablebit(ifp, 0, IFCAP_TSO4);
4073 			} else if (IFCAP_TXCSUM & if_getcapenable(ifp)) {
4074 				if_setcapenablebit(ifp, IFCAP_TSO4, 0);
4075 				if_sethwassistbits(ifp, CSUM_TSO, 0);
4076 			} else {
4077 				printf("mxge requires tx checksum offload"
4078 				       " be enabled to use TSO\n");
4079 				err = EINVAL;
4080 			}
4081 		}
4082 #if IFCAP_TSO6
4083 		if (mask & IFCAP_TXCSUM_IPV6) {
4084 			if (IFCAP_TXCSUM_IPV6 & if_getcapenable(ifp)) {
4085 				mask &= ~IFCAP_TSO6;
4086 				if_setcapenablebit(ifp, 0,
4087 				    IFCAP_TXCSUM_IPV6 | IFCAP_TSO6);
4088 				if_sethwassistbits(ifp, 0,
4089 				    CSUM_TCP_IPV6 | CSUM_UDP);
4090 			} else {
4091 				if_setcapenablebit(ifp, IFCAP_TXCSUM_IPV6, 0);
4092 				if_sethwassistbits(ifp,
4093 				    CSUM_TCP_IPV6 | CSUM_UDP_IPV6, 0);
4094 			}
4095 		}
4096 		if (mask & IFCAP_RXCSUM_IPV6) {
4097 			if (IFCAP_RXCSUM_IPV6 & if_getcapenable(ifp)) {
4098 				if_setcapenablebit(ifp, 0, IFCAP_RXCSUM_IPV6);
4099 			} else {
4100 				if_setcapenablebit(ifp, IFCAP_RXCSUM_IPV6, 0);
4101 			}
4102 		}
4103 		if (mask & IFCAP_TSO6) {
4104 			if (IFCAP_TSO6 & if_getcapenable(ifp)) {
4105 				if_setcapenablebit(ifp, 0, IFCAP_TSO6);
4106 			} else if (IFCAP_TXCSUM_IPV6 & if_getcapenable(ifp)) {
4107 				if_setcapenablebit(ifp, IFCAP_TSO6, 0);
4108 				if_sethwassistbits(ifp, CSUM_TSO, 0);
4109 			} else {
4110 				printf("mxge requires tx checksum offload"
4111 				       " be enabled to use TSO\n");
4112 				err = EINVAL;
4113 			}
4114 		}
4115 #endif /*IFCAP_TSO6 */
4116 
4117 		if (mask & IFCAP_LRO)
4118 			if_togglecapenable(ifp, IFCAP_LRO);
4119 		if (mask & IFCAP_VLAN_HWTAGGING)
4120 			if_togglecapenable(ifp, IFCAP_VLAN_HWTAGGING);
4121 		if (mask & IFCAP_VLAN_HWTSO)
4122 			if_togglecapenable(ifp, IFCAP_VLAN_HWTSO);
4123 
4124 		if (!(if_getcapabilities(ifp) & IFCAP_VLAN_HWTSO) ||
4125 		    !(if_getcapenable(ifp) & IFCAP_VLAN_HWTAGGING))
4126 			if_setcapenablebit(ifp, 0, IFCAP_VLAN_HWTSO);
4127 
4128 		mtx_unlock(&sc->driver_mtx);
4129 		VLAN_CAPABILITIES(ifp);
4130 
4131 		break;
4132 
4133 	case SIOCGIFMEDIA:
4134 		mtx_lock(&sc->driver_mtx);
4135 		if (sc->dying) {
4136 			mtx_unlock(&sc->driver_mtx);
4137 			return (EINVAL);
4138 		}
4139 		mxge_media_probe(sc);
4140 		mtx_unlock(&sc->driver_mtx);
4141 		err = ifmedia_ioctl(ifp, (struct ifreq *)data,
4142 				    &sc->media, command);
4143 		break;
4144 
4145 	case SIOCGI2C:
4146 		if (sc->connector != MXGE_XFP &&
4147 		    sc->connector != MXGE_SFP) {
4148 			err = ENXIO;
4149 			break;
4150 		}
4151 		err = copyin(ifr_data_get_ptr(ifr), &i2c, sizeof(i2c));
4152 		if (err != 0)
4153 			break;
4154 		mtx_lock(&sc->driver_mtx);
4155 		if (sc->dying) {
4156 			mtx_unlock(&sc->driver_mtx);
4157 			return (EINVAL);
4158 		}
4159 		err = mxge_fetch_i2c(sc, &i2c);
4160 		mtx_unlock(&sc->driver_mtx);
4161 		if (err == 0)
4162 			err = copyout(&i2c, ifr_data_get_ptr(ifr),
4163 			    sizeof(i2c));
4164 		break;
4165 	default:
4166 		err = ether_ioctl(ifp, command, data);
4167 		break;
4168 	}
4169 	return err;
4170 }
4171 
4172 static void
4173 mxge_fetch_tunables(mxge_softc_t *sc)
4174 {
4175 
4176 	TUNABLE_INT_FETCH("hw.mxge.max_slices", &mxge_max_slices);
4177 	TUNABLE_INT_FETCH("hw.mxge.flow_control_enabled",
4178 			  &mxge_flow_control);
4179 	TUNABLE_INT_FETCH("hw.mxge.intr_coal_delay",
4180 			  &mxge_intr_coal_delay);
4181 	TUNABLE_INT_FETCH("hw.mxge.nvidia_ecrc_enable",
4182 			  &mxge_nvidia_ecrc_enable);
4183 	TUNABLE_INT_FETCH("hw.mxge.force_firmware",
4184 			  &mxge_force_firmware);
4185 	TUNABLE_INT_FETCH("hw.mxge.deassert_wait",
4186 			  &mxge_deassert_wait);
4187 	TUNABLE_INT_FETCH("hw.mxge.verbose",
4188 			  &mxge_verbose);
4189 	TUNABLE_INT_FETCH("hw.mxge.ticks", &mxge_ticks);
4190 	TUNABLE_INT_FETCH("hw.mxge.always_promisc", &mxge_always_promisc);
4191 	TUNABLE_INT_FETCH("hw.mxge.rss_hash_type", &mxge_rss_hash_type);
4192 	TUNABLE_INT_FETCH("hw.mxge.rss_hashtype", &mxge_rss_hash_type);
4193 	TUNABLE_INT_FETCH("hw.mxge.initial_mtu", &mxge_initial_mtu);
4194 	TUNABLE_INT_FETCH("hw.mxge.throttle", &mxge_throttle);
4195 
4196 	if (bootverbose)
4197 		mxge_verbose = 1;
4198 	if (mxge_intr_coal_delay < 0 || mxge_intr_coal_delay > 10*1000)
4199 		mxge_intr_coal_delay = 30;
4200 	if (mxge_ticks == 0)
4201 		mxge_ticks = hz / 2;
4202 	sc->pause = mxge_flow_control;
4203 	if (mxge_rss_hash_type < MXGEFW_RSS_HASH_TYPE_IPV4
4204 	    || mxge_rss_hash_type > MXGEFW_RSS_HASH_TYPE_MAX) {
4205 		mxge_rss_hash_type = MXGEFW_RSS_HASH_TYPE_SRC_DST_PORT;
4206 	}
4207 	if (mxge_initial_mtu > ETHERMTU_JUMBO ||
4208 	    mxge_initial_mtu < ETHER_MIN_LEN)
4209 		mxge_initial_mtu = ETHERMTU_JUMBO;
4210 
4211 	if (mxge_throttle && mxge_throttle > MXGE_MAX_THROTTLE)
4212 		mxge_throttle = MXGE_MAX_THROTTLE;
4213 	if (mxge_throttle && mxge_throttle < MXGE_MIN_THROTTLE)
4214 		mxge_throttle = MXGE_MIN_THROTTLE;
4215 	sc->throttle = mxge_throttle;
4216 }
4217 
4218 static void
4219 mxge_free_slices(mxge_softc_t *sc)
4220 {
4221 	struct mxge_slice_state *ss;
4222 	int i;
4223 
4224 	if (sc->ss == NULL)
4225 		return;
4226 
4227 	for (i = 0; i < sc->num_slices; i++) {
4228 		ss = &sc->ss[i];
4229 		if (ss->fw_stats != NULL) {
4230 			mxge_dma_free(&ss->fw_stats_dma);
4231 			ss->fw_stats = NULL;
4232 			if (ss->tx.br != NULL) {
4233 				drbr_free(ss->tx.br, M_DEVBUF);
4234 				ss->tx.br = NULL;
4235 			}
4236 			mtx_destroy(&ss->tx.mtx);
4237 		}
4238 		if (ss->rx_done.entry != NULL) {
4239 			mxge_dma_free(&ss->rx_done.dma);
4240 			ss->rx_done.entry = NULL;
4241 		}
4242 	}
4243 	free(sc->ss, M_DEVBUF);
4244 	sc->ss = NULL;
4245 }
4246 
4247 static int
4248 mxge_alloc_slices(mxge_softc_t *sc)
4249 {
4250 	mxge_cmd_t cmd;
4251 	struct mxge_slice_state *ss;
4252 	size_t bytes;
4253 	int err, i, max_intr_slots;
4254 
4255 	err = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
4256 	if (err != 0) {
4257 		device_printf(sc->dev, "Cannot determine rx ring size\n");
4258 		return err;
4259 	}
4260 	sc->rx_ring_size = cmd.data0;
4261 	max_intr_slots = 2 * (sc->rx_ring_size / sizeof (mcp_dma_addr_t));
4262 
4263 	bytes = sizeof (*sc->ss) * sc->num_slices;
4264 	sc->ss = malloc(bytes, M_DEVBUF, M_NOWAIT | M_ZERO);
4265 	if (sc->ss == NULL)
4266 		return (ENOMEM);
4267 	for (i = 0; i < sc->num_slices; i++) {
4268 		ss = &sc->ss[i];
4269 
4270 		ss->sc = sc;
4271 
4272 		/* allocate per-slice rx interrupt queues */
4273 
4274 		bytes = max_intr_slots * sizeof (*ss->rx_done.entry);
4275 		err = mxge_dma_alloc(sc, &ss->rx_done.dma, bytes, 4096);
4276 		if (err != 0)
4277 			goto abort;
4278 		ss->rx_done.entry = ss->rx_done.dma.addr;
4279 		bzero(ss->rx_done.entry, bytes);
4280 
4281 		/*
4282 		 * allocate the per-slice firmware stats; stats
4283 		 * (including tx) are used used only on the first
4284 		 * slice for now
4285 		 */
4286 
4287 		bytes = sizeof (*ss->fw_stats);
4288 		err = mxge_dma_alloc(sc, &ss->fw_stats_dma,
4289 				     sizeof (*ss->fw_stats), 64);
4290 		if (err != 0)
4291 			goto abort;
4292 		ss->fw_stats = (mcp_irq_data_t *)ss->fw_stats_dma.addr;
4293 		snprintf(ss->tx.mtx_name, sizeof(ss->tx.mtx_name),
4294 			 "%s:tx(%d)", device_get_nameunit(sc->dev), i);
4295 		mtx_init(&ss->tx.mtx, ss->tx.mtx_name, NULL, MTX_DEF);
4296 		ss->tx.br = buf_ring_alloc(2048, M_DEVBUF, M_WAITOK,
4297 					   &ss->tx.mtx);
4298 	}
4299 
4300 	return (0);
4301 
4302 abort:
4303 	mxge_free_slices(sc);
4304 	return (ENOMEM);
4305 }
4306 
4307 static void
4308 mxge_slice_probe(mxge_softc_t *sc)
4309 {
4310 	mxge_cmd_t cmd;
4311 	char *old_fw;
4312 	int msix_cnt, status, max_intr_slots;
4313 
4314 	sc->num_slices = 1;
4315 	/*
4316 	 *  don't enable multiple slices if they are not enabled,
4317 	 *  or if this is not an SMP system
4318 	 */
4319 
4320 	if (mxge_max_slices == 0 || mxge_max_slices == 1 || mp_ncpus < 2)
4321 		return;
4322 
4323 	/* see how many MSI-X interrupts are available */
4324 	msix_cnt = pci_msix_count(sc->dev);
4325 	if (msix_cnt < 2)
4326 		return;
4327 
4328 	/* now load the slice aware firmware see what it supports */
4329 	old_fw = sc->fw_name;
4330 	if (old_fw == mxge_fw_aligned)
4331 		sc->fw_name = mxge_fw_rss_aligned;
4332 	else
4333 		sc->fw_name = mxge_fw_rss_unaligned;
4334 	status = mxge_load_firmware(sc, 0);
4335 	if (status != 0) {
4336 		device_printf(sc->dev, "Falling back to a single slice\n");
4337 		return;
4338 	}
4339 
4340 	/* try to send a reset command to the card to see if it
4341 	   is alive */
4342 	memset(&cmd, 0, sizeof (cmd));
4343 	status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
4344 	if (status != 0) {
4345 		device_printf(sc->dev, "failed reset\n");
4346 		goto abort_with_fw;
4347 	}
4348 
4349 	/* get rx ring size */
4350 	status = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
4351 	if (status != 0) {
4352 		device_printf(sc->dev, "Cannot determine rx ring size\n");
4353 		goto abort_with_fw;
4354 	}
4355 	max_intr_slots = 2 * (cmd.data0 / sizeof (mcp_dma_addr_t));
4356 
4357 	/* tell it the size of the interrupt queues */
4358 	cmd.data0 = max_intr_slots * sizeof (struct mcp_slot);
4359 	status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
4360 	if (status != 0) {
4361 		device_printf(sc->dev, "failed MXGEFW_CMD_SET_INTRQ_SIZE\n");
4362 		goto abort_with_fw;
4363 	}
4364 
4365 	/* ask the maximum number of slices it supports */
4366 	status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES, &cmd);
4367 	if (status != 0) {
4368 		device_printf(sc->dev,
4369 			      "failed MXGEFW_CMD_GET_MAX_RSS_QUEUES\n");
4370 		goto abort_with_fw;
4371 	}
4372 	sc->num_slices = cmd.data0;
4373 	if (sc->num_slices > msix_cnt)
4374 		sc->num_slices = msix_cnt;
4375 
4376 	if (mxge_max_slices == -1) {
4377 		/* cap to number of CPUs in system */
4378 		if (sc->num_slices > mp_ncpus)
4379 			sc->num_slices = mp_ncpus;
4380 	} else {
4381 		if (sc->num_slices > mxge_max_slices)
4382 			sc->num_slices = mxge_max_slices;
4383 	}
4384 	/* make sure it is a power of two */
4385 	while (sc->num_slices & (sc->num_slices - 1))
4386 		sc->num_slices--;
4387 
4388 	if (mxge_verbose)
4389 		device_printf(sc->dev, "using %d slices\n",
4390 			      sc->num_slices);
4391 
4392 	return;
4393 
4394 abort_with_fw:
4395 	sc->fw_name = old_fw;
4396 	(void) mxge_load_firmware(sc, 0);
4397 }
4398 
4399 static int
4400 mxge_add_msix_irqs(mxge_softc_t *sc)
4401 {
4402 	size_t bytes;
4403 	int count, err, i, rid;
4404 
4405 	rid = PCIR_BAR(2);
4406 	sc->msix_table_res = bus_alloc_resource_any(sc->dev, SYS_RES_MEMORY,
4407 						    &rid, RF_ACTIVE);
4408 
4409 	if (sc->msix_table_res == NULL) {
4410 		device_printf(sc->dev, "couldn't alloc MSIX table res\n");
4411 		return ENXIO;
4412 	}
4413 
4414 	count = sc->num_slices;
4415 	err = pci_alloc_msix(sc->dev, &count);
4416 	if (err != 0) {
4417 		device_printf(sc->dev, "pci_alloc_msix: failed, wanted %d"
4418 			      "err = %d \n", sc->num_slices, err);
4419 		goto abort_with_msix_table;
4420 	}
4421 	if (count < sc->num_slices) {
4422 		device_printf(sc->dev, "pci_alloc_msix: need %d, got %d\n",
4423 			      count, sc->num_slices);
4424 		device_printf(sc->dev,
4425 			      "Try setting hw.mxge.max_slices to %d\n",
4426 			      count);
4427 		err = ENOSPC;
4428 		goto abort_with_msix;
4429 	}
4430 	bytes = sizeof (*sc->msix_irq_res) * sc->num_slices;
4431 	sc->msix_irq_res = malloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO);
4432 	if (sc->msix_irq_res == NULL) {
4433 		err = ENOMEM;
4434 		goto abort_with_msix;
4435 	}
4436 
4437 	for (i = 0; i < sc->num_slices; i++) {
4438 		rid = i + 1;
4439 		sc->msix_irq_res[i] = bus_alloc_resource_any(sc->dev,
4440 							  SYS_RES_IRQ,
4441 							  &rid, RF_ACTIVE);
4442 		if (sc->msix_irq_res[i] == NULL) {
4443 			device_printf(sc->dev, "couldn't allocate IRQ res"
4444 				      " for message %d\n", i);
4445 			err = ENXIO;
4446 			goto abort_with_res;
4447 		}
4448 	}
4449 
4450 	bytes = sizeof (*sc->msix_ih) * sc->num_slices;
4451 	sc->msix_ih =  malloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO);
4452 
4453 	for (i = 0; i < sc->num_slices; i++) {
4454 		err = bus_setup_intr(sc->dev, sc->msix_irq_res[i],
4455 				     INTR_TYPE_NET | INTR_MPSAFE, NULL,
4456 				     mxge_intr, &sc->ss[i], &sc->msix_ih[i]);
4457 		if (err != 0) {
4458 			device_printf(sc->dev, "couldn't setup intr for "
4459 				      "message %d\n", i);
4460 			goto abort_with_intr;
4461 		}
4462 		bus_describe_intr(sc->dev, sc->msix_irq_res[i],
4463 				  sc->msix_ih[i], "s%d", i);
4464 	}
4465 
4466 	if (mxge_verbose) {
4467 		device_printf(sc->dev, "using %d msix IRQs:",
4468 			      sc->num_slices);
4469 		for (i = 0; i < sc->num_slices; i++)
4470 			printf(" %jd", rman_get_start(sc->msix_irq_res[i]));
4471 		printf("\n");
4472 	}
4473 	return (0);
4474 
4475 abort_with_intr:
4476 	for (i = 0; i < sc->num_slices; i++) {
4477 		if (sc->msix_ih[i] != NULL) {
4478 			bus_teardown_intr(sc->dev, sc->msix_irq_res[i],
4479 					  sc->msix_ih[i]);
4480 			sc->msix_ih[i] = NULL;
4481 		}
4482 	}
4483 	free(sc->msix_ih, M_DEVBUF);
4484 
4485 abort_with_res:
4486 	for (i = 0; i < sc->num_slices; i++) {
4487 		rid = i + 1;
4488 		if (sc->msix_irq_res[i] != NULL)
4489 			bus_release_resource(sc->dev, SYS_RES_IRQ, rid,
4490 					     sc->msix_irq_res[i]);
4491 		sc->msix_irq_res[i] = NULL;
4492 	}
4493 	free(sc->msix_irq_res, M_DEVBUF);
4494 
4495 abort_with_msix:
4496 	pci_release_msi(sc->dev);
4497 
4498 abort_with_msix_table:
4499 	bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2),
4500 			     sc->msix_table_res);
4501 
4502 	return err;
4503 }
4504 
4505 static int
4506 mxge_add_single_irq(mxge_softc_t *sc)
4507 {
4508 	int count, err, rid;
4509 
4510 	count = pci_msi_count(sc->dev);
4511 	if (count == 1 && pci_alloc_msi(sc->dev, &count) == 0) {
4512 		rid = 1;
4513 	} else {
4514 		rid = 0;
4515 		sc->legacy_irq = 1;
4516 	}
4517 	sc->irq_res = bus_alloc_resource_any(sc->dev, SYS_RES_IRQ, &rid,
4518 					     RF_SHAREABLE | RF_ACTIVE);
4519 	if (sc->irq_res == NULL) {
4520 		device_printf(sc->dev, "could not alloc interrupt\n");
4521 		return ENXIO;
4522 	}
4523 	if (mxge_verbose)
4524 		device_printf(sc->dev, "using %s irq %jd\n",
4525 			      sc->legacy_irq ? "INTx" : "MSI",
4526 			      rman_get_start(sc->irq_res));
4527 	err = bus_setup_intr(sc->dev, sc->irq_res,
4528 			     INTR_TYPE_NET | INTR_MPSAFE, NULL,
4529 			     mxge_intr, &sc->ss[0], &sc->ih);
4530 	if (err != 0) {
4531 		bus_release_resource(sc->dev, SYS_RES_IRQ,
4532 				     sc->legacy_irq ? 0 : 1, sc->irq_res);
4533 		if (!sc->legacy_irq)
4534 			pci_release_msi(sc->dev);
4535 	}
4536 	return err;
4537 }
4538 
4539 static void
4540 mxge_rem_msix_irqs(mxge_softc_t *sc)
4541 {
4542 	int i, rid;
4543 
4544 	for (i = 0; i < sc->num_slices; i++) {
4545 		if (sc->msix_ih[i] != NULL) {
4546 			bus_teardown_intr(sc->dev, sc->msix_irq_res[i],
4547 					  sc->msix_ih[i]);
4548 			sc->msix_ih[i] = NULL;
4549 		}
4550 	}
4551 	free(sc->msix_ih, M_DEVBUF);
4552 
4553 	for (i = 0; i < sc->num_slices; i++) {
4554 		rid = i + 1;
4555 		if (sc->msix_irq_res[i] != NULL)
4556 			bus_release_resource(sc->dev, SYS_RES_IRQ, rid,
4557 					     sc->msix_irq_res[i]);
4558 		sc->msix_irq_res[i] = NULL;
4559 	}
4560 	free(sc->msix_irq_res, M_DEVBUF);
4561 
4562 	bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2),
4563 			     sc->msix_table_res);
4564 
4565 	pci_release_msi(sc->dev);
4566 	return;
4567 }
4568 
4569 static void
4570 mxge_rem_single_irq(mxge_softc_t *sc)
4571 {
4572 	bus_teardown_intr(sc->dev, sc->irq_res, sc->ih);
4573 	bus_release_resource(sc->dev, SYS_RES_IRQ,
4574 			     sc->legacy_irq ? 0 : 1, sc->irq_res);
4575 	if (!sc->legacy_irq)
4576 		pci_release_msi(sc->dev);
4577 }
4578 
4579 static void
4580 mxge_rem_irq(mxge_softc_t *sc)
4581 {
4582 	if (sc->num_slices > 1)
4583 		mxge_rem_msix_irqs(sc);
4584 	else
4585 		mxge_rem_single_irq(sc);
4586 }
4587 
4588 static int
4589 mxge_add_irq(mxge_softc_t *sc)
4590 {
4591 	int err;
4592 
4593 	if (sc->num_slices > 1)
4594 		err = mxge_add_msix_irqs(sc);
4595 	else
4596 		err = mxge_add_single_irq(sc);
4597 
4598 	if (0 && err == 0 && sc->num_slices > 1) {
4599 		mxge_rem_msix_irqs(sc);
4600 		err = mxge_add_msix_irqs(sc);
4601 	}
4602 	return err;
4603 }
4604 
4605 static int
4606 mxge_attach(device_t dev)
4607 {
4608 	mxge_cmd_t cmd;
4609 	mxge_softc_t *sc = device_get_softc(dev);
4610 	if_t ifp;
4611 	int err, rid;
4612 
4613 	sc->dev = dev;
4614 	mxge_fetch_tunables(sc);
4615 
4616 	TASK_INIT(&sc->watchdog_task, 1, mxge_watchdog_task, sc);
4617 	sc->tq = taskqueue_create("mxge_taskq", M_WAITOK,
4618 				  taskqueue_thread_enqueue, &sc->tq);
4619 	if (sc->tq == NULL) {
4620 		err = ENOMEM;
4621 		goto abort_with_nothing;
4622 	}
4623 
4624 	err = bus_dma_tag_create(bus_get_dma_tag(dev),	/* parent */
4625 				 1,			/* alignment */
4626 				 0,			/* boundary */
4627 				 BUS_SPACE_MAXADDR,	/* low */
4628 				 BUS_SPACE_MAXADDR,	/* high */
4629 				 NULL, NULL,		/* filter */
4630 				 65536 + 256,		/* maxsize */
4631 				 MXGE_MAX_SEND_DESC, 	/* num segs */
4632 				 65536,			/* maxsegsize */
4633 				 0,			/* flags */
4634 				 NULL, NULL,		/* lock */
4635 				 &sc->parent_dmat);	/* tag */
4636 
4637 	if (err != 0) {
4638 		device_printf(sc->dev, "Err %d allocating parent dmat\n",
4639 			      err);
4640 		goto abort_with_tq;
4641 	}
4642 
4643 	ifp = sc->ifp = if_alloc(IFT_ETHER);
4644 	if (ifp == NULL) {
4645 		device_printf(dev, "can not if_alloc()\n");
4646 		err = ENOSPC;
4647 		goto abort_with_parent_dmat;
4648 	}
4649 	if_initname(ifp, device_get_name(dev), device_get_unit(dev));
4650 
4651 	snprintf(sc->cmd_mtx_name, sizeof(sc->cmd_mtx_name), "%s:cmd",
4652 		 device_get_nameunit(dev));
4653 	mtx_init(&sc->cmd_mtx, sc->cmd_mtx_name, NULL, MTX_DEF);
4654 	snprintf(sc->driver_mtx_name, sizeof(sc->driver_mtx_name),
4655 		 "%s:drv", device_get_nameunit(dev));
4656 	mtx_init(&sc->driver_mtx, sc->driver_mtx_name,
4657 		 MTX_NETWORK_LOCK, MTX_DEF);
4658 
4659 	callout_init_mtx(&sc->co_hdl, &sc->driver_mtx, 0);
4660 
4661 	mxge_setup_cfg_space(sc);
4662 
4663 	/* Map the board into the kernel */
4664 	rid = PCIR_BARS;
4665 	sc->mem_res = bus_alloc_resource_any(dev, SYS_RES_MEMORY, &rid,
4666 					     RF_ACTIVE);
4667 	if (sc->mem_res == NULL) {
4668 		device_printf(dev, "could not map memory\n");
4669 		err = ENXIO;
4670 		goto abort_with_lock;
4671 	}
4672 	sc->sram = rman_get_virtual(sc->mem_res);
4673 	sc->sram_size = 2*1024*1024 - (2*(48*1024)+(32*1024)) - 0x100;
4674 	if (sc->sram_size > rman_get_size(sc->mem_res)) {
4675 		device_printf(dev, "impossible memory region size %jd\n",
4676 			      rman_get_size(sc->mem_res));
4677 		err = ENXIO;
4678 		goto abort_with_mem_res;
4679 	}
4680 
4681 	/* make NULL terminated copy of the EEPROM strings section of
4682 	   lanai SRAM */
4683 	bzero(sc->eeprom_strings, MXGE_EEPROM_STRINGS_SIZE);
4684 	bus_space_read_region_1(rman_get_bustag(sc->mem_res),
4685 				rman_get_bushandle(sc->mem_res),
4686 				sc->sram_size - MXGE_EEPROM_STRINGS_SIZE,
4687 				sc->eeprom_strings,
4688 				MXGE_EEPROM_STRINGS_SIZE - 2);
4689 	err = mxge_parse_strings(sc);
4690 	if (err != 0)
4691 		goto abort_with_mem_res;
4692 
4693 	/* Enable write combining for efficient use of PCIe bus */
4694 	mxge_enable_wc(sc);
4695 
4696 	/* Allocate the out of band dma memory */
4697 	err = mxge_dma_alloc(sc, &sc->cmd_dma,
4698 			     sizeof (mxge_cmd_t), 64);
4699 	if (err != 0)
4700 		goto abort_with_mem_res;
4701 	sc->cmd = (mcp_cmd_response_t *) sc->cmd_dma.addr;
4702 	err = mxge_dma_alloc(sc, &sc->zeropad_dma, 64, 64);
4703 	if (err != 0)
4704 		goto abort_with_cmd_dma;
4705 
4706 	err = mxge_dma_alloc(sc, &sc->dmabench_dma, 4096, 4096);
4707 	if (err != 0)
4708 		goto abort_with_zeropad_dma;
4709 
4710 	/* select & load the firmware */
4711 	err = mxge_select_firmware(sc);
4712 	if (err != 0)
4713 		goto abort_with_dmabench;
4714 	sc->intr_coal_delay = mxge_intr_coal_delay;
4715 
4716 	mxge_slice_probe(sc);
4717 	err = mxge_alloc_slices(sc);
4718 	if (err != 0)
4719 		goto abort_with_dmabench;
4720 
4721 	err = mxge_reset(sc, 0);
4722 	if (err != 0)
4723 		goto abort_with_slices;
4724 
4725 	err = mxge_alloc_rings(sc);
4726 	if (err != 0) {
4727 		device_printf(sc->dev, "failed to allocate rings\n");
4728 		goto abort_with_slices;
4729 	}
4730 
4731 	err = mxge_add_irq(sc);
4732 	if (err != 0) {
4733 		device_printf(sc->dev, "failed to add irq\n");
4734 		goto abort_with_rings;
4735 	}
4736 
4737 	if_setbaudrate(ifp, IF_Gbps(10));
4738 	if_setcapabilities(ifp, IFCAP_RXCSUM | IFCAP_TXCSUM | IFCAP_TSO4 |
4739 		IFCAP_VLAN_MTU | IFCAP_LINKSTATE | IFCAP_TXCSUM_IPV6 |
4740 		IFCAP_RXCSUM_IPV6);
4741 #if defined(INET) || defined(INET6)
4742 	if_setcapabilitiesbit(ifp, IFCAP_LRO, 0);
4743 #endif
4744 
4745 #ifdef MXGE_NEW_VLAN_API
4746 	if_setcapabilitiesbit(ifp, IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_HWCSUM, 0);
4747 
4748 	/* Only FW 1.4.32 and newer can do TSO over vlans */
4749 	if (sc->fw_ver_major == 1 && sc->fw_ver_minor == 4 &&
4750 	    sc->fw_ver_tiny >= 32)
4751 		if_setcapabilitiesbit(ifp, IFCAP_VLAN_HWTSO, 0);
4752 #endif
4753 	sc->max_mtu = mxge_max_mtu(sc);
4754 	if (sc->max_mtu >= 9000)
4755 		if_setcapabilitiesbit(ifp, IFCAP_JUMBO_MTU, 0);
4756 	else
4757 		device_printf(dev, "MTU limited to %d.  Install "
4758 			      "latest firmware for 9000 byte jumbo support\n",
4759 			      sc->max_mtu - ETHER_HDR_LEN);
4760 	if_sethwassist(ifp, CSUM_TCP | CSUM_UDP | CSUM_TSO);
4761 	if_sethwassistbits(ifp, CSUM_TCP_IPV6 | CSUM_UDP_IPV6, 0);
4762 	/* check to see if f/w supports TSO for IPv6 */
4763 	if (!mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_TSO6_HDR_SIZE, &cmd)) {
4764 		if (CSUM_TCP_IPV6)
4765 			if_setcapabilitiesbit(ifp, IFCAP_TSO6, 0);
4766 		sc->max_tso6_hlen = min(cmd.data0,
4767 					sizeof (sc->ss[0].scratch));
4768 	}
4769 	if_setcapenable(ifp, if_getcapabilities(ifp));
4770 	if (sc->lro_cnt == 0)
4771 		if_setcapenablebit(ifp, 0, IFCAP_LRO);
4772 	if_setinitfn(ifp, mxge_init);
4773 	if_setsoftc(ifp, sc);
4774 	if_setflags(ifp, IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST);
4775 	if_setioctlfn(ifp, mxge_ioctl);
4776 	if_setstartfn(ifp, mxge_start);
4777 	if_setgetcounterfn(ifp, mxge_get_counter);
4778 	if_sethwtsomax(ifp, IP_MAXPACKET - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN));
4779 	if_sethwtsomaxsegcount(ifp, sc->ss[0].tx.max_desc);
4780 	if_sethwtsomaxsegsize(ifp, IP_MAXPACKET);
4781 	/* Initialise the ifmedia structure */
4782 	ifmedia_init(&sc->media, 0, mxge_media_change,
4783 		     mxge_media_status);
4784 	mxge_media_init(sc);
4785 	mxge_media_probe(sc);
4786 	sc->dying = 0;
4787 	ether_ifattach(ifp, sc->mac_addr);
4788 	/* ether_ifattach sets mtu to ETHERMTU */
4789 	if (mxge_initial_mtu != ETHERMTU)
4790 		mxge_change_mtu(sc, mxge_initial_mtu);
4791 
4792 	mxge_add_sysctls(sc);
4793 	if_settransmitfn(ifp, mxge_transmit);
4794 	if_setqflushfn(ifp, mxge_qflush);
4795 	taskqueue_start_threads(&sc->tq, 1, PI_NET, "%s taskq",
4796 				device_get_nameunit(sc->dev));
4797 	callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
4798 	return 0;
4799 
4800 abort_with_rings:
4801 	mxge_free_rings(sc);
4802 abort_with_slices:
4803 	mxge_free_slices(sc);
4804 abort_with_dmabench:
4805 	mxge_dma_free(&sc->dmabench_dma);
4806 abort_with_zeropad_dma:
4807 	mxge_dma_free(&sc->zeropad_dma);
4808 abort_with_cmd_dma:
4809 	mxge_dma_free(&sc->cmd_dma);
4810 abort_with_mem_res:
4811 	bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
4812 abort_with_lock:
4813 	pci_disable_busmaster(dev);
4814 	mtx_destroy(&sc->cmd_mtx);
4815 	mtx_destroy(&sc->driver_mtx);
4816 	if_free(ifp);
4817 abort_with_parent_dmat:
4818 	bus_dma_tag_destroy(sc->parent_dmat);
4819 abort_with_tq:
4820 	if (sc->tq != NULL) {
4821 		taskqueue_drain(sc->tq, &sc->watchdog_task);
4822 		taskqueue_free(sc->tq);
4823 		sc->tq = NULL;
4824 	}
4825 abort_with_nothing:
4826 	return err;
4827 }
4828 
4829 static int
4830 mxge_detach(device_t dev)
4831 {
4832 	mxge_softc_t *sc = device_get_softc(dev);
4833 
4834 	if (mxge_vlans_active(sc)) {
4835 		device_printf(sc->dev,
4836 			      "Detach vlans before removing module\n");
4837 		return EBUSY;
4838 	}
4839 	mtx_lock(&sc->driver_mtx);
4840 	sc->dying = 1;
4841 	if (if_getdrvflags(sc->ifp) & IFF_DRV_RUNNING)
4842 		mxge_close(sc, 0);
4843 	mtx_unlock(&sc->driver_mtx);
4844 	ether_ifdetach(sc->ifp);
4845 	if (sc->tq != NULL) {
4846 		taskqueue_drain(sc->tq, &sc->watchdog_task);
4847 		taskqueue_free(sc->tq);
4848 		sc->tq = NULL;
4849 	}
4850 	callout_drain(&sc->co_hdl);
4851 	ifmedia_removeall(&sc->media);
4852 	mxge_dummy_rdma(sc, 0);
4853 	mxge_rem_sysctls(sc);
4854 	mxge_rem_irq(sc);
4855 	mxge_free_rings(sc);
4856 	mxge_free_slices(sc);
4857 	mxge_dma_free(&sc->dmabench_dma);
4858 	mxge_dma_free(&sc->zeropad_dma);
4859 	mxge_dma_free(&sc->cmd_dma);
4860 	bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
4861 	pci_disable_busmaster(dev);
4862 	mtx_destroy(&sc->cmd_mtx);
4863 	mtx_destroy(&sc->driver_mtx);
4864 	if_free(sc->ifp);
4865 	bus_dma_tag_destroy(sc->parent_dmat);
4866 	return 0;
4867 }
4868 
4869 static int
4870 mxge_shutdown(device_t dev)
4871 {
4872 	return 0;
4873 }
4874 
4875 /*
4876   This file uses Myri10GE driver indentation.
4877 
4878   Local Variables:
4879   c-file-style:"linux"
4880   tab-width:8
4881   End:
4882 */
4883