xref: /freebsd/sys/dev/mxge/if_mxge.c (revision f374ba41f55c1a127303d92d830dd58eef2f5243)
1 /******************************************************************************
2 SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3 
4 Copyright (c) 2006-2013, Myricom Inc.
5 All rights reserved.
6 
7 Redistribution and use in source and binary forms, with or without
8 modification, are permitted provided that the following conditions are met:
9 
10  1. Redistributions of source code must retain the above copyright notice,
11     this list of conditions and the following disclaimer.
12 
13  2. Neither the name of the Myricom Inc, nor the names of its
14     contributors may be used to endorse or promote products derived from
15     this software without specific prior written permission.
16 
17 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
21 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27 POSSIBILITY OF SUCH DAMAGE.
28 
29 ***************************************************************************/
30 
31 #include <sys/cdefs.h>
32 __FBSDID("$FreeBSD$");
33 
34 #include <sys/param.h>
35 #include <sys/systm.h>
36 #include <sys/linker.h>
37 #include <sys/firmware.h>
38 #include <sys/endian.h>
39 #include <sys/sockio.h>
40 #include <sys/mbuf.h>
41 #include <sys/malloc.h>
42 #include <sys/kdb.h>
43 #include <sys/kernel.h>
44 #include <sys/lock.h>
45 #include <sys/module.h>
46 #include <sys/socket.h>
47 #include <sys/sysctl.h>
48 #include <sys/sx.h>
49 #include <sys/taskqueue.h>
50 #include <contrib/zlib/zlib.h>
51 #include <dev/zlib/zcalloc.h>
52 
53 #include <net/if.h>
54 #include <net/if_var.h>
55 #include <net/if_arp.h>
56 #include <net/ethernet.h>
57 #include <net/if_dl.h>
58 #include <net/if_media.h>
59 
60 #include <net/bpf.h>
61 
62 #include <net/if_types.h>
63 #include <net/if_vlan_var.h>
64 
65 #include <netinet/in_systm.h>
66 #include <netinet/in.h>
67 #include <netinet/ip.h>
68 #include <netinet/ip6.h>
69 #include <netinet/tcp.h>
70 #include <netinet/tcp_lro.h>
71 #include <netinet6/ip6_var.h>
72 
73 #include <machine/bus.h>
74 #include <machine/in_cksum.h>
75 #include <machine/resource.h>
76 #include <sys/bus.h>
77 #include <sys/rman.h>
78 #include <sys/smp.h>
79 
80 #include <dev/pci/pcireg.h>
81 #include <dev/pci/pcivar.h>
82 #include <dev/pci/pci_private.h> /* XXX for pci_cfg_restore */
83 
84 #include <vm/vm.h>		/* for pmap_mapdev() */
85 #include <vm/pmap.h>
86 
87 #if defined(__i386) || defined(__amd64)
88 #include <machine/specialreg.h>
89 #endif
90 
91 #include <dev/mxge/mxge_mcp.h>
92 #include <dev/mxge/mcp_gen_header.h>
93 /*#define MXGE_FAKE_IFP*/
94 #include <dev/mxge/if_mxge_var.h>
95 #include <sys/buf_ring.h>
96 
97 #include "opt_inet.h"
98 #include "opt_inet6.h"
99 
100 /* tunable params */
101 static int mxge_nvidia_ecrc_enable = 1;
102 static int mxge_force_firmware = 0;
103 static int mxge_intr_coal_delay = 30;
104 static int mxge_deassert_wait = 1;
105 static int mxge_flow_control = 1;
106 static int mxge_verbose = 0;
107 static int mxge_ticks;
108 static int mxge_max_slices = 1;
109 static int mxge_rss_hash_type = MXGEFW_RSS_HASH_TYPE_SRC_DST_PORT;
110 static int mxge_always_promisc = 0;
111 static int mxge_initial_mtu = ETHERMTU_JUMBO;
112 static int mxge_throttle = 0;
113 static char *mxge_fw_unaligned = "mxge_ethp_z8e";
114 static char *mxge_fw_aligned = "mxge_eth_z8e";
115 static char *mxge_fw_rss_aligned = "mxge_rss_eth_z8e";
116 static char *mxge_fw_rss_unaligned = "mxge_rss_ethp_z8e";
117 
118 static int mxge_probe(device_t dev);
119 static int mxge_attach(device_t dev);
120 static int mxge_detach(device_t dev);
121 static int mxge_shutdown(device_t dev);
122 static void mxge_intr(void *arg);
123 
124 static device_method_t mxge_methods[] =
125 {
126   /* Device interface */
127   DEVMETHOD(device_probe, mxge_probe),
128   DEVMETHOD(device_attach, mxge_attach),
129   DEVMETHOD(device_detach, mxge_detach),
130   DEVMETHOD(device_shutdown, mxge_shutdown),
131 
132   DEVMETHOD_END
133 };
134 
135 static driver_t mxge_driver =
136 {
137   "mxge",
138   mxge_methods,
139   sizeof(mxge_softc_t),
140 };
141 
142 /* Declare ourselves to be a child of the PCI bus.*/
143 DRIVER_MODULE(mxge, pci, mxge_driver, 0, 0);
144 MODULE_DEPEND(mxge, firmware, 1, 1, 1);
145 MODULE_DEPEND(mxge, zlib, 1, 1, 1);
146 
147 static int mxge_load_firmware(mxge_softc_t *sc, int adopt);
148 static int mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data);
149 static int mxge_close(mxge_softc_t *sc, int down);
150 static int mxge_open(mxge_softc_t *sc);
151 static void mxge_tick(void *arg);
152 
153 static int
154 mxge_probe(device_t dev)
155 {
156 	int rev;
157 
158 	if ((pci_get_vendor(dev) == MXGE_PCI_VENDOR_MYRICOM) &&
159 	    ((pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E) ||
160 	     (pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E_9))) {
161 		rev = pci_get_revid(dev);
162 		switch (rev) {
163 		case MXGE_PCI_REV_Z8E:
164 			device_set_desc(dev, "Myri10G-PCIE-8A");
165 			break;
166 		case MXGE_PCI_REV_Z8ES:
167 			device_set_desc(dev, "Myri10G-PCIE-8B");
168 			break;
169 		default:
170 			device_set_desc(dev, "Myri10G-PCIE-8??");
171 			device_printf(dev, "Unrecognized rev %d NIC\n",
172 				      rev);
173 			break;
174 		}
175 		return 0;
176 	}
177 	return ENXIO;
178 }
179 
180 static void
181 mxge_enable_wc(mxge_softc_t *sc)
182 {
183 #if defined(__i386) || defined(__amd64)
184 	vm_offset_t len;
185 	int err;
186 
187 	sc->wc = 1;
188 	len = rman_get_size(sc->mem_res);
189 	err = pmap_change_attr((vm_offset_t) sc->sram,
190 			       len, PAT_WRITE_COMBINING);
191 	if (err != 0) {
192 		device_printf(sc->dev, "pmap_change_attr failed, %d\n",
193 			      err);
194 		sc->wc = 0;
195 	}
196 #endif
197 }
198 
199 /* callback to get our DMA address */
200 static void
201 mxge_dmamap_callback(void *arg, bus_dma_segment_t *segs, int nsegs,
202 			 int error)
203 {
204 	if (error == 0) {
205 		*(bus_addr_t *) arg = segs->ds_addr;
206 	}
207 }
208 
209 static int
210 mxge_dma_alloc(mxge_softc_t *sc, mxge_dma_t *dma, size_t bytes,
211 		   bus_size_t alignment)
212 {
213 	int err;
214 	device_t dev = sc->dev;
215 	bus_size_t boundary, maxsegsize;
216 
217 	if (bytes > 4096 && alignment == 4096) {
218 		boundary = 0;
219 		maxsegsize = bytes;
220 	} else {
221 		boundary = 4096;
222 		maxsegsize = 4096;
223 	}
224 
225 	/* allocate DMAable memory tags */
226 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
227 				 alignment,		/* alignment */
228 				 boundary,		/* boundary */
229 				 BUS_SPACE_MAXADDR,	/* low */
230 				 BUS_SPACE_MAXADDR,	/* high */
231 				 NULL, NULL,		/* filter */
232 				 bytes,			/* maxsize */
233 				 1,			/* num segs */
234 				 maxsegsize,		/* maxsegsize */
235 				 BUS_DMA_COHERENT,	/* flags */
236 				 NULL, NULL,		/* lock */
237 				 &dma->dmat);		/* tag */
238 	if (err != 0) {
239 		device_printf(dev, "couldn't alloc tag (err = %d)\n", err);
240 		return err;
241 	}
242 
243 	/* allocate DMAable memory & map */
244 	err = bus_dmamem_alloc(dma->dmat, &dma->addr,
245 			       (BUS_DMA_WAITOK | BUS_DMA_COHERENT
246 				| BUS_DMA_ZERO),  &dma->map);
247 	if (err != 0) {
248 		device_printf(dev, "couldn't alloc mem (err = %d)\n", err);
249 		goto abort_with_dmat;
250 	}
251 
252 	/* load the memory */
253 	err = bus_dmamap_load(dma->dmat, dma->map, dma->addr, bytes,
254 			      mxge_dmamap_callback,
255 			      (void *)&dma->bus_addr, 0);
256 	if (err != 0) {
257 		device_printf(dev, "couldn't load map (err = %d)\n", err);
258 		goto abort_with_mem;
259 	}
260 	return 0;
261 
262 abort_with_mem:
263 	bus_dmamem_free(dma->dmat, dma->addr, dma->map);
264 abort_with_dmat:
265 	(void)bus_dma_tag_destroy(dma->dmat);
266 	return err;
267 }
268 
269 static void
270 mxge_dma_free(mxge_dma_t *dma)
271 {
272 	bus_dmamap_unload(dma->dmat, dma->map);
273 	bus_dmamem_free(dma->dmat, dma->addr, dma->map);
274 	(void)bus_dma_tag_destroy(dma->dmat);
275 }
276 
277 /*
278  * The eeprom strings on the lanaiX have the format
279  * SN=x\0
280  * MAC=x:x:x:x:x:x\0
281  * PC=text\0
282  */
283 
284 static int
285 mxge_parse_strings(mxge_softc_t *sc)
286 {
287 	char *ptr;
288 	int i, found_mac, found_sn2;
289 	char *endptr;
290 
291 	ptr = sc->eeprom_strings;
292 	found_mac = 0;
293 	found_sn2 = 0;
294 	while (*ptr != '\0') {
295 		if (strncmp(ptr, "MAC=", 4) == 0) {
296 			ptr += 4;
297 			for (i = 0;;) {
298 				sc->mac_addr[i] = strtoul(ptr, &endptr, 16);
299 				if (endptr - ptr != 2)
300 					goto abort;
301 				ptr = endptr;
302 				if (++i == 6)
303 					break;
304 				if (*ptr++ != ':')
305 					goto abort;
306 			}
307 			found_mac = 1;
308 		} else if (strncmp(ptr, "PC=", 3) == 0) {
309 			ptr += 3;
310 			strlcpy(sc->product_code_string, ptr,
311 			    sizeof(sc->product_code_string));
312 		} else if (!found_sn2 && (strncmp(ptr, "SN=", 3) == 0)) {
313 			ptr += 3;
314 			strlcpy(sc->serial_number_string, ptr,
315 			    sizeof(sc->serial_number_string));
316 		} else if (strncmp(ptr, "SN2=", 4) == 0) {
317 			/* SN2 takes precedence over SN */
318 			ptr += 4;
319 			found_sn2 = 1;
320 			strlcpy(sc->serial_number_string, ptr,
321 			    sizeof(sc->serial_number_string));
322 		}
323 		while (*ptr++ != '\0') {}
324 	}
325 
326 	if (found_mac)
327 		return 0;
328 
329  abort:
330 	device_printf(sc->dev, "failed to parse eeprom_strings\n");
331 
332 	return ENXIO;
333 }
334 
335 #if defined __i386 || defined i386 || defined __i386__ || defined __x86_64__
336 static void
337 mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
338 {
339 	uint32_t val;
340 	unsigned long base, off;
341 	char *va, *cfgptr;
342 	device_t pdev, mcp55;
343 	uint16_t vendor_id, device_id, word;
344 	uintptr_t bus, slot, func, ivend, idev;
345 	uint32_t *ptr32;
346 
347 	if (!mxge_nvidia_ecrc_enable)
348 		return;
349 
350 	pdev = device_get_parent(device_get_parent(sc->dev));
351 	if (pdev == NULL) {
352 		device_printf(sc->dev, "could not find parent?\n");
353 		return;
354 	}
355 	vendor_id = pci_read_config(pdev, PCIR_VENDOR, 2);
356 	device_id = pci_read_config(pdev, PCIR_DEVICE, 2);
357 
358 	if (vendor_id != 0x10de)
359 		return;
360 
361 	base = 0;
362 
363 	if (device_id == 0x005d) {
364 		/* ck804, base address is magic */
365 		base = 0xe0000000UL;
366 	} else if (device_id >= 0x0374 && device_id <= 0x378) {
367 		/* mcp55, base address stored in chipset */
368 		mcp55 = pci_find_bsf(0, 0, 0);
369 		if (mcp55 &&
370 		    0x10de == pci_read_config(mcp55, PCIR_VENDOR, 2) &&
371 		    0x0369 == pci_read_config(mcp55, PCIR_DEVICE, 2)) {
372 			word = pci_read_config(mcp55, 0x90, 2);
373 			base = ((unsigned long)word & 0x7ffeU) << 25;
374 		}
375 	}
376 	if (!base)
377 		return;
378 
379 	/* XXXX
380 	   Test below is commented because it is believed that doing
381 	   config read/write beyond 0xff will access the config space
382 	   for the next larger function.  Uncomment this and remove
383 	   the hacky pmap_mapdev() way of accessing config space when
384 	   FreeBSD grows support for extended pcie config space access
385 	*/
386 #if 0
387 	/* See if we can, by some miracle, access the extended
388 	   config space */
389 	val = pci_read_config(pdev, 0x178, 4);
390 	if (val != 0xffffffff) {
391 		val |= 0x40;
392 		pci_write_config(pdev, 0x178, val, 4);
393 		return;
394 	}
395 #endif
396 	/* Rather than using normal pci config space writes, we must
397 	 * map the Nvidia config space ourselves.  This is because on
398 	 * opteron/nvidia class machine the 0xe000000 mapping is
399 	 * handled by the nvidia chipset, that means the internal PCI
400 	 * device (the on-chip northbridge), or the amd-8131 bridge
401 	 * and things behind them are not visible by this method.
402 	 */
403 
404 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
405 		      PCI_IVAR_BUS, &bus);
406 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
407 		      PCI_IVAR_SLOT, &slot);
408 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
409 		      PCI_IVAR_FUNCTION, &func);
410 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
411 		      PCI_IVAR_VENDOR, &ivend);
412 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
413 		      PCI_IVAR_DEVICE, &idev);
414 
415 	off =  base
416 		+ 0x00100000UL * (unsigned long)bus
417 		+ 0x00001000UL * (unsigned long)(func
418 						 + 8 * slot);
419 
420 	/* map it into the kernel */
421 	va = pmap_mapdev(trunc_page((vm_paddr_t)off), PAGE_SIZE);
422 
423 	if (va == NULL) {
424 		device_printf(sc->dev, "pmap_kenter_temporary didn't\n");
425 		return;
426 	}
427 	/* get a pointer to the config space mapped into the kernel */
428 	cfgptr = va + (off & PAGE_MASK);
429 
430 	/* make sure that we can really access it */
431 	vendor_id = *(uint16_t *)(cfgptr + PCIR_VENDOR);
432 	device_id = *(uint16_t *)(cfgptr + PCIR_DEVICE);
433 	if (! (vendor_id == ivend && device_id == idev)) {
434 		device_printf(sc->dev, "mapping failed: 0x%x:0x%x\n",
435 			      vendor_id, device_id);
436 		pmap_unmapdev(va, PAGE_SIZE);
437 		return;
438 	}
439 
440 	ptr32 = (uint32_t*)(cfgptr + 0x178);
441 	val = *ptr32;
442 
443 	if (val == 0xffffffff) {
444 		device_printf(sc->dev, "extended mapping failed\n");
445 		pmap_unmapdev(va, PAGE_SIZE);
446 		return;
447 	}
448 	*ptr32 = val | 0x40;
449 	pmap_unmapdev(va, PAGE_SIZE);
450 	if (mxge_verbose)
451 		device_printf(sc->dev,
452 			      "Enabled ECRC on upstream Nvidia bridge "
453 			      "at %d:%d:%d\n",
454 			      (int)bus, (int)slot, (int)func);
455 	return;
456 }
457 #else
458 static void
459 mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
460 {
461 	device_printf(sc->dev,
462 		      "Nforce 4 chipset on non-x86/amd64!?!?!\n");
463 	return;
464 }
465 #endif
466 
467 static int
468 mxge_dma_test(mxge_softc_t *sc, int test_type)
469 {
470 	mxge_cmd_t cmd;
471 	bus_addr_t dmatest_bus = sc->dmabench_dma.bus_addr;
472 	int status;
473 	uint32_t len;
474 	char *test = " ";
475 
476 	/* Run a small DMA test.
477 	 * The magic multipliers to the length tell the firmware
478 	 * to do DMA read, write, or read+write tests.  The
479 	 * results are returned in cmd.data0.  The upper 16
480 	 * bits of the return is the number of transfers completed.
481 	 * The lower 16 bits is the time in 0.5us ticks that the
482 	 * transfers took to complete.
483 	 */
484 
485 	len = sc->tx_boundary;
486 
487 	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
488 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
489 	cmd.data2 = len * 0x10000;
490 	status = mxge_send_cmd(sc, test_type, &cmd);
491 	if (status != 0) {
492 		test = "read";
493 		goto abort;
494 	}
495 	sc->read_dma = ((cmd.data0>>16) * len * 2) /
496 		(cmd.data0 & 0xffff);
497 	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
498 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
499 	cmd.data2 = len * 0x1;
500 	status = mxge_send_cmd(sc, test_type, &cmd);
501 	if (status != 0) {
502 		test = "write";
503 		goto abort;
504 	}
505 	sc->write_dma = ((cmd.data0>>16) * len * 2) /
506 		(cmd.data0 & 0xffff);
507 
508 	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
509 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
510 	cmd.data2 = len * 0x10001;
511 	status = mxge_send_cmd(sc, test_type, &cmd);
512 	if (status != 0) {
513 		test = "read/write";
514 		goto abort;
515 	}
516 	sc->read_write_dma = ((cmd.data0>>16) * len * 2 * 2) /
517 		(cmd.data0 & 0xffff);
518 
519 abort:
520 	if (status != 0 && test_type != MXGEFW_CMD_UNALIGNED_TEST)
521 		device_printf(sc->dev, "DMA %s benchmark failed: %d\n",
522 			      test, status);
523 
524 	return status;
525 }
526 
527 /*
528  * The Lanai Z8E PCI-E interface achieves higher Read-DMA throughput
529  * when the PCI-E Completion packets are aligned on an 8-byte
530  * boundary.  Some PCI-E chip sets always align Completion packets; on
531  * the ones that do not, the alignment can be enforced by enabling
532  * ECRC generation (if supported).
533  *
534  * When PCI-E Completion packets are not aligned, it is actually more
535  * efficient to limit Read-DMA transactions to 2KB, rather than 4KB.
536  *
537  * If the driver can neither enable ECRC nor verify that it has
538  * already been enabled, then it must use a firmware image which works
539  * around unaligned completion packets (ethp_z8e.dat), and it should
540  * also ensure that it never gives the device a Read-DMA which is
541  * larger than 2KB by setting the tx_boundary to 2KB.  If ECRC is
542  * enabled, then the driver should use the aligned (eth_z8e.dat)
543  * firmware image, and set tx_boundary to 4KB.
544  */
545 
546 static int
547 mxge_firmware_probe(mxge_softc_t *sc)
548 {
549 	device_t dev = sc->dev;
550 	int reg, status;
551 	uint16_t pectl;
552 
553 	sc->tx_boundary = 4096;
554 	/*
555 	 * Verify the max read request size was set to 4KB
556 	 * before trying the test with 4KB.
557 	 */
558 	if (pci_find_cap(dev, PCIY_EXPRESS, &reg) == 0) {
559 		pectl = pci_read_config(dev, reg + 0x8, 2);
560 		if ((pectl & (5 << 12)) != (5 << 12)) {
561 			device_printf(dev, "Max Read Req. size != 4k (0x%x\n",
562 				      pectl);
563 			sc->tx_boundary = 2048;
564 		}
565 	}
566 
567 	/*
568 	 * load the optimized firmware (which assumes aligned PCIe
569 	 * completions) in order to see if it works on this host.
570 	 */
571 	sc->fw_name = mxge_fw_aligned;
572 	status = mxge_load_firmware(sc, 1);
573 	if (status != 0) {
574 		return status;
575 	}
576 
577 	/*
578 	 * Enable ECRC if possible
579 	 */
580 	mxge_enable_nvidia_ecrc(sc);
581 
582 	/*
583 	 * Run a DMA test which watches for unaligned completions and
584 	 * aborts on the first one seen.  Not required on Z8ES or newer.
585 	 */
586 	if (pci_get_revid(sc->dev) >= MXGE_PCI_REV_Z8ES)
587 		return 0;
588 	status = mxge_dma_test(sc, MXGEFW_CMD_UNALIGNED_TEST);
589 	if (status == 0)
590 		return 0; /* keep the aligned firmware */
591 
592 	if (status != E2BIG)
593 		device_printf(dev, "DMA test failed: %d\n", status);
594 	if (status == ENOSYS)
595 		device_printf(dev, "Falling back to ethp! "
596 			      "Please install up to date fw\n");
597 	return status;
598 }
599 
600 static int
601 mxge_select_firmware(mxge_softc_t *sc)
602 {
603 	int aligned = 0;
604 	int force_firmware = mxge_force_firmware;
605 
606 	if (sc->throttle)
607 		force_firmware = sc->throttle;
608 
609 	if (force_firmware != 0) {
610 		if (force_firmware == 1)
611 			aligned = 1;
612 		else
613 			aligned = 0;
614 		if (mxge_verbose)
615 			device_printf(sc->dev,
616 				      "Assuming %s completions (forced)\n",
617 				      aligned ? "aligned" : "unaligned");
618 		goto abort;
619 	}
620 
621 	/* if the PCIe link width is 4 or less, we can use the aligned
622 	   firmware and skip any checks */
623 	if (sc->link_width != 0 && sc->link_width <= 4) {
624 		device_printf(sc->dev,
625 			      "PCIe x%d Link, expect reduced performance\n",
626 			      sc->link_width);
627 		aligned = 1;
628 		goto abort;
629 	}
630 
631 	if (0 == mxge_firmware_probe(sc))
632 		return 0;
633 
634 abort:
635 	if (aligned) {
636 		sc->fw_name = mxge_fw_aligned;
637 		sc->tx_boundary = 4096;
638 	} else {
639 		sc->fw_name = mxge_fw_unaligned;
640 		sc->tx_boundary = 2048;
641 	}
642 	return (mxge_load_firmware(sc, 0));
643 }
644 
645 static int
646 mxge_validate_firmware(mxge_softc_t *sc, const mcp_gen_header_t *hdr)
647 {
648 
649 	if (be32toh(hdr->mcp_type) != MCP_TYPE_ETH) {
650 		device_printf(sc->dev, "Bad firmware type: 0x%x\n",
651 			      be32toh(hdr->mcp_type));
652 		return EIO;
653 	}
654 
655 	/* save firmware version for sysctl */
656 	strlcpy(sc->fw_version, hdr->version, sizeof(sc->fw_version));
657 	if (mxge_verbose)
658 		device_printf(sc->dev, "firmware id: %s\n", hdr->version);
659 
660 	sscanf(sc->fw_version, "%d.%d.%d", &sc->fw_ver_major,
661 	       &sc->fw_ver_minor, &sc->fw_ver_tiny);
662 
663 	if (!(sc->fw_ver_major == MXGEFW_VERSION_MAJOR
664 	      && sc->fw_ver_minor == MXGEFW_VERSION_MINOR)) {
665 		device_printf(sc->dev, "Found firmware version %s\n",
666 			      sc->fw_version);
667 		device_printf(sc->dev, "Driver needs %d.%d\n",
668 			      MXGEFW_VERSION_MAJOR, MXGEFW_VERSION_MINOR);
669 		return EINVAL;
670 	}
671 	return 0;
672 
673 }
674 
675 static int
676 mxge_load_firmware_helper(mxge_softc_t *sc, uint32_t *limit)
677 {
678 	z_stream zs;
679 	char *inflate_buffer;
680 	const struct firmware *fw;
681 	const mcp_gen_header_t *hdr;
682 	unsigned hdr_offset;
683 	int status;
684 	unsigned int i;
685 	size_t fw_len;
686 
687 	fw = firmware_get(sc->fw_name);
688 	if (fw == NULL) {
689 		device_printf(sc->dev, "Could not find firmware image %s\n",
690 			      sc->fw_name);
691 		return ENOENT;
692 	}
693 
694 	/* setup zlib and decompress f/w */
695 	bzero(&zs, sizeof (zs));
696 	zs.zalloc = zcalloc_nowait;
697 	zs.zfree = zcfree;
698 	status = inflateInit(&zs);
699 	if (status != Z_OK) {
700 		status = EIO;
701 		goto abort_with_fw;
702 	}
703 
704 	/* the uncompressed size is stored as the firmware version,
705 	   which would otherwise go unused */
706 	fw_len = (size_t) fw->version;
707 	inflate_buffer = malloc(fw_len, M_TEMP, M_NOWAIT);
708 	if (inflate_buffer == NULL)
709 		goto abort_with_zs;
710 	zs.avail_in = fw->datasize;
711 	zs.next_in = __DECONST(char *, fw->data);
712 	zs.avail_out = fw_len;
713 	zs.next_out = inflate_buffer;
714 	status = inflate(&zs, Z_FINISH);
715 	if (status != Z_STREAM_END) {
716 		device_printf(sc->dev, "zlib %d\n", status);
717 		status = EIO;
718 		goto abort_with_buffer;
719 	}
720 
721 	/* check id */
722 	hdr_offset = htobe32(*(const uint32_t *)
723 			     (inflate_buffer + MCP_HEADER_PTR_OFFSET));
724 	if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > fw_len) {
725 		device_printf(sc->dev, "Bad firmware file");
726 		status = EIO;
727 		goto abort_with_buffer;
728 	}
729 	hdr = (const void*)(inflate_buffer + hdr_offset);
730 
731 	status = mxge_validate_firmware(sc, hdr);
732 	if (status != 0)
733 		goto abort_with_buffer;
734 
735 	/* Copy the inflated firmware to NIC SRAM. */
736 	for (i = 0; i < fw_len; i += 256) {
737 		mxge_pio_copy(sc->sram + MXGE_FW_OFFSET + i,
738 			      inflate_buffer + i,
739 			      min(256U, (unsigned)(fw_len - i)));
740 		wmb();
741 		(void)*sc->sram;
742 		wmb();
743 	}
744 
745 	*limit = fw_len;
746 	status = 0;
747 abort_with_buffer:
748 	free(inflate_buffer, M_TEMP);
749 abort_with_zs:
750 	inflateEnd(&zs);
751 abort_with_fw:
752 	firmware_put(fw, FIRMWARE_UNLOAD);
753 	return status;
754 }
755 
756 /*
757  * Enable or disable periodic RDMAs from the host to make certain
758  * chipsets resend dropped PCIe messages
759  */
760 
761 static void
762 mxge_dummy_rdma(mxge_softc_t *sc, int enable)
763 {
764 	char buf_bytes[72];
765 	volatile uint32_t *confirm;
766 	volatile char *submit;
767 	uint32_t *buf, dma_low, dma_high;
768 	int i;
769 
770 	buf = (uint32_t *)((uintptr_t)(buf_bytes + 7) & ~7UL);
771 
772 	/* clear confirmation addr */
773 	confirm = (volatile uint32_t *)sc->cmd;
774 	*confirm = 0;
775 	wmb();
776 
777 	/* send an rdma command to the PCIe engine, and wait for the
778 	   response in the confirmation address.  The firmware should
779 	   write a -1 there to indicate it is alive and well
780 	*/
781 
782 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
783 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
784 	buf[0] = htobe32(dma_high);		/* confirm addr MSW */
785 	buf[1] = htobe32(dma_low);		/* confirm addr LSW */
786 	buf[2] = htobe32(0xffffffff);		/* confirm data */
787 	dma_low = MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr);
788 	dma_high = MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr);
789 	buf[3] = htobe32(dma_high); 		/* dummy addr MSW */
790 	buf[4] = htobe32(dma_low); 		/* dummy addr LSW */
791 	buf[5] = htobe32(enable);			/* enable? */
792 
793 	submit = (volatile char *)(sc->sram + MXGEFW_BOOT_DUMMY_RDMA);
794 
795 	mxge_pio_copy(submit, buf, 64);
796 	wmb();
797 	DELAY(1000);
798 	wmb();
799 	i = 0;
800 	while (*confirm != 0xffffffff && i < 20) {
801 		DELAY(1000);
802 		i++;
803 	}
804 	if (*confirm != 0xffffffff) {
805 		device_printf(sc->dev, "dummy rdma %s failed (%p = 0x%x)",
806 			      (enable ? "enable" : "disable"), confirm,
807 			      *confirm);
808 	}
809 	return;
810 }
811 
812 static int
813 mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data)
814 {
815 	mcp_cmd_t *buf;
816 	char buf_bytes[sizeof(*buf) + 8];
817 	volatile mcp_cmd_response_t *response = sc->cmd;
818 	volatile char *cmd_addr = sc->sram + MXGEFW_ETH_CMD;
819 	uint32_t dma_low, dma_high;
820 	int err, sleep_total = 0;
821 
822 	/* ensure buf is aligned to 8 bytes */
823 	buf = (mcp_cmd_t *)((uintptr_t)(buf_bytes + 7) & ~7UL);
824 
825 	buf->data0 = htobe32(data->data0);
826 	buf->data1 = htobe32(data->data1);
827 	buf->data2 = htobe32(data->data2);
828 	buf->cmd = htobe32(cmd);
829 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
830 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
831 
832 	buf->response_addr.low = htobe32(dma_low);
833 	buf->response_addr.high = htobe32(dma_high);
834 	mtx_lock(&sc->cmd_mtx);
835 	response->result = 0xffffffff;
836 	wmb();
837 	mxge_pio_copy((volatile void *)cmd_addr, buf, sizeof (*buf));
838 
839 	/* wait up to 20ms */
840 	err = EAGAIN;
841 	for (sleep_total = 0; sleep_total <  20; sleep_total++) {
842 		bus_dmamap_sync(sc->cmd_dma.dmat,
843 				sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
844 		wmb();
845 		switch (be32toh(response->result)) {
846 		case 0:
847 			data->data0 = be32toh(response->data);
848 			err = 0;
849 			break;
850 		case 0xffffffff:
851 			DELAY(1000);
852 			break;
853 		case MXGEFW_CMD_UNKNOWN:
854 			err = ENOSYS;
855 			break;
856 		case MXGEFW_CMD_ERROR_UNALIGNED:
857 			err = E2BIG;
858 			break;
859 		case MXGEFW_CMD_ERROR_BUSY:
860 			err = EBUSY;
861 			break;
862 		case MXGEFW_CMD_ERROR_I2C_ABSENT:
863 			err = ENXIO;
864 			break;
865 		default:
866 			device_printf(sc->dev,
867 				      "mxge: command %d "
868 				      "failed, result = %d\n",
869 				      cmd, be32toh(response->result));
870 			err = ENXIO;
871 			break;
872 		}
873 		if (err != EAGAIN)
874 			break;
875 	}
876 	if (err == EAGAIN)
877 		device_printf(sc->dev, "mxge: command %d timed out"
878 			      "result = %d\n",
879 			      cmd, be32toh(response->result));
880 	mtx_unlock(&sc->cmd_mtx);
881 	return err;
882 }
883 
884 static int
885 mxge_adopt_running_firmware(mxge_softc_t *sc)
886 {
887 	struct mcp_gen_header *hdr;
888 	const size_t bytes = sizeof (struct mcp_gen_header);
889 	size_t hdr_offset;
890 	int status;
891 
892 	/* find running firmware header */
893 	hdr_offset = htobe32(*(volatile uint32_t *)
894 			     (sc->sram + MCP_HEADER_PTR_OFFSET));
895 
896 	if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > sc->sram_size) {
897 		device_printf(sc->dev,
898 			      "Running firmware has bad header offset (%d)\n",
899 			      (int)hdr_offset);
900 		return EIO;
901 	}
902 
903 	/* copy header of running firmware from SRAM to host memory to
904 	 * validate firmware */
905 	hdr = malloc(bytes, M_DEVBUF, M_NOWAIT);
906 	if (hdr == NULL) {
907 		device_printf(sc->dev, "could not malloc firmware hdr\n");
908 		return ENOMEM;
909 	}
910 	bus_space_read_region_1(rman_get_bustag(sc->mem_res),
911 				rman_get_bushandle(sc->mem_res),
912 				hdr_offset, (char *)hdr, bytes);
913 	status = mxge_validate_firmware(sc, hdr);
914 	free(hdr, M_DEVBUF);
915 
916 	/*
917 	 * check to see if adopted firmware has bug where adopting
918 	 * it will cause broadcasts to be filtered unless the NIC
919 	 * is kept in ALLMULTI mode
920 	 */
921 	if (sc->fw_ver_major == 1 && sc->fw_ver_minor == 4 &&
922 	    sc->fw_ver_tiny >= 4 && sc->fw_ver_tiny <= 11) {
923 		sc->adopted_rx_filter_bug = 1;
924 		device_printf(sc->dev, "Adopting fw %d.%d.%d: "
925 			      "working around rx filter bug\n",
926 			      sc->fw_ver_major, sc->fw_ver_minor,
927 			      sc->fw_ver_tiny);
928 	}
929 
930 	return status;
931 }
932 
933 static int
934 mxge_load_firmware(mxge_softc_t *sc, int adopt)
935 {
936 	volatile uint32_t *confirm;
937 	volatile char *submit;
938 	char buf_bytes[72];
939 	uint32_t *buf, size, dma_low, dma_high;
940 	int status, i;
941 
942 	buf = (uint32_t *)((uintptr_t)(buf_bytes + 7) & ~7UL);
943 
944 	size = sc->sram_size;
945 	status = mxge_load_firmware_helper(sc, &size);
946 	if (status) {
947 		if (!adopt)
948 			return status;
949 		/* Try to use the currently running firmware, if
950 		   it is new enough */
951 		status = mxge_adopt_running_firmware(sc);
952 		if (status) {
953 			device_printf(sc->dev,
954 				      "failed to adopt running firmware\n");
955 			return status;
956 		}
957 		device_printf(sc->dev,
958 			      "Successfully adopted running firmware\n");
959 		if (sc->tx_boundary == 4096) {
960 			device_printf(sc->dev,
961 				"Using firmware currently running on NIC"
962 				 ".  For optimal\n");
963 			device_printf(sc->dev,
964 				 "performance consider loading optimized "
965 				 "firmware\n");
966 		}
967 		sc->fw_name = mxge_fw_unaligned;
968 		sc->tx_boundary = 2048;
969 		return 0;
970 	}
971 	/* clear confirmation addr */
972 	confirm = (volatile uint32_t *)sc->cmd;
973 	*confirm = 0;
974 	wmb();
975 	/* send a reload command to the bootstrap MCP, and wait for the
976 	   response in the confirmation address.  The firmware should
977 	   write a -1 there to indicate it is alive and well
978 	*/
979 
980 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
981 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
982 
983 	buf[0] = htobe32(dma_high);	/* confirm addr MSW */
984 	buf[1] = htobe32(dma_low);	/* confirm addr LSW */
985 	buf[2] = htobe32(0xffffffff);	/* confirm data */
986 
987 	/* FIX: All newest firmware should un-protect the bottom of
988 	   the sram before handoff. However, the very first interfaces
989 	   do not. Therefore the handoff copy must skip the first 8 bytes
990 	*/
991 					/* where the code starts*/
992 	buf[3] = htobe32(MXGE_FW_OFFSET + 8);
993 	buf[4] = htobe32(size - 8); 	/* length of code */
994 	buf[5] = htobe32(8);		/* where to copy to */
995 	buf[6] = htobe32(0);		/* where to jump to */
996 
997 	submit = (volatile char *)(sc->sram + MXGEFW_BOOT_HANDOFF);
998 	mxge_pio_copy(submit, buf, 64);
999 	wmb();
1000 	DELAY(1000);
1001 	wmb();
1002 	i = 0;
1003 	while (*confirm != 0xffffffff && i < 20) {
1004 		DELAY(1000*10);
1005 		i++;
1006 		bus_dmamap_sync(sc->cmd_dma.dmat,
1007 				sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
1008 	}
1009 	if (*confirm != 0xffffffff) {
1010 		device_printf(sc->dev,"handoff failed (%p = 0x%x)",
1011 			confirm, *confirm);
1012 
1013 		return ENXIO;
1014 	}
1015 	return 0;
1016 }
1017 
1018 static int
1019 mxge_update_mac_address(mxge_softc_t *sc)
1020 {
1021 	mxge_cmd_t cmd;
1022 	uint8_t *addr = sc->mac_addr;
1023 	int status;
1024 
1025 	cmd.data0 = ((addr[0] << 24) | (addr[1] << 16)
1026 		     | (addr[2] << 8) | addr[3]);
1027 
1028 	cmd.data1 = ((addr[4] << 8) | (addr[5]));
1029 
1030 	status = mxge_send_cmd(sc, MXGEFW_SET_MAC_ADDRESS, &cmd);
1031 	return status;
1032 }
1033 
1034 static int
1035 mxge_change_pause(mxge_softc_t *sc, int pause)
1036 {
1037 	mxge_cmd_t cmd;
1038 	int status;
1039 
1040 	if (pause)
1041 		status = mxge_send_cmd(sc, MXGEFW_ENABLE_FLOW_CONTROL,
1042 				       &cmd);
1043 	else
1044 		status = mxge_send_cmd(sc, MXGEFW_DISABLE_FLOW_CONTROL,
1045 				       &cmd);
1046 
1047 	if (status) {
1048 		device_printf(sc->dev, "Failed to set flow control mode\n");
1049 		return ENXIO;
1050 	}
1051 	sc->pause = pause;
1052 	return 0;
1053 }
1054 
1055 static void
1056 mxge_change_promisc(mxge_softc_t *sc, int promisc)
1057 {
1058 	mxge_cmd_t cmd;
1059 	int status;
1060 
1061 	if (mxge_always_promisc)
1062 		promisc = 1;
1063 
1064 	if (promisc)
1065 		status = mxge_send_cmd(sc, MXGEFW_ENABLE_PROMISC,
1066 				       &cmd);
1067 	else
1068 		status = mxge_send_cmd(sc, MXGEFW_DISABLE_PROMISC,
1069 				       &cmd);
1070 
1071 	if (status) {
1072 		device_printf(sc->dev, "Failed to set promisc mode\n");
1073 	}
1074 }
1075 
1076 struct mxge_add_maddr_ctx {
1077 	mxge_softc_t *sc;
1078 	int error;
1079 };
1080 
1081 static u_int
1082 mxge_add_maddr(void *arg, struct sockaddr_dl *sdl, u_int cnt)
1083 {
1084 	struct mxge_add_maddr_ctx *ctx = arg;
1085 	mxge_cmd_t cmd;
1086 
1087 	if (ctx->error != 0)
1088 		return (0);
1089 	bcopy(LLADDR(sdl), &cmd.data0, 4);
1090 	bcopy(LLADDR(sdl) + 4, &cmd.data1, 2);
1091 	cmd.data0 = htonl(cmd.data0);
1092 	cmd.data1 = htonl(cmd.data1);
1093 
1094 	ctx->error = mxge_send_cmd(ctx->sc, MXGEFW_JOIN_MULTICAST_GROUP, &cmd);
1095 
1096 	return (1);
1097 }
1098 
1099 static void
1100 mxge_set_multicast_list(mxge_softc_t *sc)
1101 {
1102 	struct mxge_add_maddr_ctx ctx;
1103 	if_t ifp = sc->ifp;
1104 	mxge_cmd_t cmd;
1105 	int err;
1106 
1107 	/* This firmware is known to not support multicast */
1108 	if (!sc->fw_multicast_support)
1109 		return;
1110 
1111 	/* Disable multicast filtering while we play with the lists*/
1112 	err = mxge_send_cmd(sc, MXGEFW_ENABLE_ALLMULTI, &cmd);
1113 	if (err != 0) {
1114 		device_printf(sc->dev, "Failed MXGEFW_ENABLE_ALLMULTI,"
1115 		       " error status: %d\n", err);
1116 		return;
1117 	}
1118 
1119 	if (sc->adopted_rx_filter_bug)
1120 		return;
1121 
1122 	if (if_getflags(ifp) & IFF_ALLMULTI)
1123 		/* request to disable multicast filtering, so quit here */
1124 		return;
1125 
1126 	/* Flush all the filters */
1127 
1128 	err = mxge_send_cmd(sc, MXGEFW_LEAVE_ALL_MULTICAST_GROUPS, &cmd);
1129 	if (err != 0) {
1130 		device_printf(sc->dev,
1131 			      "Failed MXGEFW_LEAVE_ALL_MULTICAST_GROUPS"
1132 			      ", error status: %d\n", err);
1133 		return;
1134 	}
1135 
1136 	/* Walk the multicast list, and add each address */
1137 	ctx.sc = sc;
1138 	ctx.error = 0;
1139 	if_foreach_llmaddr(ifp, mxge_add_maddr, &ctx);
1140 	if (ctx.error != 0) {
1141 		device_printf(sc->dev, "Failed MXGEFW_JOIN_MULTICAST_GROUP, "
1142 		    "error status:" "%d\t", ctx.error);
1143 		/* abort, leaving multicast filtering off */
1144 		return;
1145 	}
1146 
1147 	/* Enable multicast filtering */
1148 	err = mxge_send_cmd(sc, MXGEFW_DISABLE_ALLMULTI, &cmd);
1149 	if (err != 0) {
1150 		device_printf(sc->dev, "Failed MXGEFW_DISABLE_ALLMULTI"
1151 		       ", error status: %d\n", err);
1152 	}
1153 }
1154 
1155 static int
1156 mxge_max_mtu(mxge_softc_t *sc)
1157 {
1158 	mxge_cmd_t cmd;
1159 	int status;
1160 
1161 	if (MJUMPAGESIZE - MXGEFW_PAD >  MXGEFW_MAX_MTU)
1162 		return  MXGEFW_MAX_MTU - MXGEFW_PAD;
1163 
1164 	/* try to set nbufs to see if it we can
1165 	   use virtually contiguous jumbos */
1166 	cmd.data0 = 0;
1167 	status = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
1168 			       &cmd);
1169 	if (status == 0)
1170 		return  MXGEFW_MAX_MTU - MXGEFW_PAD;
1171 
1172 	/* otherwise, we're limited to MJUMPAGESIZE */
1173 	return MJUMPAGESIZE - MXGEFW_PAD;
1174 }
1175 
1176 static int
1177 mxge_reset(mxge_softc_t *sc, int interrupts_setup)
1178 {
1179 	struct mxge_slice_state *ss;
1180 	mxge_rx_done_t *rx_done;
1181 	volatile uint32_t *irq_claim;
1182 	mxge_cmd_t cmd;
1183 	int slice, status;
1184 
1185 	/* try to send a reset command to the card to see if it
1186 	   is alive */
1187 	memset(&cmd, 0, sizeof (cmd));
1188 	status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
1189 	if (status != 0) {
1190 		device_printf(sc->dev, "failed reset\n");
1191 		return ENXIO;
1192 	}
1193 
1194 	mxge_dummy_rdma(sc, 1);
1195 
1196 	/* set the intrq size */
1197 	cmd.data0 = sc->rx_ring_size;
1198 	status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
1199 
1200 	/*
1201 	 * Even though we already know how many slices are supported
1202 	 * via mxge_slice_probe(), MXGEFW_CMD_GET_MAX_RSS_QUEUES
1203 	 * has magic side effects, and must be called after a reset.
1204 	 * It must be called prior to calling any RSS related cmds,
1205 	 * including assigning an interrupt queue for anything but
1206 	 * slice 0.  It must also be called *after*
1207 	 * MXGEFW_CMD_SET_INTRQ_SIZE, since the intrq size is used by
1208 	 * the firmware to compute offsets.
1209 	 */
1210 
1211 	if (sc->num_slices > 1) {
1212 		/* ask the maximum number of slices it supports */
1213 		status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES,
1214 					   &cmd);
1215 		if (status != 0) {
1216 			device_printf(sc->dev,
1217 				      "failed to get number of slices\n");
1218 			return status;
1219 		}
1220 		/*
1221 		 * MXGEFW_CMD_ENABLE_RSS_QUEUES must be called prior
1222 		 * to setting up the interrupt queue DMA
1223 		 */
1224 		cmd.data0 = sc->num_slices;
1225 		cmd.data1 = MXGEFW_SLICE_INTR_MODE_ONE_PER_SLICE;
1226 		cmd.data1 |= MXGEFW_SLICE_ENABLE_MULTIPLE_TX_QUEUES;
1227 		status = mxge_send_cmd(sc, MXGEFW_CMD_ENABLE_RSS_QUEUES,
1228 					   &cmd);
1229 		if (status != 0) {
1230 			device_printf(sc->dev,
1231 				      "failed to set number of slices\n");
1232 			return status;
1233 		}
1234 	}
1235 
1236 	if (interrupts_setup) {
1237 		/* Now exchange information about interrupts  */
1238 		for (slice = 0; slice < sc->num_slices; slice++) {
1239 			rx_done = &sc->ss[slice].rx_done;
1240 			memset(rx_done->entry, 0, sc->rx_ring_size);
1241 			cmd.data0 = MXGE_LOWPART_TO_U32(rx_done->dma.bus_addr);
1242 			cmd.data1 = MXGE_HIGHPART_TO_U32(rx_done->dma.bus_addr);
1243 			cmd.data2 = slice;
1244 			status |= mxge_send_cmd(sc,
1245 						MXGEFW_CMD_SET_INTRQ_DMA,
1246 						&cmd);
1247 		}
1248 	}
1249 
1250 	status |= mxge_send_cmd(sc,
1251 				MXGEFW_CMD_GET_INTR_COAL_DELAY_OFFSET, &cmd);
1252 
1253 	sc->intr_coal_delay_ptr = (volatile uint32_t *)(sc->sram + cmd.data0);
1254 
1255 	status |= mxge_send_cmd(sc, MXGEFW_CMD_GET_IRQ_ACK_OFFSET, &cmd);
1256 	irq_claim = (volatile uint32_t *)(sc->sram + cmd.data0);
1257 
1258 	status |= mxge_send_cmd(sc,  MXGEFW_CMD_GET_IRQ_DEASSERT_OFFSET,
1259 				&cmd);
1260 	sc->irq_deassert = (volatile uint32_t *)(sc->sram + cmd.data0);
1261 	if (status != 0) {
1262 		device_printf(sc->dev, "failed set interrupt parameters\n");
1263 		return status;
1264 	}
1265 
1266 	*sc->intr_coal_delay_ptr = htobe32(sc->intr_coal_delay);
1267 
1268 	/* run a DMA benchmark */
1269 	(void) mxge_dma_test(sc, MXGEFW_DMA_TEST);
1270 
1271 	for (slice = 0; slice < sc->num_slices; slice++) {
1272 		ss = &sc->ss[slice];
1273 
1274 		ss->irq_claim = irq_claim + (2 * slice);
1275 		/* reset mcp/driver shared state back to 0 */
1276 		ss->rx_done.idx = 0;
1277 		ss->rx_done.cnt = 0;
1278 		ss->tx.req = 0;
1279 		ss->tx.done = 0;
1280 		ss->tx.pkt_done = 0;
1281 		ss->tx.queue_active = 0;
1282 		ss->tx.activate = 0;
1283 		ss->tx.deactivate = 0;
1284 		ss->tx.wake = 0;
1285 		ss->tx.defrag = 0;
1286 		ss->tx.stall = 0;
1287 		ss->rx_big.cnt = 0;
1288 		ss->rx_small.cnt = 0;
1289 		ss->lc.lro_bad_csum = 0;
1290 		ss->lc.lro_queued = 0;
1291 		ss->lc.lro_flushed = 0;
1292 		if (ss->fw_stats != NULL) {
1293 			bzero(ss->fw_stats, sizeof *ss->fw_stats);
1294 		}
1295 	}
1296 	sc->rdma_tags_available = 15;
1297 	status = mxge_update_mac_address(sc);
1298 	mxge_change_promisc(sc, if_getflags(sc->ifp) & IFF_PROMISC);
1299 	mxge_change_pause(sc, sc->pause);
1300 	mxge_set_multicast_list(sc);
1301 	if (sc->throttle) {
1302 		cmd.data0 = sc->throttle;
1303 		if (mxge_send_cmd(sc, MXGEFW_CMD_SET_THROTTLE_FACTOR,
1304 				  &cmd)) {
1305 			device_printf(sc->dev,
1306 				      "can't enable throttle\n");
1307 		}
1308 	}
1309 	return status;
1310 }
1311 
1312 static int
1313 mxge_change_throttle(SYSCTL_HANDLER_ARGS)
1314 {
1315 	mxge_cmd_t cmd;
1316 	mxge_softc_t *sc;
1317 	int err;
1318 	unsigned int throttle;
1319 
1320 	sc = arg1;
1321 	throttle = sc->throttle;
1322 	err = sysctl_handle_int(oidp, &throttle, arg2, req);
1323 	if (err != 0) {
1324 		return err;
1325 	}
1326 
1327 	if (throttle == sc->throttle)
1328 		return 0;
1329 
1330 	if (throttle < MXGE_MIN_THROTTLE || throttle > MXGE_MAX_THROTTLE)
1331 		return EINVAL;
1332 
1333 	mtx_lock(&sc->driver_mtx);
1334 	cmd.data0 = throttle;
1335 	err = mxge_send_cmd(sc, MXGEFW_CMD_SET_THROTTLE_FACTOR, &cmd);
1336 	if (err == 0)
1337 		sc->throttle = throttle;
1338 	mtx_unlock(&sc->driver_mtx);
1339 	return err;
1340 }
1341 
1342 static int
1343 mxge_change_intr_coal(SYSCTL_HANDLER_ARGS)
1344 {
1345 	mxge_softc_t *sc;
1346 	unsigned int intr_coal_delay;
1347 	int err;
1348 
1349 	sc = arg1;
1350 	intr_coal_delay = sc->intr_coal_delay;
1351 	err = sysctl_handle_int(oidp, &intr_coal_delay, arg2, req);
1352 	if (err != 0) {
1353 		return err;
1354 	}
1355 	if (intr_coal_delay == sc->intr_coal_delay)
1356 		return 0;
1357 
1358 	if (intr_coal_delay == 0 || intr_coal_delay > 1000*1000)
1359 		return EINVAL;
1360 
1361 	mtx_lock(&sc->driver_mtx);
1362 	*sc->intr_coal_delay_ptr = htobe32(intr_coal_delay);
1363 	sc->intr_coal_delay = intr_coal_delay;
1364 
1365 	mtx_unlock(&sc->driver_mtx);
1366 	return err;
1367 }
1368 
1369 static int
1370 mxge_change_flow_control(SYSCTL_HANDLER_ARGS)
1371 {
1372 	mxge_softc_t *sc;
1373 	unsigned int enabled;
1374 	int err;
1375 
1376 	sc = arg1;
1377 	enabled = sc->pause;
1378 	err = sysctl_handle_int(oidp, &enabled, arg2, req);
1379 	if (err != 0) {
1380 		return err;
1381 	}
1382 	if (enabled == sc->pause)
1383 		return 0;
1384 
1385 	mtx_lock(&sc->driver_mtx);
1386 	err = mxge_change_pause(sc, enabled);
1387 	mtx_unlock(&sc->driver_mtx);
1388 	return err;
1389 }
1390 
1391 static int
1392 mxge_handle_be32(SYSCTL_HANDLER_ARGS)
1393 {
1394 	int err;
1395 
1396 	if (arg1 == NULL)
1397 		return EFAULT;
1398 	arg2 = be32toh(*(int *)arg1);
1399 	arg1 = NULL;
1400 	err = sysctl_handle_int(oidp, arg1, arg2, req);
1401 
1402 	return err;
1403 }
1404 
1405 static void
1406 mxge_rem_sysctls(mxge_softc_t *sc)
1407 {
1408 	struct mxge_slice_state *ss;
1409 	int slice;
1410 
1411 	if (sc->slice_sysctl_tree == NULL)
1412 		return;
1413 
1414 	for (slice = 0; slice < sc->num_slices; slice++) {
1415 		ss = &sc->ss[slice];
1416 		if (ss == NULL || ss->sysctl_tree == NULL)
1417 			continue;
1418 		sysctl_ctx_free(&ss->sysctl_ctx);
1419 		ss->sysctl_tree = NULL;
1420 	}
1421 	sysctl_ctx_free(&sc->slice_sysctl_ctx);
1422 	sc->slice_sysctl_tree = NULL;
1423 }
1424 
1425 static void
1426 mxge_add_sysctls(mxge_softc_t *sc)
1427 {
1428 	struct sysctl_ctx_list *ctx;
1429 	struct sysctl_oid_list *children;
1430 	mcp_irq_data_t *fw;
1431 	struct mxge_slice_state *ss;
1432 	int slice;
1433 	char slice_num[8];
1434 
1435 	ctx = device_get_sysctl_ctx(sc->dev);
1436 	children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev));
1437 	fw = sc->ss[0].fw_stats;
1438 
1439 	/* random information */
1440 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1441 		       "firmware_version",
1442 		       CTLFLAG_RD, sc->fw_version,
1443 		       0, "firmware version");
1444 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1445 		       "serial_number",
1446 		       CTLFLAG_RD, sc->serial_number_string,
1447 		       0, "serial number");
1448 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1449 		       "product_code",
1450 		       CTLFLAG_RD, sc->product_code_string,
1451 		       0, "product_code");
1452 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1453 		       "pcie_link_width",
1454 		       CTLFLAG_RD, &sc->link_width,
1455 		       0, "tx_boundary");
1456 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1457 		       "tx_boundary",
1458 		       CTLFLAG_RD, &sc->tx_boundary,
1459 		       0, "tx_boundary");
1460 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1461 		       "write_combine",
1462 		       CTLFLAG_RD, &sc->wc,
1463 		       0, "write combining PIO?");
1464 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1465 		       "read_dma_MBs",
1466 		       CTLFLAG_RD, &sc->read_dma,
1467 		       0, "DMA Read speed in MB/s");
1468 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1469 		       "write_dma_MBs",
1470 		       CTLFLAG_RD, &sc->write_dma,
1471 		       0, "DMA Write speed in MB/s");
1472 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1473 		       "read_write_dma_MBs",
1474 		       CTLFLAG_RD, &sc->read_write_dma,
1475 		       0, "DMA concurrent Read/Write speed in MB/s");
1476 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1477 		       "watchdog_resets",
1478 		       CTLFLAG_RD, &sc->watchdog_resets,
1479 		       0, "Number of times NIC was reset");
1480 
1481 	/* performance related tunables */
1482 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1483 	    "intr_coal_delay", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
1484 	    sc, 0, mxge_change_intr_coal, "I",
1485 	    "interrupt coalescing delay in usecs");
1486 
1487 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1488 	    "throttle", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
1489 	    mxge_change_throttle, "I", "transmit throttling");
1490 
1491 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1492 	    "flow_control_enabled",
1493 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
1494 	    mxge_change_flow_control, "I",
1495 	    "interrupt coalescing delay in usecs");
1496 
1497 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1498 		       "deassert_wait",
1499 		       CTLFLAG_RW, &mxge_deassert_wait,
1500 		       0, "Wait for IRQ line to go low in ihandler");
1501 
1502 	/* stats block from firmware is in network byte order.
1503 	   Need to swap it */
1504 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1505 	    "link_up", CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
1506 	    &fw->link_up, 0, mxge_handle_be32, "I", "link up");
1507 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1508 	    "rdma_tags_available", CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
1509 	    &fw->rdma_tags_available, 0, mxge_handle_be32, "I",
1510 	    "rdma_tags_available");
1511 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1512 	    "dropped_bad_crc32", CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
1513 	    &fw->dropped_bad_crc32, 0, mxge_handle_be32, "I",
1514 	    "dropped_bad_crc32");
1515 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1516 	    "dropped_bad_phy", CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
1517 	    &fw->dropped_bad_phy, 0, mxge_handle_be32, "I", "dropped_bad_phy");
1518 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1519 	    "dropped_link_error_or_filtered",
1520 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
1521 	    &fw->dropped_link_error_or_filtered, 0, mxge_handle_be32, "I",
1522 	    "dropped_link_error_or_filtered");
1523 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1524 	    "dropped_link_overflow",
1525 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
1526 	    &fw->dropped_link_overflow, 0, mxge_handle_be32, "I",
1527 	    "dropped_link_overflow");
1528 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1529 	    "dropped_multicast_filtered",
1530 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
1531 	    &fw->dropped_multicast_filtered, 0, mxge_handle_be32, "I",
1532 	    "dropped_multicast_filtered");
1533 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1534 	    "dropped_no_big_buffer",
1535 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
1536 	    &fw->dropped_no_big_buffer, 0, mxge_handle_be32, "I",
1537 	    "dropped_no_big_buffer");
1538 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1539 	    "dropped_no_small_buffer",
1540 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
1541 	    &fw->dropped_no_small_buffer, 0, mxge_handle_be32, "I",
1542 	    "dropped_no_small_buffer");
1543 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1544 	    "dropped_overrun",
1545 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
1546 	    &fw->dropped_overrun, 0, mxge_handle_be32, "I",
1547 	    "dropped_overrun");
1548 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1549 	    "dropped_pause", CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
1550 	    &fw->dropped_pause, 0, mxge_handle_be32, "I", "dropped_pause");
1551 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1552 	    "dropped_runt", CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
1553 	    &fw->dropped_runt, 0, mxge_handle_be32, "I", "dropped_runt");
1554 
1555 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1556 	    "dropped_unicast_filtered",
1557 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
1558 	    &fw->dropped_unicast_filtered, 0, mxge_handle_be32, "I",
1559 	    "dropped_unicast_filtered");
1560 
1561 	/* verbose printing? */
1562 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1563 		       "verbose",
1564 		       CTLFLAG_RW, &mxge_verbose,
1565 		       0, "verbose printing");
1566 
1567 	/* add counters exported for debugging from all slices */
1568 	sysctl_ctx_init(&sc->slice_sysctl_ctx);
1569 	sc->slice_sysctl_tree =
1570 		SYSCTL_ADD_NODE(&sc->slice_sysctl_ctx, children, OID_AUTO,
1571 		    "slice", CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
1572 
1573 	for (slice = 0; slice < sc->num_slices; slice++) {
1574 		ss = &sc->ss[slice];
1575 		sysctl_ctx_init(&ss->sysctl_ctx);
1576 		ctx = &ss->sysctl_ctx;
1577 		children = SYSCTL_CHILDREN(sc->slice_sysctl_tree);
1578 		sprintf(slice_num, "%d", slice);
1579 		ss->sysctl_tree =
1580 			SYSCTL_ADD_NODE(ctx, children, OID_AUTO, slice_num,
1581 			    CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
1582 		children = SYSCTL_CHILDREN(ss->sysctl_tree);
1583 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1584 			       "rx_small_cnt",
1585 			       CTLFLAG_RD, &ss->rx_small.cnt,
1586 			       0, "rx_small_cnt");
1587 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1588 			       "rx_big_cnt",
1589 			       CTLFLAG_RD, &ss->rx_big.cnt,
1590 			       0, "rx_small_cnt");
1591 		SYSCTL_ADD_U64(ctx, children, OID_AUTO,
1592 			       "lro_flushed", CTLFLAG_RD, &ss->lc.lro_flushed,
1593 			       0, "number of lro merge queues flushed");
1594 
1595 		SYSCTL_ADD_U64(ctx, children, OID_AUTO,
1596 			       "lro_bad_csum", CTLFLAG_RD, &ss->lc.lro_bad_csum,
1597 			       0, "number of bad csums preventing LRO");
1598 
1599 		SYSCTL_ADD_U64(ctx, children, OID_AUTO,
1600 			       "lro_queued", CTLFLAG_RD, &ss->lc.lro_queued,
1601 			       0, "number of frames appended to lro merge"
1602 			       "queues");
1603 
1604 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1605 			       "tx_req",
1606 			       CTLFLAG_RD, &ss->tx.req,
1607 			       0, "tx_req");
1608 
1609 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1610 			       "tx_done",
1611 			       CTLFLAG_RD, &ss->tx.done,
1612 			       0, "tx_done");
1613 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1614 			       "tx_pkt_done",
1615 			       CTLFLAG_RD, &ss->tx.pkt_done,
1616 			       0, "tx_done");
1617 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1618 			       "tx_stall",
1619 			       CTLFLAG_RD, &ss->tx.stall,
1620 			       0, "tx_stall");
1621 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1622 			       "tx_wake",
1623 			       CTLFLAG_RD, &ss->tx.wake,
1624 			       0, "tx_wake");
1625 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1626 			       "tx_defrag",
1627 			       CTLFLAG_RD, &ss->tx.defrag,
1628 			       0, "tx_defrag");
1629 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1630 			       "tx_queue_active",
1631 			       CTLFLAG_RD, &ss->tx.queue_active,
1632 			       0, "tx_queue_active");
1633 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1634 			       "tx_activate",
1635 			       CTLFLAG_RD, &ss->tx.activate,
1636 			       0, "tx_activate");
1637 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1638 			       "tx_deactivate",
1639 			       CTLFLAG_RD, &ss->tx.deactivate,
1640 			       0, "tx_deactivate");
1641 	}
1642 }
1643 
1644 /* copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
1645    backwards one at a time and handle ring wraps */
1646 
1647 static inline void
1648 mxge_submit_req_backwards(mxge_tx_ring_t *tx,
1649 			    mcp_kreq_ether_send_t *src, int cnt)
1650 {
1651 	int idx, starting_slot;
1652 	starting_slot = tx->req;
1653 	while (cnt > 1) {
1654 		cnt--;
1655 		idx = (starting_slot + cnt) & tx->mask;
1656 		mxge_pio_copy(&tx->lanai[idx],
1657 			      &src[cnt], sizeof(*src));
1658 		wmb();
1659 	}
1660 }
1661 
1662 /*
1663  * copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
1664  * at most 32 bytes at a time, so as to avoid involving the software
1665  * pio handler in the nic.   We re-write the first segment's flags
1666  * to mark them valid only after writing the entire chain
1667  */
1668 
1669 static inline void
1670 mxge_submit_req(mxge_tx_ring_t *tx, mcp_kreq_ether_send_t *src,
1671 		  int cnt)
1672 {
1673 	int idx, i;
1674 	uint32_t *src_ints;
1675 	volatile uint32_t *dst_ints;
1676 	mcp_kreq_ether_send_t *srcp;
1677 	volatile mcp_kreq_ether_send_t *dstp, *dst;
1678 	uint8_t last_flags;
1679 
1680 	idx = tx->req & tx->mask;
1681 
1682 	last_flags = src->flags;
1683 	src->flags = 0;
1684 	wmb();
1685 	dst = dstp = &tx->lanai[idx];
1686 	srcp = src;
1687 
1688 	if ((idx + cnt) < tx->mask) {
1689 		for (i = 0; i < (cnt - 1); i += 2) {
1690 			mxge_pio_copy(dstp, srcp, 2 * sizeof(*src));
1691 			wmb(); /* force write every 32 bytes */
1692 			srcp += 2;
1693 			dstp += 2;
1694 		}
1695 	} else {
1696 		/* submit all but the first request, and ensure
1697 		   that it is submitted below */
1698 		mxge_submit_req_backwards(tx, src, cnt);
1699 		i = 0;
1700 	}
1701 	if (i < cnt) {
1702 		/* submit the first request */
1703 		mxge_pio_copy(dstp, srcp, sizeof(*src));
1704 		wmb(); /* barrier before setting valid flag */
1705 	}
1706 
1707 	/* re-write the last 32-bits with the valid flags */
1708 	src->flags = last_flags;
1709 	src_ints = (uint32_t *)src;
1710 	src_ints+=3;
1711 	dst_ints = (volatile uint32_t *)dst;
1712 	dst_ints+=3;
1713 	*dst_ints =  *src_ints;
1714 	tx->req += cnt;
1715 	wmb();
1716 }
1717 
1718 static int
1719 mxge_parse_tx(struct mxge_slice_state *ss, struct mbuf *m,
1720     struct mxge_pkt_info *pi)
1721 {
1722 	struct ether_vlan_header *eh;
1723 	uint16_t etype;
1724 	int tso = m->m_pkthdr.csum_flags & (CSUM_TSO);
1725 #if IFCAP_TSO6 && defined(INET6)
1726 	int nxt;
1727 #endif
1728 
1729 	eh = mtod(m, struct ether_vlan_header *);
1730 	if (eh->evl_encap_proto == htons(ETHERTYPE_VLAN)) {
1731 		etype = ntohs(eh->evl_proto);
1732 		pi->ip_off = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
1733 	} else {
1734 		etype = ntohs(eh->evl_encap_proto);
1735 		pi->ip_off = ETHER_HDR_LEN;
1736 	}
1737 
1738 	switch (etype) {
1739 	case ETHERTYPE_IP:
1740 		/*
1741 		 * ensure ip header is in first mbuf, copy it to a
1742 		 * scratch buffer if not
1743 		 */
1744 		pi->ip = (struct ip *)(m->m_data + pi->ip_off);
1745 		pi->ip6 = NULL;
1746 		if (__predict_false(m->m_len < pi->ip_off + sizeof(*pi->ip))) {
1747 			m_copydata(m, 0, pi->ip_off + sizeof(*pi->ip),
1748 			    ss->scratch);
1749 			pi->ip = (struct ip *)(ss->scratch + pi->ip_off);
1750 		}
1751 		pi->ip_hlen = pi->ip->ip_hl << 2;
1752 		if (!tso)
1753 			return 0;
1754 
1755 		if (__predict_false(m->m_len < pi->ip_off + pi->ip_hlen +
1756 		    sizeof(struct tcphdr))) {
1757 			m_copydata(m, 0, pi->ip_off + pi->ip_hlen +
1758 			    sizeof(struct tcphdr), ss->scratch);
1759 			pi->ip = (struct ip *)(ss->scratch + pi->ip_off);
1760 		}
1761 		pi->tcp = (struct tcphdr *)((char *)pi->ip + pi->ip_hlen);
1762 		break;
1763 #if IFCAP_TSO6 && defined(INET6)
1764 	case ETHERTYPE_IPV6:
1765 		pi->ip6 = (struct ip6_hdr *)(m->m_data + pi->ip_off);
1766 		if (__predict_false(m->m_len < pi->ip_off + sizeof(*pi->ip6))) {
1767 			m_copydata(m, 0, pi->ip_off + sizeof(*pi->ip6),
1768 			    ss->scratch);
1769 			pi->ip6 = (struct ip6_hdr *)(ss->scratch + pi->ip_off);
1770 		}
1771 		nxt = 0;
1772 		pi->ip_hlen = ip6_lasthdr(m, pi->ip_off, IPPROTO_IPV6, &nxt);
1773 		pi->ip_hlen -= pi->ip_off;
1774 		if (nxt != IPPROTO_TCP && nxt != IPPROTO_UDP)
1775 			return EINVAL;
1776 
1777 		if (!tso)
1778 			return 0;
1779 
1780 		if (pi->ip_off + pi->ip_hlen > ss->sc->max_tso6_hlen)
1781 			return EINVAL;
1782 
1783 		if (__predict_false(m->m_len < pi->ip_off + pi->ip_hlen +
1784 		    sizeof(struct tcphdr))) {
1785 			m_copydata(m, 0, pi->ip_off + pi->ip_hlen +
1786 			    sizeof(struct tcphdr), ss->scratch);
1787 			pi->ip6 = (struct ip6_hdr *)(ss->scratch + pi->ip_off);
1788 		}
1789 		pi->tcp = (struct tcphdr *)((char *)pi->ip6 + pi->ip_hlen);
1790 		break;
1791 #endif
1792 	default:
1793 		return EINVAL;
1794 	}
1795 	return 0;
1796 }
1797 
1798 #if IFCAP_TSO4
1799 
1800 static void
1801 mxge_encap_tso(struct mxge_slice_state *ss, struct mbuf *m,
1802 	       int busdma_seg_cnt, struct mxge_pkt_info *pi)
1803 {
1804 	mxge_tx_ring_t *tx;
1805 	mcp_kreq_ether_send_t *req;
1806 	bus_dma_segment_t *seg;
1807 	uint32_t low, high_swapped;
1808 	int len, seglen, cum_len, cum_len_next;
1809 	int next_is_first, chop, cnt, rdma_count, small;
1810 	uint16_t pseudo_hdr_offset, cksum_offset, mss, sum;
1811 	uint8_t flags, flags_next;
1812 	static int once;
1813 
1814 	mss = m->m_pkthdr.tso_segsz;
1815 
1816 	/* negative cum_len signifies to the
1817 	 * send loop that we are still in the
1818 	 * header portion of the TSO packet.
1819 	 */
1820 
1821 	cksum_offset = pi->ip_off + pi->ip_hlen;
1822 	cum_len = -(cksum_offset + (pi->tcp->th_off << 2));
1823 
1824 	/* TSO implies checksum offload on this hardware */
1825 	if (__predict_false((m->m_pkthdr.csum_flags & (CSUM_TCP|CSUM_TCP_IPV6)) == 0)) {
1826 		/*
1827 		 * If packet has full TCP csum, replace it with pseudo hdr
1828 		 * sum that the NIC expects, otherwise the NIC will emit
1829 		 * packets with bad TCP checksums.
1830 		 */
1831 		m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
1832 		if (pi->ip6) {
1833 #if (CSUM_TCP_IPV6 != 0) && defined(INET6)
1834 			m->m_pkthdr.csum_flags |= CSUM_TCP_IPV6;
1835 			sum = in6_cksum_pseudo(pi->ip6,
1836 			    m->m_pkthdr.len - cksum_offset,
1837 			    IPPROTO_TCP, 0);
1838 #endif
1839 		} else {
1840 #ifdef INET
1841 			m->m_pkthdr.csum_flags |= CSUM_TCP;
1842 			sum = in_pseudo(pi->ip->ip_src.s_addr,
1843 			    pi->ip->ip_dst.s_addr,
1844 			    htons(IPPROTO_TCP + (m->m_pkthdr.len -
1845 				    cksum_offset)));
1846 #endif
1847 		}
1848 		m_copyback(m, offsetof(struct tcphdr, th_sum) +
1849 		    cksum_offset, sizeof(sum), (caddr_t)&sum);
1850 	}
1851 	flags = MXGEFW_FLAGS_TSO_HDR | MXGEFW_FLAGS_FIRST;
1852 
1853 	/* for TSO, pseudo_hdr_offset holds mss.
1854 	 * The firmware figures out where to put
1855 	 * the checksum by parsing the header. */
1856 	pseudo_hdr_offset = htobe16(mss);
1857 
1858 	if (pi->ip6) {
1859 		/*
1860 		 * for IPv6 TSO, the "checksum offset" is re-purposed
1861 		 * to store the TCP header len
1862 		 */
1863 		cksum_offset = (pi->tcp->th_off << 2);
1864 	}
1865 
1866 	tx = &ss->tx;
1867 	req = tx->req_list;
1868 	seg = tx->seg_list;
1869 	cnt = 0;
1870 	rdma_count = 0;
1871 	/* "rdma_count" is the number of RDMAs belonging to the
1872 	 * current packet BEFORE the current send request. For
1873 	 * non-TSO packets, this is equal to "count".
1874 	 * For TSO packets, rdma_count needs to be reset
1875 	 * to 0 after a segment cut.
1876 	 *
1877 	 * The rdma_count field of the send request is
1878 	 * the number of RDMAs of the packet starting at
1879 	 * that request. For TSO send requests with one ore more cuts
1880 	 * in the middle, this is the number of RDMAs starting
1881 	 * after the last cut in the request. All previous
1882 	 * segments before the last cut implicitly have 1 RDMA.
1883 	 *
1884 	 * Since the number of RDMAs is not known beforehand,
1885 	 * it must be filled-in retroactively - after each
1886 	 * segmentation cut or at the end of the entire packet.
1887 	 */
1888 
1889 	while (busdma_seg_cnt) {
1890 		/* Break the busdma segment up into pieces*/
1891 		low = MXGE_LOWPART_TO_U32(seg->ds_addr);
1892 		high_swapped = 	htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
1893 		len = seg->ds_len;
1894 
1895 		while (len) {
1896 			flags_next = flags & ~MXGEFW_FLAGS_FIRST;
1897 			seglen = len;
1898 			cum_len_next = cum_len + seglen;
1899 			(req-rdma_count)->rdma_count = rdma_count + 1;
1900 			if (__predict_true(cum_len >= 0)) {
1901 				/* payload */
1902 				chop = (cum_len_next > mss);
1903 				cum_len_next = cum_len_next % mss;
1904 				next_is_first = (cum_len_next == 0);
1905 				flags |= chop * MXGEFW_FLAGS_TSO_CHOP;
1906 				flags_next |= next_is_first *
1907 					MXGEFW_FLAGS_FIRST;
1908 				rdma_count |= -(chop | next_is_first);
1909 				rdma_count += chop & !next_is_first;
1910 			} else if (cum_len_next >= 0) {
1911 				/* header ends */
1912 				rdma_count = -1;
1913 				cum_len_next = 0;
1914 				seglen = -cum_len;
1915 				small = (mss <= MXGEFW_SEND_SMALL_SIZE);
1916 				flags_next = MXGEFW_FLAGS_TSO_PLD |
1917 					MXGEFW_FLAGS_FIRST |
1918 					(small * MXGEFW_FLAGS_SMALL);
1919 			    }
1920 
1921 			req->addr_high = high_swapped;
1922 			req->addr_low = htobe32(low);
1923 			req->pseudo_hdr_offset = pseudo_hdr_offset;
1924 			req->pad = 0;
1925 			req->rdma_count = 1;
1926 			req->length = htobe16(seglen);
1927 			req->cksum_offset = cksum_offset;
1928 			req->flags = flags | ((cum_len & 1) *
1929 					      MXGEFW_FLAGS_ALIGN_ODD);
1930 			low += seglen;
1931 			len -= seglen;
1932 			cum_len = cum_len_next;
1933 			flags = flags_next;
1934 			req++;
1935 			cnt++;
1936 			rdma_count++;
1937 			if (cksum_offset != 0 && !pi->ip6) {
1938 				if (__predict_false(cksum_offset > seglen))
1939 					cksum_offset -= seglen;
1940 				else
1941 					cksum_offset = 0;
1942 			}
1943 			if (__predict_false(cnt > tx->max_desc))
1944 				goto drop;
1945 		}
1946 		busdma_seg_cnt--;
1947 		seg++;
1948 	}
1949 	(req-rdma_count)->rdma_count = rdma_count;
1950 
1951 	do {
1952 		req--;
1953 		req->flags |= MXGEFW_FLAGS_TSO_LAST;
1954 	} while (!(req->flags & (MXGEFW_FLAGS_TSO_CHOP | MXGEFW_FLAGS_FIRST)));
1955 
1956 	tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
1957 	mxge_submit_req(tx, tx->req_list, cnt);
1958 
1959 	if ((ss->sc->num_slices > 1) && tx->queue_active == 0) {
1960 		/* tell the NIC to start polling this slice */
1961 		*tx->send_go = 1;
1962 		tx->queue_active = 1;
1963 		tx->activate++;
1964 		wmb();
1965 	}
1966 
1967 	return;
1968 
1969 drop:
1970 	bus_dmamap_unload(tx->dmat, tx->info[tx->req & tx->mask].map);
1971 	m_freem(m);
1972 	ss->oerrors++;
1973 	if (!once) {
1974 		printf("tx->max_desc exceeded via TSO!\n");
1975 		printf("mss = %d, %ld, %d!\n", mss,
1976 		       (long)seg - (long)tx->seg_list, tx->max_desc);
1977 		once = 1;
1978 	}
1979 	return;
1980 
1981 }
1982 
1983 #endif /* IFCAP_TSO4 */
1984 
1985 #ifdef MXGE_NEW_VLAN_API
1986 /*
1987  * We reproduce the software vlan tag insertion from
1988  * net/if_vlan.c:vlan_start() here so that we can advertise "hardware"
1989  * vlan tag insertion. We need to advertise this in order to have the
1990  * vlan interface respect our csum offload flags.
1991  */
1992 static struct mbuf *
1993 mxge_vlan_tag_insert(struct mbuf *m)
1994 {
1995 	struct ether_vlan_header *evl;
1996 
1997 	M_PREPEND(m, ETHER_VLAN_ENCAP_LEN, M_NOWAIT);
1998 	if (__predict_false(m == NULL))
1999 		return NULL;
2000 	if (m->m_len < sizeof(*evl)) {
2001 		m = m_pullup(m, sizeof(*evl));
2002 		if (__predict_false(m == NULL))
2003 			return NULL;
2004 	}
2005 	/*
2006 	 * Transform the Ethernet header into an Ethernet header
2007 	 * with 802.1Q encapsulation.
2008 	 */
2009 	evl = mtod(m, struct ether_vlan_header *);
2010 	bcopy((char *)evl + ETHER_VLAN_ENCAP_LEN,
2011 	      (char *)evl, ETHER_HDR_LEN - ETHER_TYPE_LEN);
2012 	evl->evl_encap_proto = htons(ETHERTYPE_VLAN);
2013 	evl->evl_tag = htons(m->m_pkthdr.ether_vtag);
2014 	m->m_flags &= ~M_VLANTAG;
2015 	return m;
2016 }
2017 #endif /* MXGE_NEW_VLAN_API */
2018 
2019 static void
2020 mxge_encap(struct mxge_slice_state *ss, struct mbuf *m)
2021 {
2022 	struct mxge_pkt_info pi = {0,0,0,0};
2023 	mxge_softc_t *sc;
2024 	mcp_kreq_ether_send_t *req;
2025 	bus_dma_segment_t *seg;
2026 	struct mbuf *m_tmp;
2027 	mxge_tx_ring_t *tx;
2028 	int cnt, cum_len, err, i, idx, odd_flag;
2029 	uint16_t pseudo_hdr_offset;
2030 	uint8_t flags, cksum_offset;
2031 
2032 	sc = ss->sc;
2033 	tx = &ss->tx;
2034 
2035 #ifdef MXGE_NEW_VLAN_API
2036 	if (m->m_flags & M_VLANTAG) {
2037 		m = mxge_vlan_tag_insert(m);
2038 		if (__predict_false(m == NULL))
2039 			goto drop_without_m;
2040 	}
2041 #endif
2042 	if (m->m_pkthdr.csum_flags &
2043 	    (CSUM_TSO | CSUM_DELAY_DATA | CSUM_DELAY_DATA_IPV6)) {
2044 		if (mxge_parse_tx(ss, m, &pi))
2045 			goto drop;
2046 	}
2047 
2048 	/* (try to) map the frame for DMA */
2049 	idx = tx->req & tx->mask;
2050 	err = bus_dmamap_load_mbuf_sg(tx->dmat, tx->info[idx].map,
2051 				      m, tx->seg_list, &cnt,
2052 				      BUS_DMA_NOWAIT);
2053 	if (__predict_false(err == EFBIG)) {
2054 		/* Too many segments in the chain.  Try
2055 		   to defrag */
2056 		m_tmp = m_defrag(m, M_NOWAIT);
2057 		if (m_tmp == NULL) {
2058 			goto drop;
2059 		}
2060 		ss->tx.defrag++;
2061 		m = m_tmp;
2062 		err = bus_dmamap_load_mbuf_sg(tx->dmat,
2063 					      tx->info[idx].map,
2064 					      m, tx->seg_list, &cnt,
2065 					      BUS_DMA_NOWAIT);
2066 	}
2067 	if (__predict_false(err != 0)) {
2068 		device_printf(sc->dev, "bus_dmamap_load_mbuf_sg returned %d"
2069 			      " packet len = %d\n", err, m->m_pkthdr.len);
2070 		goto drop;
2071 	}
2072 	bus_dmamap_sync(tx->dmat, tx->info[idx].map,
2073 			BUS_DMASYNC_PREWRITE);
2074 	tx->info[idx].m = m;
2075 
2076 #if IFCAP_TSO4
2077 	/* TSO is different enough, we handle it in another routine */
2078 	if (m->m_pkthdr.csum_flags & (CSUM_TSO)) {
2079 		mxge_encap_tso(ss, m, cnt, &pi);
2080 		return;
2081 	}
2082 #endif
2083 
2084 	req = tx->req_list;
2085 	cksum_offset = 0;
2086 	pseudo_hdr_offset = 0;
2087 	flags = MXGEFW_FLAGS_NO_TSO;
2088 
2089 	/* checksum offloading? */
2090 	if (m->m_pkthdr.csum_flags &
2091 	    (CSUM_DELAY_DATA | CSUM_DELAY_DATA_IPV6)) {
2092 		/* ensure ip header is in first mbuf, copy
2093 		   it to a scratch buffer if not */
2094 		cksum_offset = pi.ip_off + pi.ip_hlen;
2095 		pseudo_hdr_offset = cksum_offset +  m->m_pkthdr.csum_data;
2096 		pseudo_hdr_offset = htobe16(pseudo_hdr_offset);
2097 		req->cksum_offset = cksum_offset;
2098 		flags |= MXGEFW_FLAGS_CKSUM;
2099 		odd_flag = MXGEFW_FLAGS_ALIGN_ODD;
2100 	} else {
2101 		odd_flag = 0;
2102 	}
2103 	if (m->m_pkthdr.len < MXGEFW_SEND_SMALL_SIZE)
2104 		flags |= MXGEFW_FLAGS_SMALL;
2105 
2106 	/* convert segments into a request list */
2107 	cum_len = 0;
2108 	seg = tx->seg_list;
2109 	req->flags = MXGEFW_FLAGS_FIRST;
2110 	for (i = 0; i < cnt; i++) {
2111 		req->addr_low =
2112 			htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
2113 		req->addr_high =
2114 			htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
2115 		req->length = htobe16(seg->ds_len);
2116 		req->cksum_offset = cksum_offset;
2117 		if (cksum_offset > seg->ds_len)
2118 			cksum_offset -= seg->ds_len;
2119 		else
2120 			cksum_offset = 0;
2121 		req->pseudo_hdr_offset = pseudo_hdr_offset;
2122 		req->pad = 0; /* complete solid 16-byte block */
2123 		req->rdma_count = 1;
2124 		req->flags |= flags | ((cum_len & 1) * odd_flag);
2125 		cum_len += seg->ds_len;
2126 		seg++;
2127 		req++;
2128 		req->flags = 0;
2129 	}
2130 	req--;
2131 	/* pad runts to 60 bytes */
2132 	if (cum_len < 60) {
2133 		req++;
2134 		req->addr_low =
2135 			htobe32(MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr));
2136 		req->addr_high =
2137 			htobe32(MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr));
2138 		req->length = htobe16(60 - cum_len);
2139 		req->cksum_offset = 0;
2140 		req->pseudo_hdr_offset = pseudo_hdr_offset;
2141 		req->pad = 0; /* complete solid 16-byte block */
2142 		req->rdma_count = 1;
2143 		req->flags |= flags | ((cum_len & 1) * odd_flag);
2144 		cnt++;
2145 	}
2146 
2147 	tx->req_list[0].rdma_count = cnt;
2148 #if 0
2149 	/* print what the firmware will see */
2150 	for (i = 0; i < cnt; i++) {
2151 		printf("%d: addr: 0x%x 0x%x len:%d pso%d,"
2152 		    "cso:%d, flags:0x%x, rdma:%d\n",
2153 		    i, (int)ntohl(tx->req_list[i].addr_high),
2154 		    (int)ntohl(tx->req_list[i].addr_low),
2155 		    (int)ntohs(tx->req_list[i].length),
2156 		    (int)ntohs(tx->req_list[i].pseudo_hdr_offset),
2157 		    tx->req_list[i].cksum_offset, tx->req_list[i].flags,
2158 		    tx->req_list[i].rdma_count);
2159 	}
2160 	printf("--------------\n");
2161 #endif
2162 	tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
2163 	mxge_submit_req(tx, tx->req_list, cnt);
2164 
2165 	if ((ss->sc->num_slices > 1) && tx->queue_active == 0) {
2166 		/* tell the NIC to start polling this slice */
2167 		*tx->send_go = 1;
2168 		tx->queue_active = 1;
2169 		tx->activate++;
2170 		wmb();
2171 	}
2172 
2173 	return;
2174 
2175 drop:
2176 	m_freem(m);
2177 drop_without_m:
2178 	ss->oerrors++;
2179 	return;
2180 }
2181 
2182 static void
2183 mxge_qflush(if_t ifp)
2184 {
2185 	mxge_softc_t *sc = if_getsoftc(ifp);
2186 	mxge_tx_ring_t *tx;
2187 	struct mbuf *m;
2188 	int slice;
2189 
2190 	for (slice = 0; slice < sc->num_slices; slice++) {
2191 		tx = &sc->ss[slice].tx;
2192 		mtx_lock(&tx->mtx);
2193 		while ((m = buf_ring_dequeue_sc(tx->br)) != NULL)
2194 			m_freem(m);
2195 		mtx_unlock(&tx->mtx);
2196 	}
2197 	if_qflush(ifp);
2198 }
2199 
2200 static inline void
2201 mxge_start_locked(struct mxge_slice_state *ss)
2202 {
2203 	mxge_softc_t *sc;
2204 	struct mbuf *m;
2205 	if_t ifp;
2206 	mxge_tx_ring_t *tx;
2207 
2208 	sc = ss->sc;
2209 	ifp = sc->ifp;
2210 	tx = &ss->tx;
2211 
2212 	while ((tx->mask - (tx->req - tx->done)) > tx->max_desc) {
2213 		m = drbr_dequeue(ifp, tx->br);
2214 		if (m == NULL) {
2215 			return;
2216 		}
2217 		/* let BPF see it */
2218 		BPF_MTAP(ifp, m);
2219 
2220 		/* give it to the nic */
2221 		mxge_encap(ss, m);
2222 	}
2223 	/* ran out of transmit slots */
2224 	if (((ss->if_drv_flags & IFF_DRV_OACTIVE) == 0)
2225 	    && (!drbr_empty(ifp, tx->br))) {
2226 		ss->if_drv_flags |= IFF_DRV_OACTIVE;
2227 		tx->stall++;
2228 	}
2229 }
2230 
2231 static int
2232 mxge_transmit_locked(struct mxge_slice_state *ss, struct mbuf *m)
2233 {
2234 	mxge_softc_t *sc;
2235 	if_t ifp;
2236 	mxge_tx_ring_t *tx;
2237 	int err;
2238 
2239 	sc = ss->sc;
2240 	ifp = sc->ifp;
2241 	tx = &ss->tx;
2242 
2243 	if ((ss->if_drv_flags & (IFF_DRV_RUNNING|IFF_DRV_OACTIVE)) !=
2244 	    IFF_DRV_RUNNING) {
2245 		err = drbr_enqueue(ifp, tx->br, m);
2246 		return (err);
2247 	}
2248 
2249 	if (!drbr_needs_enqueue(ifp, tx->br) &&
2250 	    ((tx->mask - (tx->req - tx->done)) > tx->max_desc)) {
2251 		/* let BPF see it */
2252 		BPF_MTAP(ifp, m);
2253 		/* give it to the nic */
2254 		mxge_encap(ss, m);
2255 	} else if ((err = drbr_enqueue(ifp, tx->br, m)) != 0) {
2256 		return (err);
2257 	}
2258 	if (!drbr_empty(ifp, tx->br))
2259 		mxge_start_locked(ss);
2260 	return (0);
2261 }
2262 
2263 static int
2264 mxge_transmit(if_t ifp, struct mbuf *m)
2265 {
2266 	mxge_softc_t *sc = if_getsoftc(ifp);
2267 	struct mxge_slice_state *ss;
2268 	mxge_tx_ring_t *tx;
2269 	int err = 0;
2270 	int slice;
2271 
2272 	slice = m->m_pkthdr.flowid;
2273 	slice &= (sc->num_slices - 1);  /* num_slices always power of 2 */
2274 
2275 	ss = &sc->ss[slice];
2276 	tx = &ss->tx;
2277 
2278 	if (mtx_trylock(&tx->mtx)) {
2279 		err = mxge_transmit_locked(ss, m);
2280 		mtx_unlock(&tx->mtx);
2281 	} else {
2282 		err = drbr_enqueue(ifp, tx->br, m);
2283 	}
2284 
2285 	return (err);
2286 }
2287 
2288 static void
2289 mxge_start(if_t ifp)
2290 {
2291 	mxge_softc_t *sc = if_getsoftc(ifp);
2292 	struct mxge_slice_state *ss;
2293 
2294 	/* only use the first slice for now */
2295 	ss = &sc->ss[0];
2296 	mtx_lock(&ss->tx.mtx);
2297 	mxge_start_locked(ss);
2298 	mtx_unlock(&ss->tx.mtx);
2299 }
2300 
2301 /*
2302  * copy an array of mcp_kreq_ether_recv_t's to the mcp.  Copy
2303  * at most 32 bytes at a time, so as to avoid involving the software
2304  * pio handler in the nic.   We re-write the first segment's low
2305  * DMA address to mark it valid only after we write the entire chunk
2306  * in a burst
2307  */
2308 static inline void
2309 mxge_submit_8rx(volatile mcp_kreq_ether_recv_t *dst,
2310 		mcp_kreq_ether_recv_t *src)
2311 {
2312 	uint32_t low;
2313 
2314 	low = src->addr_low;
2315 	src->addr_low = 0xffffffff;
2316 	mxge_pio_copy(dst, src, 4 * sizeof (*src));
2317 	wmb();
2318 	mxge_pio_copy(dst + 4, src + 4, 4 * sizeof (*src));
2319 	wmb();
2320 	src->addr_low = low;
2321 	dst->addr_low = low;
2322 	wmb();
2323 }
2324 
2325 static int
2326 mxge_get_buf_small(struct mxge_slice_state *ss, bus_dmamap_t map, int idx)
2327 {
2328 	bus_dma_segment_t seg;
2329 	struct mbuf *m;
2330 	mxge_rx_ring_t *rx = &ss->rx_small;
2331 	int cnt, err;
2332 
2333 	m = m_gethdr(M_NOWAIT, MT_DATA);
2334 	if (m == NULL) {
2335 		rx->alloc_fail++;
2336 		err = ENOBUFS;
2337 		goto done;
2338 	}
2339 	m->m_len = MHLEN;
2340 	err = bus_dmamap_load_mbuf_sg(rx->dmat, map, m,
2341 				      &seg, &cnt, BUS_DMA_NOWAIT);
2342 	if (err != 0) {
2343 		m_free(m);
2344 		goto done;
2345 	}
2346 	rx->info[idx].m = m;
2347 	rx->shadow[idx].addr_low =
2348 		htobe32(MXGE_LOWPART_TO_U32(seg.ds_addr));
2349 	rx->shadow[idx].addr_high =
2350 		htobe32(MXGE_HIGHPART_TO_U32(seg.ds_addr));
2351 
2352 done:
2353 	if ((idx & 7) == 7)
2354 		mxge_submit_8rx(&rx->lanai[idx - 7], &rx->shadow[idx - 7]);
2355 	return err;
2356 }
2357 
2358 static int
2359 mxge_get_buf_big(struct mxge_slice_state *ss, bus_dmamap_t map, int idx)
2360 {
2361 	bus_dma_segment_t seg[3];
2362 	struct mbuf *m;
2363 	mxge_rx_ring_t *rx = &ss->rx_big;
2364 	int cnt, err, i;
2365 
2366 	m = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, rx->cl_size);
2367 	if (m == NULL) {
2368 		rx->alloc_fail++;
2369 		err = ENOBUFS;
2370 		goto done;
2371 	}
2372 	m->m_len = rx->mlen;
2373 	err = bus_dmamap_load_mbuf_sg(rx->dmat, map, m,
2374 				      seg, &cnt, BUS_DMA_NOWAIT);
2375 	if (err != 0) {
2376 		m_free(m);
2377 		goto done;
2378 	}
2379 	rx->info[idx].m = m;
2380 	rx->shadow[idx].addr_low =
2381 		htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
2382 	rx->shadow[idx].addr_high =
2383 		htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
2384 
2385 done:
2386        for (i = 0; i < rx->nbufs; i++) {
2387 		if ((idx & 7) == 7) {
2388 			mxge_submit_8rx(&rx->lanai[idx - 7],
2389 					&rx->shadow[idx - 7]);
2390 		}
2391 		idx++;
2392 	}
2393 	return err;
2394 }
2395 
2396 #ifdef INET6
2397 
2398 static uint16_t
2399 mxge_csum_generic(uint16_t *raw, int len)
2400 {
2401 	uint32_t csum;
2402 
2403 	csum = 0;
2404 	while (len > 0) {
2405 		csum += *raw;
2406 		raw++;
2407 		len -= 2;
2408 	}
2409 	csum = (csum >> 16) + (csum & 0xffff);
2410 	csum = (csum >> 16) + (csum & 0xffff);
2411 	return (uint16_t)csum;
2412 }
2413 
2414 static inline uint16_t
2415 mxge_rx_csum6(void *p, struct mbuf *m, uint32_t csum)
2416 {
2417 	uint32_t partial;
2418 	int nxt, cksum_offset;
2419 	struct ip6_hdr *ip6 = p;
2420 	uint16_t c;
2421 
2422 	nxt = ip6->ip6_nxt;
2423 	cksum_offset = sizeof (*ip6) + ETHER_HDR_LEN;
2424 	if (nxt != IPPROTO_TCP && nxt != IPPROTO_UDP) {
2425 		cksum_offset = ip6_lasthdr(m, ETHER_HDR_LEN,
2426 					   IPPROTO_IPV6, &nxt);
2427 		if (nxt != IPPROTO_TCP && nxt != IPPROTO_UDP)
2428 			return (1);
2429 	}
2430 
2431 	/*
2432 	 * IPv6 headers do not contain a checksum, and hence
2433 	 * do not checksum to zero, so they don't "fall out"
2434 	 * of the partial checksum calculation like IPv4
2435 	 * headers do.  We need to fix the partial checksum by
2436 	 * subtracting the checksum of the IPv6 header.
2437 	 */
2438 
2439 	partial = mxge_csum_generic((uint16_t *)ip6, cksum_offset -
2440 				    ETHER_HDR_LEN);
2441 	csum += ~partial;
2442 	csum +=	 (csum < ~partial);
2443 	csum = (csum >> 16) + (csum & 0xFFFF);
2444 	csum = (csum >> 16) + (csum & 0xFFFF);
2445 	c = in6_cksum_pseudo(ip6, m->m_pkthdr.len - cksum_offset, nxt,
2446 			     csum);
2447 	c ^= 0xffff;
2448 	return (c);
2449 }
2450 #endif /* INET6 */
2451 /*
2452  *  Myri10GE hardware checksums are not valid if the sender
2453  *  padded the frame with non-zero padding.  This is because
2454  *  the firmware just does a simple 16-bit 1s complement
2455  *  checksum across the entire frame, excluding the first 14
2456  *  bytes.  It is best to simply to check the checksum and
2457  *  tell the stack about it only if the checksum is good
2458  */
2459 
2460 static inline uint16_t
2461 mxge_rx_csum(struct mbuf *m, int csum)
2462 {
2463 	struct ether_header *eh;
2464 #ifdef INET
2465 	struct ip *ip;
2466 #endif
2467 #if defined(INET) || defined(INET6)
2468 	int cap = if_getcapenable(m->m_pkthdr.rcvif);
2469 #endif
2470 	uint16_t c, etype;
2471 
2472 	eh = mtod(m, struct ether_header *);
2473 	etype = ntohs(eh->ether_type);
2474 	switch (etype) {
2475 #ifdef INET
2476 	case ETHERTYPE_IP:
2477 		if ((cap & IFCAP_RXCSUM) == 0)
2478 			return (1);
2479 		ip = (struct ip *)(eh + 1);
2480 		if (ip->ip_p != IPPROTO_TCP && ip->ip_p != IPPROTO_UDP)
2481 			return (1);
2482 		c = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
2483 			      htonl(ntohs(csum) + ntohs(ip->ip_len) -
2484 				    (ip->ip_hl << 2) + ip->ip_p));
2485 		c ^= 0xffff;
2486 		break;
2487 #endif
2488 #ifdef INET6
2489 	case ETHERTYPE_IPV6:
2490 		if ((cap & IFCAP_RXCSUM_IPV6) == 0)
2491 			return (1);
2492 		c = mxge_rx_csum6((eh + 1), m, csum);
2493 		break;
2494 #endif
2495 	default:
2496 		c = 1;
2497 	}
2498 	return (c);
2499 }
2500 
2501 static void
2502 mxge_vlan_tag_remove(struct mbuf *m, uint32_t *csum)
2503 {
2504 	struct ether_vlan_header *evl;
2505 	uint32_t partial;
2506 
2507 	evl = mtod(m, struct ether_vlan_header *);
2508 
2509 	/*
2510 	 * fix checksum by subtracting ETHER_VLAN_ENCAP_LEN bytes
2511 	 * after what the firmware thought was the end of the ethernet
2512 	 * header.
2513 	 */
2514 
2515 	/* put checksum into host byte order */
2516 	*csum = ntohs(*csum);
2517 	partial = ntohl(*(uint32_t *)(mtod(m, char *) + ETHER_HDR_LEN));
2518 	(*csum) += ~partial;
2519 	(*csum) +=  ((*csum) < ~partial);
2520 	(*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2521 	(*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2522 
2523 	/* restore checksum to network byte order;
2524 	   later consumers expect this */
2525 	*csum = htons(*csum);
2526 
2527 	/* save the tag */
2528 #ifdef MXGE_NEW_VLAN_API
2529 	m->m_pkthdr.ether_vtag = ntohs(evl->evl_tag);
2530 #else
2531 	{
2532 		struct m_tag *mtag;
2533 		mtag = m_tag_alloc(MTAG_VLAN, MTAG_VLAN_TAG, sizeof(u_int),
2534 				   M_NOWAIT);
2535 		if (mtag == NULL)
2536 			return;
2537 		VLAN_TAG_VALUE(mtag) = ntohs(evl->evl_tag);
2538 		m_tag_prepend(m, mtag);
2539 	}
2540 
2541 #endif
2542 	m->m_flags |= M_VLANTAG;
2543 
2544 	/*
2545 	 * Remove the 802.1q header by copying the Ethernet
2546 	 * addresses over it and adjusting the beginning of
2547 	 * the data in the mbuf.  The encapsulated Ethernet
2548 	 * type field is already in place.
2549 	 */
2550 	bcopy((char *)evl, (char *)evl + ETHER_VLAN_ENCAP_LEN,
2551 	      ETHER_HDR_LEN - ETHER_TYPE_LEN);
2552 	m_adj(m, ETHER_VLAN_ENCAP_LEN);
2553 }
2554 
2555 static inline void
2556 mxge_rx_done_big(struct mxge_slice_state *ss, uint32_t len,
2557 		 uint32_t csum, int lro)
2558 {
2559 	mxge_softc_t *sc;
2560 	if_t ifp;
2561 	struct mbuf *m;
2562 	struct ether_header *eh;
2563 	mxge_rx_ring_t *rx;
2564 	bus_dmamap_t old_map;
2565 	int idx;
2566 
2567 	sc = ss->sc;
2568 	ifp = sc->ifp;
2569 	rx = &ss->rx_big;
2570 	idx = rx->cnt & rx->mask;
2571 	rx->cnt += rx->nbufs;
2572 	/* save a pointer to the received mbuf */
2573 	m = rx->info[idx].m;
2574 	/* try to replace the received mbuf */
2575 	if (mxge_get_buf_big(ss, rx->extra_map, idx)) {
2576 		/* drop the frame -- the old mbuf is re-cycled */
2577 		if_inc_counter(ifp, IFCOUNTER_IERRORS, 1);
2578 		return;
2579 	}
2580 
2581 	/* unmap the received buffer */
2582 	old_map = rx->info[idx].map;
2583 	bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2584 	bus_dmamap_unload(rx->dmat, old_map);
2585 
2586 	/* swap the bus_dmamap_t's */
2587 	rx->info[idx].map = rx->extra_map;
2588 	rx->extra_map = old_map;
2589 
2590 	/* mcp implicitly skips 1st 2 bytes so that packet is properly
2591 	 * aligned */
2592 	m->m_data += MXGEFW_PAD;
2593 
2594 	m->m_pkthdr.rcvif = ifp;
2595 	m->m_len = m->m_pkthdr.len = len;
2596 	ss->ipackets++;
2597 	eh = mtod(m, struct ether_header *);
2598 	if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2599 		mxge_vlan_tag_remove(m, &csum);
2600 	}
2601 	/* flowid only valid if RSS hashing is enabled */
2602 	if (sc->num_slices > 1) {
2603 		m->m_pkthdr.flowid = (ss - sc->ss);
2604 		M_HASHTYPE_SET(m, M_HASHTYPE_OPAQUE);
2605 	}
2606 	/* if the checksum is valid, mark it in the mbuf header */
2607 	if ((if_getcapenable(ifp) & (IFCAP_RXCSUM_IPV6 | IFCAP_RXCSUM)) &&
2608 	    (0 == mxge_rx_csum(m, csum))) {
2609 		/* Tell the stack that the  checksum is good */
2610 		m->m_pkthdr.csum_data = 0xffff;
2611 		m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR |
2612 			CSUM_DATA_VALID;
2613 
2614 #if defined(INET) || defined (INET6)
2615 		if (lro && (0 == tcp_lro_rx(&ss->lc, m, 0)))
2616 			return;
2617 #endif
2618 	}
2619 	/* pass the frame up the stack */
2620 	if_input(ifp, m);
2621 }
2622 
2623 static inline void
2624 mxge_rx_done_small(struct mxge_slice_state *ss, uint32_t len,
2625 		   uint32_t csum, int lro)
2626 {
2627 	mxge_softc_t *sc;
2628 	if_t ifp;
2629 	struct ether_header *eh;
2630 	struct mbuf *m;
2631 	mxge_rx_ring_t *rx;
2632 	bus_dmamap_t old_map;
2633 	int idx;
2634 
2635 	sc = ss->sc;
2636 	ifp = sc->ifp;
2637 	rx = &ss->rx_small;
2638 	idx = rx->cnt & rx->mask;
2639 	rx->cnt++;
2640 	/* save a pointer to the received mbuf */
2641 	m = rx->info[idx].m;
2642 	/* try to replace the received mbuf */
2643 	if (mxge_get_buf_small(ss, rx->extra_map, idx)) {
2644 		/* drop the frame -- the old mbuf is re-cycled */
2645 		if_inc_counter(ifp, IFCOUNTER_IERRORS, 1);
2646 		return;
2647 	}
2648 
2649 	/* unmap the received buffer */
2650 	old_map = rx->info[idx].map;
2651 	bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2652 	bus_dmamap_unload(rx->dmat, old_map);
2653 
2654 	/* swap the bus_dmamap_t's */
2655 	rx->info[idx].map = rx->extra_map;
2656 	rx->extra_map = old_map;
2657 
2658 	/* mcp implicitly skips 1st 2 bytes so that packet is properly
2659 	 * aligned */
2660 	m->m_data += MXGEFW_PAD;
2661 
2662 	m->m_pkthdr.rcvif = ifp;
2663 	m->m_len = m->m_pkthdr.len = len;
2664 	ss->ipackets++;
2665 	eh = mtod(m, struct ether_header *);
2666 	if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2667 		mxge_vlan_tag_remove(m, &csum);
2668 	}
2669 	/* flowid only valid if RSS hashing is enabled */
2670 	if (sc->num_slices > 1) {
2671 		m->m_pkthdr.flowid = (ss - sc->ss);
2672 		M_HASHTYPE_SET(m, M_HASHTYPE_OPAQUE);
2673 	}
2674 	/* if the checksum is valid, mark it in the mbuf header */
2675 	if ((if_getcapenable(ifp) & (IFCAP_RXCSUM_IPV6 | IFCAP_RXCSUM)) &&
2676 	    (0 == mxge_rx_csum(m, csum))) {
2677 		/* Tell the stack that the  checksum is good */
2678 		m->m_pkthdr.csum_data = 0xffff;
2679 		m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR |
2680 			CSUM_DATA_VALID;
2681 
2682 #if defined(INET) || defined (INET6)
2683 		if (lro && (0 == tcp_lro_rx(&ss->lc, m, csum)))
2684 			return;
2685 #endif
2686 	}
2687 	/* pass the frame up the stack */
2688 	if_input(ifp, m);
2689 }
2690 
2691 static inline void
2692 mxge_clean_rx_done(struct mxge_slice_state *ss)
2693 {
2694 	mxge_rx_done_t *rx_done = &ss->rx_done;
2695 	int limit = 0;
2696 	uint16_t length;
2697 	uint16_t checksum;
2698 	int lro;
2699 
2700 	lro = if_getcapenable(ss->sc->ifp) & IFCAP_LRO;
2701 	while (rx_done->entry[rx_done->idx].length != 0) {
2702 		length = ntohs(rx_done->entry[rx_done->idx].length);
2703 		rx_done->entry[rx_done->idx].length = 0;
2704 		checksum = rx_done->entry[rx_done->idx].checksum;
2705 		if (length <= (MHLEN - MXGEFW_PAD))
2706 			mxge_rx_done_small(ss, length, checksum, lro);
2707 		else
2708 			mxge_rx_done_big(ss, length, checksum, lro);
2709 		rx_done->cnt++;
2710 		rx_done->idx = rx_done->cnt & rx_done->mask;
2711 
2712 		/* limit potential for livelock */
2713 		if (__predict_false(++limit > rx_done->mask / 2))
2714 			break;
2715 	}
2716 #if defined(INET)  || defined (INET6)
2717 	tcp_lro_flush_all(&ss->lc);
2718 #endif
2719 }
2720 
2721 static inline void
2722 mxge_tx_done(struct mxge_slice_state *ss, uint32_t mcp_idx)
2723 {
2724 	if_t ifp __unused;
2725 	mxge_tx_ring_t *tx;
2726 	struct mbuf *m;
2727 	bus_dmamap_t map;
2728 	int idx;
2729 	int *flags;
2730 
2731 	tx = &ss->tx;
2732 	ifp = ss->sc->ifp;
2733 	while (tx->pkt_done != mcp_idx) {
2734 		idx = tx->done & tx->mask;
2735 		tx->done++;
2736 		m = tx->info[idx].m;
2737 		/* mbuf and DMA map only attached to the first
2738 		   segment per-mbuf */
2739 		if (m != NULL) {
2740 			ss->obytes += m->m_pkthdr.len;
2741 			if (m->m_flags & M_MCAST)
2742 				ss->omcasts++;
2743 			ss->opackets++;
2744 			tx->info[idx].m = NULL;
2745 			map = tx->info[idx].map;
2746 			bus_dmamap_unload(tx->dmat, map);
2747 			m_freem(m);
2748 		}
2749 		if (tx->info[idx].flag) {
2750 			tx->info[idx].flag = 0;
2751 			tx->pkt_done++;
2752 		}
2753 	}
2754 
2755 	/* If we have space, clear IFF_OACTIVE to tell the stack that
2756 	   its OK to send packets */
2757 	flags = &ss->if_drv_flags;
2758 	mtx_lock(&ss->tx.mtx);
2759 	if ((*flags) & IFF_DRV_OACTIVE &&
2760 	    tx->req - tx->done < (tx->mask + 1)/4) {
2761 		*(flags) &= ~IFF_DRV_OACTIVE;
2762 		ss->tx.wake++;
2763 		mxge_start_locked(ss);
2764 	}
2765 	if ((ss->sc->num_slices > 1) && (tx->req == tx->done)) {
2766 		/* let the NIC stop polling this queue, since there
2767 		 * are no more transmits pending */
2768 		if (tx->req == tx->done) {
2769 			*tx->send_stop = 1;
2770 			tx->queue_active = 0;
2771 			tx->deactivate++;
2772 			wmb();
2773 		}
2774 	}
2775 	mtx_unlock(&ss->tx.mtx);
2776 }
2777 
2778 static struct mxge_media_type mxge_xfp_media_types[] =
2779 {
2780 	{IFM_10G_CX4,	0x7f, 		"10GBASE-CX4 (module)"},
2781 	{IFM_10G_SR, 	(1 << 7),	"10GBASE-SR"},
2782 	{IFM_10G_LR, 	(1 << 6),	"10GBASE-LR"},
2783 	{0,		(1 << 5),	"10GBASE-ER"},
2784 	{IFM_10G_LRM,	(1 << 4),	"10GBASE-LRM"},
2785 	{0,		(1 << 3),	"10GBASE-SW"},
2786 	{0,		(1 << 2),	"10GBASE-LW"},
2787 	{0,		(1 << 1),	"10GBASE-EW"},
2788 	{0,		(1 << 0),	"Reserved"}
2789 };
2790 static struct mxge_media_type mxge_sfp_media_types[] =
2791 {
2792 	{IFM_10G_TWINAX,      0,	"10GBASE-Twinax"},
2793 	{0,		(1 << 7),	"Reserved"},
2794 	{IFM_10G_LRM,	(1 << 6),	"10GBASE-LRM"},
2795 	{IFM_10G_LR, 	(1 << 5),	"10GBASE-LR"},
2796 	{IFM_10G_SR,	(1 << 4),	"10GBASE-SR"},
2797 	{IFM_10G_TWINAX,(1 << 0),	"10GBASE-Twinax"}
2798 };
2799 
2800 static void
2801 mxge_media_set(mxge_softc_t *sc, int media_type)
2802 {
2803 
2804 	ifmedia_add(&sc->media, IFM_ETHER | IFM_FDX | media_type,
2805 		    0, NULL);
2806 	ifmedia_set(&sc->media, IFM_ETHER | IFM_FDX | media_type);
2807 	sc->current_media = media_type;
2808 	sc->media.ifm_media = sc->media.ifm_cur->ifm_media;
2809 }
2810 
2811 static void
2812 mxge_media_init(mxge_softc_t *sc)
2813 {
2814 	char *ptr;
2815 	int i;
2816 
2817 	ifmedia_removeall(&sc->media);
2818 	mxge_media_set(sc, IFM_AUTO);
2819 
2820 	/*
2821 	 * parse the product code to deterimine the interface type
2822 	 * (CX4, XFP, Quad Ribbon Fiber) by looking at the character
2823 	 * after the 3rd dash in the driver's cached copy of the
2824 	 * EEPROM's product code string.
2825 	 */
2826 	ptr = sc->product_code_string;
2827 	if (ptr == NULL) {
2828 		device_printf(sc->dev, "Missing product code\n");
2829 		return;
2830 	}
2831 
2832 	for (i = 0; i < 3; i++, ptr++) {
2833 		ptr = strchr(ptr, '-');
2834 		if (ptr == NULL) {
2835 			device_printf(sc->dev,
2836 				      "only %d dashes in PC?!?\n", i);
2837 			return;
2838 		}
2839 	}
2840 	if (*ptr == 'C' || *(ptr +1) == 'C') {
2841 		/* -C is CX4 */
2842 		sc->connector = MXGE_CX4;
2843 		mxge_media_set(sc, IFM_10G_CX4);
2844 	} else if (*ptr == 'Q') {
2845 		/* -Q is Quad Ribbon Fiber */
2846 		sc->connector = MXGE_QRF;
2847 		device_printf(sc->dev, "Quad Ribbon Fiber Media\n");
2848 		/* FreeBSD has no media type for Quad ribbon fiber */
2849 	} else if (*ptr == 'R') {
2850 		/* -R is XFP */
2851 		sc->connector = MXGE_XFP;
2852 	} else if (*ptr == 'S' || *(ptr +1) == 'S') {
2853 		/* -S or -2S is SFP+ */
2854 		sc->connector = MXGE_SFP;
2855 	} else {
2856 		device_printf(sc->dev, "Unknown media type: %c\n", *ptr);
2857 	}
2858 }
2859 
2860 /*
2861  * Determine the media type for a NIC.  Some XFPs will identify
2862  * themselves only when their link is up, so this is initiated via a
2863  * link up interrupt.  However, this can potentially take up to
2864  * several milliseconds, so it is run via the watchdog routine, rather
2865  * than in the interrupt handler itself.
2866  */
2867 static void
2868 mxge_media_probe(mxge_softc_t *sc)
2869 {
2870 	mxge_cmd_t cmd;
2871 	char *cage_type;
2872 
2873 	struct mxge_media_type *mxge_media_types = NULL;
2874 	int i, err, ms, mxge_media_type_entries;
2875 	uint32_t byte;
2876 
2877 	sc->need_media_probe = 0;
2878 
2879 	if (sc->connector == MXGE_XFP) {
2880 		/* -R is XFP */
2881 		mxge_media_types = mxge_xfp_media_types;
2882 		mxge_media_type_entries =
2883 			nitems(mxge_xfp_media_types);
2884 		byte = MXGE_XFP_COMPLIANCE_BYTE;
2885 		cage_type = "XFP";
2886 	} else 	if (sc->connector == MXGE_SFP) {
2887 		/* -S or -2S is SFP+ */
2888 		mxge_media_types = mxge_sfp_media_types;
2889 		mxge_media_type_entries =
2890 			nitems(mxge_sfp_media_types);
2891 		cage_type = "SFP+";
2892 		byte = 3;
2893 	} else {
2894 		/* nothing to do; media type cannot change */
2895 		return;
2896 	}
2897 
2898 	/*
2899 	 * At this point we know the NIC has an XFP cage, so now we
2900 	 * try to determine what is in the cage by using the
2901 	 * firmware's XFP I2C commands to read the XFP 10GbE compilance
2902 	 * register.  We read just one byte, which may take over
2903 	 * a millisecond
2904 	 */
2905 
2906 	cmd.data0 = 0;	 /* just fetch 1 byte, not all 256 */
2907 	cmd.data1 = byte;
2908 	err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_READ, &cmd);
2909 	if (err == MXGEFW_CMD_ERROR_I2C_FAILURE) {
2910 		device_printf(sc->dev, "failed to read XFP\n");
2911 	}
2912 	if (err == MXGEFW_CMD_ERROR_I2C_ABSENT) {
2913 		device_printf(sc->dev, "Type R/S with no XFP!?!?\n");
2914 	}
2915 	if (err != MXGEFW_CMD_OK) {
2916 		return;
2917 	}
2918 
2919 	/* now we wait for the data to be cached */
2920 	cmd.data0 = byte;
2921 	err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
2922 	for (ms = 0; (err == EBUSY) && (ms < 50); ms++) {
2923 		DELAY(1000);
2924 		cmd.data0 = byte;
2925 		err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
2926 	}
2927 	if (err != MXGEFW_CMD_OK) {
2928 		device_printf(sc->dev, "failed to read %s (%d, %dms)\n",
2929 			      cage_type, err, ms);
2930 		return;
2931 	}
2932 
2933 	if (cmd.data0 == mxge_media_types[0].bitmask) {
2934 		if (mxge_verbose)
2935 			device_printf(sc->dev, "%s:%s\n", cage_type,
2936 				      mxge_media_types[0].name);
2937 		if (sc->current_media != mxge_media_types[0].flag) {
2938 			mxge_media_init(sc);
2939 			mxge_media_set(sc, mxge_media_types[0].flag);
2940 		}
2941 		return;
2942 	}
2943 	for (i = 1; i < mxge_media_type_entries; i++) {
2944 		if (cmd.data0 & mxge_media_types[i].bitmask) {
2945 			if (mxge_verbose)
2946 				device_printf(sc->dev, "%s:%s\n",
2947 					      cage_type,
2948 					      mxge_media_types[i].name);
2949 
2950 			if (sc->current_media != mxge_media_types[i].flag) {
2951 				mxge_media_init(sc);
2952 				mxge_media_set(sc, mxge_media_types[i].flag);
2953 			}
2954 			return;
2955 		}
2956 	}
2957 	if (mxge_verbose)
2958 		device_printf(sc->dev, "%s media 0x%x unknown\n",
2959 			      cage_type, cmd.data0);
2960 
2961 	return;
2962 }
2963 
2964 static void
2965 mxge_intr(void *arg)
2966 {
2967 	struct mxge_slice_state *ss = arg;
2968 	mxge_softc_t *sc = ss->sc;
2969 	mcp_irq_data_t *stats = ss->fw_stats;
2970 	mxge_tx_ring_t *tx = &ss->tx;
2971 	mxge_rx_done_t *rx_done = &ss->rx_done;
2972 	uint32_t send_done_count;
2973 	uint8_t valid;
2974 
2975 	/* make sure the DMA has finished */
2976 	if (!stats->valid) {
2977 		return;
2978 	}
2979 	valid = stats->valid;
2980 
2981 	if (sc->legacy_irq) {
2982 		/* lower legacy IRQ  */
2983 		*sc->irq_deassert = 0;
2984 		if (!mxge_deassert_wait)
2985 			/* don't wait for conf. that irq is low */
2986 			stats->valid = 0;
2987 	} else {
2988 		stats->valid = 0;
2989 	}
2990 
2991 	/* loop while waiting for legacy irq deassertion */
2992 	do {
2993 		/* check for transmit completes and receives */
2994 		send_done_count = be32toh(stats->send_done_count);
2995 		while ((send_done_count != tx->pkt_done) ||
2996 		       (rx_done->entry[rx_done->idx].length != 0)) {
2997 			if (send_done_count != tx->pkt_done)
2998 				mxge_tx_done(ss, (int)send_done_count);
2999 			mxge_clean_rx_done(ss);
3000 			send_done_count = be32toh(stats->send_done_count);
3001 		}
3002 		if (sc->legacy_irq && mxge_deassert_wait)
3003 			wmb();
3004 	} while (*((volatile uint8_t *) &stats->valid));
3005 
3006 	/* fw link & error stats meaningful only on the first slice */
3007 	if (__predict_false((ss == sc->ss) && stats->stats_updated)) {
3008 		if (sc->link_state != stats->link_up) {
3009 			sc->link_state = stats->link_up;
3010 			if (sc->link_state) {
3011 				if_link_state_change(sc->ifp, LINK_STATE_UP);
3012 				if (mxge_verbose)
3013 					device_printf(sc->dev, "link up\n");
3014 			} else {
3015 				if_link_state_change(sc->ifp, LINK_STATE_DOWN);
3016 				if (mxge_verbose)
3017 					device_printf(sc->dev, "link down\n");
3018 			}
3019 			sc->need_media_probe = 1;
3020 		}
3021 		if (sc->rdma_tags_available !=
3022 		    be32toh(stats->rdma_tags_available)) {
3023 			sc->rdma_tags_available =
3024 				be32toh(stats->rdma_tags_available);
3025 			device_printf(sc->dev, "RDMA timed out! %d tags "
3026 				      "left\n", sc->rdma_tags_available);
3027 		}
3028 
3029 		if (stats->link_down) {
3030 			sc->down_cnt += stats->link_down;
3031 			sc->link_state = 0;
3032 			if_link_state_change(sc->ifp, LINK_STATE_DOWN);
3033 		}
3034 	}
3035 
3036 	/* check to see if we have rx token to pass back */
3037 	if (valid & 0x1)
3038 	    *ss->irq_claim = be32toh(3);
3039 	*(ss->irq_claim + 1) = be32toh(3);
3040 }
3041 
3042 static void
3043 mxge_init(void *arg)
3044 {
3045 	mxge_softc_t *sc = arg;
3046 	if_t ifp = sc->ifp;
3047 
3048 	mtx_lock(&sc->driver_mtx);
3049 	if ((if_getdrvflags(ifp) & IFF_DRV_RUNNING) == 0)
3050 		(void) mxge_open(sc);
3051 	mtx_unlock(&sc->driver_mtx);
3052 }
3053 
3054 static void
3055 mxge_free_slice_mbufs(struct mxge_slice_state *ss)
3056 {
3057 	int i;
3058 
3059 #if defined(INET) || defined(INET6)
3060 	tcp_lro_free(&ss->lc);
3061 #endif
3062 	for (i = 0; i <= ss->rx_big.mask; i++) {
3063 		if (ss->rx_big.info[i].m == NULL)
3064 			continue;
3065 		bus_dmamap_unload(ss->rx_big.dmat,
3066 				  ss->rx_big.info[i].map);
3067 		m_freem(ss->rx_big.info[i].m);
3068 		ss->rx_big.info[i].m = NULL;
3069 	}
3070 
3071 	for (i = 0; i <= ss->rx_small.mask; i++) {
3072 		if (ss->rx_small.info[i].m == NULL)
3073 			continue;
3074 		bus_dmamap_unload(ss->rx_small.dmat,
3075 				  ss->rx_small.info[i].map);
3076 		m_freem(ss->rx_small.info[i].m);
3077 		ss->rx_small.info[i].m = NULL;
3078 	}
3079 
3080 	/* transmit ring used only on the first slice */
3081 	if (ss->tx.info == NULL)
3082 		return;
3083 
3084 	for (i = 0; i <= ss->tx.mask; i++) {
3085 		ss->tx.info[i].flag = 0;
3086 		if (ss->tx.info[i].m == NULL)
3087 			continue;
3088 		bus_dmamap_unload(ss->tx.dmat,
3089 				  ss->tx.info[i].map);
3090 		m_freem(ss->tx.info[i].m);
3091 		ss->tx.info[i].m = NULL;
3092 	}
3093 }
3094 
3095 static void
3096 mxge_free_mbufs(mxge_softc_t *sc)
3097 {
3098 	int slice;
3099 
3100 	for (slice = 0; slice < sc->num_slices; slice++)
3101 		mxge_free_slice_mbufs(&sc->ss[slice]);
3102 }
3103 
3104 static void
3105 mxge_free_slice_rings(struct mxge_slice_state *ss)
3106 {
3107 	int i;
3108 
3109 	if (ss->rx_done.entry != NULL)
3110 		mxge_dma_free(&ss->rx_done.dma);
3111 	ss->rx_done.entry = NULL;
3112 
3113 	if (ss->tx.req_bytes != NULL)
3114 		free(ss->tx.req_bytes, M_DEVBUF);
3115 	ss->tx.req_bytes = NULL;
3116 
3117 	if (ss->tx.seg_list != NULL)
3118 		free(ss->tx.seg_list, M_DEVBUF);
3119 	ss->tx.seg_list = NULL;
3120 
3121 	if (ss->rx_small.shadow != NULL)
3122 		free(ss->rx_small.shadow, M_DEVBUF);
3123 	ss->rx_small.shadow = NULL;
3124 
3125 	if (ss->rx_big.shadow != NULL)
3126 		free(ss->rx_big.shadow, M_DEVBUF);
3127 	ss->rx_big.shadow = NULL;
3128 
3129 	if (ss->tx.info != NULL) {
3130 		if (ss->tx.dmat != NULL) {
3131 			for (i = 0; i <= ss->tx.mask; i++) {
3132 				bus_dmamap_destroy(ss->tx.dmat,
3133 						   ss->tx.info[i].map);
3134 			}
3135 			bus_dma_tag_destroy(ss->tx.dmat);
3136 		}
3137 		free(ss->tx.info, M_DEVBUF);
3138 	}
3139 	ss->tx.info = NULL;
3140 
3141 	if (ss->rx_small.info != NULL) {
3142 		if (ss->rx_small.dmat != NULL) {
3143 			for (i = 0; i <= ss->rx_small.mask; i++) {
3144 				bus_dmamap_destroy(ss->rx_small.dmat,
3145 						   ss->rx_small.info[i].map);
3146 			}
3147 			bus_dmamap_destroy(ss->rx_small.dmat,
3148 					   ss->rx_small.extra_map);
3149 			bus_dma_tag_destroy(ss->rx_small.dmat);
3150 		}
3151 		free(ss->rx_small.info, M_DEVBUF);
3152 	}
3153 	ss->rx_small.info = NULL;
3154 
3155 	if (ss->rx_big.info != NULL) {
3156 		if (ss->rx_big.dmat != NULL) {
3157 			for (i = 0; i <= ss->rx_big.mask; i++) {
3158 				bus_dmamap_destroy(ss->rx_big.dmat,
3159 						   ss->rx_big.info[i].map);
3160 			}
3161 			bus_dmamap_destroy(ss->rx_big.dmat,
3162 					   ss->rx_big.extra_map);
3163 			bus_dma_tag_destroy(ss->rx_big.dmat);
3164 		}
3165 		free(ss->rx_big.info, M_DEVBUF);
3166 	}
3167 	ss->rx_big.info = NULL;
3168 }
3169 
3170 static void
3171 mxge_free_rings(mxge_softc_t *sc)
3172 {
3173 	int slice;
3174 
3175 	for (slice = 0; slice < sc->num_slices; slice++)
3176 		mxge_free_slice_rings(&sc->ss[slice]);
3177 }
3178 
3179 static int
3180 mxge_alloc_slice_rings(struct mxge_slice_state *ss, int rx_ring_entries,
3181 		       int tx_ring_entries)
3182 {
3183 	mxge_softc_t *sc = ss->sc;
3184 	size_t bytes;
3185 	int err, i;
3186 
3187 	/* allocate per-slice receive resources */
3188 
3189 	ss->rx_small.mask = ss->rx_big.mask = rx_ring_entries - 1;
3190 	ss->rx_done.mask = (2 * rx_ring_entries) - 1;
3191 
3192 	/* allocate the rx shadow rings */
3193 	bytes = rx_ring_entries * sizeof (*ss->rx_small.shadow);
3194 	ss->rx_small.shadow = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3195 
3196 	bytes = rx_ring_entries * sizeof (*ss->rx_big.shadow);
3197 	ss->rx_big.shadow = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3198 
3199 	/* allocate the rx host info rings */
3200 	bytes = rx_ring_entries * sizeof (*ss->rx_small.info);
3201 	ss->rx_small.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3202 
3203 	bytes = rx_ring_entries * sizeof (*ss->rx_big.info);
3204 	ss->rx_big.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3205 
3206 	/* allocate the rx busdma resources */
3207 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
3208 				 1,			/* alignment */
3209 				 4096,			/* boundary */
3210 				 BUS_SPACE_MAXADDR,	/* low */
3211 				 BUS_SPACE_MAXADDR,	/* high */
3212 				 NULL, NULL,		/* filter */
3213 				 MHLEN,			/* maxsize */
3214 				 1,			/* num segs */
3215 				 MHLEN,			/* maxsegsize */
3216 				 BUS_DMA_ALLOCNOW,	/* flags */
3217 				 NULL, NULL,		/* lock */
3218 				 &ss->rx_small.dmat);	/* tag */
3219 	if (err != 0) {
3220 		device_printf(sc->dev, "Err %d allocating rx_small dmat\n",
3221 			      err);
3222 		return err;
3223 	}
3224 
3225 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
3226 				 1,			/* alignment */
3227 				 0,			/* boundary */
3228 				 BUS_SPACE_MAXADDR,	/* low */
3229 				 BUS_SPACE_MAXADDR,	/* high */
3230 				 NULL, NULL,		/* filter */
3231 				 3*4096,		/* maxsize */
3232 				 1,			/* num segs */
3233 				 MJUM9BYTES,		/* maxsegsize*/
3234 				 BUS_DMA_ALLOCNOW,	/* flags */
3235 				 NULL, NULL,		/* lock */
3236 				 &ss->rx_big.dmat);	/* tag */
3237 	if (err != 0) {
3238 		device_printf(sc->dev, "Err %d allocating rx_big dmat\n",
3239 			      err);
3240 		return err;
3241 	}
3242 	for (i = 0; i <= ss->rx_small.mask; i++) {
3243 		err = bus_dmamap_create(ss->rx_small.dmat, 0,
3244 					&ss->rx_small.info[i].map);
3245 		if (err != 0) {
3246 			device_printf(sc->dev, "Err %d  rx_small dmamap\n",
3247 				      err);
3248 			return err;
3249 		}
3250 	}
3251 	err = bus_dmamap_create(ss->rx_small.dmat, 0,
3252 				&ss->rx_small.extra_map);
3253 	if (err != 0) {
3254 		device_printf(sc->dev, "Err %d extra rx_small dmamap\n",
3255 			      err);
3256 		return err;
3257 	}
3258 
3259 	for (i = 0; i <= ss->rx_big.mask; i++) {
3260 		err = bus_dmamap_create(ss->rx_big.dmat, 0,
3261 					&ss->rx_big.info[i].map);
3262 		if (err != 0) {
3263 			device_printf(sc->dev, "Err %d  rx_big dmamap\n",
3264 				      err);
3265 			return err;
3266 		}
3267 	}
3268 	err = bus_dmamap_create(ss->rx_big.dmat, 0,
3269 				&ss->rx_big.extra_map);
3270 	if (err != 0) {
3271 		device_printf(sc->dev, "Err %d extra rx_big dmamap\n",
3272 			      err);
3273 		return err;
3274 	}
3275 
3276 	/* now allocate TX resources */
3277 
3278 	ss->tx.mask = tx_ring_entries - 1;
3279 	ss->tx.max_desc = MIN(MXGE_MAX_SEND_DESC, tx_ring_entries / 4);
3280 
3281 	/* allocate the tx request copy block */
3282 	bytes = 8 +
3283 		sizeof (*ss->tx.req_list) * (ss->tx.max_desc + 4);
3284 	ss->tx.req_bytes = malloc(bytes, M_DEVBUF, M_WAITOK);
3285 	/* ensure req_list entries are aligned to 8 bytes */
3286 	ss->tx.req_list = (mcp_kreq_ether_send_t *)
3287 		((uintptr_t)(ss->tx.req_bytes + 7) & ~7UL);
3288 
3289 	/* allocate the tx busdma segment list */
3290 	bytes = sizeof (*ss->tx.seg_list) * ss->tx.max_desc;
3291 	ss->tx.seg_list = (bus_dma_segment_t *)
3292 		malloc(bytes, M_DEVBUF, M_WAITOK);
3293 
3294 	/* allocate the tx host info ring */
3295 	bytes = tx_ring_entries * sizeof (*ss->tx.info);
3296 	ss->tx.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3297 
3298 	/* allocate the tx busdma resources */
3299 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
3300 				 1,			/* alignment */
3301 				 sc->tx_boundary,	/* boundary */
3302 				 BUS_SPACE_MAXADDR,	/* low */
3303 				 BUS_SPACE_MAXADDR,	/* high */
3304 				 NULL, NULL,		/* filter */
3305 				 65536 + 256,		/* maxsize */
3306 				 ss->tx.max_desc - 2,	/* num segs */
3307 				 sc->tx_boundary,	/* maxsegsz */
3308 				 BUS_DMA_ALLOCNOW,	/* flags */
3309 				 NULL, NULL,		/* lock */
3310 				 &ss->tx.dmat);		/* tag */
3311 
3312 	if (err != 0) {
3313 		device_printf(sc->dev, "Err %d allocating tx dmat\n",
3314 			      err);
3315 		return err;
3316 	}
3317 
3318 	/* now use these tags to setup dmamaps for each slot
3319 	   in the ring */
3320 	for (i = 0; i <= ss->tx.mask; i++) {
3321 		err = bus_dmamap_create(ss->tx.dmat, 0,
3322 					&ss->tx.info[i].map);
3323 		if (err != 0) {
3324 			device_printf(sc->dev, "Err %d  tx dmamap\n",
3325 				      err);
3326 			return err;
3327 		}
3328 	}
3329 	return 0;
3330 
3331 }
3332 
3333 static int
3334 mxge_alloc_rings(mxge_softc_t *sc)
3335 {
3336 	mxge_cmd_t cmd;
3337 	int tx_ring_size;
3338 	int tx_ring_entries, rx_ring_entries;
3339 	int err, slice;
3340 
3341 	/* get ring sizes */
3342 	err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_RING_SIZE, &cmd);
3343 	tx_ring_size = cmd.data0;
3344 	if (err != 0) {
3345 		device_printf(sc->dev, "Cannot determine tx ring sizes\n");
3346 		goto abort;
3347 	}
3348 
3349 	tx_ring_entries = tx_ring_size / sizeof (mcp_kreq_ether_send_t);
3350 	rx_ring_entries = sc->rx_ring_size / sizeof (mcp_dma_addr_t);
3351 	if_setsendqlen(sc->ifp, tx_ring_entries - 1);
3352 	if_setsendqready(sc->ifp);
3353 
3354 	for (slice = 0; slice < sc->num_slices; slice++) {
3355 		err = mxge_alloc_slice_rings(&sc->ss[slice],
3356 					     rx_ring_entries,
3357 					     tx_ring_entries);
3358 		if (err != 0)
3359 			goto abort;
3360 	}
3361 	return 0;
3362 
3363 abort:
3364 	mxge_free_rings(sc);
3365 	return err;
3366 
3367 }
3368 
3369 static void
3370 mxge_choose_params(int mtu, int *big_buf_size, int *cl_size, int *nbufs)
3371 {
3372 	int bufsize = mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN + MXGEFW_PAD;
3373 
3374 	if (bufsize < MCLBYTES) {
3375 		/* easy, everything fits in a single buffer */
3376 		*big_buf_size = MCLBYTES;
3377 		*cl_size = MCLBYTES;
3378 		*nbufs = 1;
3379 		return;
3380 	}
3381 
3382 	if (bufsize < MJUMPAGESIZE) {
3383 		/* still easy, everything still fits in a single buffer */
3384 		*big_buf_size = MJUMPAGESIZE;
3385 		*cl_size = MJUMPAGESIZE;
3386 		*nbufs = 1;
3387 		return;
3388 	}
3389 	*cl_size = MJUM9BYTES;
3390 	*big_buf_size = MJUM9BYTES;
3391 	*nbufs = 1;
3392 }
3393 
3394 static int
3395 mxge_slice_open(struct mxge_slice_state *ss, int nbufs, int cl_size)
3396 {
3397 	mxge_softc_t *sc;
3398 	mxge_cmd_t cmd;
3399 	bus_dmamap_t map;
3400 	int err, i, slice;
3401 
3402 	sc = ss->sc;
3403 	slice = ss - sc->ss;
3404 
3405 #if defined(INET) || defined(INET6)
3406 	(void)tcp_lro_init(&ss->lc);
3407 #endif
3408 	ss->lc.ifp = sc->ifp;
3409 
3410 	/* get the lanai pointers to the send and receive rings */
3411 
3412 	err = 0;
3413 
3414 	cmd.data0 = slice;
3415 	err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_OFFSET, &cmd);
3416 	ss->tx.lanai =
3417 		(volatile mcp_kreq_ether_send_t *)(sc->sram + cmd.data0);
3418 	ss->tx.send_go = (volatile uint32_t *)
3419 		(sc->sram + MXGEFW_ETH_SEND_GO + 64 * slice);
3420 	ss->tx.send_stop = (volatile uint32_t *)
3421 	(sc->sram + MXGEFW_ETH_SEND_STOP + 64 * slice);
3422 
3423 	cmd.data0 = slice;
3424 	err |= mxge_send_cmd(sc,
3425 			     MXGEFW_CMD_GET_SMALL_RX_OFFSET, &cmd);
3426 	ss->rx_small.lanai =
3427 		(volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
3428 	cmd.data0 = slice;
3429 	err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_BIG_RX_OFFSET, &cmd);
3430 	ss->rx_big.lanai =
3431 		(volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
3432 
3433 	if (err != 0) {
3434 		device_printf(sc->dev,
3435 			      "failed to get ring sizes or locations\n");
3436 		return EIO;
3437 	}
3438 
3439 	/* stock receive rings */
3440 	for (i = 0; i <= ss->rx_small.mask; i++) {
3441 		map = ss->rx_small.info[i].map;
3442 		err = mxge_get_buf_small(ss, map, i);
3443 		if (err) {
3444 			device_printf(sc->dev, "alloced %d/%d smalls\n",
3445 				      i, ss->rx_small.mask + 1);
3446 			return ENOMEM;
3447 		}
3448 	}
3449 	for (i = 0; i <= ss->rx_big.mask; i++) {
3450 		ss->rx_big.shadow[i].addr_low = 0xffffffff;
3451 		ss->rx_big.shadow[i].addr_high = 0xffffffff;
3452 	}
3453 	ss->rx_big.nbufs = nbufs;
3454 	ss->rx_big.cl_size = cl_size;
3455 	ss->rx_big.mlen = if_getmtu(ss->sc->ifp) + ETHER_HDR_LEN +
3456 		ETHER_VLAN_ENCAP_LEN + MXGEFW_PAD;
3457 	for (i = 0; i <= ss->rx_big.mask; i += ss->rx_big.nbufs) {
3458 		map = ss->rx_big.info[i].map;
3459 		err = mxge_get_buf_big(ss, map, i);
3460 		if (err) {
3461 			device_printf(sc->dev, "alloced %d/%d bigs\n",
3462 				      i, ss->rx_big.mask + 1);
3463 			return ENOMEM;
3464 		}
3465 	}
3466 	return 0;
3467 }
3468 
3469 static int
3470 mxge_open(mxge_softc_t *sc)
3471 {
3472 	mxge_cmd_t cmd;
3473 	int err, big_bytes, nbufs, slice, cl_size, i;
3474 	bus_addr_t bus;
3475 	volatile uint8_t *itable;
3476 	struct mxge_slice_state *ss;
3477 
3478 	/* Copy the MAC address in case it was overridden */
3479 	bcopy(if_getlladdr(sc->ifp), sc->mac_addr, ETHER_ADDR_LEN);
3480 
3481 	err = mxge_reset(sc, 1);
3482 	if (err != 0) {
3483 		device_printf(sc->dev, "failed to reset\n");
3484 		return EIO;
3485 	}
3486 
3487 	if (sc->num_slices > 1) {
3488 		/* setup the indirection table */
3489 		cmd.data0 = sc->num_slices;
3490 		err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_TABLE_SIZE,
3491 				    &cmd);
3492 
3493 		err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_RSS_TABLE_OFFSET,
3494 				     &cmd);
3495 		if (err != 0) {
3496 			device_printf(sc->dev,
3497 				      "failed to setup rss tables\n");
3498 			return err;
3499 		}
3500 
3501 		/* just enable an identity mapping */
3502 		itable = sc->sram + cmd.data0;
3503 		for (i = 0; i < sc->num_slices; i++)
3504 			itable[i] = (uint8_t)i;
3505 
3506 		cmd.data0 = 1;
3507 		cmd.data1 = mxge_rss_hash_type;
3508 		err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_ENABLE, &cmd);
3509 		if (err != 0) {
3510 			device_printf(sc->dev, "failed to enable slices\n");
3511 			return err;
3512 		}
3513 	}
3514 
3515 	mxge_choose_params(if_getmtu(sc->ifp), &big_bytes, &cl_size, &nbufs);
3516 
3517 	cmd.data0 = nbufs;
3518 	err = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
3519 			    &cmd);
3520 	/* error is only meaningful if we're trying to set
3521 	   MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS > 1 */
3522 	if (err && nbufs > 1) {
3523 		device_printf(sc->dev,
3524 			      "Failed to set alway-use-n to %d\n",
3525 			      nbufs);
3526 		return EIO;
3527 	}
3528 	/* Give the firmware the mtu and the big and small buffer
3529 	   sizes.  The firmware wants the big buf size to be a power
3530 	   of two. Luckily, FreeBSD's clusters are powers of two */
3531 	cmd.data0 = if_getmtu(sc->ifp) + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
3532 	err = mxge_send_cmd(sc, MXGEFW_CMD_SET_MTU, &cmd);
3533 	cmd.data0 = MHLEN - MXGEFW_PAD;
3534 	err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_SMALL_BUFFER_SIZE,
3535 			     &cmd);
3536 	cmd.data0 = big_bytes;
3537 	err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_BIG_BUFFER_SIZE, &cmd);
3538 
3539 	if (err != 0) {
3540 		device_printf(sc->dev, "failed to setup params\n");
3541 		goto abort;
3542 	}
3543 
3544 	/* Now give him the pointer to the stats block */
3545 	for (slice = 0; slice < sc->num_slices; slice++) {
3546 		ss = &sc->ss[slice];
3547 		cmd.data0 =
3548 			MXGE_LOWPART_TO_U32(ss->fw_stats_dma.bus_addr);
3549 		cmd.data1 =
3550 			MXGE_HIGHPART_TO_U32(ss->fw_stats_dma.bus_addr);
3551 		cmd.data2 = sizeof(struct mcp_irq_data);
3552 		cmd.data2 |= (slice << 16);
3553 		err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_STATS_DMA_V2, &cmd);
3554 	}
3555 
3556 	if (err != 0) {
3557 		bus = sc->ss->fw_stats_dma.bus_addr;
3558 		bus += offsetof(struct mcp_irq_data, send_done_count);
3559 		cmd.data0 = MXGE_LOWPART_TO_U32(bus);
3560 		cmd.data1 = MXGE_HIGHPART_TO_U32(bus);
3561 		err = mxge_send_cmd(sc,
3562 				    MXGEFW_CMD_SET_STATS_DMA_OBSOLETE,
3563 				    &cmd);
3564 		/* Firmware cannot support multicast without STATS_DMA_V2 */
3565 		sc->fw_multicast_support = 0;
3566 	} else {
3567 		sc->fw_multicast_support = 1;
3568 	}
3569 
3570 	if (err != 0) {
3571 		device_printf(sc->dev, "failed to setup params\n");
3572 		goto abort;
3573 	}
3574 
3575 	for (slice = 0; slice < sc->num_slices; slice++) {
3576 		err = mxge_slice_open(&sc->ss[slice], nbufs, cl_size);
3577 		if (err != 0) {
3578 			device_printf(sc->dev, "couldn't open slice %d\n",
3579 				      slice);
3580 			goto abort;
3581 		}
3582 	}
3583 
3584 	/* Finally, start the firmware running */
3585 	err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_UP, &cmd);
3586 	if (err) {
3587 		device_printf(sc->dev, "Couldn't bring up link\n");
3588 		goto abort;
3589 	}
3590 	for (slice = 0; slice < sc->num_slices; slice++) {
3591 		ss = &sc->ss[slice];
3592 		ss->if_drv_flags |= IFF_DRV_RUNNING;
3593 		ss->if_drv_flags &= ~IFF_DRV_OACTIVE;
3594 	}
3595 	if_setdrvflagbits(sc->ifp, IFF_DRV_RUNNING, 0);
3596 	if_setdrvflagbits(sc->ifp, 0, IFF_DRV_OACTIVE);
3597 
3598 	return 0;
3599 
3600 abort:
3601 	mxge_free_mbufs(sc);
3602 
3603 	return err;
3604 }
3605 
3606 static int
3607 mxge_close(mxge_softc_t *sc, int down)
3608 {
3609 	mxge_cmd_t cmd;
3610 	int err, old_down_cnt;
3611 	struct mxge_slice_state *ss;
3612 	int slice;
3613 
3614 	for (slice = 0; slice < sc->num_slices; slice++) {
3615 		ss = &sc->ss[slice];
3616 		ss->if_drv_flags &= ~IFF_DRV_RUNNING;
3617 	}
3618 	if_setdrvflagbits(sc->ifp, 0, IFF_DRV_RUNNING);
3619 	if (!down) {
3620 		old_down_cnt = sc->down_cnt;
3621 		wmb();
3622 		err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_DOWN, &cmd);
3623 		if (err) {
3624 			device_printf(sc->dev,
3625 				      "Couldn't bring down link\n");
3626 		}
3627 		if (old_down_cnt == sc->down_cnt) {
3628 			/* wait for down irq */
3629 			DELAY(10 * sc->intr_coal_delay);
3630 		}
3631 		wmb();
3632 		if (old_down_cnt == sc->down_cnt) {
3633 			device_printf(sc->dev, "never got down irq\n");
3634 		}
3635 	}
3636 	mxge_free_mbufs(sc);
3637 
3638 	return 0;
3639 }
3640 
3641 static void
3642 mxge_setup_cfg_space(mxge_softc_t *sc)
3643 {
3644 	device_t dev = sc->dev;
3645 	int reg;
3646 	uint16_t lnk, pectl;
3647 
3648 	/* find the PCIe link width and set max read request to 4KB*/
3649 	if (pci_find_cap(dev, PCIY_EXPRESS, &reg) == 0) {
3650 		lnk = pci_read_config(dev, reg + 0x12, 2);
3651 		sc->link_width = (lnk >> 4) & 0x3f;
3652 
3653 		if (sc->pectl == 0) {
3654 			pectl = pci_read_config(dev, reg + 0x8, 2);
3655 			pectl = (pectl & ~0x7000) | (5 << 12);
3656 			pci_write_config(dev, reg + 0x8, pectl, 2);
3657 			sc->pectl = pectl;
3658 		} else {
3659 			/* restore saved pectl after watchdog reset */
3660 			pci_write_config(dev, reg + 0x8, sc->pectl, 2);
3661 		}
3662 	}
3663 
3664 	/* Enable DMA and Memory space access */
3665 	pci_enable_busmaster(dev);
3666 }
3667 
3668 static uint32_t
3669 mxge_read_reboot(mxge_softc_t *sc)
3670 {
3671 	device_t dev = sc->dev;
3672 	uint32_t vs;
3673 
3674 	/* find the vendor specific offset */
3675 	if (pci_find_cap(dev, PCIY_VENDOR, &vs) != 0) {
3676 		device_printf(sc->dev,
3677 			      "could not find vendor specific offset\n");
3678 		return (uint32_t)-1;
3679 	}
3680 	/* enable read32 mode */
3681 	pci_write_config(dev, vs + 0x10, 0x3, 1);
3682 	/* tell NIC which register to read */
3683 	pci_write_config(dev, vs + 0x18, 0xfffffff0, 4);
3684 	return (pci_read_config(dev, vs + 0x14, 4));
3685 }
3686 
3687 static void
3688 mxge_watchdog_reset(mxge_softc_t *sc)
3689 {
3690 	struct pci_devinfo *dinfo;
3691 	struct mxge_slice_state *ss;
3692 	int err, running, s, num_tx_slices = 1;
3693 	uint32_t reboot;
3694 	uint16_t cmd;
3695 
3696 	err = ENXIO;
3697 
3698 	device_printf(sc->dev, "Watchdog reset!\n");
3699 
3700 	/*
3701 	 * check to see if the NIC rebooted.  If it did, then all of
3702 	 * PCI config space has been reset, and things like the
3703 	 * busmaster bit will be zero.  If this is the case, then we
3704 	 * must restore PCI config space before the NIC can be used
3705 	 * again
3706 	 */
3707 	cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3708 	if (cmd == 0xffff) {
3709 		/*
3710 		 * maybe the watchdog caught the NIC rebooting; wait
3711 		 * up to 100ms for it to finish.  If it does not come
3712 		 * back, then give up
3713 		 */
3714 		DELAY(1000*100);
3715 		cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3716 		if (cmd == 0xffff) {
3717 			device_printf(sc->dev, "NIC disappeared!\n");
3718 		}
3719 	}
3720 	if ((cmd & PCIM_CMD_BUSMASTEREN) == 0) {
3721 		/* print the reboot status */
3722 		reboot = mxge_read_reboot(sc);
3723 		device_printf(sc->dev, "NIC rebooted, status = 0x%x\n",
3724 			      reboot);
3725 		running = if_getdrvflags(sc->ifp) & IFF_DRV_RUNNING;
3726 		if (running) {
3727 			/*
3728 			 * quiesce NIC so that TX routines will not try to
3729 			 * xmit after restoration of BAR
3730 			 */
3731 
3732 			/* Mark the link as down */
3733 			if (sc->link_state) {
3734 				sc->link_state = 0;
3735 				if_link_state_change(sc->ifp,
3736 						     LINK_STATE_DOWN);
3737 			}
3738 
3739 			num_tx_slices = sc->num_slices;
3740 
3741 			/* grab all TX locks to ensure no tx  */
3742 			for (s = 0; s < num_tx_slices; s++) {
3743 				ss = &sc->ss[s];
3744 				mtx_lock(&ss->tx.mtx);
3745 			}
3746 			mxge_close(sc, 1);
3747 		}
3748 		/* restore PCI configuration space */
3749 		dinfo = device_get_ivars(sc->dev);
3750 		pci_cfg_restore(sc->dev, dinfo);
3751 
3752 		/* and redo any changes we made to our config space */
3753 		mxge_setup_cfg_space(sc);
3754 
3755 		/* reload f/w */
3756 		err = mxge_load_firmware(sc, 0);
3757 		if (err) {
3758 			device_printf(sc->dev,
3759 				      "Unable to re-load f/w\n");
3760 		}
3761 		if (running) {
3762 			if (!err)
3763 				err = mxge_open(sc);
3764 			/* release all TX locks */
3765 			for (s = 0; s < num_tx_slices; s++) {
3766 				ss = &sc->ss[s];
3767 				mxge_start_locked(ss);
3768 				mtx_unlock(&ss->tx.mtx);
3769 			}
3770 		}
3771 		sc->watchdog_resets++;
3772 	} else {
3773 		device_printf(sc->dev,
3774 			      "NIC did not reboot, not resetting\n");
3775 		err = 0;
3776 	}
3777 	if (err) {
3778 		device_printf(sc->dev, "watchdog reset failed\n");
3779 	} else {
3780 		if (sc->dying == 2)
3781 			sc->dying = 0;
3782 		callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
3783 	}
3784 }
3785 
3786 static void
3787 mxge_watchdog_task(void *arg, int pending)
3788 {
3789 	mxge_softc_t *sc = arg;
3790 
3791 	mtx_lock(&sc->driver_mtx);
3792 	mxge_watchdog_reset(sc);
3793 	mtx_unlock(&sc->driver_mtx);
3794 }
3795 
3796 static void
3797 mxge_warn_stuck(mxge_softc_t *sc, mxge_tx_ring_t *tx, int slice)
3798 {
3799 	tx = &sc->ss[slice].tx;
3800 	device_printf(sc->dev, "slice %d struck? ring state:\n", slice);
3801 	device_printf(sc->dev,
3802 		      "tx.req=%d tx.done=%d, tx.queue_active=%d\n",
3803 		      tx->req, tx->done, tx->queue_active);
3804 	device_printf(sc->dev, "tx.activate=%d tx.deactivate=%d\n",
3805 			      tx->activate, tx->deactivate);
3806 	device_printf(sc->dev, "pkt_done=%d fw=%d\n",
3807 		      tx->pkt_done,
3808 		      be32toh(sc->ss->fw_stats->send_done_count));
3809 }
3810 
3811 static int
3812 mxge_watchdog(mxge_softc_t *sc)
3813 {
3814 	mxge_tx_ring_t *tx;
3815 	uint32_t rx_pause = be32toh(sc->ss->fw_stats->dropped_pause);
3816 	int i, err = 0;
3817 
3818 	/* see if we have outstanding transmits, which
3819 	   have been pending for more than mxge_ticks */
3820 	for (i = 0; (i < sc->num_slices) && (err == 0); i++) {
3821 		tx = &sc->ss[i].tx;
3822 		if (tx->req != tx->done &&
3823 		    tx->watchdog_req != tx->watchdog_done &&
3824 		    tx->done == tx->watchdog_done) {
3825 			/* check for pause blocking before resetting */
3826 			if (tx->watchdog_rx_pause == rx_pause) {
3827 				mxge_warn_stuck(sc, tx, i);
3828 				taskqueue_enqueue(sc->tq, &sc->watchdog_task);
3829 				return (ENXIO);
3830 			}
3831 			else
3832 				device_printf(sc->dev, "Flow control blocking "
3833 					      "xmits, check link partner\n");
3834 		}
3835 
3836 		tx->watchdog_req = tx->req;
3837 		tx->watchdog_done = tx->done;
3838 		tx->watchdog_rx_pause = rx_pause;
3839 	}
3840 
3841 	if (sc->need_media_probe)
3842 		mxge_media_probe(sc);
3843 	return (err);
3844 }
3845 
3846 static uint64_t
3847 mxge_get_counter(if_t ifp, ift_counter cnt)
3848 {
3849 	struct mxge_softc *sc;
3850 	uint64_t rv;
3851 
3852 	sc = if_getsoftc(ifp);
3853 	rv = 0;
3854 
3855 	switch (cnt) {
3856 	case IFCOUNTER_IPACKETS:
3857 		for (int s = 0; s < sc->num_slices; s++)
3858 			rv += sc->ss[s].ipackets;
3859 		return (rv);
3860 	case IFCOUNTER_OPACKETS:
3861 		for (int s = 0; s < sc->num_slices; s++)
3862 			rv += sc->ss[s].opackets;
3863 		return (rv);
3864 	case IFCOUNTER_OERRORS:
3865 		for (int s = 0; s < sc->num_slices; s++)
3866 			rv += sc->ss[s].oerrors;
3867 		return (rv);
3868 	case IFCOUNTER_OBYTES:
3869 		for (int s = 0; s < sc->num_slices; s++)
3870 			rv += sc->ss[s].obytes;
3871 		return (rv);
3872 	case IFCOUNTER_OMCASTS:
3873 		for (int s = 0; s < sc->num_slices; s++)
3874 			rv += sc->ss[s].omcasts;
3875 		return (rv);
3876 	case IFCOUNTER_OQDROPS:
3877 		for (int s = 0; s < sc->num_slices; s++)
3878 			rv += sc->ss[s].tx.br->br_drops;
3879 		return (rv);
3880 	default:
3881 		return (if_get_counter_default(ifp, cnt));
3882 	}
3883 }
3884 
3885 static void
3886 mxge_tick(void *arg)
3887 {
3888 	mxge_softc_t *sc = arg;
3889 	u_long pkts = 0;
3890 	int err = 0;
3891 	int running, ticks;
3892 	uint16_t cmd;
3893 
3894 	ticks = mxge_ticks;
3895 	running = if_getdrvflags(sc->ifp) & IFF_DRV_RUNNING;
3896 	if (running) {
3897 		if (!sc->watchdog_countdown) {
3898 			err = mxge_watchdog(sc);
3899 			sc->watchdog_countdown = 4;
3900 		}
3901 		sc->watchdog_countdown--;
3902 	}
3903 	if (pkts == 0) {
3904 		/* ensure NIC did not suffer h/w fault while idle */
3905 		cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3906 		if ((cmd & PCIM_CMD_BUSMASTEREN) == 0) {
3907 			sc->dying = 2;
3908 			taskqueue_enqueue(sc->tq, &sc->watchdog_task);
3909 			err = ENXIO;
3910 		}
3911 		/* look less often if NIC is idle */
3912 		ticks *= 4;
3913 	}
3914 
3915 	if (err == 0)
3916 		callout_reset(&sc->co_hdl, ticks, mxge_tick, sc);
3917 
3918 }
3919 
3920 static int
3921 mxge_media_change(if_t ifp)
3922 {
3923 	return EINVAL;
3924 }
3925 
3926 static int
3927 mxge_change_mtu(mxge_softc_t *sc, int mtu)
3928 {
3929 	if_t ifp = sc->ifp;
3930 	int real_mtu, old_mtu;
3931 	int err = 0;
3932 
3933 	real_mtu = mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
3934 	if ((real_mtu > sc->max_mtu) || real_mtu < 60)
3935 		return EINVAL;
3936 	mtx_lock(&sc->driver_mtx);
3937 	old_mtu = if_getmtu(ifp);
3938 	if_setmtu(ifp, mtu);
3939 	if (if_getdrvflags(ifp) & IFF_DRV_RUNNING) {
3940 		mxge_close(sc, 0);
3941 		err = mxge_open(sc);
3942 		if (err != 0) {
3943 			if_setmtu(ifp, old_mtu);
3944 			mxge_close(sc, 0);
3945 			(void) mxge_open(sc);
3946 		}
3947 	}
3948 	mtx_unlock(&sc->driver_mtx);
3949 	return err;
3950 }
3951 
3952 static void
3953 mxge_media_status(if_t ifp, struct ifmediareq *ifmr)
3954 {
3955 	mxge_softc_t *sc = if_getsoftc(ifp);
3956 
3957 	if (sc == NULL)
3958 		return;
3959 	ifmr->ifm_status = IFM_AVALID;
3960 	ifmr->ifm_active = IFM_ETHER | IFM_FDX;
3961 	ifmr->ifm_status |= sc->link_state ? IFM_ACTIVE : 0;
3962 	ifmr->ifm_active |= sc->current_media;
3963 }
3964 
3965 static int
3966 mxge_fetch_i2c(mxge_softc_t *sc, struct ifi2creq *i2c)
3967 {
3968 	mxge_cmd_t cmd;
3969 	uint32_t i2c_args;
3970 	int i, ms, err;
3971 
3972 	if (i2c->dev_addr != 0xA0 &&
3973 	    i2c->dev_addr != 0xA2)
3974 		return (EINVAL);
3975 	if (i2c->len > sizeof(i2c->data))
3976 		return (EINVAL);
3977 
3978 	for (i = 0; i < i2c->len; i++) {
3979 		i2c_args = i2c->dev_addr << 0x8;
3980 		i2c_args |= i2c->offset + i;
3981 		cmd.data0 = 0;	 /* just fetch 1 byte, not all 256 */
3982 		cmd.data1 = i2c_args;
3983 		err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_READ, &cmd);
3984 
3985 		if (err != MXGEFW_CMD_OK)
3986 			return (EIO);
3987 		/* now we wait for the data to be cached */
3988 		cmd.data0 = i2c_args & 0xff;
3989 		err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
3990 		for (ms = 0; (err == EBUSY) && (ms < 50); ms++) {
3991 			cmd.data0 = i2c_args & 0xff;
3992 			err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
3993 			if (err == EBUSY)
3994 				DELAY(1000);
3995 		}
3996 		if (err != MXGEFW_CMD_OK)
3997 			return (EIO);
3998 		i2c->data[i] = cmd.data0;
3999 	}
4000 	return (0);
4001 }
4002 
4003 static int
4004 mxge_ioctl(if_t ifp, u_long command, caddr_t data)
4005 {
4006 	mxge_softc_t *sc = if_getsoftc(ifp);
4007 	struct ifreq *ifr = (struct ifreq *)data;
4008 	struct ifi2creq i2c;
4009 	int err, mask;
4010 
4011 	err = 0;
4012 	switch (command) {
4013 	case SIOCSIFMTU:
4014 		err = mxge_change_mtu(sc, ifr->ifr_mtu);
4015 		break;
4016 
4017 	case SIOCSIFFLAGS:
4018 		mtx_lock(&sc->driver_mtx);
4019 		if (sc->dying) {
4020 			mtx_unlock(&sc->driver_mtx);
4021 			return EINVAL;
4022 		}
4023 		if (if_getflags(ifp) & IFF_UP) {
4024 			if (!(if_getdrvflags(ifp) & IFF_DRV_RUNNING)) {
4025 				err = mxge_open(sc);
4026 			} else {
4027 				/* take care of promis can allmulti
4028 				   flag chages */
4029 				mxge_change_promisc(sc,
4030 						    if_getflags(ifp) & IFF_PROMISC);
4031 				mxge_set_multicast_list(sc);
4032 			}
4033 		} else {
4034 			if (if_getdrvflags(ifp) & IFF_DRV_RUNNING) {
4035 				mxge_close(sc, 0);
4036 			}
4037 		}
4038 		mtx_unlock(&sc->driver_mtx);
4039 		break;
4040 
4041 	case SIOCADDMULTI:
4042 	case SIOCDELMULTI:
4043 		mtx_lock(&sc->driver_mtx);
4044 		if (sc->dying) {
4045 			mtx_unlock(&sc->driver_mtx);
4046 			return (EINVAL);
4047 		}
4048 		mxge_set_multicast_list(sc);
4049 		mtx_unlock(&sc->driver_mtx);
4050 		break;
4051 
4052 	case SIOCSIFCAP:
4053 		mtx_lock(&sc->driver_mtx);
4054 		mask = ifr->ifr_reqcap ^ if_getcapenable(ifp);
4055 		if (mask & IFCAP_TXCSUM) {
4056 			if (IFCAP_TXCSUM & if_getcapenable(ifp)) {
4057 				mask &= ~IFCAP_TSO4;
4058 				if_setcapenablebit(ifp, 0, (IFCAP_TXCSUM|IFCAP_TSO4));
4059 				if_sethwassistbits(ifp, 0, (CSUM_TCP | CSUM_UDP));
4060 			} else {
4061 				if_setcapenablebit(ifp, IFCAP_TXCSUM, 0);
4062 				if_sethwassistbits(ifp, (CSUM_TCP | CSUM_UDP), 0);
4063 			}
4064 		}
4065 		if (mask & IFCAP_RXCSUM) {
4066 			if (IFCAP_RXCSUM & if_getcapenable(ifp)) {
4067 				if_setcapenablebit(ifp, 0, IFCAP_RXCSUM);
4068 			} else {
4069 				if_setcapenablebit(ifp, IFCAP_RXCSUM, 0);
4070 			}
4071 		}
4072 		if (mask & IFCAP_TSO4) {
4073 			if (IFCAP_TSO4 & if_getcapenable(ifp)) {
4074 				if_setcapenablebit(ifp, 0, IFCAP_TSO4);
4075 			} else if (IFCAP_TXCSUM & if_getcapenable(ifp)) {
4076 				if_setcapenablebit(ifp, IFCAP_TSO4, 0);
4077 				if_sethwassistbits(ifp, CSUM_TSO, 0);
4078 			} else {
4079 				printf("mxge requires tx checksum offload"
4080 				       " be enabled to use TSO\n");
4081 				err = EINVAL;
4082 			}
4083 		}
4084 #if IFCAP_TSO6
4085 		if (mask & IFCAP_TXCSUM_IPV6) {
4086 			if (IFCAP_TXCSUM_IPV6 & if_getcapenable(ifp)) {
4087 				mask &= ~IFCAP_TSO6;
4088 				if_setcapenablebit(ifp, 0,
4089 				    IFCAP_TXCSUM_IPV6 | IFCAP_TSO6);
4090 				if_sethwassistbits(ifp, 0,
4091 				    CSUM_TCP_IPV6 | CSUM_UDP);
4092 			} else {
4093 				if_setcapenablebit(ifp, IFCAP_TXCSUM_IPV6, 0);
4094 				if_sethwassistbits(ifp,
4095 				    CSUM_TCP_IPV6 | CSUM_UDP_IPV6, 0);
4096 			}
4097 		}
4098 		if (mask & IFCAP_RXCSUM_IPV6) {
4099 			if (IFCAP_RXCSUM_IPV6 & if_getcapenable(ifp)) {
4100 				if_setcapenablebit(ifp, 0, IFCAP_RXCSUM_IPV6);
4101 			} else {
4102 				if_setcapenablebit(ifp, IFCAP_RXCSUM_IPV6, 0);
4103 			}
4104 		}
4105 		if (mask & IFCAP_TSO6) {
4106 			if (IFCAP_TSO6 & if_getcapenable(ifp)) {
4107 				if_setcapenablebit(ifp, 0, IFCAP_TSO6);
4108 			} else if (IFCAP_TXCSUM_IPV6 & if_getcapenable(ifp)) {
4109 				if_setcapenablebit(ifp, IFCAP_TSO6, 0);
4110 				if_sethwassistbits(ifp, CSUM_TSO, 0);
4111 			} else {
4112 				printf("mxge requires tx checksum offload"
4113 				       " be enabled to use TSO\n");
4114 				err = EINVAL;
4115 			}
4116 		}
4117 #endif /*IFCAP_TSO6 */
4118 
4119 		if (mask & IFCAP_LRO)
4120 			if_togglecapenable(ifp, IFCAP_LRO);
4121 		if (mask & IFCAP_VLAN_HWTAGGING)
4122 			if_togglecapenable(ifp, IFCAP_VLAN_HWTAGGING);
4123 		if (mask & IFCAP_VLAN_HWTSO)
4124 			if_togglecapenable(ifp, IFCAP_VLAN_HWTSO);
4125 
4126 		if (!(if_getcapabilities(ifp) & IFCAP_VLAN_HWTSO) ||
4127 		    !(if_getcapenable(ifp) & IFCAP_VLAN_HWTAGGING))
4128 			if_setcapenablebit(ifp, 0, IFCAP_VLAN_HWTSO);
4129 
4130 		mtx_unlock(&sc->driver_mtx);
4131 		VLAN_CAPABILITIES(ifp);
4132 
4133 		break;
4134 
4135 	case SIOCGIFMEDIA:
4136 		mtx_lock(&sc->driver_mtx);
4137 		if (sc->dying) {
4138 			mtx_unlock(&sc->driver_mtx);
4139 			return (EINVAL);
4140 		}
4141 		mxge_media_probe(sc);
4142 		mtx_unlock(&sc->driver_mtx);
4143 		err = ifmedia_ioctl(ifp, (struct ifreq *)data,
4144 				    &sc->media, command);
4145 		break;
4146 
4147 	case SIOCGI2C:
4148 		if (sc->connector != MXGE_XFP &&
4149 		    sc->connector != MXGE_SFP) {
4150 			err = ENXIO;
4151 			break;
4152 		}
4153 		err = copyin(ifr_data_get_ptr(ifr), &i2c, sizeof(i2c));
4154 		if (err != 0)
4155 			break;
4156 		mtx_lock(&sc->driver_mtx);
4157 		if (sc->dying) {
4158 			mtx_unlock(&sc->driver_mtx);
4159 			return (EINVAL);
4160 		}
4161 		err = mxge_fetch_i2c(sc, &i2c);
4162 		mtx_unlock(&sc->driver_mtx);
4163 		if (err == 0)
4164 			err = copyout(&i2c, ifr_data_get_ptr(ifr),
4165 			    sizeof(i2c));
4166 		break;
4167 	default:
4168 		err = ether_ioctl(ifp, command, data);
4169 		break;
4170 	}
4171 	return err;
4172 }
4173 
4174 static void
4175 mxge_fetch_tunables(mxge_softc_t *sc)
4176 {
4177 
4178 	TUNABLE_INT_FETCH("hw.mxge.max_slices", &mxge_max_slices);
4179 	TUNABLE_INT_FETCH("hw.mxge.flow_control_enabled",
4180 			  &mxge_flow_control);
4181 	TUNABLE_INT_FETCH("hw.mxge.intr_coal_delay",
4182 			  &mxge_intr_coal_delay);
4183 	TUNABLE_INT_FETCH("hw.mxge.nvidia_ecrc_enable",
4184 			  &mxge_nvidia_ecrc_enable);
4185 	TUNABLE_INT_FETCH("hw.mxge.force_firmware",
4186 			  &mxge_force_firmware);
4187 	TUNABLE_INT_FETCH("hw.mxge.deassert_wait",
4188 			  &mxge_deassert_wait);
4189 	TUNABLE_INT_FETCH("hw.mxge.verbose",
4190 			  &mxge_verbose);
4191 	TUNABLE_INT_FETCH("hw.mxge.ticks", &mxge_ticks);
4192 	TUNABLE_INT_FETCH("hw.mxge.always_promisc", &mxge_always_promisc);
4193 	TUNABLE_INT_FETCH("hw.mxge.rss_hash_type", &mxge_rss_hash_type);
4194 	TUNABLE_INT_FETCH("hw.mxge.rss_hashtype", &mxge_rss_hash_type);
4195 	TUNABLE_INT_FETCH("hw.mxge.initial_mtu", &mxge_initial_mtu);
4196 	TUNABLE_INT_FETCH("hw.mxge.throttle", &mxge_throttle);
4197 
4198 	if (bootverbose)
4199 		mxge_verbose = 1;
4200 	if (mxge_intr_coal_delay < 0 || mxge_intr_coal_delay > 10*1000)
4201 		mxge_intr_coal_delay = 30;
4202 	if (mxge_ticks == 0)
4203 		mxge_ticks = hz / 2;
4204 	sc->pause = mxge_flow_control;
4205 	if (mxge_rss_hash_type < MXGEFW_RSS_HASH_TYPE_IPV4
4206 	    || mxge_rss_hash_type > MXGEFW_RSS_HASH_TYPE_MAX) {
4207 		mxge_rss_hash_type = MXGEFW_RSS_HASH_TYPE_SRC_DST_PORT;
4208 	}
4209 	if (mxge_initial_mtu > ETHERMTU_JUMBO ||
4210 	    mxge_initial_mtu < ETHER_MIN_LEN)
4211 		mxge_initial_mtu = ETHERMTU_JUMBO;
4212 
4213 	if (mxge_throttle && mxge_throttle > MXGE_MAX_THROTTLE)
4214 		mxge_throttle = MXGE_MAX_THROTTLE;
4215 	if (mxge_throttle && mxge_throttle < MXGE_MIN_THROTTLE)
4216 		mxge_throttle = MXGE_MIN_THROTTLE;
4217 	sc->throttle = mxge_throttle;
4218 }
4219 
4220 static void
4221 mxge_free_slices(mxge_softc_t *sc)
4222 {
4223 	struct mxge_slice_state *ss;
4224 	int i;
4225 
4226 	if (sc->ss == NULL)
4227 		return;
4228 
4229 	for (i = 0; i < sc->num_slices; i++) {
4230 		ss = &sc->ss[i];
4231 		if (ss->fw_stats != NULL) {
4232 			mxge_dma_free(&ss->fw_stats_dma);
4233 			ss->fw_stats = NULL;
4234 			if (ss->tx.br != NULL) {
4235 				drbr_free(ss->tx.br, M_DEVBUF);
4236 				ss->tx.br = NULL;
4237 			}
4238 			mtx_destroy(&ss->tx.mtx);
4239 		}
4240 		if (ss->rx_done.entry != NULL) {
4241 			mxge_dma_free(&ss->rx_done.dma);
4242 			ss->rx_done.entry = NULL;
4243 		}
4244 	}
4245 	free(sc->ss, M_DEVBUF);
4246 	sc->ss = NULL;
4247 }
4248 
4249 static int
4250 mxge_alloc_slices(mxge_softc_t *sc)
4251 {
4252 	mxge_cmd_t cmd;
4253 	struct mxge_slice_state *ss;
4254 	size_t bytes;
4255 	int err, i, max_intr_slots;
4256 
4257 	err = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
4258 	if (err != 0) {
4259 		device_printf(sc->dev, "Cannot determine rx ring size\n");
4260 		return err;
4261 	}
4262 	sc->rx_ring_size = cmd.data0;
4263 	max_intr_slots = 2 * (sc->rx_ring_size / sizeof (mcp_dma_addr_t));
4264 
4265 	bytes = sizeof (*sc->ss) * sc->num_slices;
4266 	sc->ss = malloc(bytes, M_DEVBUF, M_NOWAIT | M_ZERO);
4267 	if (sc->ss == NULL)
4268 		return (ENOMEM);
4269 	for (i = 0; i < sc->num_slices; i++) {
4270 		ss = &sc->ss[i];
4271 
4272 		ss->sc = sc;
4273 
4274 		/* allocate per-slice rx interrupt queues */
4275 
4276 		bytes = max_intr_slots * sizeof (*ss->rx_done.entry);
4277 		err = mxge_dma_alloc(sc, &ss->rx_done.dma, bytes, 4096);
4278 		if (err != 0)
4279 			goto abort;
4280 		ss->rx_done.entry = ss->rx_done.dma.addr;
4281 		bzero(ss->rx_done.entry, bytes);
4282 
4283 		/*
4284 		 * allocate the per-slice firmware stats; stats
4285 		 * (including tx) are used used only on the first
4286 		 * slice for now
4287 		 */
4288 
4289 		bytes = sizeof (*ss->fw_stats);
4290 		err = mxge_dma_alloc(sc, &ss->fw_stats_dma,
4291 				     sizeof (*ss->fw_stats), 64);
4292 		if (err != 0)
4293 			goto abort;
4294 		ss->fw_stats = (mcp_irq_data_t *)ss->fw_stats_dma.addr;
4295 		snprintf(ss->tx.mtx_name, sizeof(ss->tx.mtx_name),
4296 			 "%s:tx(%d)", device_get_nameunit(sc->dev), i);
4297 		mtx_init(&ss->tx.mtx, ss->tx.mtx_name, NULL, MTX_DEF);
4298 		ss->tx.br = buf_ring_alloc(2048, M_DEVBUF, M_WAITOK,
4299 					   &ss->tx.mtx);
4300 	}
4301 
4302 	return (0);
4303 
4304 abort:
4305 	mxge_free_slices(sc);
4306 	return (ENOMEM);
4307 }
4308 
4309 static void
4310 mxge_slice_probe(mxge_softc_t *sc)
4311 {
4312 	mxge_cmd_t cmd;
4313 	char *old_fw;
4314 	int msix_cnt, status, max_intr_slots;
4315 
4316 	sc->num_slices = 1;
4317 	/*
4318 	 *  don't enable multiple slices if they are not enabled,
4319 	 *  or if this is not an SMP system
4320 	 */
4321 
4322 	if (mxge_max_slices == 0 || mxge_max_slices == 1 || mp_ncpus < 2)
4323 		return;
4324 
4325 	/* see how many MSI-X interrupts are available */
4326 	msix_cnt = pci_msix_count(sc->dev);
4327 	if (msix_cnt < 2)
4328 		return;
4329 
4330 	/* now load the slice aware firmware see what it supports */
4331 	old_fw = sc->fw_name;
4332 	if (old_fw == mxge_fw_aligned)
4333 		sc->fw_name = mxge_fw_rss_aligned;
4334 	else
4335 		sc->fw_name = mxge_fw_rss_unaligned;
4336 	status = mxge_load_firmware(sc, 0);
4337 	if (status != 0) {
4338 		device_printf(sc->dev, "Falling back to a single slice\n");
4339 		return;
4340 	}
4341 
4342 	/* try to send a reset command to the card to see if it
4343 	   is alive */
4344 	memset(&cmd, 0, sizeof (cmd));
4345 	status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
4346 	if (status != 0) {
4347 		device_printf(sc->dev, "failed reset\n");
4348 		goto abort_with_fw;
4349 	}
4350 
4351 	/* get rx ring size */
4352 	status = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
4353 	if (status != 0) {
4354 		device_printf(sc->dev, "Cannot determine rx ring size\n");
4355 		goto abort_with_fw;
4356 	}
4357 	max_intr_slots = 2 * (cmd.data0 / sizeof (mcp_dma_addr_t));
4358 
4359 	/* tell it the size of the interrupt queues */
4360 	cmd.data0 = max_intr_slots * sizeof (struct mcp_slot);
4361 	status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
4362 	if (status != 0) {
4363 		device_printf(sc->dev, "failed MXGEFW_CMD_SET_INTRQ_SIZE\n");
4364 		goto abort_with_fw;
4365 	}
4366 
4367 	/* ask the maximum number of slices it supports */
4368 	status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES, &cmd);
4369 	if (status != 0) {
4370 		device_printf(sc->dev,
4371 			      "failed MXGEFW_CMD_GET_MAX_RSS_QUEUES\n");
4372 		goto abort_with_fw;
4373 	}
4374 	sc->num_slices = cmd.data0;
4375 	if (sc->num_slices > msix_cnt)
4376 		sc->num_slices = msix_cnt;
4377 
4378 	if (mxge_max_slices == -1) {
4379 		/* cap to number of CPUs in system */
4380 		if (sc->num_slices > mp_ncpus)
4381 			sc->num_slices = mp_ncpus;
4382 	} else {
4383 		if (sc->num_slices > mxge_max_slices)
4384 			sc->num_slices = mxge_max_slices;
4385 	}
4386 	/* make sure it is a power of two */
4387 	while (sc->num_slices & (sc->num_slices - 1))
4388 		sc->num_slices--;
4389 
4390 	if (mxge_verbose)
4391 		device_printf(sc->dev, "using %d slices\n",
4392 			      sc->num_slices);
4393 
4394 	return;
4395 
4396 abort_with_fw:
4397 	sc->fw_name = old_fw;
4398 	(void) mxge_load_firmware(sc, 0);
4399 }
4400 
4401 static int
4402 mxge_add_msix_irqs(mxge_softc_t *sc)
4403 {
4404 	size_t bytes;
4405 	int count, err, i, rid;
4406 
4407 	rid = PCIR_BAR(2);
4408 	sc->msix_table_res = bus_alloc_resource_any(sc->dev, SYS_RES_MEMORY,
4409 						    &rid, RF_ACTIVE);
4410 
4411 	if (sc->msix_table_res == NULL) {
4412 		device_printf(sc->dev, "couldn't alloc MSIX table res\n");
4413 		return ENXIO;
4414 	}
4415 
4416 	count = sc->num_slices;
4417 	err = pci_alloc_msix(sc->dev, &count);
4418 	if (err != 0) {
4419 		device_printf(sc->dev, "pci_alloc_msix: failed, wanted %d"
4420 			      "err = %d \n", sc->num_slices, err);
4421 		goto abort_with_msix_table;
4422 	}
4423 	if (count < sc->num_slices) {
4424 		device_printf(sc->dev, "pci_alloc_msix: need %d, got %d\n",
4425 			      count, sc->num_slices);
4426 		device_printf(sc->dev,
4427 			      "Try setting hw.mxge.max_slices to %d\n",
4428 			      count);
4429 		err = ENOSPC;
4430 		goto abort_with_msix;
4431 	}
4432 	bytes = sizeof (*sc->msix_irq_res) * sc->num_slices;
4433 	sc->msix_irq_res = malloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO);
4434 	if (sc->msix_irq_res == NULL) {
4435 		err = ENOMEM;
4436 		goto abort_with_msix;
4437 	}
4438 
4439 	for (i = 0; i < sc->num_slices; i++) {
4440 		rid = i + 1;
4441 		sc->msix_irq_res[i] = bus_alloc_resource_any(sc->dev,
4442 							  SYS_RES_IRQ,
4443 							  &rid, RF_ACTIVE);
4444 		if (sc->msix_irq_res[i] == NULL) {
4445 			device_printf(sc->dev, "couldn't allocate IRQ res"
4446 				      " for message %d\n", i);
4447 			err = ENXIO;
4448 			goto abort_with_res;
4449 		}
4450 	}
4451 
4452 	bytes = sizeof (*sc->msix_ih) * sc->num_slices;
4453 	sc->msix_ih =  malloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO);
4454 
4455 	for (i = 0; i < sc->num_slices; i++) {
4456 		err = bus_setup_intr(sc->dev, sc->msix_irq_res[i],
4457 				     INTR_TYPE_NET | INTR_MPSAFE, NULL,
4458 				     mxge_intr, &sc->ss[i], &sc->msix_ih[i]);
4459 		if (err != 0) {
4460 			device_printf(sc->dev, "couldn't setup intr for "
4461 				      "message %d\n", i);
4462 			goto abort_with_intr;
4463 		}
4464 		bus_describe_intr(sc->dev, sc->msix_irq_res[i],
4465 				  sc->msix_ih[i], "s%d", i);
4466 	}
4467 
4468 	if (mxge_verbose) {
4469 		device_printf(sc->dev, "using %d msix IRQs:",
4470 			      sc->num_slices);
4471 		for (i = 0; i < sc->num_slices; i++)
4472 			printf(" %jd", rman_get_start(sc->msix_irq_res[i]));
4473 		printf("\n");
4474 	}
4475 	return (0);
4476 
4477 abort_with_intr:
4478 	for (i = 0; i < sc->num_slices; i++) {
4479 		if (sc->msix_ih[i] != NULL) {
4480 			bus_teardown_intr(sc->dev, sc->msix_irq_res[i],
4481 					  sc->msix_ih[i]);
4482 			sc->msix_ih[i] = NULL;
4483 		}
4484 	}
4485 	free(sc->msix_ih, M_DEVBUF);
4486 
4487 abort_with_res:
4488 	for (i = 0; i < sc->num_slices; i++) {
4489 		rid = i + 1;
4490 		if (sc->msix_irq_res[i] != NULL)
4491 			bus_release_resource(sc->dev, SYS_RES_IRQ, rid,
4492 					     sc->msix_irq_res[i]);
4493 		sc->msix_irq_res[i] = NULL;
4494 	}
4495 	free(sc->msix_irq_res, M_DEVBUF);
4496 
4497 abort_with_msix:
4498 	pci_release_msi(sc->dev);
4499 
4500 abort_with_msix_table:
4501 	bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2),
4502 			     sc->msix_table_res);
4503 
4504 	return err;
4505 }
4506 
4507 static int
4508 mxge_add_single_irq(mxge_softc_t *sc)
4509 {
4510 	int count, err, rid;
4511 
4512 	count = pci_msi_count(sc->dev);
4513 	if (count == 1 && pci_alloc_msi(sc->dev, &count) == 0) {
4514 		rid = 1;
4515 	} else {
4516 		rid = 0;
4517 		sc->legacy_irq = 1;
4518 	}
4519 	sc->irq_res = bus_alloc_resource_any(sc->dev, SYS_RES_IRQ, &rid,
4520 					     RF_SHAREABLE | RF_ACTIVE);
4521 	if (sc->irq_res == NULL) {
4522 		device_printf(sc->dev, "could not alloc interrupt\n");
4523 		return ENXIO;
4524 	}
4525 	if (mxge_verbose)
4526 		device_printf(sc->dev, "using %s irq %jd\n",
4527 			      sc->legacy_irq ? "INTx" : "MSI",
4528 			      rman_get_start(sc->irq_res));
4529 	err = bus_setup_intr(sc->dev, sc->irq_res,
4530 			     INTR_TYPE_NET | INTR_MPSAFE, NULL,
4531 			     mxge_intr, &sc->ss[0], &sc->ih);
4532 	if (err != 0) {
4533 		bus_release_resource(sc->dev, SYS_RES_IRQ,
4534 				     sc->legacy_irq ? 0 : 1, sc->irq_res);
4535 		if (!sc->legacy_irq)
4536 			pci_release_msi(sc->dev);
4537 	}
4538 	return err;
4539 }
4540 
4541 static void
4542 mxge_rem_msix_irqs(mxge_softc_t *sc)
4543 {
4544 	int i, rid;
4545 
4546 	for (i = 0; i < sc->num_slices; i++) {
4547 		if (sc->msix_ih[i] != NULL) {
4548 			bus_teardown_intr(sc->dev, sc->msix_irq_res[i],
4549 					  sc->msix_ih[i]);
4550 			sc->msix_ih[i] = NULL;
4551 		}
4552 	}
4553 	free(sc->msix_ih, M_DEVBUF);
4554 
4555 	for (i = 0; i < sc->num_slices; i++) {
4556 		rid = i + 1;
4557 		if (sc->msix_irq_res[i] != NULL)
4558 			bus_release_resource(sc->dev, SYS_RES_IRQ, rid,
4559 					     sc->msix_irq_res[i]);
4560 		sc->msix_irq_res[i] = NULL;
4561 	}
4562 	free(sc->msix_irq_res, M_DEVBUF);
4563 
4564 	bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2),
4565 			     sc->msix_table_res);
4566 
4567 	pci_release_msi(sc->dev);
4568 	return;
4569 }
4570 
4571 static void
4572 mxge_rem_single_irq(mxge_softc_t *sc)
4573 {
4574 	bus_teardown_intr(sc->dev, sc->irq_res, sc->ih);
4575 	bus_release_resource(sc->dev, SYS_RES_IRQ,
4576 			     sc->legacy_irq ? 0 : 1, sc->irq_res);
4577 	if (!sc->legacy_irq)
4578 		pci_release_msi(sc->dev);
4579 }
4580 
4581 static void
4582 mxge_rem_irq(mxge_softc_t *sc)
4583 {
4584 	if (sc->num_slices > 1)
4585 		mxge_rem_msix_irqs(sc);
4586 	else
4587 		mxge_rem_single_irq(sc);
4588 }
4589 
4590 static int
4591 mxge_add_irq(mxge_softc_t *sc)
4592 {
4593 	int err;
4594 
4595 	if (sc->num_slices > 1)
4596 		err = mxge_add_msix_irqs(sc);
4597 	else
4598 		err = mxge_add_single_irq(sc);
4599 
4600 	if (0 && err == 0 && sc->num_slices > 1) {
4601 		mxge_rem_msix_irqs(sc);
4602 		err = mxge_add_msix_irqs(sc);
4603 	}
4604 	return err;
4605 }
4606 
4607 static int
4608 mxge_attach(device_t dev)
4609 {
4610 	mxge_cmd_t cmd;
4611 	mxge_softc_t *sc = device_get_softc(dev);
4612 	if_t ifp;
4613 	int err, rid;
4614 
4615 	sc->dev = dev;
4616 	mxge_fetch_tunables(sc);
4617 
4618 	TASK_INIT(&sc->watchdog_task, 1, mxge_watchdog_task, sc);
4619 	sc->tq = taskqueue_create("mxge_taskq", M_WAITOK,
4620 				  taskqueue_thread_enqueue, &sc->tq);
4621 	if (sc->tq == NULL) {
4622 		err = ENOMEM;
4623 		goto abort_with_nothing;
4624 	}
4625 
4626 	err = bus_dma_tag_create(bus_get_dma_tag(dev),	/* parent */
4627 				 1,			/* alignment */
4628 				 0,			/* boundary */
4629 				 BUS_SPACE_MAXADDR,	/* low */
4630 				 BUS_SPACE_MAXADDR,	/* high */
4631 				 NULL, NULL,		/* filter */
4632 				 65536 + 256,		/* maxsize */
4633 				 MXGE_MAX_SEND_DESC, 	/* num segs */
4634 				 65536,			/* maxsegsize */
4635 				 0,			/* flags */
4636 				 NULL, NULL,		/* lock */
4637 				 &sc->parent_dmat);	/* tag */
4638 
4639 	if (err != 0) {
4640 		device_printf(sc->dev, "Err %d allocating parent dmat\n",
4641 			      err);
4642 		goto abort_with_tq;
4643 	}
4644 
4645 	ifp = sc->ifp = if_alloc(IFT_ETHER);
4646 	if (ifp == NULL) {
4647 		device_printf(dev, "can not if_alloc()\n");
4648 		err = ENOSPC;
4649 		goto abort_with_parent_dmat;
4650 	}
4651 	if_initname(ifp, device_get_name(dev), device_get_unit(dev));
4652 
4653 	snprintf(sc->cmd_mtx_name, sizeof(sc->cmd_mtx_name), "%s:cmd",
4654 		 device_get_nameunit(dev));
4655 	mtx_init(&sc->cmd_mtx, sc->cmd_mtx_name, NULL, MTX_DEF);
4656 	snprintf(sc->driver_mtx_name, sizeof(sc->driver_mtx_name),
4657 		 "%s:drv", device_get_nameunit(dev));
4658 	mtx_init(&sc->driver_mtx, sc->driver_mtx_name,
4659 		 MTX_NETWORK_LOCK, MTX_DEF);
4660 
4661 	callout_init_mtx(&sc->co_hdl, &sc->driver_mtx, 0);
4662 
4663 	mxge_setup_cfg_space(sc);
4664 
4665 	/* Map the board into the kernel */
4666 	rid = PCIR_BARS;
4667 	sc->mem_res = bus_alloc_resource_any(dev, SYS_RES_MEMORY, &rid,
4668 					     RF_ACTIVE);
4669 	if (sc->mem_res == NULL) {
4670 		device_printf(dev, "could not map memory\n");
4671 		err = ENXIO;
4672 		goto abort_with_lock;
4673 	}
4674 	sc->sram = rman_get_virtual(sc->mem_res);
4675 	sc->sram_size = 2*1024*1024 - (2*(48*1024)+(32*1024)) - 0x100;
4676 	if (sc->sram_size > rman_get_size(sc->mem_res)) {
4677 		device_printf(dev, "impossible memory region size %jd\n",
4678 			      rman_get_size(sc->mem_res));
4679 		err = ENXIO;
4680 		goto abort_with_mem_res;
4681 	}
4682 
4683 	/* make NULL terminated copy of the EEPROM strings section of
4684 	   lanai SRAM */
4685 	bzero(sc->eeprom_strings, MXGE_EEPROM_STRINGS_SIZE);
4686 	bus_space_read_region_1(rman_get_bustag(sc->mem_res),
4687 				rman_get_bushandle(sc->mem_res),
4688 				sc->sram_size - MXGE_EEPROM_STRINGS_SIZE,
4689 				sc->eeprom_strings,
4690 				MXGE_EEPROM_STRINGS_SIZE - 2);
4691 	err = mxge_parse_strings(sc);
4692 	if (err != 0)
4693 		goto abort_with_mem_res;
4694 
4695 	/* Enable write combining for efficient use of PCIe bus */
4696 	mxge_enable_wc(sc);
4697 
4698 	/* Allocate the out of band dma memory */
4699 	err = mxge_dma_alloc(sc, &sc->cmd_dma,
4700 			     sizeof (mxge_cmd_t), 64);
4701 	if (err != 0)
4702 		goto abort_with_mem_res;
4703 	sc->cmd = (mcp_cmd_response_t *) sc->cmd_dma.addr;
4704 	err = mxge_dma_alloc(sc, &sc->zeropad_dma, 64, 64);
4705 	if (err != 0)
4706 		goto abort_with_cmd_dma;
4707 
4708 	err = mxge_dma_alloc(sc, &sc->dmabench_dma, 4096, 4096);
4709 	if (err != 0)
4710 		goto abort_with_zeropad_dma;
4711 
4712 	/* select & load the firmware */
4713 	err = mxge_select_firmware(sc);
4714 	if (err != 0)
4715 		goto abort_with_dmabench;
4716 	sc->intr_coal_delay = mxge_intr_coal_delay;
4717 
4718 	mxge_slice_probe(sc);
4719 	err = mxge_alloc_slices(sc);
4720 	if (err != 0)
4721 		goto abort_with_dmabench;
4722 
4723 	err = mxge_reset(sc, 0);
4724 	if (err != 0)
4725 		goto abort_with_slices;
4726 
4727 	err = mxge_alloc_rings(sc);
4728 	if (err != 0) {
4729 		device_printf(sc->dev, "failed to allocate rings\n");
4730 		goto abort_with_slices;
4731 	}
4732 
4733 	err = mxge_add_irq(sc);
4734 	if (err != 0) {
4735 		device_printf(sc->dev, "failed to add irq\n");
4736 		goto abort_with_rings;
4737 	}
4738 
4739 	if_setbaudrate(ifp, IF_Gbps(10));
4740 	if_setcapabilities(ifp, IFCAP_RXCSUM | IFCAP_TXCSUM | IFCAP_TSO4 |
4741 		IFCAP_VLAN_MTU | IFCAP_LINKSTATE | IFCAP_TXCSUM_IPV6 |
4742 		IFCAP_RXCSUM_IPV6);
4743 #if defined(INET) || defined(INET6)
4744 	if_setcapabilitiesbit(ifp, IFCAP_LRO, 0);
4745 #endif
4746 
4747 #ifdef MXGE_NEW_VLAN_API
4748 	if_setcapabilitiesbit(ifp, IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_HWCSUM, 0);
4749 
4750 	/* Only FW 1.4.32 and newer can do TSO over vlans */
4751 	if (sc->fw_ver_major == 1 && sc->fw_ver_minor == 4 &&
4752 	    sc->fw_ver_tiny >= 32)
4753 		if_setcapabilitiesbit(ifp, IFCAP_VLAN_HWTSO, 0);
4754 #endif
4755 	sc->max_mtu = mxge_max_mtu(sc);
4756 	if (sc->max_mtu >= 9000)
4757 		if_setcapabilitiesbit(ifp, IFCAP_JUMBO_MTU, 0);
4758 	else
4759 		device_printf(dev, "MTU limited to %d.  Install "
4760 			      "latest firmware for 9000 byte jumbo support\n",
4761 			      sc->max_mtu - ETHER_HDR_LEN);
4762 	if_sethwassist(ifp, CSUM_TCP | CSUM_UDP | CSUM_TSO);
4763 	if_sethwassistbits(ifp, CSUM_TCP_IPV6 | CSUM_UDP_IPV6, 0);
4764 	/* check to see if f/w supports TSO for IPv6 */
4765 	if (!mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_TSO6_HDR_SIZE, &cmd)) {
4766 		if (CSUM_TCP_IPV6)
4767 			if_setcapabilitiesbit(ifp, IFCAP_TSO6, 0);
4768 		sc->max_tso6_hlen = min(cmd.data0,
4769 					sizeof (sc->ss[0].scratch));
4770 	}
4771 	if_setcapenable(ifp, if_getcapabilities(ifp));
4772 	if (sc->lro_cnt == 0)
4773 		if_setcapenablebit(ifp, 0, IFCAP_LRO);
4774 	if_setinitfn(ifp, mxge_init);
4775 	if_setsoftc(ifp, sc);
4776 	if_setflags(ifp, IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST);
4777 	if_setioctlfn(ifp, mxge_ioctl);
4778 	if_setstartfn(ifp, mxge_start);
4779 	if_setgetcounterfn(ifp, mxge_get_counter);
4780 	if_sethwtsomax(ifp, IP_MAXPACKET - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN));
4781 	if_sethwtsomaxsegcount(ifp, sc->ss[0].tx.max_desc);
4782 	if_sethwtsomaxsegsize(ifp, IP_MAXPACKET);
4783 	/* Initialise the ifmedia structure */
4784 	ifmedia_init(&sc->media, 0, mxge_media_change,
4785 		     mxge_media_status);
4786 	mxge_media_init(sc);
4787 	mxge_media_probe(sc);
4788 	sc->dying = 0;
4789 	ether_ifattach(ifp, sc->mac_addr);
4790 	/* ether_ifattach sets mtu to ETHERMTU */
4791 	if (mxge_initial_mtu != ETHERMTU)
4792 		mxge_change_mtu(sc, mxge_initial_mtu);
4793 
4794 	mxge_add_sysctls(sc);
4795 	if_settransmitfn(ifp, mxge_transmit);
4796 	if_setqflushfn(ifp, mxge_qflush);
4797 	taskqueue_start_threads(&sc->tq, 1, PI_NET, "%s taskq",
4798 				device_get_nameunit(sc->dev));
4799 	callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
4800 	return 0;
4801 
4802 abort_with_rings:
4803 	mxge_free_rings(sc);
4804 abort_with_slices:
4805 	mxge_free_slices(sc);
4806 abort_with_dmabench:
4807 	mxge_dma_free(&sc->dmabench_dma);
4808 abort_with_zeropad_dma:
4809 	mxge_dma_free(&sc->zeropad_dma);
4810 abort_with_cmd_dma:
4811 	mxge_dma_free(&sc->cmd_dma);
4812 abort_with_mem_res:
4813 	bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
4814 abort_with_lock:
4815 	pci_disable_busmaster(dev);
4816 	mtx_destroy(&sc->cmd_mtx);
4817 	mtx_destroy(&sc->driver_mtx);
4818 	if_free(ifp);
4819 abort_with_parent_dmat:
4820 	bus_dma_tag_destroy(sc->parent_dmat);
4821 abort_with_tq:
4822 	if (sc->tq != NULL) {
4823 		taskqueue_drain(sc->tq, &sc->watchdog_task);
4824 		taskqueue_free(sc->tq);
4825 		sc->tq = NULL;
4826 	}
4827 abort_with_nothing:
4828 	return err;
4829 }
4830 
4831 static int
4832 mxge_detach(device_t dev)
4833 {
4834 	mxge_softc_t *sc = device_get_softc(dev);
4835 
4836 	if (mxge_vlans_active(sc)) {
4837 		device_printf(sc->dev,
4838 			      "Detach vlans before removing module\n");
4839 		return EBUSY;
4840 	}
4841 	mtx_lock(&sc->driver_mtx);
4842 	sc->dying = 1;
4843 	if (if_getdrvflags(sc->ifp) & IFF_DRV_RUNNING)
4844 		mxge_close(sc, 0);
4845 	mtx_unlock(&sc->driver_mtx);
4846 	ether_ifdetach(sc->ifp);
4847 	if (sc->tq != NULL) {
4848 		taskqueue_drain(sc->tq, &sc->watchdog_task);
4849 		taskqueue_free(sc->tq);
4850 		sc->tq = NULL;
4851 	}
4852 	callout_drain(&sc->co_hdl);
4853 	ifmedia_removeall(&sc->media);
4854 	mxge_dummy_rdma(sc, 0);
4855 	mxge_rem_sysctls(sc);
4856 	mxge_rem_irq(sc);
4857 	mxge_free_rings(sc);
4858 	mxge_free_slices(sc);
4859 	mxge_dma_free(&sc->dmabench_dma);
4860 	mxge_dma_free(&sc->zeropad_dma);
4861 	mxge_dma_free(&sc->cmd_dma);
4862 	bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
4863 	pci_disable_busmaster(dev);
4864 	mtx_destroy(&sc->cmd_mtx);
4865 	mtx_destroy(&sc->driver_mtx);
4866 	if_free(sc->ifp);
4867 	bus_dma_tag_destroy(sc->parent_dmat);
4868 	return 0;
4869 }
4870 
4871 static int
4872 mxge_shutdown(device_t dev)
4873 {
4874 	return 0;
4875 }
4876 
4877 /*
4878   This file uses Myri10GE driver indentation.
4879 
4880   Local Variables:
4881   c-file-style:"linux"
4882   tab-width:8
4883   End:
4884 */
4885