xref: /freebsd/sys/dev/mxge/if_mxge.c (revision b9128a37faafede823eb456aa65a11ac69997284)
1 /******************************************************************************
2 SPDX-License-Identifier: BSD-2-Clause
3 
4 Copyright (c) 2006-2013, Myricom Inc.
5 All rights reserved.
6 
7 Redistribution and use in source and binary forms, with or without
8 modification, are permitted provided that the following conditions are met:
9 
10  1. Redistributions of source code must retain the above copyright notice,
11     this list of conditions and the following disclaimer.
12 
13  2. Neither the name of the Myricom Inc, nor the names of its
14     contributors may be used to endorse or promote products derived from
15     this software without specific prior written permission.
16 
17 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
21 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27 POSSIBILITY OF SUCH DAMAGE.
28 
29 ***************************************************************************/
30 
31 #include <sys/param.h>
32 #include <sys/systm.h>
33 #include <sys/linker.h>
34 #include <sys/firmware.h>
35 #include <sys/endian.h>
36 #include <sys/sockio.h>
37 #include <sys/mbuf.h>
38 #include <sys/malloc.h>
39 #include <sys/kdb.h>
40 #include <sys/kernel.h>
41 #include <sys/lock.h>
42 #include <sys/module.h>
43 #include <sys/socket.h>
44 #include <sys/sysctl.h>
45 #include <sys/sx.h>
46 #include <sys/taskqueue.h>
47 #include <contrib/zlib/zlib.h>
48 #include <dev/zlib/zcalloc.h>
49 
50 #include <net/if.h>
51 #include <net/if_var.h>
52 #include <net/if_arp.h>
53 #include <net/ethernet.h>
54 #include <net/if_dl.h>
55 #include <net/if_media.h>
56 
57 #include <net/bpf.h>
58 
59 #include <net/if_types.h>
60 #include <net/if_vlan_var.h>
61 
62 #include <netinet/in_systm.h>
63 #include <netinet/in.h>
64 #include <netinet/ip.h>
65 #include <netinet/ip6.h>
66 #include <netinet/tcp.h>
67 #include <netinet/tcp_lro.h>
68 #include <netinet6/ip6_var.h>
69 
70 #include <machine/bus.h>
71 #include <machine/in_cksum.h>
72 #include <machine/resource.h>
73 #include <sys/bus.h>
74 #include <sys/rman.h>
75 #include <sys/smp.h>
76 
77 #include <dev/pci/pcireg.h>
78 #include <dev/pci/pcivar.h>
79 #include <dev/pci/pci_private.h> /* XXX for pci_cfg_restore */
80 
81 #include <vm/vm.h>		/* for pmap_mapdev() */
82 #include <vm/pmap.h>
83 
84 #if defined(__i386) || defined(__amd64)
85 #include <machine/specialreg.h>
86 #endif
87 
88 #include <dev/mxge/mxge_mcp.h>
89 #include <dev/mxge/mcp_gen_header.h>
90 /*#define MXGE_FAKE_IFP*/
91 #include <dev/mxge/if_mxge_var.h>
92 #include <sys/buf_ring.h>
93 
94 #include "opt_inet.h"
95 #include "opt_inet6.h"
96 
97 /* tunable params */
98 static int mxge_nvidia_ecrc_enable = 1;
99 static int mxge_force_firmware = 0;
100 static int mxge_intr_coal_delay = 30;
101 static int mxge_deassert_wait = 1;
102 static int mxge_flow_control = 1;
103 static int mxge_verbose = 0;
104 static int mxge_ticks;
105 static int mxge_max_slices = 1;
106 static int mxge_rss_hash_type = MXGEFW_RSS_HASH_TYPE_SRC_DST_PORT;
107 static int mxge_always_promisc = 0;
108 static int mxge_initial_mtu = ETHERMTU_JUMBO;
109 static int mxge_throttle = 0;
110 static char *mxge_fw_unaligned = "mxge_ethp_z8e";
111 static char *mxge_fw_aligned = "mxge_eth_z8e";
112 static char *mxge_fw_rss_aligned = "mxge_rss_eth_z8e";
113 static char *mxge_fw_rss_unaligned = "mxge_rss_ethp_z8e";
114 
115 static int mxge_probe(device_t dev);
116 static int mxge_attach(device_t dev);
117 static int mxge_detach(device_t dev);
118 static int mxge_shutdown(device_t dev);
119 static void mxge_intr(void *arg);
120 
121 static device_method_t mxge_methods[] =
122 {
123   /* Device interface */
124   DEVMETHOD(device_probe, mxge_probe),
125   DEVMETHOD(device_attach, mxge_attach),
126   DEVMETHOD(device_detach, mxge_detach),
127   DEVMETHOD(device_shutdown, mxge_shutdown),
128 
129   DEVMETHOD_END
130 };
131 
132 static driver_t mxge_driver =
133 {
134   "mxge",
135   mxge_methods,
136   sizeof(mxge_softc_t),
137 };
138 
139 /* Declare ourselves to be a child of the PCI bus.*/
140 DRIVER_MODULE(mxge, pci, mxge_driver, 0, 0);
141 MODULE_DEPEND(mxge, firmware, 1, 1, 1);
142 MODULE_DEPEND(mxge, zlib, 1, 1, 1);
143 
144 static int mxge_load_firmware(mxge_softc_t *sc, int adopt);
145 static int mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data);
146 static int mxge_close(mxge_softc_t *sc, int down);
147 static int mxge_open(mxge_softc_t *sc);
148 static void mxge_tick(void *arg);
149 
150 static int
151 mxge_probe(device_t dev)
152 {
153 	int rev;
154 
155 	if ((pci_get_vendor(dev) == MXGE_PCI_VENDOR_MYRICOM) &&
156 	    ((pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E) ||
157 	     (pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E_9))) {
158 		rev = pci_get_revid(dev);
159 		switch (rev) {
160 		case MXGE_PCI_REV_Z8E:
161 			device_set_desc(dev, "Myri10G-PCIE-8A");
162 			break;
163 		case MXGE_PCI_REV_Z8ES:
164 			device_set_desc(dev, "Myri10G-PCIE-8B");
165 			break;
166 		default:
167 			device_set_desc(dev, "Myri10G-PCIE-8??");
168 			device_printf(dev, "Unrecognized rev %d NIC\n",
169 				      rev);
170 			break;
171 		}
172 		return 0;
173 	}
174 	return ENXIO;
175 }
176 
177 static void
178 mxge_enable_wc(mxge_softc_t *sc)
179 {
180 #if defined(__i386) || defined(__amd64)
181 	vm_offset_t len;
182 	int err;
183 
184 	sc->wc = 1;
185 	len = rman_get_size(sc->mem_res);
186 	err = pmap_change_attr((vm_offset_t) sc->sram,
187 			       len, PAT_WRITE_COMBINING);
188 	if (err != 0) {
189 		device_printf(sc->dev, "pmap_change_attr failed, %d\n",
190 			      err);
191 		sc->wc = 0;
192 	}
193 #endif
194 }
195 
196 /* callback to get our DMA address */
197 static void
198 mxge_dmamap_callback(void *arg, bus_dma_segment_t *segs, int nsegs,
199 			 int error)
200 {
201 	if (error == 0) {
202 		*(bus_addr_t *) arg = segs->ds_addr;
203 	}
204 }
205 
206 static int
207 mxge_dma_alloc(mxge_softc_t *sc, mxge_dma_t *dma, size_t bytes,
208 		   bus_size_t alignment)
209 {
210 	int err;
211 	device_t dev = sc->dev;
212 	bus_size_t boundary, maxsegsize;
213 
214 	if (bytes > 4096 && alignment == 4096) {
215 		boundary = 0;
216 		maxsegsize = bytes;
217 	} else {
218 		boundary = 4096;
219 		maxsegsize = 4096;
220 	}
221 
222 	/* allocate DMAable memory tags */
223 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
224 				 alignment,		/* alignment */
225 				 boundary,		/* boundary */
226 				 BUS_SPACE_MAXADDR,	/* low */
227 				 BUS_SPACE_MAXADDR,	/* high */
228 				 NULL, NULL,		/* filter */
229 				 bytes,			/* maxsize */
230 				 1,			/* num segs */
231 				 maxsegsize,		/* maxsegsize */
232 				 BUS_DMA_COHERENT,	/* flags */
233 				 NULL, NULL,		/* lock */
234 				 &dma->dmat);		/* tag */
235 	if (err != 0) {
236 		device_printf(dev, "couldn't alloc tag (err = %d)\n", err);
237 		return err;
238 	}
239 
240 	/* allocate DMAable memory & map */
241 	err = bus_dmamem_alloc(dma->dmat, &dma->addr,
242 			       (BUS_DMA_WAITOK | BUS_DMA_COHERENT
243 				| BUS_DMA_ZERO),  &dma->map);
244 	if (err != 0) {
245 		device_printf(dev, "couldn't alloc mem (err = %d)\n", err);
246 		goto abort_with_dmat;
247 	}
248 
249 	/* load the memory */
250 	err = bus_dmamap_load(dma->dmat, dma->map, dma->addr, bytes,
251 			      mxge_dmamap_callback,
252 			      (void *)&dma->bus_addr, 0);
253 	if (err != 0) {
254 		device_printf(dev, "couldn't load map (err = %d)\n", err);
255 		goto abort_with_mem;
256 	}
257 	return 0;
258 
259 abort_with_mem:
260 	bus_dmamem_free(dma->dmat, dma->addr, dma->map);
261 abort_with_dmat:
262 	(void)bus_dma_tag_destroy(dma->dmat);
263 	return err;
264 }
265 
266 static void
267 mxge_dma_free(mxge_dma_t *dma)
268 {
269 	bus_dmamap_unload(dma->dmat, dma->map);
270 	bus_dmamem_free(dma->dmat, dma->addr, dma->map);
271 	(void)bus_dma_tag_destroy(dma->dmat);
272 }
273 
274 /*
275  * The eeprom strings on the lanaiX have the format
276  * SN=x\0
277  * MAC=x:x:x:x:x:x\0
278  * PC=text\0
279  */
280 
281 static int
282 mxge_parse_strings(mxge_softc_t *sc)
283 {
284 	char *ptr;
285 	int i, found_mac, found_sn2;
286 	char *endptr;
287 
288 	ptr = sc->eeprom_strings;
289 	found_mac = 0;
290 	found_sn2 = 0;
291 	while (*ptr != '\0') {
292 		if (strncmp(ptr, "MAC=", 4) == 0) {
293 			ptr += 4;
294 			for (i = 0;;) {
295 				sc->mac_addr[i] = strtoul(ptr, &endptr, 16);
296 				if (endptr - ptr != 2)
297 					goto abort;
298 				ptr = endptr;
299 				if (++i == 6)
300 					break;
301 				if (*ptr++ != ':')
302 					goto abort;
303 			}
304 			found_mac = 1;
305 		} else if (strncmp(ptr, "PC=", 3) == 0) {
306 			ptr += 3;
307 			strlcpy(sc->product_code_string, ptr,
308 			    sizeof(sc->product_code_string));
309 		} else if (!found_sn2 && (strncmp(ptr, "SN=", 3) == 0)) {
310 			ptr += 3;
311 			strlcpy(sc->serial_number_string, ptr,
312 			    sizeof(sc->serial_number_string));
313 		} else if (strncmp(ptr, "SN2=", 4) == 0) {
314 			/* SN2 takes precedence over SN */
315 			ptr += 4;
316 			found_sn2 = 1;
317 			strlcpy(sc->serial_number_string, ptr,
318 			    sizeof(sc->serial_number_string));
319 		}
320 		while (*ptr++ != '\0') {}
321 	}
322 
323 	if (found_mac)
324 		return 0;
325 
326  abort:
327 	device_printf(sc->dev, "failed to parse eeprom_strings\n");
328 
329 	return ENXIO;
330 }
331 
332 #if defined __i386 || defined i386 || defined __i386__ || defined __x86_64__
333 static void
334 mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
335 {
336 	uint32_t val;
337 	unsigned long base, off;
338 	char *va, *cfgptr;
339 	device_t pdev, mcp55;
340 	uint16_t vendor_id, device_id, word;
341 	uintptr_t bus, slot, func, ivend, idev;
342 	uint32_t *ptr32;
343 
344 	if (!mxge_nvidia_ecrc_enable)
345 		return;
346 
347 	pdev = device_get_parent(device_get_parent(sc->dev));
348 	if (pdev == NULL) {
349 		device_printf(sc->dev, "could not find parent?\n");
350 		return;
351 	}
352 	vendor_id = pci_read_config(pdev, PCIR_VENDOR, 2);
353 	device_id = pci_read_config(pdev, PCIR_DEVICE, 2);
354 
355 	if (vendor_id != 0x10de)
356 		return;
357 
358 	base = 0;
359 
360 	if (device_id == 0x005d) {
361 		/* ck804, base address is magic */
362 		base = 0xe0000000UL;
363 	} else if (device_id >= 0x0374 && device_id <= 0x378) {
364 		/* mcp55, base address stored in chipset */
365 		mcp55 = pci_find_bsf(0, 0, 0);
366 		if (mcp55 &&
367 		    0x10de == pci_read_config(mcp55, PCIR_VENDOR, 2) &&
368 		    0x0369 == pci_read_config(mcp55, PCIR_DEVICE, 2)) {
369 			word = pci_read_config(mcp55, 0x90, 2);
370 			base = ((unsigned long)word & 0x7ffeU) << 25;
371 		}
372 	}
373 	if (!base)
374 		return;
375 
376 	/* XXXX
377 	   Test below is commented because it is believed that doing
378 	   config read/write beyond 0xff will access the config space
379 	   for the next larger function.  Uncomment this and remove
380 	   the hacky pmap_mapdev() way of accessing config space when
381 	   FreeBSD grows support for extended pcie config space access
382 	*/
383 #if 0
384 	/* See if we can, by some miracle, access the extended
385 	   config space */
386 	val = pci_read_config(pdev, 0x178, 4);
387 	if (val != 0xffffffff) {
388 		val |= 0x40;
389 		pci_write_config(pdev, 0x178, val, 4);
390 		return;
391 	}
392 #endif
393 	/* Rather than using normal pci config space writes, we must
394 	 * map the Nvidia config space ourselves.  This is because on
395 	 * opteron/nvidia class machine the 0xe000000 mapping is
396 	 * handled by the nvidia chipset, that means the internal PCI
397 	 * device (the on-chip northbridge), or the amd-8131 bridge
398 	 * and things behind them are not visible by this method.
399 	 */
400 
401 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
402 		      PCI_IVAR_BUS, &bus);
403 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
404 		      PCI_IVAR_SLOT, &slot);
405 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
406 		      PCI_IVAR_FUNCTION, &func);
407 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
408 		      PCI_IVAR_VENDOR, &ivend);
409 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
410 		      PCI_IVAR_DEVICE, &idev);
411 
412 	off =  base
413 		+ 0x00100000UL * (unsigned long)bus
414 		+ 0x00001000UL * (unsigned long)(func
415 						 + 8 * slot);
416 
417 	/* map it into the kernel */
418 	va = pmap_mapdev(trunc_page((vm_paddr_t)off), PAGE_SIZE);
419 
420 	if (va == NULL) {
421 		device_printf(sc->dev, "pmap_kenter_temporary didn't\n");
422 		return;
423 	}
424 	/* get a pointer to the config space mapped into the kernel */
425 	cfgptr = va + (off & PAGE_MASK);
426 
427 	/* make sure that we can really access it */
428 	vendor_id = *(uint16_t *)(cfgptr + PCIR_VENDOR);
429 	device_id = *(uint16_t *)(cfgptr + PCIR_DEVICE);
430 	if (! (vendor_id == ivend && device_id == idev)) {
431 		device_printf(sc->dev, "mapping failed: 0x%x:0x%x\n",
432 			      vendor_id, device_id);
433 		pmap_unmapdev(va, PAGE_SIZE);
434 		return;
435 	}
436 
437 	ptr32 = (uint32_t*)(cfgptr + 0x178);
438 	val = *ptr32;
439 
440 	if (val == 0xffffffff) {
441 		device_printf(sc->dev, "extended mapping failed\n");
442 		pmap_unmapdev(va, PAGE_SIZE);
443 		return;
444 	}
445 	*ptr32 = val | 0x40;
446 	pmap_unmapdev(va, PAGE_SIZE);
447 	if (mxge_verbose)
448 		device_printf(sc->dev,
449 			      "Enabled ECRC on upstream Nvidia bridge "
450 			      "at %d:%d:%d\n",
451 			      (int)bus, (int)slot, (int)func);
452 	return;
453 }
454 #else
455 static void
456 mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
457 {
458 	device_printf(sc->dev,
459 		      "Nforce 4 chipset on non-x86/amd64!?!?!\n");
460 	return;
461 }
462 #endif
463 
464 static int
465 mxge_dma_test(mxge_softc_t *sc, int test_type)
466 {
467 	mxge_cmd_t cmd;
468 	bus_addr_t dmatest_bus = sc->dmabench_dma.bus_addr;
469 	int status;
470 	uint32_t len;
471 	char *test = " ";
472 
473 	/* Run a small DMA test.
474 	 * The magic multipliers to the length tell the firmware
475 	 * to do DMA read, write, or read+write tests.  The
476 	 * results are returned in cmd.data0.  The upper 16
477 	 * bits of the return is the number of transfers completed.
478 	 * The lower 16 bits is the time in 0.5us ticks that the
479 	 * transfers took to complete.
480 	 */
481 
482 	len = sc->tx_boundary;
483 
484 	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
485 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
486 	cmd.data2 = len * 0x10000;
487 	status = mxge_send_cmd(sc, test_type, &cmd);
488 	if (status != 0) {
489 		test = "read";
490 		goto abort;
491 	}
492 	sc->read_dma = ((cmd.data0>>16) * len * 2) /
493 		(cmd.data0 & 0xffff);
494 	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
495 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
496 	cmd.data2 = len * 0x1;
497 	status = mxge_send_cmd(sc, test_type, &cmd);
498 	if (status != 0) {
499 		test = "write";
500 		goto abort;
501 	}
502 	sc->write_dma = ((cmd.data0>>16) * len * 2) /
503 		(cmd.data0 & 0xffff);
504 
505 	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
506 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
507 	cmd.data2 = len * 0x10001;
508 	status = mxge_send_cmd(sc, test_type, &cmd);
509 	if (status != 0) {
510 		test = "read/write";
511 		goto abort;
512 	}
513 	sc->read_write_dma = ((cmd.data0>>16) * len * 2 * 2) /
514 		(cmd.data0 & 0xffff);
515 
516 abort:
517 	if (status != 0 && test_type != MXGEFW_CMD_UNALIGNED_TEST)
518 		device_printf(sc->dev, "DMA %s benchmark failed: %d\n",
519 			      test, status);
520 
521 	return status;
522 }
523 
524 /*
525  * The Lanai Z8E PCI-E interface achieves higher Read-DMA throughput
526  * when the PCI-E Completion packets are aligned on an 8-byte
527  * boundary.  Some PCI-E chip sets always align Completion packets; on
528  * the ones that do not, the alignment can be enforced by enabling
529  * ECRC generation (if supported).
530  *
531  * When PCI-E Completion packets are not aligned, it is actually more
532  * efficient to limit Read-DMA transactions to 2KB, rather than 4KB.
533  *
534  * If the driver can neither enable ECRC nor verify that it has
535  * already been enabled, then it must use a firmware image which works
536  * around unaligned completion packets (ethp_z8e.dat), and it should
537  * also ensure that it never gives the device a Read-DMA which is
538  * larger than 2KB by setting the tx_boundary to 2KB.  If ECRC is
539  * enabled, then the driver should use the aligned (eth_z8e.dat)
540  * firmware image, and set tx_boundary to 4KB.
541  */
542 
543 static int
544 mxge_firmware_probe(mxge_softc_t *sc)
545 {
546 	device_t dev = sc->dev;
547 	int reg, status;
548 	uint16_t pectl;
549 
550 	sc->tx_boundary = 4096;
551 	/*
552 	 * Verify the max read request size was set to 4KB
553 	 * before trying the test with 4KB.
554 	 */
555 	if (pci_find_cap(dev, PCIY_EXPRESS, &reg) == 0) {
556 		pectl = pci_read_config(dev, reg + 0x8, 2);
557 		if ((pectl & (5 << 12)) != (5 << 12)) {
558 			device_printf(dev, "Max Read Req. size != 4k (0x%x\n",
559 				      pectl);
560 			sc->tx_boundary = 2048;
561 		}
562 	}
563 
564 	/*
565 	 * load the optimized firmware (which assumes aligned PCIe
566 	 * completions) in order to see if it works on this host.
567 	 */
568 	sc->fw_name = mxge_fw_aligned;
569 	status = mxge_load_firmware(sc, 1);
570 	if (status != 0) {
571 		return status;
572 	}
573 
574 	/*
575 	 * Enable ECRC if possible
576 	 */
577 	mxge_enable_nvidia_ecrc(sc);
578 
579 	/*
580 	 * Run a DMA test which watches for unaligned completions and
581 	 * aborts on the first one seen.  Not required on Z8ES or newer.
582 	 */
583 	if (pci_get_revid(sc->dev) >= MXGE_PCI_REV_Z8ES)
584 		return 0;
585 	status = mxge_dma_test(sc, MXGEFW_CMD_UNALIGNED_TEST);
586 	if (status == 0)
587 		return 0; /* keep the aligned firmware */
588 
589 	if (status != E2BIG)
590 		device_printf(dev, "DMA test failed: %d\n", status);
591 	if (status == ENOSYS)
592 		device_printf(dev, "Falling back to ethp! "
593 			      "Please install up to date fw\n");
594 	return status;
595 }
596 
597 static int
598 mxge_select_firmware(mxge_softc_t *sc)
599 {
600 	int aligned = 0;
601 	int force_firmware = mxge_force_firmware;
602 
603 	if (sc->throttle)
604 		force_firmware = sc->throttle;
605 
606 	if (force_firmware != 0) {
607 		if (force_firmware == 1)
608 			aligned = 1;
609 		else
610 			aligned = 0;
611 		if (mxge_verbose)
612 			device_printf(sc->dev,
613 				      "Assuming %s completions (forced)\n",
614 				      aligned ? "aligned" : "unaligned");
615 		goto abort;
616 	}
617 
618 	/* if the PCIe link width is 4 or less, we can use the aligned
619 	   firmware and skip any checks */
620 	if (sc->link_width != 0 && sc->link_width <= 4) {
621 		device_printf(sc->dev,
622 			      "PCIe x%d Link, expect reduced performance\n",
623 			      sc->link_width);
624 		aligned = 1;
625 		goto abort;
626 	}
627 
628 	if (0 == mxge_firmware_probe(sc))
629 		return 0;
630 
631 abort:
632 	if (aligned) {
633 		sc->fw_name = mxge_fw_aligned;
634 		sc->tx_boundary = 4096;
635 	} else {
636 		sc->fw_name = mxge_fw_unaligned;
637 		sc->tx_boundary = 2048;
638 	}
639 	return (mxge_load_firmware(sc, 0));
640 }
641 
642 static int
643 mxge_validate_firmware(mxge_softc_t *sc, const mcp_gen_header_t *hdr)
644 {
645 
646 	if (be32toh(hdr->mcp_type) != MCP_TYPE_ETH) {
647 		device_printf(sc->dev, "Bad firmware type: 0x%x\n",
648 			      be32toh(hdr->mcp_type));
649 		return EIO;
650 	}
651 
652 	/* save firmware version for sysctl */
653 	strlcpy(sc->fw_version, hdr->version, sizeof(sc->fw_version));
654 	if (mxge_verbose)
655 		device_printf(sc->dev, "firmware id: %s\n", hdr->version);
656 
657 	sscanf(sc->fw_version, "%d.%d.%d", &sc->fw_ver_major,
658 	       &sc->fw_ver_minor, &sc->fw_ver_tiny);
659 
660 	if (!(sc->fw_ver_major == MXGEFW_VERSION_MAJOR
661 	      && sc->fw_ver_minor == MXGEFW_VERSION_MINOR)) {
662 		device_printf(sc->dev, "Found firmware version %s\n",
663 			      sc->fw_version);
664 		device_printf(sc->dev, "Driver needs %d.%d\n",
665 			      MXGEFW_VERSION_MAJOR, MXGEFW_VERSION_MINOR);
666 		return EINVAL;
667 	}
668 	return 0;
669 
670 }
671 
672 static int
673 mxge_load_firmware_helper(mxge_softc_t *sc, uint32_t *limit)
674 {
675 	z_stream zs;
676 	char *inflate_buffer;
677 	const struct firmware *fw;
678 	const mcp_gen_header_t *hdr;
679 	unsigned hdr_offset;
680 	int status;
681 	unsigned int i;
682 	size_t fw_len;
683 
684 	fw = firmware_get(sc->fw_name);
685 	if (fw == NULL) {
686 		device_printf(sc->dev, "Could not find firmware image %s\n",
687 			      sc->fw_name);
688 		return ENOENT;
689 	}
690 
691 	/* setup zlib and decompress f/w */
692 	bzero(&zs, sizeof (zs));
693 	zs.zalloc = zcalloc_nowait;
694 	zs.zfree = zcfree;
695 	status = inflateInit(&zs);
696 	if (status != Z_OK) {
697 		status = EIO;
698 		goto abort_with_fw;
699 	}
700 
701 	/* the uncompressed size is stored as the firmware version,
702 	   which would otherwise go unused */
703 	fw_len = (size_t) fw->version;
704 	inflate_buffer = malloc(fw_len, M_TEMP, M_NOWAIT);
705 	if (inflate_buffer == NULL)
706 		goto abort_with_zs;
707 	zs.avail_in = fw->datasize;
708 	zs.next_in = __DECONST(char *, fw->data);
709 	zs.avail_out = fw_len;
710 	zs.next_out = inflate_buffer;
711 	status = inflate(&zs, Z_FINISH);
712 	if (status != Z_STREAM_END) {
713 		device_printf(sc->dev, "zlib %d\n", status);
714 		status = EIO;
715 		goto abort_with_buffer;
716 	}
717 
718 	/* check id */
719 	hdr_offset = htobe32(*(const uint32_t *)
720 			     (inflate_buffer + MCP_HEADER_PTR_OFFSET));
721 	if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > fw_len) {
722 		device_printf(sc->dev, "Bad firmware file");
723 		status = EIO;
724 		goto abort_with_buffer;
725 	}
726 	hdr = (const void*)(inflate_buffer + hdr_offset);
727 
728 	status = mxge_validate_firmware(sc, hdr);
729 	if (status != 0)
730 		goto abort_with_buffer;
731 
732 	/* Copy the inflated firmware to NIC SRAM. */
733 	for (i = 0; i < fw_len; i += 256) {
734 		mxge_pio_copy(sc->sram + MXGE_FW_OFFSET + i,
735 			      inflate_buffer + i,
736 			      min(256U, (unsigned)(fw_len - i)));
737 		wmb();
738 		(void)*sc->sram;
739 		wmb();
740 	}
741 
742 	*limit = fw_len;
743 	status = 0;
744 abort_with_buffer:
745 	free(inflate_buffer, M_TEMP);
746 abort_with_zs:
747 	inflateEnd(&zs);
748 abort_with_fw:
749 	firmware_put(fw, FIRMWARE_UNLOAD);
750 	return status;
751 }
752 
753 /*
754  * Enable or disable periodic RDMAs from the host to make certain
755  * chipsets resend dropped PCIe messages
756  */
757 
758 static void
759 mxge_dummy_rdma(mxge_softc_t *sc, int enable)
760 {
761 	char buf_bytes[72];
762 	volatile uint32_t *confirm;
763 	volatile char *submit;
764 	uint32_t *buf, dma_low, dma_high;
765 	int i;
766 
767 	buf = (uint32_t *)((uintptr_t)(buf_bytes + 7) & ~7UL);
768 
769 	/* clear confirmation addr */
770 	confirm = (volatile uint32_t *)sc->cmd;
771 	*confirm = 0;
772 	wmb();
773 
774 	/* send an rdma command to the PCIe engine, and wait for the
775 	   response in the confirmation address.  The firmware should
776 	   write a -1 there to indicate it is alive and well
777 	*/
778 
779 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
780 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
781 	buf[0] = htobe32(dma_high);		/* confirm addr MSW */
782 	buf[1] = htobe32(dma_low);		/* confirm addr LSW */
783 	buf[2] = htobe32(0xffffffff);		/* confirm data */
784 	dma_low = MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr);
785 	dma_high = MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr);
786 	buf[3] = htobe32(dma_high); 		/* dummy addr MSW */
787 	buf[4] = htobe32(dma_low); 		/* dummy addr LSW */
788 	buf[5] = htobe32(enable);			/* enable? */
789 
790 	submit = (volatile char *)(sc->sram + MXGEFW_BOOT_DUMMY_RDMA);
791 
792 	mxge_pio_copy(submit, buf, 64);
793 	wmb();
794 	DELAY(1000);
795 	wmb();
796 	i = 0;
797 	while (*confirm != 0xffffffff && i < 20) {
798 		DELAY(1000);
799 		i++;
800 	}
801 	if (*confirm != 0xffffffff) {
802 		device_printf(sc->dev, "dummy rdma %s failed (%p = 0x%x)",
803 			      (enable ? "enable" : "disable"), confirm,
804 			      *confirm);
805 	}
806 	return;
807 }
808 
809 static int
810 mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data)
811 {
812 	mcp_cmd_t *buf;
813 	char buf_bytes[sizeof(*buf) + 8];
814 	volatile mcp_cmd_response_t *response = sc->cmd;
815 	volatile char *cmd_addr = sc->sram + MXGEFW_ETH_CMD;
816 	uint32_t dma_low, dma_high;
817 	int err, sleep_total = 0;
818 
819 	/* ensure buf is aligned to 8 bytes */
820 	buf = (mcp_cmd_t *)((uintptr_t)(buf_bytes + 7) & ~7UL);
821 
822 	buf->data0 = htobe32(data->data0);
823 	buf->data1 = htobe32(data->data1);
824 	buf->data2 = htobe32(data->data2);
825 	buf->cmd = htobe32(cmd);
826 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
827 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
828 
829 	buf->response_addr.low = htobe32(dma_low);
830 	buf->response_addr.high = htobe32(dma_high);
831 	mtx_lock(&sc->cmd_mtx);
832 	response->result = 0xffffffff;
833 	wmb();
834 	mxge_pio_copy((volatile void *)cmd_addr, buf, sizeof (*buf));
835 
836 	/* wait up to 20ms */
837 	err = EAGAIN;
838 	for (sleep_total = 0; sleep_total <  20; sleep_total++) {
839 		bus_dmamap_sync(sc->cmd_dma.dmat,
840 				sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
841 		wmb();
842 		switch (be32toh(response->result)) {
843 		case 0:
844 			data->data0 = be32toh(response->data);
845 			err = 0;
846 			break;
847 		case 0xffffffff:
848 			DELAY(1000);
849 			break;
850 		case MXGEFW_CMD_UNKNOWN:
851 			err = ENOSYS;
852 			break;
853 		case MXGEFW_CMD_ERROR_UNALIGNED:
854 			err = E2BIG;
855 			break;
856 		case MXGEFW_CMD_ERROR_BUSY:
857 			err = EBUSY;
858 			break;
859 		case MXGEFW_CMD_ERROR_I2C_ABSENT:
860 			err = ENXIO;
861 			break;
862 		default:
863 			device_printf(sc->dev,
864 				      "mxge: command %d "
865 				      "failed, result = %d\n",
866 				      cmd, be32toh(response->result));
867 			err = ENXIO;
868 			break;
869 		}
870 		if (err != EAGAIN)
871 			break;
872 	}
873 	if (err == EAGAIN)
874 		device_printf(sc->dev, "mxge: command %d timed out"
875 			      "result = %d\n",
876 			      cmd, be32toh(response->result));
877 	mtx_unlock(&sc->cmd_mtx);
878 	return err;
879 }
880 
881 static int
882 mxge_adopt_running_firmware(mxge_softc_t *sc)
883 {
884 	struct mcp_gen_header *hdr;
885 	const size_t bytes = sizeof (struct mcp_gen_header);
886 	size_t hdr_offset;
887 	int status;
888 
889 	/* find running firmware header */
890 	hdr_offset = htobe32(*(volatile uint32_t *)
891 			     (sc->sram + MCP_HEADER_PTR_OFFSET));
892 
893 	if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > sc->sram_size) {
894 		device_printf(sc->dev,
895 			      "Running firmware has bad header offset (%d)\n",
896 			      (int)hdr_offset);
897 		return EIO;
898 	}
899 
900 	/* copy header of running firmware from SRAM to host memory to
901 	 * validate firmware */
902 	hdr = malloc(bytes, M_DEVBUF, M_NOWAIT);
903 	if (hdr == NULL) {
904 		device_printf(sc->dev, "could not malloc firmware hdr\n");
905 		return ENOMEM;
906 	}
907 	bus_space_read_region_1(rman_get_bustag(sc->mem_res),
908 				rman_get_bushandle(sc->mem_res),
909 				hdr_offset, (char *)hdr, bytes);
910 	status = mxge_validate_firmware(sc, hdr);
911 	free(hdr, M_DEVBUF);
912 
913 	/*
914 	 * check to see if adopted firmware has bug where adopting
915 	 * it will cause broadcasts to be filtered unless the NIC
916 	 * is kept in ALLMULTI mode
917 	 */
918 	if (sc->fw_ver_major == 1 && sc->fw_ver_minor == 4 &&
919 	    sc->fw_ver_tiny >= 4 && sc->fw_ver_tiny <= 11) {
920 		sc->adopted_rx_filter_bug = 1;
921 		device_printf(sc->dev, "Adopting fw %d.%d.%d: "
922 			      "working around rx filter bug\n",
923 			      sc->fw_ver_major, sc->fw_ver_minor,
924 			      sc->fw_ver_tiny);
925 	}
926 
927 	return status;
928 }
929 
930 static int
931 mxge_load_firmware(mxge_softc_t *sc, int adopt)
932 {
933 	volatile uint32_t *confirm;
934 	volatile char *submit;
935 	char buf_bytes[72];
936 	uint32_t *buf, size, dma_low, dma_high;
937 	int status, i;
938 
939 	buf = (uint32_t *)((uintptr_t)(buf_bytes + 7) & ~7UL);
940 
941 	size = sc->sram_size;
942 	status = mxge_load_firmware_helper(sc, &size);
943 	if (status) {
944 		if (!adopt)
945 			return status;
946 		/* Try to use the currently running firmware, if
947 		   it is new enough */
948 		status = mxge_adopt_running_firmware(sc);
949 		if (status) {
950 			device_printf(sc->dev,
951 				      "failed to adopt running firmware\n");
952 			return status;
953 		}
954 		device_printf(sc->dev,
955 			      "Successfully adopted running firmware\n");
956 		if (sc->tx_boundary == 4096) {
957 			device_printf(sc->dev,
958 				"Using firmware currently running on NIC"
959 				 ".  For optimal\n");
960 			device_printf(sc->dev,
961 				 "performance consider loading optimized "
962 				 "firmware\n");
963 		}
964 		sc->fw_name = mxge_fw_unaligned;
965 		sc->tx_boundary = 2048;
966 		return 0;
967 	}
968 	/* clear confirmation addr */
969 	confirm = (volatile uint32_t *)sc->cmd;
970 	*confirm = 0;
971 	wmb();
972 	/* send a reload command to the bootstrap MCP, and wait for the
973 	   response in the confirmation address.  The firmware should
974 	   write a -1 there to indicate it is alive and well
975 	*/
976 
977 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
978 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
979 
980 	buf[0] = htobe32(dma_high);	/* confirm addr MSW */
981 	buf[1] = htobe32(dma_low);	/* confirm addr LSW */
982 	buf[2] = htobe32(0xffffffff);	/* confirm data */
983 
984 	/* FIX: All newest firmware should un-protect the bottom of
985 	   the sram before handoff. However, the very first interfaces
986 	   do not. Therefore the handoff copy must skip the first 8 bytes
987 	*/
988 					/* where the code starts*/
989 	buf[3] = htobe32(MXGE_FW_OFFSET + 8);
990 	buf[4] = htobe32(size - 8); 	/* length of code */
991 	buf[5] = htobe32(8);		/* where to copy to */
992 	buf[6] = htobe32(0);		/* where to jump to */
993 
994 	submit = (volatile char *)(sc->sram + MXGEFW_BOOT_HANDOFF);
995 	mxge_pio_copy(submit, buf, 64);
996 	wmb();
997 	DELAY(1000);
998 	wmb();
999 	i = 0;
1000 	while (*confirm != 0xffffffff && i < 20) {
1001 		DELAY(1000*10);
1002 		i++;
1003 		bus_dmamap_sync(sc->cmd_dma.dmat,
1004 				sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
1005 	}
1006 	if (*confirm != 0xffffffff) {
1007 		device_printf(sc->dev,"handoff failed (%p = 0x%x)",
1008 			confirm, *confirm);
1009 
1010 		return ENXIO;
1011 	}
1012 	return 0;
1013 }
1014 
1015 static int
1016 mxge_update_mac_address(mxge_softc_t *sc)
1017 {
1018 	mxge_cmd_t cmd;
1019 	uint8_t *addr = sc->mac_addr;
1020 	int status;
1021 
1022 	cmd.data0 = ((addr[0] << 24) | (addr[1] << 16)
1023 		     | (addr[2] << 8) | addr[3]);
1024 
1025 	cmd.data1 = ((addr[4] << 8) | (addr[5]));
1026 
1027 	status = mxge_send_cmd(sc, MXGEFW_SET_MAC_ADDRESS, &cmd);
1028 	return status;
1029 }
1030 
1031 static int
1032 mxge_change_pause(mxge_softc_t *sc, int pause)
1033 {
1034 	mxge_cmd_t cmd;
1035 	int status;
1036 
1037 	if (pause)
1038 		status = mxge_send_cmd(sc, MXGEFW_ENABLE_FLOW_CONTROL,
1039 				       &cmd);
1040 	else
1041 		status = mxge_send_cmd(sc, MXGEFW_DISABLE_FLOW_CONTROL,
1042 				       &cmd);
1043 
1044 	if (status) {
1045 		device_printf(sc->dev, "Failed to set flow control mode\n");
1046 		return ENXIO;
1047 	}
1048 	sc->pause = pause;
1049 	return 0;
1050 }
1051 
1052 static void
1053 mxge_change_promisc(mxge_softc_t *sc, int promisc)
1054 {
1055 	mxge_cmd_t cmd;
1056 	int status;
1057 
1058 	if (mxge_always_promisc)
1059 		promisc = 1;
1060 
1061 	if (promisc)
1062 		status = mxge_send_cmd(sc, MXGEFW_ENABLE_PROMISC,
1063 				       &cmd);
1064 	else
1065 		status = mxge_send_cmd(sc, MXGEFW_DISABLE_PROMISC,
1066 				       &cmd);
1067 
1068 	if (status) {
1069 		device_printf(sc->dev, "Failed to set promisc mode\n");
1070 	}
1071 }
1072 
1073 struct mxge_add_maddr_ctx {
1074 	mxge_softc_t *sc;
1075 	int error;
1076 };
1077 
1078 static u_int
1079 mxge_add_maddr(void *arg, struct sockaddr_dl *sdl, u_int cnt)
1080 {
1081 	struct mxge_add_maddr_ctx *ctx = arg;
1082 	mxge_cmd_t cmd;
1083 
1084 	if (ctx->error != 0)
1085 		return (0);
1086 	bcopy(LLADDR(sdl), &cmd.data0, 4);
1087 	bcopy(LLADDR(sdl) + 4, &cmd.data1, 2);
1088 	cmd.data0 = htonl(cmd.data0);
1089 	cmd.data1 = htonl(cmd.data1);
1090 
1091 	ctx->error = mxge_send_cmd(ctx->sc, MXGEFW_JOIN_MULTICAST_GROUP, &cmd);
1092 
1093 	return (1);
1094 }
1095 
1096 static void
1097 mxge_set_multicast_list(mxge_softc_t *sc)
1098 {
1099 	struct mxge_add_maddr_ctx ctx;
1100 	if_t ifp = sc->ifp;
1101 	mxge_cmd_t cmd;
1102 	int err;
1103 
1104 	/* This firmware is known to not support multicast */
1105 	if (!sc->fw_multicast_support)
1106 		return;
1107 
1108 	/* Disable multicast filtering while we play with the lists*/
1109 	err = mxge_send_cmd(sc, MXGEFW_ENABLE_ALLMULTI, &cmd);
1110 	if (err != 0) {
1111 		device_printf(sc->dev, "Failed MXGEFW_ENABLE_ALLMULTI,"
1112 		       " error status: %d\n", err);
1113 		return;
1114 	}
1115 
1116 	if (sc->adopted_rx_filter_bug)
1117 		return;
1118 
1119 	if (if_getflags(ifp) & IFF_ALLMULTI)
1120 		/* request to disable multicast filtering, so quit here */
1121 		return;
1122 
1123 	/* Flush all the filters */
1124 
1125 	err = mxge_send_cmd(sc, MXGEFW_LEAVE_ALL_MULTICAST_GROUPS, &cmd);
1126 	if (err != 0) {
1127 		device_printf(sc->dev,
1128 			      "Failed MXGEFW_LEAVE_ALL_MULTICAST_GROUPS"
1129 			      ", error status: %d\n", err);
1130 		return;
1131 	}
1132 
1133 	/* Walk the multicast list, and add each address */
1134 	ctx.sc = sc;
1135 	ctx.error = 0;
1136 	if_foreach_llmaddr(ifp, mxge_add_maddr, &ctx);
1137 	if (ctx.error != 0) {
1138 		device_printf(sc->dev, "Failed MXGEFW_JOIN_MULTICAST_GROUP, "
1139 		    "error status:" "%d\t", ctx.error);
1140 		/* abort, leaving multicast filtering off */
1141 		return;
1142 	}
1143 
1144 	/* Enable multicast filtering */
1145 	err = mxge_send_cmd(sc, MXGEFW_DISABLE_ALLMULTI, &cmd);
1146 	if (err != 0) {
1147 		device_printf(sc->dev, "Failed MXGEFW_DISABLE_ALLMULTI"
1148 		       ", error status: %d\n", err);
1149 	}
1150 }
1151 
1152 static int
1153 mxge_max_mtu(mxge_softc_t *sc)
1154 {
1155 	mxge_cmd_t cmd;
1156 	int status;
1157 
1158 	if (MJUMPAGESIZE - MXGEFW_PAD >  MXGEFW_MAX_MTU)
1159 		return  MXGEFW_MAX_MTU - MXGEFW_PAD;
1160 
1161 	/* try to set nbufs to see if it we can
1162 	   use virtually contiguous jumbos */
1163 	cmd.data0 = 0;
1164 	status = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
1165 			       &cmd);
1166 	if (status == 0)
1167 		return  MXGEFW_MAX_MTU - MXGEFW_PAD;
1168 
1169 	/* otherwise, we're limited to MJUMPAGESIZE */
1170 	return MJUMPAGESIZE - MXGEFW_PAD;
1171 }
1172 
1173 static int
1174 mxge_reset(mxge_softc_t *sc, int interrupts_setup)
1175 {
1176 	struct mxge_slice_state *ss;
1177 	mxge_rx_done_t *rx_done;
1178 	volatile uint32_t *irq_claim;
1179 	mxge_cmd_t cmd;
1180 	int slice, status;
1181 
1182 	/* try to send a reset command to the card to see if it
1183 	   is alive */
1184 	memset(&cmd, 0, sizeof (cmd));
1185 	status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
1186 	if (status != 0) {
1187 		device_printf(sc->dev, "failed reset\n");
1188 		return ENXIO;
1189 	}
1190 
1191 	mxge_dummy_rdma(sc, 1);
1192 
1193 	/* set the intrq size */
1194 	cmd.data0 = sc->rx_ring_size;
1195 	status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
1196 
1197 	/*
1198 	 * Even though we already know how many slices are supported
1199 	 * via mxge_slice_probe(), MXGEFW_CMD_GET_MAX_RSS_QUEUES
1200 	 * has magic side effects, and must be called after a reset.
1201 	 * It must be called prior to calling any RSS related cmds,
1202 	 * including assigning an interrupt queue for anything but
1203 	 * slice 0.  It must also be called *after*
1204 	 * MXGEFW_CMD_SET_INTRQ_SIZE, since the intrq size is used by
1205 	 * the firmware to compute offsets.
1206 	 */
1207 
1208 	if (sc->num_slices > 1) {
1209 		/* ask the maximum number of slices it supports */
1210 		status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES,
1211 					   &cmd);
1212 		if (status != 0) {
1213 			device_printf(sc->dev,
1214 				      "failed to get number of slices\n");
1215 			return status;
1216 		}
1217 		/*
1218 		 * MXGEFW_CMD_ENABLE_RSS_QUEUES must be called prior
1219 		 * to setting up the interrupt queue DMA
1220 		 */
1221 		cmd.data0 = sc->num_slices;
1222 		cmd.data1 = MXGEFW_SLICE_INTR_MODE_ONE_PER_SLICE;
1223 		cmd.data1 |= MXGEFW_SLICE_ENABLE_MULTIPLE_TX_QUEUES;
1224 		status = mxge_send_cmd(sc, MXGEFW_CMD_ENABLE_RSS_QUEUES,
1225 					   &cmd);
1226 		if (status != 0) {
1227 			device_printf(sc->dev,
1228 				      "failed to set number of slices\n");
1229 			return status;
1230 		}
1231 	}
1232 
1233 	if (interrupts_setup) {
1234 		/* Now exchange information about interrupts  */
1235 		for (slice = 0; slice < sc->num_slices; slice++) {
1236 			rx_done = &sc->ss[slice].rx_done;
1237 			memset(rx_done->entry, 0, sc->rx_ring_size);
1238 			cmd.data0 = MXGE_LOWPART_TO_U32(rx_done->dma.bus_addr);
1239 			cmd.data1 = MXGE_HIGHPART_TO_U32(rx_done->dma.bus_addr);
1240 			cmd.data2 = slice;
1241 			status |= mxge_send_cmd(sc,
1242 						MXGEFW_CMD_SET_INTRQ_DMA,
1243 						&cmd);
1244 		}
1245 	}
1246 
1247 	status |= mxge_send_cmd(sc,
1248 				MXGEFW_CMD_GET_INTR_COAL_DELAY_OFFSET, &cmd);
1249 
1250 	sc->intr_coal_delay_ptr = (volatile uint32_t *)(sc->sram + cmd.data0);
1251 
1252 	status |= mxge_send_cmd(sc, MXGEFW_CMD_GET_IRQ_ACK_OFFSET, &cmd);
1253 	irq_claim = (volatile uint32_t *)(sc->sram + cmd.data0);
1254 
1255 	status |= mxge_send_cmd(sc,  MXGEFW_CMD_GET_IRQ_DEASSERT_OFFSET,
1256 				&cmd);
1257 	sc->irq_deassert = (volatile uint32_t *)(sc->sram + cmd.data0);
1258 	if (status != 0) {
1259 		device_printf(sc->dev, "failed set interrupt parameters\n");
1260 		return status;
1261 	}
1262 
1263 	*sc->intr_coal_delay_ptr = htobe32(sc->intr_coal_delay);
1264 
1265 	/* run a DMA benchmark */
1266 	(void) mxge_dma_test(sc, MXGEFW_DMA_TEST);
1267 
1268 	for (slice = 0; slice < sc->num_slices; slice++) {
1269 		ss = &sc->ss[slice];
1270 
1271 		ss->irq_claim = irq_claim + (2 * slice);
1272 		/* reset mcp/driver shared state back to 0 */
1273 		ss->rx_done.idx = 0;
1274 		ss->rx_done.cnt = 0;
1275 		ss->tx.req = 0;
1276 		ss->tx.done = 0;
1277 		ss->tx.pkt_done = 0;
1278 		ss->tx.queue_active = 0;
1279 		ss->tx.activate = 0;
1280 		ss->tx.deactivate = 0;
1281 		ss->tx.wake = 0;
1282 		ss->tx.defrag = 0;
1283 		ss->tx.stall = 0;
1284 		ss->rx_big.cnt = 0;
1285 		ss->rx_small.cnt = 0;
1286 		ss->lc.lro_bad_csum = 0;
1287 		ss->lc.lro_queued = 0;
1288 		ss->lc.lro_flushed = 0;
1289 		if (ss->fw_stats != NULL) {
1290 			bzero(ss->fw_stats, sizeof *ss->fw_stats);
1291 		}
1292 	}
1293 	sc->rdma_tags_available = 15;
1294 	status = mxge_update_mac_address(sc);
1295 	mxge_change_promisc(sc, if_getflags(sc->ifp) & IFF_PROMISC);
1296 	mxge_change_pause(sc, sc->pause);
1297 	mxge_set_multicast_list(sc);
1298 	if (sc->throttle) {
1299 		cmd.data0 = sc->throttle;
1300 		if (mxge_send_cmd(sc, MXGEFW_CMD_SET_THROTTLE_FACTOR,
1301 				  &cmd)) {
1302 			device_printf(sc->dev,
1303 				      "can't enable throttle\n");
1304 		}
1305 	}
1306 	return status;
1307 }
1308 
1309 static int
1310 mxge_change_throttle(SYSCTL_HANDLER_ARGS)
1311 {
1312 	mxge_cmd_t cmd;
1313 	mxge_softc_t *sc;
1314 	int err;
1315 	unsigned int throttle;
1316 
1317 	sc = arg1;
1318 	throttle = sc->throttle;
1319 	err = sysctl_handle_int(oidp, &throttle, arg2, req);
1320 	if (err != 0) {
1321 		return err;
1322 	}
1323 
1324 	if (throttle == sc->throttle)
1325 		return 0;
1326 
1327 	if (throttle < MXGE_MIN_THROTTLE || throttle > MXGE_MAX_THROTTLE)
1328 		return EINVAL;
1329 
1330 	mtx_lock(&sc->driver_mtx);
1331 	cmd.data0 = throttle;
1332 	err = mxge_send_cmd(sc, MXGEFW_CMD_SET_THROTTLE_FACTOR, &cmd);
1333 	if (err == 0)
1334 		sc->throttle = throttle;
1335 	mtx_unlock(&sc->driver_mtx);
1336 	return err;
1337 }
1338 
1339 static int
1340 mxge_change_intr_coal(SYSCTL_HANDLER_ARGS)
1341 {
1342 	mxge_softc_t *sc;
1343 	unsigned int intr_coal_delay;
1344 	int err;
1345 
1346 	sc = arg1;
1347 	intr_coal_delay = sc->intr_coal_delay;
1348 	err = sysctl_handle_int(oidp, &intr_coal_delay, arg2, req);
1349 	if (err != 0) {
1350 		return err;
1351 	}
1352 	if (intr_coal_delay == sc->intr_coal_delay)
1353 		return 0;
1354 
1355 	if (intr_coal_delay == 0 || intr_coal_delay > 1000*1000)
1356 		return EINVAL;
1357 
1358 	mtx_lock(&sc->driver_mtx);
1359 	*sc->intr_coal_delay_ptr = htobe32(intr_coal_delay);
1360 	sc->intr_coal_delay = intr_coal_delay;
1361 
1362 	mtx_unlock(&sc->driver_mtx);
1363 	return err;
1364 }
1365 
1366 static int
1367 mxge_change_flow_control(SYSCTL_HANDLER_ARGS)
1368 {
1369 	mxge_softc_t *sc;
1370 	unsigned int enabled;
1371 	int err;
1372 
1373 	sc = arg1;
1374 	enabled = sc->pause;
1375 	err = sysctl_handle_int(oidp, &enabled, arg2, req);
1376 	if (err != 0) {
1377 		return err;
1378 	}
1379 	if (enabled == sc->pause)
1380 		return 0;
1381 
1382 	mtx_lock(&sc->driver_mtx);
1383 	err = mxge_change_pause(sc, enabled);
1384 	mtx_unlock(&sc->driver_mtx);
1385 	return err;
1386 }
1387 
1388 static int
1389 mxge_handle_be32(SYSCTL_HANDLER_ARGS)
1390 {
1391 	int err;
1392 
1393 	if (arg1 == NULL)
1394 		return EFAULT;
1395 	arg2 = be32toh(*(int *)arg1);
1396 	arg1 = NULL;
1397 	err = sysctl_handle_int(oidp, arg1, arg2, req);
1398 
1399 	return err;
1400 }
1401 
1402 static void
1403 mxge_rem_sysctls(mxge_softc_t *sc)
1404 {
1405 	struct mxge_slice_state *ss;
1406 	int slice;
1407 
1408 	if (sc->slice_sysctl_tree == NULL)
1409 		return;
1410 
1411 	for (slice = 0; slice < sc->num_slices; slice++) {
1412 		ss = &sc->ss[slice];
1413 		if (ss == NULL || ss->sysctl_tree == NULL)
1414 			continue;
1415 		sysctl_ctx_free(&ss->sysctl_ctx);
1416 		ss->sysctl_tree = NULL;
1417 	}
1418 	sysctl_ctx_free(&sc->slice_sysctl_ctx);
1419 	sc->slice_sysctl_tree = NULL;
1420 }
1421 
1422 static void
1423 mxge_add_sysctls(mxge_softc_t *sc)
1424 {
1425 	struct sysctl_ctx_list *ctx;
1426 	struct sysctl_oid_list *children;
1427 	mcp_irq_data_t *fw;
1428 	struct mxge_slice_state *ss;
1429 	int slice;
1430 	char slice_num[8];
1431 
1432 	ctx = device_get_sysctl_ctx(sc->dev);
1433 	children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev));
1434 	fw = sc->ss[0].fw_stats;
1435 
1436 	/* random information */
1437 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1438 		       "firmware_version",
1439 		       CTLFLAG_RD, sc->fw_version,
1440 		       0, "firmware version");
1441 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1442 		       "serial_number",
1443 		       CTLFLAG_RD, sc->serial_number_string,
1444 		       0, "serial number");
1445 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1446 		       "product_code",
1447 		       CTLFLAG_RD, sc->product_code_string,
1448 		       0, "product_code");
1449 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1450 		       "pcie_link_width",
1451 		       CTLFLAG_RD, &sc->link_width,
1452 		       0, "tx_boundary");
1453 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1454 		       "tx_boundary",
1455 		       CTLFLAG_RD, &sc->tx_boundary,
1456 		       0, "tx_boundary");
1457 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1458 		       "write_combine",
1459 		       CTLFLAG_RD, &sc->wc,
1460 		       0, "write combining PIO?");
1461 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1462 		       "read_dma_MBs",
1463 		       CTLFLAG_RD, &sc->read_dma,
1464 		       0, "DMA Read speed in MB/s");
1465 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1466 		       "write_dma_MBs",
1467 		       CTLFLAG_RD, &sc->write_dma,
1468 		       0, "DMA Write speed in MB/s");
1469 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1470 		       "read_write_dma_MBs",
1471 		       CTLFLAG_RD, &sc->read_write_dma,
1472 		       0, "DMA concurrent Read/Write speed in MB/s");
1473 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1474 		       "watchdog_resets",
1475 		       CTLFLAG_RD, &sc->watchdog_resets,
1476 		       0, "Number of times NIC was reset");
1477 
1478 	/* performance related tunables */
1479 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1480 	    "intr_coal_delay", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
1481 	    sc, 0, mxge_change_intr_coal, "I",
1482 	    "interrupt coalescing delay in usecs");
1483 
1484 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1485 	    "throttle", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
1486 	    mxge_change_throttle, "I", "transmit throttling");
1487 
1488 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1489 	    "flow_control_enabled",
1490 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
1491 	    mxge_change_flow_control, "I",
1492 	    "interrupt coalescing delay in usecs");
1493 
1494 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1495 		       "deassert_wait",
1496 		       CTLFLAG_RW, &mxge_deassert_wait,
1497 		       0, "Wait for IRQ line to go low in ihandler");
1498 
1499 	/* stats block from firmware is in network byte order.
1500 	   Need to swap it */
1501 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1502 	    "link_up", CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
1503 	    &fw->link_up, 0, mxge_handle_be32, "I", "link up");
1504 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1505 	    "rdma_tags_available", CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
1506 	    &fw->rdma_tags_available, 0, mxge_handle_be32, "I",
1507 	    "rdma_tags_available");
1508 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1509 	    "dropped_bad_crc32", CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
1510 	    &fw->dropped_bad_crc32, 0, mxge_handle_be32, "I",
1511 	    "dropped_bad_crc32");
1512 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1513 	    "dropped_bad_phy", CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
1514 	    &fw->dropped_bad_phy, 0, mxge_handle_be32, "I", "dropped_bad_phy");
1515 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1516 	    "dropped_link_error_or_filtered",
1517 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
1518 	    &fw->dropped_link_error_or_filtered, 0, mxge_handle_be32, "I",
1519 	    "dropped_link_error_or_filtered");
1520 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1521 	    "dropped_link_overflow",
1522 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
1523 	    &fw->dropped_link_overflow, 0, mxge_handle_be32, "I",
1524 	    "dropped_link_overflow");
1525 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1526 	    "dropped_multicast_filtered",
1527 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
1528 	    &fw->dropped_multicast_filtered, 0, mxge_handle_be32, "I",
1529 	    "dropped_multicast_filtered");
1530 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1531 	    "dropped_no_big_buffer",
1532 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
1533 	    &fw->dropped_no_big_buffer, 0, mxge_handle_be32, "I",
1534 	    "dropped_no_big_buffer");
1535 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1536 	    "dropped_no_small_buffer",
1537 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
1538 	    &fw->dropped_no_small_buffer, 0, mxge_handle_be32, "I",
1539 	    "dropped_no_small_buffer");
1540 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1541 	    "dropped_overrun",
1542 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
1543 	    &fw->dropped_overrun, 0, mxge_handle_be32, "I",
1544 	    "dropped_overrun");
1545 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1546 	    "dropped_pause", CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
1547 	    &fw->dropped_pause, 0, mxge_handle_be32, "I", "dropped_pause");
1548 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1549 	    "dropped_runt", CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
1550 	    &fw->dropped_runt, 0, mxge_handle_be32, "I", "dropped_runt");
1551 
1552 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1553 	    "dropped_unicast_filtered",
1554 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
1555 	    &fw->dropped_unicast_filtered, 0, mxge_handle_be32, "I",
1556 	    "dropped_unicast_filtered");
1557 
1558 	/* verbose printing? */
1559 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1560 		       "verbose",
1561 		       CTLFLAG_RW, &mxge_verbose,
1562 		       0, "verbose printing");
1563 
1564 	/* add counters exported for debugging from all slices */
1565 	sysctl_ctx_init(&sc->slice_sysctl_ctx);
1566 	sc->slice_sysctl_tree =
1567 		SYSCTL_ADD_NODE(&sc->slice_sysctl_ctx, children, OID_AUTO,
1568 		    "slice", CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
1569 
1570 	for (slice = 0; slice < sc->num_slices; slice++) {
1571 		ss = &sc->ss[slice];
1572 		sysctl_ctx_init(&ss->sysctl_ctx);
1573 		ctx = &ss->sysctl_ctx;
1574 		children = SYSCTL_CHILDREN(sc->slice_sysctl_tree);
1575 		sprintf(slice_num, "%d", slice);
1576 		ss->sysctl_tree =
1577 			SYSCTL_ADD_NODE(ctx, children, OID_AUTO, slice_num,
1578 			    CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
1579 		children = SYSCTL_CHILDREN(ss->sysctl_tree);
1580 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1581 			       "rx_small_cnt",
1582 			       CTLFLAG_RD, &ss->rx_small.cnt,
1583 			       0, "rx_small_cnt");
1584 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1585 			       "rx_big_cnt",
1586 			       CTLFLAG_RD, &ss->rx_big.cnt,
1587 			       0, "rx_small_cnt");
1588 		SYSCTL_ADD_U64(ctx, children, OID_AUTO,
1589 			       "lro_flushed", CTLFLAG_RD, &ss->lc.lro_flushed,
1590 			       0, "number of lro merge queues flushed");
1591 
1592 		SYSCTL_ADD_U64(ctx, children, OID_AUTO,
1593 			       "lro_bad_csum", CTLFLAG_RD, &ss->lc.lro_bad_csum,
1594 			       0, "number of bad csums preventing LRO");
1595 
1596 		SYSCTL_ADD_U64(ctx, children, OID_AUTO,
1597 			       "lro_queued", CTLFLAG_RD, &ss->lc.lro_queued,
1598 			       0, "number of frames appended to lro merge"
1599 			       "queues");
1600 
1601 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1602 			       "tx_req",
1603 			       CTLFLAG_RD, &ss->tx.req,
1604 			       0, "tx_req");
1605 
1606 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1607 			       "tx_done",
1608 			       CTLFLAG_RD, &ss->tx.done,
1609 			       0, "tx_done");
1610 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1611 			       "tx_pkt_done",
1612 			       CTLFLAG_RD, &ss->tx.pkt_done,
1613 			       0, "tx_done");
1614 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1615 			       "tx_stall",
1616 			       CTLFLAG_RD, &ss->tx.stall,
1617 			       0, "tx_stall");
1618 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1619 			       "tx_wake",
1620 			       CTLFLAG_RD, &ss->tx.wake,
1621 			       0, "tx_wake");
1622 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1623 			       "tx_defrag",
1624 			       CTLFLAG_RD, &ss->tx.defrag,
1625 			       0, "tx_defrag");
1626 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1627 			       "tx_queue_active",
1628 			       CTLFLAG_RD, &ss->tx.queue_active,
1629 			       0, "tx_queue_active");
1630 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1631 			       "tx_activate",
1632 			       CTLFLAG_RD, &ss->tx.activate,
1633 			       0, "tx_activate");
1634 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1635 			       "tx_deactivate",
1636 			       CTLFLAG_RD, &ss->tx.deactivate,
1637 			       0, "tx_deactivate");
1638 	}
1639 }
1640 
1641 /* copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
1642    backwards one at a time and handle ring wraps */
1643 
1644 static inline void
1645 mxge_submit_req_backwards(mxge_tx_ring_t *tx,
1646 			    mcp_kreq_ether_send_t *src, int cnt)
1647 {
1648 	int idx, starting_slot;
1649 	starting_slot = tx->req;
1650 	while (cnt > 1) {
1651 		cnt--;
1652 		idx = (starting_slot + cnt) & tx->mask;
1653 		mxge_pio_copy(&tx->lanai[idx],
1654 			      &src[cnt], sizeof(*src));
1655 		wmb();
1656 	}
1657 }
1658 
1659 /*
1660  * copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
1661  * at most 32 bytes at a time, so as to avoid involving the software
1662  * pio handler in the nic.   We re-write the first segment's flags
1663  * to mark them valid only after writing the entire chain
1664  */
1665 
1666 static inline void
1667 mxge_submit_req(mxge_tx_ring_t *tx, mcp_kreq_ether_send_t *src,
1668 		  int cnt)
1669 {
1670 	int idx, i;
1671 	uint32_t *src_ints;
1672 	volatile uint32_t *dst_ints;
1673 	mcp_kreq_ether_send_t *srcp;
1674 	volatile mcp_kreq_ether_send_t *dstp, *dst;
1675 	uint8_t last_flags;
1676 
1677 	idx = tx->req & tx->mask;
1678 
1679 	last_flags = src->flags;
1680 	src->flags = 0;
1681 	wmb();
1682 	dst = dstp = &tx->lanai[idx];
1683 	srcp = src;
1684 
1685 	if ((idx + cnt) < tx->mask) {
1686 		for (i = 0; i < (cnt - 1); i += 2) {
1687 			mxge_pio_copy(dstp, srcp, 2 * sizeof(*src));
1688 			wmb(); /* force write every 32 bytes */
1689 			srcp += 2;
1690 			dstp += 2;
1691 		}
1692 	} else {
1693 		/* submit all but the first request, and ensure
1694 		   that it is submitted below */
1695 		mxge_submit_req_backwards(tx, src, cnt);
1696 		i = 0;
1697 	}
1698 	if (i < cnt) {
1699 		/* submit the first request */
1700 		mxge_pio_copy(dstp, srcp, sizeof(*src));
1701 		wmb(); /* barrier before setting valid flag */
1702 	}
1703 
1704 	/* re-write the last 32-bits with the valid flags */
1705 	src->flags = last_flags;
1706 	src_ints = (uint32_t *)src;
1707 	src_ints+=3;
1708 	dst_ints = (volatile uint32_t *)dst;
1709 	dst_ints+=3;
1710 	*dst_ints =  *src_ints;
1711 	tx->req += cnt;
1712 	wmb();
1713 }
1714 
1715 static int
1716 mxge_parse_tx(struct mxge_slice_state *ss, struct mbuf *m,
1717     struct mxge_pkt_info *pi)
1718 {
1719 	struct ether_vlan_header *eh;
1720 	uint16_t etype;
1721 	int tso = m->m_pkthdr.csum_flags & (CSUM_TSO);
1722 #if IFCAP_TSO6 && defined(INET6)
1723 	int nxt;
1724 #endif
1725 
1726 	eh = mtod(m, struct ether_vlan_header *);
1727 	if (eh->evl_encap_proto == htons(ETHERTYPE_VLAN)) {
1728 		etype = ntohs(eh->evl_proto);
1729 		pi->ip_off = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
1730 	} else {
1731 		etype = ntohs(eh->evl_encap_proto);
1732 		pi->ip_off = ETHER_HDR_LEN;
1733 	}
1734 
1735 	switch (etype) {
1736 	case ETHERTYPE_IP:
1737 		/*
1738 		 * ensure ip header is in first mbuf, copy it to a
1739 		 * scratch buffer if not
1740 		 */
1741 		pi->ip = (struct ip *)(m->m_data + pi->ip_off);
1742 		pi->ip6 = NULL;
1743 		if (__predict_false(m->m_len < pi->ip_off + sizeof(*pi->ip))) {
1744 			m_copydata(m, 0, pi->ip_off + sizeof(*pi->ip),
1745 			    ss->scratch);
1746 			pi->ip = (struct ip *)(ss->scratch + pi->ip_off);
1747 		}
1748 		pi->ip_hlen = pi->ip->ip_hl << 2;
1749 		if (!tso)
1750 			return 0;
1751 
1752 		if (__predict_false(m->m_len < pi->ip_off + pi->ip_hlen +
1753 		    sizeof(struct tcphdr))) {
1754 			m_copydata(m, 0, pi->ip_off + pi->ip_hlen +
1755 			    sizeof(struct tcphdr), ss->scratch);
1756 			pi->ip = (struct ip *)(ss->scratch + pi->ip_off);
1757 		}
1758 		pi->tcp = (struct tcphdr *)((char *)pi->ip + pi->ip_hlen);
1759 		break;
1760 #if IFCAP_TSO6 && defined(INET6)
1761 	case ETHERTYPE_IPV6:
1762 		pi->ip6 = (struct ip6_hdr *)(m->m_data + pi->ip_off);
1763 		if (__predict_false(m->m_len < pi->ip_off + sizeof(*pi->ip6))) {
1764 			m_copydata(m, 0, pi->ip_off + sizeof(*pi->ip6),
1765 			    ss->scratch);
1766 			pi->ip6 = (struct ip6_hdr *)(ss->scratch + pi->ip_off);
1767 		}
1768 		nxt = 0;
1769 		pi->ip_hlen = ip6_lasthdr(m, pi->ip_off, IPPROTO_IPV6, &nxt);
1770 		pi->ip_hlen -= pi->ip_off;
1771 		if (nxt != IPPROTO_TCP && nxt != IPPROTO_UDP)
1772 			return EINVAL;
1773 
1774 		if (!tso)
1775 			return 0;
1776 
1777 		if (pi->ip_off + pi->ip_hlen > ss->sc->max_tso6_hlen)
1778 			return EINVAL;
1779 
1780 		if (__predict_false(m->m_len < pi->ip_off + pi->ip_hlen +
1781 		    sizeof(struct tcphdr))) {
1782 			m_copydata(m, 0, pi->ip_off + pi->ip_hlen +
1783 			    sizeof(struct tcphdr), ss->scratch);
1784 			pi->ip6 = (struct ip6_hdr *)(ss->scratch + pi->ip_off);
1785 		}
1786 		pi->tcp = (struct tcphdr *)((char *)pi->ip6 + pi->ip_hlen);
1787 		break;
1788 #endif
1789 	default:
1790 		return EINVAL;
1791 	}
1792 	return 0;
1793 }
1794 
1795 #if IFCAP_TSO4
1796 
1797 static void
1798 mxge_encap_tso(struct mxge_slice_state *ss, struct mbuf *m,
1799 	       int busdma_seg_cnt, struct mxge_pkt_info *pi)
1800 {
1801 	mxge_tx_ring_t *tx;
1802 	mcp_kreq_ether_send_t *req;
1803 	bus_dma_segment_t *seg;
1804 	uint32_t low, high_swapped;
1805 	int len, seglen, cum_len, cum_len_next;
1806 	int next_is_first, chop, cnt, rdma_count, small;
1807 	uint16_t pseudo_hdr_offset, cksum_offset, mss, sum;
1808 	uint8_t flags, flags_next;
1809 	static int once;
1810 
1811 	mss = m->m_pkthdr.tso_segsz;
1812 
1813 	/* negative cum_len signifies to the
1814 	 * send loop that we are still in the
1815 	 * header portion of the TSO packet.
1816 	 */
1817 
1818 	cksum_offset = pi->ip_off + pi->ip_hlen;
1819 	cum_len = -(cksum_offset + (pi->tcp->th_off << 2));
1820 
1821 	/* TSO implies checksum offload on this hardware */
1822 	if (__predict_false((m->m_pkthdr.csum_flags & (CSUM_TCP|CSUM_TCP_IPV6)) == 0)) {
1823 		/*
1824 		 * If packet has full TCP csum, replace it with pseudo hdr
1825 		 * sum that the NIC expects, otherwise the NIC will emit
1826 		 * packets with bad TCP checksums.
1827 		 */
1828 		m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
1829 		if (pi->ip6) {
1830 #if (CSUM_TCP_IPV6 != 0) && defined(INET6)
1831 			m->m_pkthdr.csum_flags |= CSUM_TCP_IPV6;
1832 			sum = in6_cksum_pseudo(pi->ip6,
1833 			    m->m_pkthdr.len - cksum_offset,
1834 			    IPPROTO_TCP, 0);
1835 #endif
1836 		} else {
1837 #ifdef INET
1838 			m->m_pkthdr.csum_flags |= CSUM_TCP;
1839 			sum = in_pseudo(pi->ip->ip_src.s_addr,
1840 			    pi->ip->ip_dst.s_addr,
1841 			    htons(IPPROTO_TCP + (m->m_pkthdr.len -
1842 				    cksum_offset)));
1843 #endif
1844 		}
1845 		m_copyback(m, offsetof(struct tcphdr, th_sum) +
1846 		    cksum_offset, sizeof(sum), (caddr_t)&sum);
1847 	}
1848 	flags = MXGEFW_FLAGS_TSO_HDR | MXGEFW_FLAGS_FIRST;
1849 
1850 	/* for TSO, pseudo_hdr_offset holds mss.
1851 	 * The firmware figures out where to put
1852 	 * the checksum by parsing the header. */
1853 	pseudo_hdr_offset = htobe16(mss);
1854 
1855 	if (pi->ip6) {
1856 		/*
1857 		 * for IPv6 TSO, the "checksum offset" is re-purposed
1858 		 * to store the TCP header len
1859 		 */
1860 		cksum_offset = (pi->tcp->th_off << 2);
1861 	}
1862 
1863 	tx = &ss->tx;
1864 	req = tx->req_list;
1865 	seg = tx->seg_list;
1866 	cnt = 0;
1867 	rdma_count = 0;
1868 	/* "rdma_count" is the number of RDMAs belonging to the
1869 	 * current packet BEFORE the current send request. For
1870 	 * non-TSO packets, this is equal to "count".
1871 	 * For TSO packets, rdma_count needs to be reset
1872 	 * to 0 after a segment cut.
1873 	 *
1874 	 * The rdma_count field of the send request is
1875 	 * the number of RDMAs of the packet starting at
1876 	 * that request. For TSO send requests with one ore more cuts
1877 	 * in the middle, this is the number of RDMAs starting
1878 	 * after the last cut in the request. All previous
1879 	 * segments before the last cut implicitly have 1 RDMA.
1880 	 *
1881 	 * Since the number of RDMAs is not known beforehand,
1882 	 * it must be filled-in retroactively - after each
1883 	 * segmentation cut or at the end of the entire packet.
1884 	 */
1885 
1886 	while (busdma_seg_cnt) {
1887 		/* Break the busdma segment up into pieces*/
1888 		low = MXGE_LOWPART_TO_U32(seg->ds_addr);
1889 		high_swapped = 	htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
1890 		len = seg->ds_len;
1891 
1892 		while (len) {
1893 			flags_next = flags & ~MXGEFW_FLAGS_FIRST;
1894 			seglen = len;
1895 			cum_len_next = cum_len + seglen;
1896 			(req-rdma_count)->rdma_count = rdma_count + 1;
1897 			if (__predict_true(cum_len >= 0)) {
1898 				/* payload */
1899 				chop = (cum_len_next > mss);
1900 				cum_len_next = cum_len_next % mss;
1901 				next_is_first = (cum_len_next == 0);
1902 				flags |= chop * MXGEFW_FLAGS_TSO_CHOP;
1903 				flags_next |= next_is_first *
1904 					MXGEFW_FLAGS_FIRST;
1905 				rdma_count |= -(chop | next_is_first);
1906 				rdma_count += chop & !next_is_first;
1907 			} else if (cum_len_next >= 0) {
1908 				/* header ends */
1909 				rdma_count = -1;
1910 				cum_len_next = 0;
1911 				seglen = -cum_len;
1912 				small = (mss <= MXGEFW_SEND_SMALL_SIZE);
1913 				flags_next = MXGEFW_FLAGS_TSO_PLD |
1914 					MXGEFW_FLAGS_FIRST |
1915 					(small * MXGEFW_FLAGS_SMALL);
1916 			    }
1917 
1918 			req->addr_high = high_swapped;
1919 			req->addr_low = htobe32(low);
1920 			req->pseudo_hdr_offset = pseudo_hdr_offset;
1921 			req->pad = 0;
1922 			req->rdma_count = 1;
1923 			req->length = htobe16(seglen);
1924 			req->cksum_offset = cksum_offset;
1925 			req->flags = flags | ((cum_len & 1) *
1926 					      MXGEFW_FLAGS_ALIGN_ODD);
1927 			low += seglen;
1928 			len -= seglen;
1929 			cum_len = cum_len_next;
1930 			flags = flags_next;
1931 			req++;
1932 			cnt++;
1933 			rdma_count++;
1934 			if (cksum_offset != 0 && !pi->ip6) {
1935 				if (__predict_false(cksum_offset > seglen))
1936 					cksum_offset -= seglen;
1937 				else
1938 					cksum_offset = 0;
1939 			}
1940 			if (__predict_false(cnt > tx->max_desc))
1941 				goto drop;
1942 		}
1943 		busdma_seg_cnt--;
1944 		seg++;
1945 	}
1946 	(req-rdma_count)->rdma_count = rdma_count;
1947 
1948 	do {
1949 		req--;
1950 		req->flags |= MXGEFW_FLAGS_TSO_LAST;
1951 	} while (!(req->flags & (MXGEFW_FLAGS_TSO_CHOP | MXGEFW_FLAGS_FIRST)));
1952 
1953 	tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
1954 	mxge_submit_req(tx, tx->req_list, cnt);
1955 
1956 	if ((ss->sc->num_slices > 1) && tx->queue_active == 0) {
1957 		/* tell the NIC to start polling this slice */
1958 		*tx->send_go = 1;
1959 		tx->queue_active = 1;
1960 		tx->activate++;
1961 		wmb();
1962 	}
1963 
1964 	return;
1965 
1966 drop:
1967 	bus_dmamap_unload(tx->dmat, tx->info[tx->req & tx->mask].map);
1968 	m_freem(m);
1969 	ss->oerrors++;
1970 	if (!once) {
1971 		printf("tx->max_desc exceeded via TSO!\n");
1972 		printf("mss = %d, %ld, %d!\n", mss,
1973 		       (long)seg - (long)tx->seg_list, tx->max_desc);
1974 		once = 1;
1975 	}
1976 	return;
1977 
1978 }
1979 
1980 #endif /* IFCAP_TSO4 */
1981 
1982 #ifdef MXGE_NEW_VLAN_API
1983 /*
1984  * We reproduce the software vlan tag insertion from
1985  * net/if_vlan.c:vlan_start() here so that we can advertise "hardware"
1986  * vlan tag insertion. We need to advertise this in order to have the
1987  * vlan interface respect our csum offload flags.
1988  */
1989 static struct mbuf *
1990 mxge_vlan_tag_insert(struct mbuf *m)
1991 {
1992 	struct ether_vlan_header *evl;
1993 
1994 	M_PREPEND(m, ETHER_VLAN_ENCAP_LEN, M_NOWAIT);
1995 	if (__predict_false(m == NULL))
1996 		return NULL;
1997 	if (m->m_len < sizeof(*evl)) {
1998 		m = m_pullup(m, sizeof(*evl));
1999 		if (__predict_false(m == NULL))
2000 			return NULL;
2001 	}
2002 	/*
2003 	 * Transform the Ethernet header into an Ethernet header
2004 	 * with 802.1Q encapsulation.
2005 	 */
2006 	evl = mtod(m, struct ether_vlan_header *);
2007 	bcopy((char *)evl + ETHER_VLAN_ENCAP_LEN,
2008 	      (char *)evl, ETHER_HDR_LEN - ETHER_TYPE_LEN);
2009 	evl->evl_encap_proto = htons(ETHERTYPE_VLAN);
2010 	evl->evl_tag = htons(m->m_pkthdr.ether_vtag);
2011 	m->m_flags &= ~M_VLANTAG;
2012 	return m;
2013 }
2014 #endif /* MXGE_NEW_VLAN_API */
2015 
2016 static void
2017 mxge_encap(struct mxge_slice_state *ss, struct mbuf *m)
2018 {
2019 	struct mxge_pkt_info pi = {0,0,0,0};
2020 	mxge_softc_t *sc;
2021 	mcp_kreq_ether_send_t *req;
2022 	bus_dma_segment_t *seg;
2023 	struct mbuf *m_tmp;
2024 	mxge_tx_ring_t *tx;
2025 	int cnt, cum_len, err, i, idx, odd_flag;
2026 	uint16_t pseudo_hdr_offset;
2027 	uint8_t flags, cksum_offset;
2028 
2029 	sc = ss->sc;
2030 	tx = &ss->tx;
2031 
2032 #ifdef MXGE_NEW_VLAN_API
2033 	if (m->m_flags & M_VLANTAG) {
2034 		m = mxge_vlan_tag_insert(m);
2035 		if (__predict_false(m == NULL))
2036 			goto drop_without_m;
2037 	}
2038 #endif
2039 	if (m->m_pkthdr.csum_flags &
2040 	    (CSUM_TSO | CSUM_DELAY_DATA | CSUM_DELAY_DATA_IPV6)) {
2041 		if (mxge_parse_tx(ss, m, &pi))
2042 			goto drop;
2043 	}
2044 
2045 	/* (try to) map the frame for DMA */
2046 	idx = tx->req & tx->mask;
2047 	err = bus_dmamap_load_mbuf_sg(tx->dmat, tx->info[idx].map,
2048 				      m, tx->seg_list, &cnt,
2049 				      BUS_DMA_NOWAIT);
2050 	if (__predict_false(err == EFBIG)) {
2051 		/* Too many segments in the chain.  Try
2052 		   to defrag */
2053 		m_tmp = m_defrag(m, M_NOWAIT);
2054 		if (m_tmp == NULL) {
2055 			goto drop;
2056 		}
2057 		ss->tx.defrag++;
2058 		m = m_tmp;
2059 		err = bus_dmamap_load_mbuf_sg(tx->dmat,
2060 					      tx->info[idx].map,
2061 					      m, tx->seg_list, &cnt,
2062 					      BUS_DMA_NOWAIT);
2063 	}
2064 	if (__predict_false(err != 0)) {
2065 		device_printf(sc->dev, "bus_dmamap_load_mbuf_sg returned %d"
2066 			      " packet len = %d\n", err, m->m_pkthdr.len);
2067 		goto drop;
2068 	}
2069 	bus_dmamap_sync(tx->dmat, tx->info[idx].map,
2070 			BUS_DMASYNC_PREWRITE);
2071 	tx->info[idx].m = m;
2072 
2073 #if IFCAP_TSO4
2074 	/* TSO is different enough, we handle it in another routine */
2075 	if (m->m_pkthdr.csum_flags & (CSUM_TSO)) {
2076 		mxge_encap_tso(ss, m, cnt, &pi);
2077 		return;
2078 	}
2079 #endif
2080 
2081 	req = tx->req_list;
2082 	cksum_offset = 0;
2083 	pseudo_hdr_offset = 0;
2084 	flags = MXGEFW_FLAGS_NO_TSO;
2085 
2086 	/* checksum offloading? */
2087 	if (m->m_pkthdr.csum_flags &
2088 	    (CSUM_DELAY_DATA | CSUM_DELAY_DATA_IPV6)) {
2089 		/* ensure ip header is in first mbuf, copy
2090 		   it to a scratch buffer if not */
2091 		cksum_offset = pi.ip_off + pi.ip_hlen;
2092 		pseudo_hdr_offset = cksum_offset +  m->m_pkthdr.csum_data;
2093 		pseudo_hdr_offset = htobe16(pseudo_hdr_offset);
2094 		req->cksum_offset = cksum_offset;
2095 		flags |= MXGEFW_FLAGS_CKSUM;
2096 		odd_flag = MXGEFW_FLAGS_ALIGN_ODD;
2097 	} else {
2098 		odd_flag = 0;
2099 	}
2100 	if (m->m_pkthdr.len < MXGEFW_SEND_SMALL_SIZE)
2101 		flags |= MXGEFW_FLAGS_SMALL;
2102 
2103 	/* convert segments into a request list */
2104 	cum_len = 0;
2105 	seg = tx->seg_list;
2106 	req->flags = MXGEFW_FLAGS_FIRST;
2107 	for (i = 0; i < cnt; i++) {
2108 		req->addr_low =
2109 			htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
2110 		req->addr_high =
2111 			htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
2112 		req->length = htobe16(seg->ds_len);
2113 		req->cksum_offset = cksum_offset;
2114 		if (cksum_offset > seg->ds_len)
2115 			cksum_offset -= seg->ds_len;
2116 		else
2117 			cksum_offset = 0;
2118 		req->pseudo_hdr_offset = pseudo_hdr_offset;
2119 		req->pad = 0; /* complete solid 16-byte block */
2120 		req->rdma_count = 1;
2121 		req->flags |= flags | ((cum_len & 1) * odd_flag);
2122 		cum_len += seg->ds_len;
2123 		seg++;
2124 		req++;
2125 		req->flags = 0;
2126 	}
2127 	req--;
2128 	/* pad runts to 60 bytes */
2129 	if (cum_len < 60) {
2130 		req++;
2131 		req->addr_low =
2132 			htobe32(MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr));
2133 		req->addr_high =
2134 			htobe32(MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr));
2135 		req->length = htobe16(60 - cum_len);
2136 		req->cksum_offset = 0;
2137 		req->pseudo_hdr_offset = pseudo_hdr_offset;
2138 		req->pad = 0; /* complete solid 16-byte block */
2139 		req->rdma_count = 1;
2140 		req->flags |= flags | ((cum_len & 1) * odd_flag);
2141 		cnt++;
2142 	}
2143 
2144 	tx->req_list[0].rdma_count = cnt;
2145 #if 0
2146 	/* print what the firmware will see */
2147 	for (i = 0; i < cnt; i++) {
2148 		printf("%d: addr: 0x%x 0x%x len:%d pso%d,"
2149 		    "cso:%d, flags:0x%x, rdma:%d\n",
2150 		    i, (int)ntohl(tx->req_list[i].addr_high),
2151 		    (int)ntohl(tx->req_list[i].addr_low),
2152 		    (int)ntohs(tx->req_list[i].length),
2153 		    (int)ntohs(tx->req_list[i].pseudo_hdr_offset),
2154 		    tx->req_list[i].cksum_offset, tx->req_list[i].flags,
2155 		    tx->req_list[i].rdma_count);
2156 	}
2157 	printf("--------------\n");
2158 #endif
2159 	tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
2160 	mxge_submit_req(tx, tx->req_list, cnt);
2161 
2162 	if ((ss->sc->num_slices > 1) && tx->queue_active == 0) {
2163 		/* tell the NIC to start polling this slice */
2164 		*tx->send_go = 1;
2165 		tx->queue_active = 1;
2166 		tx->activate++;
2167 		wmb();
2168 	}
2169 
2170 	return;
2171 
2172 drop:
2173 	m_freem(m);
2174 drop_without_m:
2175 	ss->oerrors++;
2176 	return;
2177 }
2178 
2179 static void
2180 mxge_qflush(if_t ifp)
2181 {
2182 	mxge_softc_t *sc = if_getsoftc(ifp);
2183 	mxge_tx_ring_t *tx;
2184 	struct mbuf *m;
2185 	int slice;
2186 
2187 	for (slice = 0; slice < sc->num_slices; slice++) {
2188 		tx = &sc->ss[slice].tx;
2189 		mtx_lock(&tx->mtx);
2190 		while ((m = buf_ring_dequeue_sc(tx->br)) != NULL)
2191 			m_freem(m);
2192 		mtx_unlock(&tx->mtx);
2193 	}
2194 	if_qflush(ifp);
2195 }
2196 
2197 static inline void
2198 mxge_start_locked(struct mxge_slice_state *ss)
2199 {
2200 	mxge_softc_t *sc;
2201 	struct mbuf *m;
2202 	if_t ifp;
2203 	mxge_tx_ring_t *tx;
2204 
2205 	sc = ss->sc;
2206 	ifp = sc->ifp;
2207 	tx = &ss->tx;
2208 
2209 	while ((tx->mask - (tx->req - tx->done)) > tx->max_desc) {
2210 		m = drbr_dequeue(ifp, tx->br);
2211 		if (m == NULL) {
2212 			return;
2213 		}
2214 		/* let BPF see it */
2215 		BPF_MTAP(ifp, m);
2216 
2217 		/* give it to the nic */
2218 		mxge_encap(ss, m);
2219 	}
2220 	/* ran out of transmit slots */
2221 	if (((ss->if_drv_flags & IFF_DRV_OACTIVE) == 0)
2222 	    && (!drbr_empty(ifp, tx->br))) {
2223 		ss->if_drv_flags |= IFF_DRV_OACTIVE;
2224 		tx->stall++;
2225 	}
2226 }
2227 
2228 static int
2229 mxge_transmit_locked(struct mxge_slice_state *ss, struct mbuf *m)
2230 {
2231 	mxge_softc_t *sc;
2232 	if_t ifp;
2233 	mxge_tx_ring_t *tx;
2234 	int err;
2235 
2236 	sc = ss->sc;
2237 	ifp = sc->ifp;
2238 	tx = &ss->tx;
2239 
2240 	if ((ss->if_drv_flags & (IFF_DRV_RUNNING|IFF_DRV_OACTIVE)) !=
2241 	    IFF_DRV_RUNNING) {
2242 		err = drbr_enqueue(ifp, tx->br, m);
2243 		return (err);
2244 	}
2245 
2246 	if (!drbr_needs_enqueue(ifp, tx->br) &&
2247 	    ((tx->mask - (tx->req - tx->done)) > tx->max_desc)) {
2248 		/* let BPF see it */
2249 		BPF_MTAP(ifp, m);
2250 		/* give it to the nic */
2251 		mxge_encap(ss, m);
2252 	} else if ((err = drbr_enqueue(ifp, tx->br, m)) != 0) {
2253 		return (err);
2254 	}
2255 	if (!drbr_empty(ifp, tx->br))
2256 		mxge_start_locked(ss);
2257 	return (0);
2258 }
2259 
2260 static int
2261 mxge_transmit(if_t ifp, struct mbuf *m)
2262 {
2263 	mxge_softc_t *sc = if_getsoftc(ifp);
2264 	struct mxge_slice_state *ss;
2265 	mxge_tx_ring_t *tx;
2266 	int err = 0;
2267 	int slice;
2268 
2269 	slice = m->m_pkthdr.flowid;
2270 	slice &= (sc->num_slices - 1);  /* num_slices always power of 2 */
2271 
2272 	ss = &sc->ss[slice];
2273 	tx = &ss->tx;
2274 
2275 	if (mtx_trylock(&tx->mtx)) {
2276 		err = mxge_transmit_locked(ss, m);
2277 		mtx_unlock(&tx->mtx);
2278 	} else {
2279 		err = drbr_enqueue(ifp, tx->br, m);
2280 	}
2281 
2282 	return (err);
2283 }
2284 
2285 static void
2286 mxge_start(if_t ifp)
2287 {
2288 	mxge_softc_t *sc = if_getsoftc(ifp);
2289 	struct mxge_slice_state *ss;
2290 
2291 	/* only use the first slice for now */
2292 	ss = &sc->ss[0];
2293 	mtx_lock(&ss->tx.mtx);
2294 	mxge_start_locked(ss);
2295 	mtx_unlock(&ss->tx.mtx);
2296 }
2297 
2298 /*
2299  * copy an array of mcp_kreq_ether_recv_t's to the mcp.  Copy
2300  * at most 32 bytes at a time, so as to avoid involving the software
2301  * pio handler in the nic.   We re-write the first segment's low
2302  * DMA address to mark it valid only after we write the entire chunk
2303  * in a burst
2304  */
2305 static inline void
2306 mxge_submit_8rx(volatile mcp_kreq_ether_recv_t *dst,
2307 		mcp_kreq_ether_recv_t *src)
2308 {
2309 	uint32_t low;
2310 
2311 	low = src->addr_low;
2312 	src->addr_low = 0xffffffff;
2313 	mxge_pio_copy(dst, src, 4 * sizeof (*src));
2314 	wmb();
2315 	mxge_pio_copy(dst + 4, src + 4, 4 * sizeof (*src));
2316 	wmb();
2317 	src->addr_low = low;
2318 	dst->addr_low = low;
2319 	wmb();
2320 }
2321 
2322 static int
2323 mxge_get_buf_small(struct mxge_slice_state *ss, bus_dmamap_t map, int idx)
2324 {
2325 	bus_dma_segment_t seg;
2326 	struct mbuf *m;
2327 	mxge_rx_ring_t *rx = &ss->rx_small;
2328 	int cnt, err;
2329 
2330 	m = m_gethdr(M_NOWAIT, MT_DATA);
2331 	if (m == NULL) {
2332 		rx->alloc_fail++;
2333 		err = ENOBUFS;
2334 		goto done;
2335 	}
2336 	m->m_len = MHLEN;
2337 	err = bus_dmamap_load_mbuf_sg(rx->dmat, map, m,
2338 				      &seg, &cnt, BUS_DMA_NOWAIT);
2339 	if (err != 0) {
2340 		m_free(m);
2341 		goto done;
2342 	}
2343 	rx->info[idx].m = m;
2344 	rx->shadow[idx].addr_low =
2345 		htobe32(MXGE_LOWPART_TO_U32(seg.ds_addr));
2346 	rx->shadow[idx].addr_high =
2347 		htobe32(MXGE_HIGHPART_TO_U32(seg.ds_addr));
2348 
2349 done:
2350 	if ((idx & 7) == 7)
2351 		mxge_submit_8rx(&rx->lanai[idx - 7], &rx->shadow[idx - 7]);
2352 	return err;
2353 }
2354 
2355 static int
2356 mxge_get_buf_big(struct mxge_slice_state *ss, bus_dmamap_t map, int idx)
2357 {
2358 	bus_dma_segment_t seg[3];
2359 	struct mbuf *m;
2360 	mxge_rx_ring_t *rx = &ss->rx_big;
2361 	int cnt, err, i;
2362 
2363 	m = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, rx->cl_size);
2364 	if (m == NULL) {
2365 		rx->alloc_fail++;
2366 		err = ENOBUFS;
2367 		goto done;
2368 	}
2369 	m->m_len = rx->mlen;
2370 	err = bus_dmamap_load_mbuf_sg(rx->dmat, map, m,
2371 				      seg, &cnt, BUS_DMA_NOWAIT);
2372 	if (err != 0) {
2373 		m_free(m);
2374 		goto done;
2375 	}
2376 	rx->info[idx].m = m;
2377 	rx->shadow[idx].addr_low =
2378 		htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
2379 	rx->shadow[idx].addr_high =
2380 		htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
2381 
2382 done:
2383        for (i = 0; i < rx->nbufs; i++) {
2384 		if ((idx & 7) == 7) {
2385 			mxge_submit_8rx(&rx->lanai[idx - 7],
2386 					&rx->shadow[idx - 7]);
2387 		}
2388 		idx++;
2389 	}
2390 	return err;
2391 }
2392 
2393 #ifdef INET6
2394 
2395 static uint16_t
2396 mxge_csum_generic(uint16_t *raw, int len)
2397 {
2398 	uint32_t csum;
2399 
2400 	csum = 0;
2401 	while (len > 0) {
2402 		csum += *raw;
2403 		raw++;
2404 		len -= 2;
2405 	}
2406 	csum = (csum >> 16) + (csum & 0xffff);
2407 	csum = (csum >> 16) + (csum & 0xffff);
2408 	return (uint16_t)csum;
2409 }
2410 
2411 static inline uint16_t
2412 mxge_rx_csum6(void *p, struct mbuf *m, uint32_t csum)
2413 {
2414 	uint32_t partial;
2415 	int nxt, cksum_offset;
2416 	struct ip6_hdr *ip6 = p;
2417 	uint16_t c;
2418 
2419 	nxt = ip6->ip6_nxt;
2420 	cksum_offset = sizeof (*ip6) + ETHER_HDR_LEN;
2421 	if (nxt != IPPROTO_TCP && nxt != IPPROTO_UDP) {
2422 		cksum_offset = ip6_lasthdr(m, ETHER_HDR_LEN,
2423 					   IPPROTO_IPV6, &nxt);
2424 		if (nxt != IPPROTO_TCP && nxt != IPPROTO_UDP)
2425 			return (1);
2426 	}
2427 
2428 	/*
2429 	 * IPv6 headers do not contain a checksum, and hence
2430 	 * do not checksum to zero, so they don't "fall out"
2431 	 * of the partial checksum calculation like IPv4
2432 	 * headers do.  We need to fix the partial checksum by
2433 	 * subtracting the checksum of the IPv6 header.
2434 	 */
2435 
2436 	partial = mxge_csum_generic((uint16_t *)ip6, cksum_offset -
2437 				    ETHER_HDR_LEN);
2438 	csum += ~partial;
2439 	csum +=	 (csum < ~partial);
2440 	csum = (csum >> 16) + (csum & 0xFFFF);
2441 	csum = (csum >> 16) + (csum & 0xFFFF);
2442 	c = in6_cksum_pseudo(ip6, m->m_pkthdr.len - cksum_offset, nxt,
2443 			     csum);
2444 	c ^= 0xffff;
2445 	return (c);
2446 }
2447 #endif /* INET6 */
2448 /*
2449  *  Myri10GE hardware checksums are not valid if the sender
2450  *  padded the frame with non-zero padding.  This is because
2451  *  the firmware just does a simple 16-bit 1s complement
2452  *  checksum across the entire frame, excluding the first 14
2453  *  bytes.  It is best to simply to check the checksum and
2454  *  tell the stack about it only if the checksum is good
2455  */
2456 
2457 static inline uint16_t
2458 mxge_rx_csum(struct mbuf *m, int csum)
2459 {
2460 	struct ether_header *eh;
2461 #ifdef INET
2462 	struct ip *ip;
2463 #endif
2464 #if defined(INET) || defined(INET6)
2465 	int cap = if_getcapenable(m->m_pkthdr.rcvif);
2466 #endif
2467 	uint16_t c, etype;
2468 
2469 	eh = mtod(m, struct ether_header *);
2470 	etype = ntohs(eh->ether_type);
2471 	switch (etype) {
2472 #ifdef INET
2473 	case ETHERTYPE_IP:
2474 		if ((cap & IFCAP_RXCSUM) == 0)
2475 			return (1);
2476 		ip = (struct ip *)(eh + 1);
2477 		if (ip->ip_p != IPPROTO_TCP && ip->ip_p != IPPROTO_UDP)
2478 			return (1);
2479 		c = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
2480 			      htonl(ntohs(csum) + ntohs(ip->ip_len) -
2481 				    (ip->ip_hl << 2) + ip->ip_p));
2482 		c ^= 0xffff;
2483 		break;
2484 #endif
2485 #ifdef INET6
2486 	case ETHERTYPE_IPV6:
2487 		if ((cap & IFCAP_RXCSUM_IPV6) == 0)
2488 			return (1);
2489 		c = mxge_rx_csum6((eh + 1), m, csum);
2490 		break;
2491 #endif
2492 	default:
2493 		c = 1;
2494 	}
2495 	return (c);
2496 }
2497 
2498 static void
2499 mxge_vlan_tag_remove(struct mbuf *m, uint32_t *csum)
2500 {
2501 	struct ether_vlan_header *evl;
2502 	uint32_t partial;
2503 
2504 	evl = mtod(m, struct ether_vlan_header *);
2505 
2506 	/*
2507 	 * fix checksum by subtracting ETHER_VLAN_ENCAP_LEN bytes
2508 	 * after what the firmware thought was the end of the ethernet
2509 	 * header.
2510 	 */
2511 
2512 	/* put checksum into host byte order */
2513 	*csum = ntohs(*csum);
2514 	partial = ntohl(*(uint32_t *)(mtod(m, char *) + ETHER_HDR_LEN));
2515 	(*csum) += ~partial;
2516 	(*csum) +=  ((*csum) < ~partial);
2517 	(*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2518 	(*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2519 
2520 	/* restore checksum to network byte order;
2521 	   later consumers expect this */
2522 	*csum = htons(*csum);
2523 
2524 	/* save the tag */
2525 #ifdef MXGE_NEW_VLAN_API
2526 	m->m_pkthdr.ether_vtag = ntohs(evl->evl_tag);
2527 #else
2528 	{
2529 		struct m_tag *mtag;
2530 		mtag = m_tag_alloc(MTAG_VLAN, MTAG_VLAN_TAG, sizeof(u_int),
2531 				   M_NOWAIT);
2532 		if (mtag == NULL)
2533 			return;
2534 		VLAN_TAG_VALUE(mtag) = ntohs(evl->evl_tag);
2535 		m_tag_prepend(m, mtag);
2536 	}
2537 
2538 #endif
2539 	m->m_flags |= M_VLANTAG;
2540 
2541 	/*
2542 	 * Remove the 802.1q header by copying the Ethernet
2543 	 * addresses over it and adjusting the beginning of
2544 	 * the data in the mbuf.  The encapsulated Ethernet
2545 	 * type field is already in place.
2546 	 */
2547 	bcopy((char *)evl, (char *)evl + ETHER_VLAN_ENCAP_LEN,
2548 	      ETHER_HDR_LEN - ETHER_TYPE_LEN);
2549 	m_adj(m, ETHER_VLAN_ENCAP_LEN);
2550 }
2551 
2552 static inline void
2553 mxge_rx_done_big(struct mxge_slice_state *ss, uint32_t len,
2554 		 uint32_t csum, int lro)
2555 {
2556 	mxge_softc_t *sc;
2557 	if_t ifp;
2558 	struct mbuf *m;
2559 	struct ether_header *eh;
2560 	mxge_rx_ring_t *rx;
2561 	bus_dmamap_t old_map;
2562 	int idx;
2563 
2564 	sc = ss->sc;
2565 	ifp = sc->ifp;
2566 	rx = &ss->rx_big;
2567 	idx = rx->cnt & rx->mask;
2568 	rx->cnt += rx->nbufs;
2569 	/* save a pointer to the received mbuf */
2570 	m = rx->info[idx].m;
2571 	/* try to replace the received mbuf */
2572 	if (mxge_get_buf_big(ss, rx->extra_map, idx)) {
2573 		/* drop the frame -- the old mbuf is re-cycled */
2574 		if_inc_counter(ifp, IFCOUNTER_IERRORS, 1);
2575 		return;
2576 	}
2577 
2578 	/* unmap the received buffer */
2579 	old_map = rx->info[idx].map;
2580 	bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2581 	bus_dmamap_unload(rx->dmat, old_map);
2582 
2583 	/* swap the bus_dmamap_t's */
2584 	rx->info[idx].map = rx->extra_map;
2585 	rx->extra_map = old_map;
2586 
2587 	/* mcp implicitly skips 1st 2 bytes so that packet is properly
2588 	 * aligned */
2589 	m->m_data += MXGEFW_PAD;
2590 
2591 	m->m_pkthdr.rcvif = ifp;
2592 	m->m_len = m->m_pkthdr.len = len;
2593 	ss->ipackets++;
2594 	eh = mtod(m, struct ether_header *);
2595 	if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2596 		mxge_vlan_tag_remove(m, &csum);
2597 	}
2598 	/* flowid only valid if RSS hashing is enabled */
2599 	if (sc->num_slices > 1) {
2600 		m->m_pkthdr.flowid = (ss - sc->ss);
2601 		M_HASHTYPE_SET(m, M_HASHTYPE_OPAQUE);
2602 	}
2603 	/* if the checksum is valid, mark it in the mbuf header */
2604 	if ((if_getcapenable(ifp) & (IFCAP_RXCSUM_IPV6 | IFCAP_RXCSUM)) &&
2605 	    (0 == mxge_rx_csum(m, csum))) {
2606 		/* Tell the stack that the  checksum is good */
2607 		m->m_pkthdr.csum_data = 0xffff;
2608 		m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR |
2609 			CSUM_DATA_VALID;
2610 
2611 #if defined(INET) || defined (INET6)
2612 		if (lro && (0 == tcp_lro_rx(&ss->lc, m, 0)))
2613 			return;
2614 #endif
2615 	}
2616 	/* pass the frame up the stack */
2617 	if_input(ifp, m);
2618 }
2619 
2620 static inline void
2621 mxge_rx_done_small(struct mxge_slice_state *ss, uint32_t len,
2622 		   uint32_t csum, int lro)
2623 {
2624 	mxge_softc_t *sc;
2625 	if_t ifp;
2626 	struct ether_header *eh;
2627 	struct mbuf *m;
2628 	mxge_rx_ring_t *rx;
2629 	bus_dmamap_t old_map;
2630 	int idx;
2631 
2632 	sc = ss->sc;
2633 	ifp = sc->ifp;
2634 	rx = &ss->rx_small;
2635 	idx = rx->cnt & rx->mask;
2636 	rx->cnt++;
2637 	/* save a pointer to the received mbuf */
2638 	m = rx->info[idx].m;
2639 	/* try to replace the received mbuf */
2640 	if (mxge_get_buf_small(ss, rx->extra_map, idx)) {
2641 		/* drop the frame -- the old mbuf is re-cycled */
2642 		if_inc_counter(ifp, IFCOUNTER_IERRORS, 1);
2643 		return;
2644 	}
2645 
2646 	/* unmap the received buffer */
2647 	old_map = rx->info[idx].map;
2648 	bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2649 	bus_dmamap_unload(rx->dmat, old_map);
2650 
2651 	/* swap the bus_dmamap_t's */
2652 	rx->info[idx].map = rx->extra_map;
2653 	rx->extra_map = old_map;
2654 
2655 	/* mcp implicitly skips 1st 2 bytes so that packet is properly
2656 	 * aligned */
2657 	m->m_data += MXGEFW_PAD;
2658 
2659 	m->m_pkthdr.rcvif = ifp;
2660 	m->m_len = m->m_pkthdr.len = len;
2661 	ss->ipackets++;
2662 	eh = mtod(m, struct ether_header *);
2663 	if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2664 		mxge_vlan_tag_remove(m, &csum);
2665 	}
2666 	/* flowid only valid if RSS hashing is enabled */
2667 	if (sc->num_slices > 1) {
2668 		m->m_pkthdr.flowid = (ss - sc->ss);
2669 		M_HASHTYPE_SET(m, M_HASHTYPE_OPAQUE);
2670 	}
2671 	/* if the checksum is valid, mark it in the mbuf header */
2672 	if ((if_getcapenable(ifp) & (IFCAP_RXCSUM_IPV6 | IFCAP_RXCSUM)) &&
2673 	    (0 == mxge_rx_csum(m, csum))) {
2674 		/* Tell the stack that the  checksum is good */
2675 		m->m_pkthdr.csum_data = 0xffff;
2676 		m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR |
2677 			CSUM_DATA_VALID;
2678 
2679 #if defined(INET) || defined (INET6)
2680 		if (lro && (0 == tcp_lro_rx(&ss->lc, m, csum)))
2681 			return;
2682 #endif
2683 	}
2684 	/* pass the frame up the stack */
2685 	if_input(ifp, m);
2686 }
2687 
2688 static inline void
2689 mxge_clean_rx_done(struct mxge_slice_state *ss)
2690 {
2691 	mxge_rx_done_t *rx_done = &ss->rx_done;
2692 	int limit = 0;
2693 	uint16_t length;
2694 	uint16_t checksum;
2695 	int lro;
2696 
2697 	lro = if_getcapenable(ss->sc->ifp) & IFCAP_LRO;
2698 	while (rx_done->entry[rx_done->idx].length != 0) {
2699 		length = ntohs(rx_done->entry[rx_done->idx].length);
2700 		rx_done->entry[rx_done->idx].length = 0;
2701 		checksum = rx_done->entry[rx_done->idx].checksum;
2702 		if (length <= (MHLEN - MXGEFW_PAD))
2703 			mxge_rx_done_small(ss, length, checksum, lro);
2704 		else
2705 			mxge_rx_done_big(ss, length, checksum, lro);
2706 		rx_done->cnt++;
2707 		rx_done->idx = rx_done->cnt & rx_done->mask;
2708 
2709 		/* limit potential for livelock */
2710 		if (__predict_false(++limit > rx_done->mask / 2))
2711 			break;
2712 	}
2713 #if defined(INET)  || defined (INET6)
2714 	tcp_lro_flush_all(&ss->lc);
2715 #endif
2716 }
2717 
2718 static inline void
2719 mxge_tx_done(struct mxge_slice_state *ss, uint32_t mcp_idx)
2720 {
2721 	if_t ifp __unused;
2722 	mxge_tx_ring_t *tx;
2723 	struct mbuf *m;
2724 	bus_dmamap_t map;
2725 	int idx;
2726 	int *flags;
2727 
2728 	tx = &ss->tx;
2729 	ifp = ss->sc->ifp;
2730 	while (tx->pkt_done != mcp_idx) {
2731 		idx = tx->done & tx->mask;
2732 		tx->done++;
2733 		m = tx->info[idx].m;
2734 		/* mbuf and DMA map only attached to the first
2735 		   segment per-mbuf */
2736 		if (m != NULL) {
2737 			ss->obytes += m->m_pkthdr.len;
2738 			if (m->m_flags & M_MCAST)
2739 				ss->omcasts++;
2740 			ss->opackets++;
2741 			tx->info[idx].m = NULL;
2742 			map = tx->info[idx].map;
2743 			bus_dmamap_unload(tx->dmat, map);
2744 			m_freem(m);
2745 		}
2746 		if (tx->info[idx].flag) {
2747 			tx->info[idx].flag = 0;
2748 			tx->pkt_done++;
2749 		}
2750 	}
2751 
2752 	/* If we have space, clear IFF_OACTIVE to tell the stack that
2753 	   its OK to send packets */
2754 	flags = &ss->if_drv_flags;
2755 	mtx_lock(&ss->tx.mtx);
2756 	if ((*flags) & IFF_DRV_OACTIVE &&
2757 	    tx->req - tx->done < (tx->mask + 1)/4) {
2758 		*(flags) &= ~IFF_DRV_OACTIVE;
2759 		ss->tx.wake++;
2760 		mxge_start_locked(ss);
2761 	}
2762 	if ((ss->sc->num_slices > 1) && (tx->req == tx->done)) {
2763 		/* let the NIC stop polling this queue, since there
2764 		 * are no more transmits pending */
2765 		if (tx->req == tx->done) {
2766 			*tx->send_stop = 1;
2767 			tx->queue_active = 0;
2768 			tx->deactivate++;
2769 			wmb();
2770 		}
2771 	}
2772 	mtx_unlock(&ss->tx.mtx);
2773 }
2774 
2775 static struct mxge_media_type mxge_xfp_media_types[] =
2776 {
2777 	{IFM_10G_CX4,	0x7f, 		"10GBASE-CX4 (module)"},
2778 	{IFM_10G_SR, 	(1 << 7),	"10GBASE-SR"},
2779 	{IFM_10G_LR, 	(1 << 6),	"10GBASE-LR"},
2780 	{0,		(1 << 5),	"10GBASE-ER"},
2781 	{IFM_10G_LRM,	(1 << 4),	"10GBASE-LRM"},
2782 	{0,		(1 << 3),	"10GBASE-SW"},
2783 	{0,		(1 << 2),	"10GBASE-LW"},
2784 	{0,		(1 << 1),	"10GBASE-EW"},
2785 	{0,		(1 << 0),	"Reserved"}
2786 };
2787 static struct mxge_media_type mxge_sfp_media_types[] =
2788 {
2789 	{IFM_10G_TWINAX,      0,	"10GBASE-Twinax"},
2790 	{0,		(1 << 7),	"Reserved"},
2791 	{IFM_10G_LRM,	(1 << 6),	"10GBASE-LRM"},
2792 	{IFM_10G_LR, 	(1 << 5),	"10GBASE-LR"},
2793 	{IFM_10G_SR,	(1 << 4),	"10GBASE-SR"},
2794 	{IFM_10G_TWINAX,(1 << 0),	"10GBASE-Twinax"}
2795 };
2796 
2797 static void
2798 mxge_media_set(mxge_softc_t *sc, int media_type)
2799 {
2800 
2801 	ifmedia_add(&sc->media, IFM_ETHER | IFM_FDX | media_type,
2802 		    0, NULL);
2803 	ifmedia_set(&sc->media, IFM_ETHER | IFM_FDX | media_type);
2804 	sc->current_media = media_type;
2805 	sc->media.ifm_media = sc->media.ifm_cur->ifm_media;
2806 }
2807 
2808 static void
2809 mxge_media_init(mxge_softc_t *sc)
2810 {
2811 	char *ptr;
2812 	int i;
2813 
2814 	ifmedia_removeall(&sc->media);
2815 	mxge_media_set(sc, IFM_AUTO);
2816 
2817 	/*
2818 	 * parse the product code to deterimine the interface type
2819 	 * (CX4, XFP, Quad Ribbon Fiber) by looking at the character
2820 	 * after the 3rd dash in the driver's cached copy of the
2821 	 * EEPROM's product code string.
2822 	 */
2823 	ptr = sc->product_code_string;
2824 	if (ptr == NULL) {
2825 		device_printf(sc->dev, "Missing product code\n");
2826 		return;
2827 	}
2828 
2829 	for (i = 0; i < 3; i++, ptr++) {
2830 		ptr = strchr(ptr, '-');
2831 		if (ptr == NULL) {
2832 			device_printf(sc->dev,
2833 				      "only %d dashes in PC?!?\n", i);
2834 			return;
2835 		}
2836 	}
2837 	if (*ptr == 'C' || *(ptr +1) == 'C') {
2838 		/* -C is CX4 */
2839 		sc->connector = MXGE_CX4;
2840 		mxge_media_set(sc, IFM_10G_CX4);
2841 	} else if (*ptr == 'Q') {
2842 		/* -Q is Quad Ribbon Fiber */
2843 		sc->connector = MXGE_QRF;
2844 		device_printf(sc->dev, "Quad Ribbon Fiber Media\n");
2845 		/* FreeBSD has no media type for Quad ribbon fiber */
2846 	} else if (*ptr == 'R') {
2847 		/* -R is XFP */
2848 		sc->connector = MXGE_XFP;
2849 	} else if (*ptr == 'S' || *(ptr +1) == 'S') {
2850 		/* -S or -2S is SFP+ */
2851 		sc->connector = MXGE_SFP;
2852 	} else {
2853 		device_printf(sc->dev, "Unknown media type: %c\n", *ptr);
2854 	}
2855 }
2856 
2857 /*
2858  * Determine the media type for a NIC.  Some XFPs will identify
2859  * themselves only when their link is up, so this is initiated via a
2860  * link up interrupt.  However, this can potentially take up to
2861  * several milliseconds, so it is run via the watchdog routine, rather
2862  * than in the interrupt handler itself.
2863  */
2864 static void
2865 mxge_media_probe(mxge_softc_t *sc)
2866 {
2867 	mxge_cmd_t cmd;
2868 	char *cage_type;
2869 
2870 	struct mxge_media_type *mxge_media_types = NULL;
2871 	int i, err, ms, mxge_media_type_entries;
2872 	uint32_t byte;
2873 
2874 	sc->need_media_probe = 0;
2875 
2876 	if (sc->connector == MXGE_XFP) {
2877 		/* -R is XFP */
2878 		mxge_media_types = mxge_xfp_media_types;
2879 		mxge_media_type_entries =
2880 			nitems(mxge_xfp_media_types);
2881 		byte = MXGE_XFP_COMPLIANCE_BYTE;
2882 		cage_type = "XFP";
2883 	} else 	if (sc->connector == MXGE_SFP) {
2884 		/* -S or -2S is SFP+ */
2885 		mxge_media_types = mxge_sfp_media_types;
2886 		mxge_media_type_entries =
2887 			nitems(mxge_sfp_media_types);
2888 		cage_type = "SFP+";
2889 		byte = 3;
2890 	} else {
2891 		/* nothing to do; media type cannot change */
2892 		return;
2893 	}
2894 
2895 	/*
2896 	 * At this point we know the NIC has an XFP cage, so now we
2897 	 * try to determine what is in the cage by using the
2898 	 * firmware's XFP I2C commands to read the XFP 10GbE compilance
2899 	 * register.  We read just one byte, which may take over
2900 	 * a millisecond
2901 	 */
2902 
2903 	cmd.data0 = 0;	 /* just fetch 1 byte, not all 256 */
2904 	cmd.data1 = byte;
2905 	err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_READ, &cmd);
2906 	if (err == MXGEFW_CMD_ERROR_I2C_FAILURE) {
2907 		device_printf(sc->dev, "failed to read XFP\n");
2908 	}
2909 	if (err == MXGEFW_CMD_ERROR_I2C_ABSENT) {
2910 		device_printf(sc->dev, "Type R/S with no XFP!?!?\n");
2911 	}
2912 	if (err != MXGEFW_CMD_OK) {
2913 		return;
2914 	}
2915 
2916 	/* now we wait for the data to be cached */
2917 	cmd.data0 = byte;
2918 	err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
2919 	for (ms = 0; (err == EBUSY) && (ms < 50); ms++) {
2920 		DELAY(1000);
2921 		cmd.data0 = byte;
2922 		err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
2923 	}
2924 	if (err != MXGEFW_CMD_OK) {
2925 		device_printf(sc->dev, "failed to read %s (%d, %dms)\n",
2926 			      cage_type, err, ms);
2927 		return;
2928 	}
2929 
2930 	if (cmd.data0 == mxge_media_types[0].bitmask) {
2931 		if (mxge_verbose)
2932 			device_printf(sc->dev, "%s:%s\n", cage_type,
2933 				      mxge_media_types[0].name);
2934 		if (sc->current_media != mxge_media_types[0].flag) {
2935 			mxge_media_init(sc);
2936 			mxge_media_set(sc, mxge_media_types[0].flag);
2937 		}
2938 		return;
2939 	}
2940 	for (i = 1; i < mxge_media_type_entries; i++) {
2941 		if (cmd.data0 & mxge_media_types[i].bitmask) {
2942 			if (mxge_verbose)
2943 				device_printf(sc->dev, "%s:%s\n",
2944 					      cage_type,
2945 					      mxge_media_types[i].name);
2946 
2947 			if (sc->current_media != mxge_media_types[i].flag) {
2948 				mxge_media_init(sc);
2949 				mxge_media_set(sc, mxge_media_types[i].flag);
2950 			}
2951 			return;
2952 		}
2953 	}
2954 	if (mxge_verbose)
2955 		device_printf(sc->dev, "%s media 0x%x unknown\n",
2956 			      cage_type, cmd.data0);
2957 
2958 	return;
2959 }
2960 
2961 static void
2962 mxge_intr(void *arg)
2963 {
2964 	struct mxge_slice_state *ss = arg;
2965 	mxge_softc_t *sc = ss->sc;
2966 	mcp_irq_data_t *stats = ss->fw_stats;
2967 	mxge_tx_ring_t *tx = &ss->tx;
2968 	mxge_rx_done_t *rx_done = &ss->rx_done;
2969 	uint32_t send_done_count;
2970 	uint8_t valid;
2971 
2972 	/* make sure the DMA has finished */
2973 	if (!stats->valid) {
2974 		return;
2975 	}
2976 	valid = stats->valid;
2977 
2978 	if (sc->legacy_irq) {
2979 		/* lower legacy IRQ  */
2980 		*sc->irq_deassert = 0;
2981 		if (!mxge_deassert_wait)
2982 			/* don't wait for conf. that irq is low */
2983 			stats->valid = 0;
2984 	} else {
2985 		stats->valid = 0;
2986 	}
2987 
2988 	/* loop while waiting for legacy irq deassertion */
2989 	do {
2990 		/* check for transmit completes and receives */
2991 		send_done_count = be32toh(stats->send_done_count);
2992 		while ((send_done_count != tx->pkt_done) ||
2993 		       (rx_done->entry[rx_done->idx].length != 0)) {
2994 			if (send_done_count != tx->pkt_done)
2995 				mxge_tx_done(ss, (int)send_done_count);
2996 			mxge_clean_rx_done(ss);
2997 			send_done_count = be32toh(stats->send_done_count);
2998 		}
2999 		if (sc->legacy_irq && mxge_deassert_wait)
3000 			wmb();
3001 	} while (*((volatile uint8_t *) &stats->valid));
3002 
3003 	/* fw link & error stats meaningful only on the first slice */
3004 	if (__predict_false((ss == sc->ss) && stats->stats_updated)) {
3005 		if (sc->link_state != stats->link_up) {
3006 			sc->link_state = stats->link_up;
3007 			if (sc->link_state) {
3008 				if_link_state_change(sc->ifp, LINK_STATE_UP);
3009 				if (mxge_verbose)
3010 					device_printf(sc->dev, "link up\n");
3011 			} else {
3012 				if_link_state_change(sc->ifp, LINK_STATE_DOWN);
3013 				if (mxge_verbose)
3014 					device_printf(sc->dev, "link down\n");
3015 			}
3016 			sc->need_media_probe = 1;
3017 		}
3018 		if (sc->rdma_tags_available !=
3019 		    be32toh(stats->rdma_tags_available)) {
3020 			sc->rdma_tags_available =
3021 				be32toh(stats->rdma_tags_available);
3022 			device_printf(sc->dev, "RDMA timed out! %d tags "
3023 				      "left\n", sc->rdma_tags_available);
3024 		}
3025 
3026 		if (stats->link_down) {
3027 			sc->down_cnt += stats->link_down;
3028 			sc->link_state = 0;
3029 			if_link_state_change(sc->ifp, LINK_STATE_DOWN);
3030 		}
3031 	}
3032 
3033 	/* check to see if we have rx token to pass back */
3034 	if (valid & 0x1)
3035 	    *ss->irq_claim = be32toh(3);
3036 	*(ss->irq_claim + 1) = be32toh(3);
3037 }
3038 
3039 static void
3040 mxge_init(void *arg)
3041 {
3042 	mxge_softc_t *sc = arg;
3043 	if_t ifp = sc->ifp;
3044 
3045 	mtx_lock(&sc->driver_mtx);
3046 	if ((if_getdrvflags(ifp) & IFF_DRV_RUNNING) == 0)
3047 		(void) mxge_open(sc);
3048 	mtx_unlock(&sc->driver_mtx);
3049 }
3050 
3051 static void
3052 mxge_free_slice_mbufs(struct mxge_slice_state *ss)
3053 {
3054 	int i;
3055 
3056 #if defined(INET) || defined(INET6)
3057 	tcp_lro_free(&ss->lc);
3058 #endif
3059 	for (i = 0; i <= ss->rx_big.mask; i++) {
3060 		if (ss->rx_big.info[i].m == NULL)
3061 			continue;
3062 		bus_dmamap_unload(ss->rx_big.dmat,
3063 				  ss->rx_big.info[i].map);
3064 		m_freem(ss->rx_big.info[i].m);
3065 		ss->rx_big.info[i].m = NULL;
3066 	}
3067 
3068 	for (i = 0; i <= ss->rx_small.mask; i++) {
3069 		if (ss->rx_small.info[i].m == NULL)
3070 			continue;
3071 		bus_dmamap_unload(ss->rx_small.dmat,
3072 				  ss->rx_small.info[i].map);
3073 		m_freem(ss->rx_small.info[i].m);
3074 		ss->rx_small.info[i].m = NULL;
3075 	}
3076 
3077 	/* transmit ring used only on the first slice */
3078 	if (ss->tx.info == NULL)
3079 		return;
3080 
3081 	for (i = 0; i <= ss->tx.mask; i++) {
3082 		ss->tx.info[i].flag = 0;
3083 		if (ss->tx.info[i].m == NULL)
3084 			continue;
3085 		bus_dmamap_unload(ss->tx.dmat,
3086 				  ss->tx.info[i].map);
3087 		m_freem(ss->tx.info[i].m);
3088 		ss->tx.info[i].m = NULL;
3089 	}
3090 }
3091 
3092 static void
3093 mxge_free_mbufs(mxge_softc_t *sc)
3094 {
3095 	int slice;
3096 
3097 	for (slice = 0; slice < sc->num_slices; slice++)
3098 		mxge_free_slice_mbufs(&sc->ss[slice]);
3099 }
3100 
3101 static void
3102 mxge_free_slice_rings(struct mxge_slice_state *ss)
3103 {
3104 	int i;
3105 
3106 	if (ss->rx_done.entry != NULL)
3107 		mxge_dma_free(&ss->rx_done.dma);
3108 	ss->rx_done.entry = NULL;
3109 
3110 	if (ss->tx.req_bytes != NULL)
3111 		free(ss->tx.req_bytes, M_DEVBUF);
3112 	ss->tx.req_bytes = NULL;
3113 
3114 	if (ss->tx.seg_list != NULL)
3115 		free(ss->tx.seg_list, M_DEVBUF);
3116 	ss->tx.seg_list = NULL;
3117 
3118 	if (ss->rx_small.shadow != NULL)
3119 		free(ss->rx_small.shadow, M_DEVBUF);
3120 	ss->rx_small.shadow = NULL;
3121 
3122 	if (ss->rx_big.shadow != NULL)
3123 		free(ss->rx_big.shadow, M_DEVBUF);
3124 	ss->rx_big.shadow = NULL;
3125 
3126 	if (ss->tx.info != NULL) {
3127 		if (ss->tx.dmat != NULL) {
3128 			for (i = 0; i <= ss->tx.mask; i++) {
3129 				bus_dmamap_destroy(ss->tx.dmat,
3130 						   ss->tx.info[i].map);
3131 			}
3132 			bus_dma_tag_destroy(ss->tx.dmat);
3133 		}
3134 		free(ss->tx.info, M_DEVBUF);
3135 	}
3136 	ss->tx.info = NULL;
3137 
3138 	if (ss->rx_small.info != NULL) {
3139 		if (ss->rx_small.dmat != NULL) {
3140 			for (i = 0; i <= ss->rx_small.mask; i++) {
3141 				bus_dmamap_destroy(ss->rx_small.dmat,
3142 						   ss->rx_small.info[i].map);
3143 			}
3144 			bus_dmamap_destroy(ss->rx_small.dmat,
3145 					   ss->rx_small.extra_map);
3146 			bus_dma_tag_destroy(ss->rx_small.dmat);
3147 		}
3148 		free(ss->rx_small.info, M_DEVBUF);
3149 	}
3150 	ss->rx_small.info = NULL;
3151 
3152 	if (ss->rx_big.info != NULL) {
3153 		if (ss->rx_big.dmat != NULL) {
3154 			for (i = 0; i <= ss->rx_big.mask; i++) {
3155 				bus_dmamap_destroy(ss->rx_big.dmat,
3156 						   ss->rx_big.info[i].map);
3157 			}
3158 			bus_dmamap_destroy(ss->rx_big.dmat,
3159 					   ss->rx_big.extra_map);
3160 			bus_dma_tag_destroy(ss->rx_big.dmat);
3161 		}
3162 		free(ss->rx_big.info, M_DEVBUF);
3163 	}
3164 	ss->rx_big.info = NULL;
3165 }
3166 
3167 static void
3168 mxge_free_rings(mxge_softc_t *sc)
3169 {
3170 	int slice;
3171 
3172 	for (slice = 0; slice < sc->num_slices; slice++)
3173 		mxge_free_slice_rings(&sc->ss[slice]);
3174 }
3175 
3176 static int
3177 mxge_alloc_slice_rings(struct mxge_slice_state *ss, int rx_ring_entries,
3178 		       int tx_ring_entries)
3179 {
3180 	mxge_softc_t *sc = ss->sc;
3181 	size_t bytes;
3182 	int err, i;
3183 
3184 	/* allocate per-slice receive resources */
3185 
3186 	ss->rx_small.mask = ss->rx_big.mask = rx_ring_entries - 1;
3187 	ss->rx_done.mask = (2 * rx_ring_entries) - 1;
3188 
3189 	/* allocate the rx shadow rings */
3190 	bytes = rx_ring_entries * sizeof (*ss->rx_small.shadow);
3191 	ss->rx_small.shadow = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3192 
3193 	bytes = rx_ring_entries * sizeof (*ss->rx_big.shadow);
3194 	ss->rx_big.shadow = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3195 
3196 	/* allocate the rx host info rings */
3197 	bytes = rx_ring_entries * sizeof (*ss->rx_small.info);
3198 	ss->rx_small.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3199 
3200 	bytes = rx_ring_entries * sizeof (*ss->rx_big.info);
3201 	ss->rx_big.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3202 
3203 	/* allocate the rx busdma resources */
3204 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
3205 				 1,			/* alignment */
3206 				 4096,			/* boundary */
3207 				 BUS_SPACE_MAXADDR,	/* low */
3208 				 BUS_SPACE_MAXADDR,	/* high */
3209 				 NULL, NULL,		/* filter */
3210 				 MHLEN,			/* maxsize */
3211 				 1,			/* num segs */
3212 				 MHLEN,			/* maxsegsize */
3213 				 BUS_DMA_ALLOCNOW,	/* flags */
3214 				 NULL, NULL,		/* lock */
3215 				 &ss->rx_small.dmat);	/* tag */
3216 	if (err != 0) {
3217 		device_printf(sc->dev, "Err %d allocating rx_small dmat\n",
3218 			      err);
3219 		return err;
3220 	}
3221 
3222 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
3223 				 1,			/* alignment */
3224 				 0,			/* boundary */
3225 				 BUS_SPACE_MAXADDR,	/* low */
3226 				 BUS_SPACE_MAXADDR,	/* high */
3227 				 NULL, NULL,		/* filter */
3228 				 3*4096,		/* maxsize */
3229 				 1,			/* num segs */
3230 				 MJUM9BYTES,		/* maxsegsize*/
3231 				 BUS_DMA_ALLOCNOW,	/* flags */
3232 				 NULL, NULL,		/* lock */
3233 				 &ss->rx_big.dmat);	/* tag */
3234 	if (err != 0) {
3235 		device_printf(sc->dev, "Err %d allocating rx_big dmat\n",
3236 			      err);
3237 		return err;
3238 	}
3239 	for (i = 0; i <= ss->rx_small.mask; i++) {
3240 		err = bus_dmamap_create(ss->rx_small.dmat, 0,
3241 					&ss->rx_small.info[i].map);
3242 		if (err != 0) {
3243 			device_printf(sc->dev, "Err %d  rx_small dmamap\n",
3244 				      err);
3245 			return err;
3246 		}
3247 	}
3248 	err = bus_dmamap_create(ss->rx_small.dmat, 0,
3249 				&ss->rx_small.extra_map);
3250 	if (err != 0) {
3251 		device_printf(sc->dev, "Err %d extra rx_small dmamap\n",
3252 			      err);
3253 		return err;
3254 	}
3255 
3256 	for (i = 0; i <= ss->rx_big.mask; i++) {
3257 		err = bus_dmamap_create(ss->rx_big.dmat, 0,
3258 					&ss->rx_big.info[i].map);
3259 		if (err != 0) {
3260 			device_printf(sc->dev, "Err %d  rx_big dmamap\n",
3261 				      err);
3262 			return err;
3263 		}
3264 	}
3265 	err = bus_dmamap_create(ss->rx_big.dmat, 0,
3266 				&ss->rx_big.extra_map);
3267 	if (err != 0) {
3268 		device_printf(sc->dev, "Err %d extra rx_big dmamap\n",
3269 			      err);
3270 		return err;
3271 	}
3272 
3273 	/* now allocate TX resources */
3274 
3275 	ss->tx.mask = tx_ring_entries - 1;
3276 	ss->tx.max_desc = MIN(MXGE_MAX_SEND_DESC, tx_ring_entries / 4);
3277 
3278 	/* allocate the tx request copy block */
3279 	bytes = 8 +
3280 		sizeof (*ss->tx.req_list) * (ss->tx.max_desc + 4);
3281 	ss->tx.req_bytes = malloc(bytes, M_DEVBUF, M_WAITOK);
3282 	/* ensure req_list entries are aligned to 8 bytes */
3283 	ss->tx.req_list = (mcp_kreq_ether_send_t *)
3284 		((uintptr_t)(ss->tx.req_bytes + 7) & ~7UL);
3285 
3286 	/* allocate the tx busdma segment list */
3287 	bytes = sizeof (*ss->tx.seg_list) * ss->tx.max_desc;
3288 	ss->tx.seg_list = (bus_dma_segment_t *)
3289 		malloc(bytes, M_DEVBUF, M_WAITOK);
3290 
3291 	/* allocate the tx host info ring */
3292 	bytes = tx_ring_entries * sizeof (*ss->tx.info);
3293 	ss->tx.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3294 
3295 	/* allocate the tx busdma resources */
3296 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
3297 				 1,			/* alignment */
3298 				 sc->tx_boundary,	/* boundary */
3299 				 BUS_SPACE_MAXADDR,	/* low */
3300 				 BUS_SPACE_MAXADDR,	/* high */
3301 				 NULL, NULL,		/* filter */
3302 				 65536 + 256,		/* maxsize */
3303 				 ss->tx.max_desc - 2,	/* num segs */
3304 				 sc->tx_boundary,	/* maxsegsz */
3305 				 BUS_DMA_ALLOCNOW,	/* flags */
3306 				 NULL, NULL,		/* lock */
3307 				 &ss->tx.dmat);		/* tag */
3308 
3309 	if (err != 0) {
3310 		device_printf(sc->dev, "Err %d allocating tx dmat\n",
3311 			      err);
3312 		return err;
3313 	}
3314 
3315 	/* now use these tags to setup dmamaps for each slot
3316 	   in the ring */
3317 	for (i = 0; i <= ss->tx.mask; i++) {
3318 		err = bus_dmamap_create(ss->tx.dmat, 0,
3319 					&ss->tx.info[i].map);
3320 		if (err != 0) {
3321 			device_printf(sc->dev, "Err %d  tx dmamap\n",
3322 				      err);
3323 			return err;
3324 		}
3325 	}
3326 	return 0;
3327 
3328 }
3329 
3330 static int
3331 mxge_alloc_rings(mxge_softc_t *sc)
3332 {
3333 	mxge_cmd_t cmd;
3334 	int tx_ring_size;
3335 	int tx_ring_entries, rx_ring_entries;
3336 	int err, slice;
3337 
3338 	/* get ring sizes */
3339 	err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_RING_SIZE, &cmd);
3340 	tx_ring_size = cmd.data0;
3341 	if (err != 0) {
3342 		device_printf(sc->dev, "Cannot determine tx ring sizes\n");
3343 		goto abort;
3344 	}
3345 
3346 	tx_ring_entries = tx_ring_size / sizeof (mcp_kreq_ether_send_t);
3347 	rx_ring_entries = sc->rx_ring_size / sizeof (mcp_dma_addr_t);
3348 	if_setsendqlen(sc->ifp, tx_ring_entries - 1);
3349 	if_setsendqready(sc->ifp);
3350 
3351 	for (slice = 0; slice < sc->num_slices; slice++) {
3352 		err = mxge_alloc_slice_rings(&sc->ss[slice],
3353 					     rx_ring_entries,
3354 					     tx_ring_entries);
3355 		if (err != 0)
3356 			goto abort;
3357 	}
3358 	return 0;
3359 
3360 abort:
3361 	mxge_free_rings(sc);
3362 	return err;
3363 
3364 }
3365 
3366 static void
3367 mxge_choose_params(int mtu, int *big_buf_size, int *cl_size, int *nbufs)
3368 {
3369 	int bufsize = mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN + MXGEFW_PAD;
3370 
3371 	if (bufsize < MCLBYTES) {
3372 		/* easy, everything fits in a single buffer */
3373 		*big_buf_size = MCLBYTES;
3374 		*cl_size = MCLBYTES;
3375 		*nbufs = 1;
3376 		return;
3377 	}
3378 
3379 	if (bufsize < MJUMPAGESIZE) {
3380 		/* still easy, everything still fits in a single buffer */
3381 		*big_buf_size = MJUMPAGESIZE;
3382 		*cl_size = MJUMPAGESIZE;
3383 		*nbufs = 1;
3384 		return;
3385 	}
3386 	*cl_size = MJUM9BYTES;
3387 	*big_buf_size = MJUM9BYTES;
3388 	*nbufs = 1;
3389 }
3390 
3391 static int
3392 mxge_slice_open(struct mxge_slice_state *ss, int nbufs, int cl_size)
3393 {
3394 	mxge_softc_t *sc;
3395 	mxge_cmd_t cmd;
3396 	bus_dmamap_t map;
3397 	int err, i, slice;
3398 
3399 	sc = ss->sc;
3400 	slice = ss - sc->ss;
3401 
3402 #if defined(INET) || defined(INET6)
3403 	(void)tcp_lro_init(&ss->lc);
3404 #endif
3405 	ss->lc.ifp = sc->ifp;
3406 
3407 	/* get the lanai pointers to the send and receive rings */
3408 
3409 	err = 0;
3410 
3411 	cmd.data0 = slice;
3412 	err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_OFFSET, &cmd);
3413 	ss->tx.lanai =
3414 		(volatile mcp_kreq_ether_send_t *)(sc->sram + cmd.data0);
3415 	ss->tx.send_go = (volatile uint32_t *)
3416 		(sc->sram + MXGEFW_ETH_SEND_GO + 64 * slice);
3417 	ss->tx.send_stop = (volatile uint32_t *)
3418 	(sc->sram + MXGEFW_ETH_SEND_STOP + 64 * slice);
3419 
3420 	cmd.data0 = slice;
3421 	err |= mxge_send_cmd(sc,
3422 			     MXGEFW_CMD_GET_SMALL_RX_OFFSET, &cmd);
3423 	ss->rx_small.lanai =
3424 		(volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
3425 	cmd.data0 = slice;
3426 	err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_BIG_RX_OFFSET, &cmd);
3427 	ss->rx_big.lanai =
3428 		(volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
3429 
3430 	if (err != 0) {
3431 		device_printf(sc->dev,
3432 			      "failed to get ring sizes or locations\n");
3433 		return EIO;
3434 	}
3435 
3436 	/* stock receive rings */
3437 	for (i = 0; i <= ss->rx_small.mask; i++) {
3438 		map = ss->rx_small.info[i].map;
3439 		err = mxge_get_buf_small(ss, map, i);
3440 		if (err) {
3441 			device_printf(sc->dev, "alloced %d/%d smalls\n",
3442 				      i, ss->rx_small.mask + 1);
3443 			return ENOMEM;
3444 		}
3445 	}
3446 	for (i = 0; i <= ss->rx_big.mask; i++) {
3447 		ss->rx_big.shadow[i].addr_low = 0xffffffff;
3448 		ss->rx_big.shadow[i].addr_high = 0xffffffff;
3449 	}
3450 	ss->rx_big.nbufs = nbufs;
3451 	ss->rx_big.cl_size = cl_size;
3452 	ss->rx_big.mlen = if_getmtu(ss->sc->ifp) + ETHER_HDR_LEN +
3453 		ETHER_VLAN_ENCAP_LEN + MXGEFW_PAD;
3454 	for (i = 0; i <= ss->rx_big.mask; i += ss->rx_big.nbufs) {
3455 		map = ss->rx_big.info[i].map;
3456 		err = mxge_get_buf_big(ss, map, i);
3457 		if (err) {
3458 			device_printf(sc->dev, "alloced %d/%d bigs\n",
3459 				      i, ss->rx_big.mask + 1);
3460 			return ENOMEM;
3461 		}
3462 	}
3463 	return 0;
3464 }
3465 
3466 static int
3467 mxge_open(mxge_softc_t *sc)
3468 {
3469 	mxge_cmd_t cmd;
3470 	int err, big_bytes, nbufs, slice, cl_size, i;
3471 	bus_addr_t bus;
3472 	volatile uint8_t *itable;
3473 	struct mxge_slice_state *ss;
3474 
3475 	/* Copy the MAC address in case it was overridden */
3476 	bcopy(if_getlladdr(sc->ifp), sc->mac_addr, ETHER_ADDR_LEN);
3477 
3478 	err = mxge_reset(sc, 1);
3479 	if (err != 0) {
3480 		device_printf(sc->dev, "failed to reset\n");
3481 		return EIO;
3482 	}
3483 
3484 	if (sc->num_slices > 1) {
3485 		/* setup the indirection table */
3486 		cmd.data0 = sc->num_slices;
3487 		err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_TABLE_SIZE,
3488 				    &cmd);
3489 
3490 		err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_RSS_TABLE_OFFSET,
3491 				     &cmd);
3492 		if (err != 0) {
3493 			device_printf(sc->dev,
3494 				      "failed to setup rss tables\n");
3495 			return err;
3496 		}
3497 
3498 		/* just enable an identity mapping */
3499 		itable = sc->sram + cmd.data0;
3500 		for (i = 0; i < sc->num_slices; i++)
3501 			itable[i] = (uint8_t)i;
3502 
3503 		cmd.data0 = 1;
3504 		cmd.data1 = mxge_rss_hash_type;
3505 		err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_ENABLE, &cmd);
3506 		if (err != 0) {
3507 			device_printf(sc->dev, "failed to enable slices\n");
3508 			return err;
3509 		}
3510 	}
3511 
3512 	mxge_choose_params(if_getmtu(sc->ifp), &big_bytes, &cl_size, &nbufs);
3513 
3514 	cmd.data0 = nbufs;
3515 	err = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
3516 			    &cmd);
3517 	/* error is only meaningful if we're trying to set
3518 	   MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS > 1 */
3519 	if (err && nbufs > 1) {
3520 		device_printf(sc->dev,
3521 			      "Failed to set alway-use-n to %d\n",
3522 			      nbufs);
3523 		return EIO;
3524 	}
3525 	/* Give the firmware the mtu and the big and small buffer
3526 	   sizes.  The firmware wants the big buf size to be a power
3527 	   of two. Luckily, FreeBSD's clusters are powers of two */
3528 	cmd.data0 = if_getmtu(sc->ifp) + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
3529 	err = mxge_send_cmd(sc, MXGEFW_CMD_SET_MTU, &cmd);
3530 	cmd.data0 = MHLEN - MXGEFW_PAD;
3531 	err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_SMALL_BUFFER_SIZE,
3532 			     &cmd);
3533 	cmd.data0 = big_bytes;
3534 	err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_BIG_BUFFER_SIZE, &cmd);
3535 
3536 	if (err != 0) {
3537 		device_printf(sc->dev, "failed to setup params\n");
3538 		goto abort;
3539 	}
3540 
3541 	/* Now give him the pointer to the stats block */
3542 	for (slice = 0; slice < sc->num_slices; slice++) {
3543 		ss = &sc->ss[slice];
3544 		cmd.data0 =
3545 			MXGE_LOWPART_TO_U32(ss->fw_stats_dma.bus_addr);
3546 		cmd.data1 =
3547 			MXGE_HIGHPART_TO_U32(ss->fw_stats_dma.bus_addr);
3548 		cmd.data2 = sizeof(struct mcp_irq_data);
3549 		cmd.data2 |= (slice << 16);
3550 		err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_STATS_DMA_V2, &cmd);
3551 	}
3552 
3553 	if (err != 0) {
3554 		bus = sc->ss->fw_stats_dma.bus_addr;
3555 		bus += offsetof(struct mcp_irq_data, send_done_count);
3556 		cmd.data0 = MXGE_LOWPART_TO_U32(bus);
3557 		cmd.data1 = MXGE_HIGHPART_TO_U32(bus);
3558 		err = mxge_send_cmd(sc,
3559 				    MXGEFW_CMD_SET_STATS_DMA_OBSOLETE,
3560 				    &cmd);
3561 		/* Firmware cannot support multicast without STATS_DMA_V2 */
3562 		sc->fw_multicast_support = 0;
3563 	} else {
3564 		sc->fw_multicast_support = 1;
3565 	}
3566 
3567 	if (err != 0) {
3568 		device_printf(sc->dev, "failed to setup params\n");
3569 		goto abort;
3570 	}
3571 
3572 	for (slice = 0; slice < sc->num_slices; slice++) {
3573 		err = mxge_slice_open(&sc->ss[slice], nbufs, cl_size);
3574 		if (err != 0) {
3575 			device_printf(sc->dev, "couldn't open slice %d\n",
3576 				      slice);
3577 			goto abort;
3578 		}
3579 	}
3580 
3581 	/* Finally, start the firmware running */
3582 	err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_UP, &cmd);
3583 	if (err) {
3584 		device_printf(sc->dev, "Couldn't bring up link\n");
3585 		goto abort;
3586 	}
3587 	for (slice = 0; slice < sc->num_slices; slice++) {
3588 		ss = &sc->ss[slice];
3589 		ss->if_drv_flags |= IFF_DRV_RUNNING;
3590 		ss->if_drv_flags &= ~IFF_DRV_OACTIVE;
3591 	}
3592 	if_setdrvflagbits(sc->ifp, IFF_DRV_RUNNING, 0);
3593 	if_setdrvflagbits(sc->ifp, 0, IFF_DRV_OACTIVE);
3594 
3595 	return 0;
3596 
3597 abort:
3598 	mxge_free_mbufs(sc);
3599 
3600 	return err;
3601 }
3602 
3603 static int
3604 mxge_close(mxge_softc_t *sc, int down)
3605 {
3606 	mxge_cmd_t cmd;
3607 	int err, old_down_cnt;
3608 	struct mxge_slice_state *ss;
3609 	int slice;
3610 
3611 	for (slice = 0; slice < sc->num_slices; slice++) {
3612 		ss = &sc->ss[slice];
3613 		ss->if_drv_flags &= ~IFF_DRV_RUNNING;
3614 	}
3615 	if_setdrvflagbits(sc->ifp, 0, IFF_DRV_RUNNING);
3616 	if (!down) {
3617 		old_down_cnt = sc->down_cnt;
3618 		wmb();
3619 		err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_DOWN, &cmd);
3620 		if (err) {
3621 			device_printf(sc->dev,
3622 				      "Couldn't bring down link\n");
3623 		}
3624 		if (old_down_cnt == sc->down_cnt) {
3625 			/* wait for down irq */
3626 			DELAY(10 * sc->intr_coal_delay);
3627 		}
3628 		wmb();
3629 		if (old_down_cnt == sc->down_cnt) {
3630 			device_printf(sc->dev, "never got down irq\n");
3631 		}
3632 	}
3633 	mxge_free_mbufs(sc);
3634 
3635 	return 0;
3636 }
3637 
3638 static void
3639 mxge_setup_cfg_space(mxge_softc_t *sc)
3640 {
3641 	device_t dev = sc->dev;
3642 	int reg;
3643 	uint16_t lnk, pectl;
3644 
3645 	/* find the PCIe link width and set max read request to 4KB*/
3646 	if (pci_find_cap(dev, PCIY_EXPRESS, &reg) == 0) {
3647 		lnk = pci_read_config(dev, reg + 0x12, 2);
3648 		sc->link_width = (lnk >> 4) & 0x3f;
3649 
3650 		if (sc->pectl == 0) {
3651 			pectl = pci_read_config(dev, reg + 0x8, 2);
3652 			pectl = (pectl & ~0x7000) | (5 << 12);
3653 			pci_write_config(dev, reg + 0x8, pectl, 2);
3654 			sc->pectl = pectl;
3655 		} else {
3656 			/* restore saved pectl after watchdog reset */
3657 			pci_write_config(dev, reg + 0x8, sc->pectl, 2);
3658 		}
3659 	}
3660 
3661 	/* Enable DMA and Memory space access */
3662 	pci_enable_busmaster(dev);
3663 }
3664 
3665 static uint32_t
3666 mxge_read_reboot(mxge_softc_t *sc)
3667 {
3668 	device_t dev = sc->dev;
3669 	uint32_t vs;
3670 
3671 	/* find the vendor specific offset */
3672 	if (pci_find_cap(dev, PCIY_VENDOR, &vs) != 0) {
3673 		device_printf(sc->dev,
3674 			      "could not find vendor specific offset\n");
3675 		return (uint32_t)-1;
3676 	}
3677 	/* enable read32 mode */
3678 	pci_write_config(dev, vs + 0x10, 0x3, 1);
3679 	/* tell NIC which register to read */
3680 	pci_write_config(dev, vs + 0x18, 0xfffffff0, 4);
3681 	return (pci_read_config(dev, vs + 0x14, 4));
3682 }
3683 
3684 static void
3685 mxge_watchdog_reset(mxge_softc_t *sc)
3686 {
3687 	struct pci_devinfo *dinfo;
3688 	struct mxge_slice_state *ss;
3689 	int err, running, s, num_tx_slices = 1;
3690 	uint32_t reboot;
3691 	uint16_t cmd;
3692 
3693 	err = ENXIO;
3694 
3695 	device_printf(sc->dev, "Watchdog reset!\n");
3696 
3697 	/*
3698 	 * check to see if the NIC rebooted.  If it did, then all of
3699 	 * PCI config space has been reset, and things like the
3700 	 * busmaster bit will be zero.  If this is the case, then we
3701 	 * must restore PCI config space before the NIC can be used
3702 	 * again
3703 	 */
3704 	cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3705 	if (cmd == 0xffff) {
3706 		/*
3707 		 * maybe the watchdog caught the NIC rebooting; wait
3708 		 * up to 100ms for it to finish.  If it does not come
3709 		 * back, then give up
3710 		 */
3711 		DELAY(1000*100);
3712 		cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3713 		if (cmd == 0xffff) {
3714 			device_printf(sc->dev, "NIC disappeared!\n");
3715 		}
3716 	}
3717 	if ((cmd & PCIM_CMD_BUSMASTEREN) == 0) {
3718 		/* print the reboot status */
3719 		reboot = mxge_read_reboot(sc);
3720 		device_printf(sc->dev, "NIC rebooted, status = 0x%x\n",
3721 			      reboot);
3722 		running = if_getdrvflags(sc->ifp) & IFF_DRV_RUNNING;
3723 		if (running) {
3724 			/*
3725 			 * quiesce NIC so that TX routines will not try to
3726 			 * xmit after restoration of BAR
3727 			 */
3728 
3729 			/* Mark the link as down */
3730 			if (sc->link_state) {
3731 				sc->link_state = 0;
3732 				if_link_state_change(sc->ifp,
3733 						     LINK_STATE_DOWN);
3734 			}
3735 
3736 			num_tx_slices = sc->num_slices;
3737 
3738 			/* grab all TX locks to ensure no tx  */
3739 			for (s = 0; s < num_tx_slices; s++) {
3740 				ss = &sc->ss[s];
3741 				mtx_lock(&ss->tx.mtx);
3742 			}
3743 			mxge_close(sc, 1);
3744 		}
3745 		/* restore PCI configuration space */
3746 		dinfo = device_get_ivars(sc->dev);
3747 		pci_cfg_restore(sc->dev, dinfo);
3748 
3749 		/* and redo any changes we made to our config space */
3750 		mxge_setup_cfg_space(sc);
3751 
3752 		/* reload f/w */
3753 		err = mxge_load_firmware(sc, 0);
3754 		if (err) {
3755 			device_printf(sc->dev,
3756 				      "Unable to re-load f/w\n");
3757 		}
3758 		if (running) {
3759 			if (!err)
3760 				err = mxge_open(sc);
3761 			/* release all TX locks */
3762 			for (s = 0; s < num_tx_slices; s++) {
3763 				ss = &sc->ss[s];
3764 				mxge_start_locked(ss);
3765 				mtx_unlock(&ss->tx.mtx);
3766 			}
3767 		}
3768 		sc->watchdog_resets++;
3769 	} else {
3770 		device_printf(sc->dev,
3771 			      "NIC did not reboot, not resetting\n");
3772 		err = 0;
3773 	}
3774 	if (err) {
3775 		device_printf(sc->dev, "watchdog reset failed\n");
3776 	} else {
3777 		if (sc->dying == 2)
3778 			sc->dying = 0;
3779 		callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
3780 	}
3781 }
3782 
3783 static void
3784 mxge_watchdog_task(void *arg, int pending)
3785 {
3786 	mxge_softc_t *sc = arg;
3787 
3788 	mtx_lock(&sc->driver_mtx);
3789 	mxge_watchdog_reset(sc);
3790 	mtx_unlock(&sc->driver_mtx);
3791 }
3792 
3793 static void
3794 mxge_warn_stuck(mxge_softc_t *sc, mxge_tx_ring_t *tx, int slice)
3795 {
3796 	tx = &sc->ss[slice].tx;
3797 	device_printf(sc->dev, "slice %d struck? ring state:\n", slice);
3798 	device_printf(sc->dev,
3799 		      "tx.req=%d tx.done=%d, tx.queue_active=%d\n",
3800 		      tx->req, tx->done, tx->queue_active);
3801 	device_printf(sc->dev, "tx.activate=%d tx.deactivate=%d\n",
3802 			      tx->activate, tx->deactivate);
3803 	device_printf(sc->dev, "pkt_done=%d fw=%d\n",
3804 		      tx->pkt_done,
3805 		      be32toh(sc->ss->fw_stats->send_done_count));
3806 }
3807 
3808 static int
3809 mxge_watchdog(mxge_softc_t *sc)
3810 {
3811 	mxge_tx_ring_t *tx;
3812 	uint32_t rx_pause = be32toh(sc->ss->fw_stats->dropped_pause);
3813 	int i, err = 0;
3814 
3815 	/* see if we have outstanding transmits, which
3816 	   have been pending for more than mxge_ticks */
3817 	for (i = 0; (i < sc->num_slices) && (err == 0); i++) {
3818 		tx = &sc->ss[i].tx;
3819 		if (tx->req != tx->done &&
3820 		    tx->watchdog_req != tx->watchdog_done &&
3821 		    tx->done == tx->watchdog_done) {
3822 			/* check for pause blocking before resetting */
3823 			if (tx->watchdog_rx_pause == rx_pause) {
3824 				mxge_warn_stuck(sc, tx, i);
3825 				taskqueue_enqueue(sc->tq, &sc->watchdog_task);
3826 				return (ENXIO);
3827 			}
3828 			else
3829 				device_printf(sc->dev, "Flow control blocking "
3830 					      "xmits, check link partner\n");
3831 		}
3832 
3833 		tx->watchdog_req = tx->req;
3834 		tx->watchdog_done = tx->done;
3835 		tx->watchdog_rx_pause = rx_pause;
3836 	}
3837 
3838 	if (sc->need_media_probe)
3839 		mxge_media_probe(sc);
3840 	return (err);
3841 }
3842 
3843 static uint64_t
3844 mxge_get_counter(if_t ifp, ift_counter cnt)
3845 {
3846 	struct mxge_softc *sc;
3847 	uint64_t rv;
3848 
3849 	sc = if_getsoftc(ifp);
3850 	rv = 0;
3851 
3852 	switch (cnt) {
3853 	case IFCOUNTER_IPACKETS:
3854 		for (int s = 0; s < sc->num_slices; s++)
3855 			rv += sc->ss[s].ipackets;
3856 		return (rv);
3857 	case IFCOUNTER_OPACKETS:
3858 		for (int s = 0; s < sc->num_slices; s++)
3859 			rv += sc->ss[s].opackets;
3860 		return (rv);
3861 	case IFCOUNTER_OERRORS:
3862 		for (int s = 0; s < sc->num_slices; s++)
3863 			rv += sc->ss[s].oerrors;
3864 		return (rv);
3865 	case IFCOUNTER_OBYTES:
3866 		for (int s = 0; s < sc->num_slices; s++)
3867 			rv += sc->ss[s].obytes;
3868 		return (rv);
3869 	case IFCOUNTER_OMCASTS:
3870 		for (int s = 0; s < sc->num_slices; s++)
3871 			rv += sc->ss[s].omcasts;
3872 		return (rv);
3873 	case IFCOUNTER_OQDROPS:
3874 		for (int s = 0; s < sc->num_slices; s++)
3875 			rv += sc->ss[s].tx.br->br_drops;
3876 		return (rv);
3877 	default:
3878 		return (if_get_counter_default(ifp, cnt));
3879 	}
3880 }
3881 
3882 static void
3883 mxge_tick(void *arg)
3884 {
3885 	mxge_softc_t *sc = arg;
3886 	u_long pkts = 0;
3887 	int err = 0;
3888 	int running, ticks;
3889 	uint16_t cmd;
3890 
3891 	ticks = mxge_ticks;
3892 	running = if_getdrvflags(sc->ifp) & IFF_DRV_RUNNING;
3893 	if (running) {
3894 		if (!sc->watchdog_countdown) {
3895 			err = mxge_watchdog(sc);
3896 			sc->watchdog_countdown = 4;
3897 		}
3898 		sc->watchdog_countdown--;
3899 	}
3900 	if (pkts == 0) {
3901 		/* ensure NIC did not suffer h/w fault while idle */
3902 		cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3903 		if ((cmd & PCIM_CMD_BUSMASTEREN) == 0) {
3904 			sc->dying = 2;
3905 			taskqueue_enqueue(sc->tq, &sc->watchdog_task);
3906 			err = ENXIO;
3907 		}
3908 		/* look less often if NIC is idle */
3909 		ticks *= 4;
3910 	}
3911 
3912 	if (err == 0)
3913 		callout_reset(&sc->co_hdl, ticks, mxge_tick, sc);
3914 
3915 }
3916 
3917 static int
3918 mxge_media_change(if_t ifp)
3919 {
3920 	return EINVAL;
3921 }
3922 
3923 static int
3924 mxge_change_mtu(mxge_softc_t *sc, int mtu)
3925 {
3926 	if_t ifp = sc->ifp;
3927 	int real_mtu, old_mtu;
3928 	int err = 0;
3929 
3930 	real_mtu = mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
3931 	if ((real_mtu > sc->max_mtu) || real_mtu < 60)
3932 		return EINVAL;
3933 	mtx_lock(&sc->driver_mtx);
3934 	old_mtu = if_getmtu(ifp);
3935 	if_setmtu(ifp, mtu);
3936 	if (if_getdrvflags(ifp) & IFF_DRV_RUNNING) {
3937 		mxge_close(sc, 0);
3938 		err = mxge_open(sc);
3939 		if (err != 0) {
3940 			if_setmtu(ifp, old_mtu);
3941 			mxge_close(sc, 0);
3942 			(void) mxge_open(sc);
3943 		}
3944 	}
3945 	mtx_unlock(&sc->driver_mtx);
3946 	return err;
3947 }
3948 
3949 static void
3950 mxge_media_status(if_t ifp, struct ifmediareq *ifmr)
3951 {
3952 	mxge_softc_t *sc = if_getsoftc(ifp);
3953 
3954 	if (sc == NULL)
3955 		return;
3956 	ifmr->ifm_status = IFM_AVALID;
3957 	ifmr->ifm_active = IFM_ETHER | IFM_FDX;
3958 	ifmr->ifm_status |= sc->link_state ? IFM_ACTIVE : 0;
3959 	ifmr->ifm_active |= sc->current_media;
3960 }
3961 
3962 static int
3963 mxge_fetch_i2c(mxge_softc_t *sc, struct ifi2creq *i2c)
3964 {
3965 	mxge_cmd_t cmd;
3966 	uint32_t i2c_args;
3967 	int i, ms, err;
3968 
3969 	if (i2c->dev_addr != 0xA0 &&
3970 	    i2c->dev_addr != 0xA2)
3971 		return (EINVAL);
3972 	if (i2c->len > sizeof(i2c->data))
3973 		return (EINVAL);
3974 
3975 	for (i = 0; i < i2c->len; i++) {
3976 		i2c_args = i2c->dev_addr << 0x8;
3977 		i2c_args |= i2c->offset + i;
3978 		cmd.data0 = 0;	 /* just fetch 1 byte, not all 256 */
3979 		cmd.data1 = i2c_args;
3980 		err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_READ, &cmd);
3981 
3982 		if (err != MXGEFW_CMD_OK)
3983 			return (EIO);
3984 		/* now we wait for the data to be cached */
3985 		cmd.data0 = i2c_args & 0xff;
3986 		err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
3987 		for (ms = 0; (err == EBUSY) && (ms < 50); ms++) {
3988 			cmd.data0 = i2c_args & 0xff;
3989 			err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
3990 			if (err == EBUSY)
3991 				DELAY(1000);
3992 		}
3993 		if (err != MXGEFW_CMD_OK)
3994 			return (EIO);
3995 		i2c->data[i] = cmd.data0;
3996 	}
3997 	return (0);
3998 }
3999 
4000 static int
4001 mxge_ioctl(if_t ifp, u_long command, caddr_t data)
4002 {
4003 	mxge_softc_t *sc = if_getsoftc(ifp);
4004 	struct ifreq *ifr = (struct ifreq *)data;
4005 	struct ifi2creq i2c;
4006 	int err, mask;
4007 
4008 	err = 0;
4009 	switch (command) {
4010 	case SIOCSIFMTU:
4011 		err = mxge_change_mtu(sc, ifr->ifr_mtu);
4012 		break;
4013 
4014 	case SIOCSIFFLAGS:
4015 		mtx_lock(&sc->driver_mtx);
4016 		if (sc->dying) {
4017 			mtx_unlock(&sc->driver_mtx);
4018 			return EINVAL;
4019 		}
4020 		if (if_getflags(ifp) & IFF_UP) {
4021 			if (!(if_getdrvflags(ifp) & IFF_DRV_RUNNING)) {
4022 				err = mxge_open(sc);
4023 			} else {
4024 				/* take care of promis can allmulti
4025 				   flag chages */
4026 				mxge_change_promisc(sc,
4027 						    if_getflags(ifp) & IFF_PROMISC);
4028 				mxge_set_multicast_list(sc);
4029 			}
4030 		} else {
4031 			if (if_getdrvflags(ifp) & IFF_DRV_RUNNING) {
4032 				mxge_close(sc, 0);
4033 			}
4034 		}
4035 		mtx_unlock(&sc->driver_mtx);
4036 		break;
4037 
4038 	case SIOCADDMULTI:
4039 	case SIOCDELMULTI:
4040 		mtx_lock(&sc->driver_mtx);
4041 		if (sc->dying) {
4042 			mtx_unlock(&sc->driver_mtx);
4043 			return (EINVAL);
4044 		}
4045 		mxge_set_multicast_list(sc);
4046 		mtx_unlock(&sc->driver_mtx);
4047 		break;
4048 
4049 	case SIOCSIFCAP:
4050 		mtx_lock(&sc->driver_mtx);
4051 		mask = ifr->ifr_reqcap ^ if_getcapenable(ifp);
4052 		if (mask & IFCAP_TXCSUM) {
4053 			if (IFCAP_TXCSUM & if_getcapenable(ifp)) {
4054 				mask &= ~IFCAP_TSO4;
4055 				if_setcapenablebit(ifp, 0, (IFCAP_TXCSUM|IFCAP_TSO4));
4056 				if_sethwassistbits(ifp, 0, (CSUM_TCP | CSUM_UDP));
4057 			} else {
4058 				if_setcapenablebit(ifp, IFCAP_TXCSUM, 0);
4059 				if_sethwassistbits(ifp, (CSUM_TCP | CSUM_UDP), 0);
4060 			}
4061 		}
4062 		if (mask & IFCAP_RXCSUM) {
4063 			if (IFCAP_RXCSUM & if_getcapenable(ifp)) {
4064 				if_setcapenablebit(ifp, 0, IFCAP_RXCSUM);
4065 			} else {
4066 				if_setcapenablebit(ifp, IFCAP_RXCSUM, 0);
4067 			}
4068 		}
4069 		if (mask & IFCAP_TSO4) {
4070 			if (IFCAP_TSO4 & if_getcapenable(ifp)) {
4071 				if_setcapenablebit(ifp, 0, IFCAP_TSO4);
4072 			} else if (IFCAP_TXCSUM & if_getcapenable(ifp)) {
4073 				if_setcapenablebit(ifp, IFCAP_TSO4, 0);
4074 				if_sethwassistbits(ifp, CSUM_TSO, 0);
4075 			} else {
4076 				printf("mxge requires tx checksum offload"
4077 				       " be enabled to use TSO\n");
4078 				err = EINVAL;
4079 			}
4080 		}
4081 #if IFCAP_TSO6
4082 		if (mask & IFCAP_TXCSUM_IPV6) {
4083 			if (IFCAP_TXCSUM_IPV6 & if_getcapenable(ifp)) {
4084 				mask &= ~IFCAP_TSO6;
4085 				if_setcapenablebit(ifp, 0,
4086 				    IFCAP_TXCSUM_IPV6 | IFCAP_TSO6);
4087 				if_sethwassistbits(ifp, 0,
4088 				    CSUM_TCP_IPV6 | CSUM_UDP);
4089 			} else {
4090 				if_setcapenablebit(ifp, IFCAP_TXCSUM_IPV6, 0);
4091 				if_sethwassistbits(ifp,
4092 				    CSUM_TCP_IPV6 | CSUM_UDP_IPV6, 0);
4093 			}
4094 		}
4095 		if (mask & IFCAP_RXCSUM_IPV6) {
4096 			if (IFCAP_RXCSUM_IPV6 & if_getcapenable(ifp)) {
4097 				if_setcapenablebit(ifp, 0, IFCAP_RXCSUM_IPV6);
4098 			} else {
4099 				if_setcapenablebit(ifp, IFCAP_RXCSUM_IPV6, 0);
4100 			}
4101 		}
4102 		if (mask & IFCAP_TSO6) {
4103 			if (IFCAP_TSO6 & if_getcapenable(ifp)) {
4104 				if_setcapenablebit(ifp, 0, IFCAP_TSO6);
4105 			} else if (IFCAP_TXCSUM_IPV6 & if_getcapenable(ifp)) {
4106 				if_setcapenablebit(ifp, IFCAP_TSO6, 0);
4107 				if_sethwassistbits(ifp, CSUM_TSO, 0);
4108 			} else {
4109 				printf("mxge requires tx checksum offload"
4110 				       " be enabled to use TSO\n");
4111 				err = EINVAL;
4112 			}
4113 		}
4114 #endif /*IFCAP_TSO6 */
4115 
4116 		if (mask & IFCAP_LRO)
4117 			if_togglecapenable(ifp, IFCAP_LRO);
4118 		if (mask & IFCAP_VLAN_HWTAGGING)
4119 			if_togglecapenable(ifp, IFCAP_VLAN_HWTAGGING);
4120 		if (mask & IFCAP_VLAN_HWTSO)
4121 			if_togglecapenable(ifp, IFCAP_VLAN_HWTSO);
4122 
4123 		if (!(if_getcapabilities(ifp) & IFCAP_VLAN_HWTSO) ||
4124 		    !(if_getcapenable(ifp) & IFCAP_VLAN_HWTAGGING))
4125 			if_setcapenablebit(ifp, 0, IFCAP_VLAN_HWTSO);
4126 
4127 		mtx_unlock(&sc->driver_mtx);
4128 		VLAN_CAPABILITIES(ifp);
4129 
4130 		break;
4131 
4132 	case SIOCGIFMEDIA:
4133 		mtx_lock(&sc->driver_mtx);
4134 		if (sc->dying) {
4135 			mtx_unlock(&sc->driver_mtx);
4136 			return (EINVAL);
4137 		}
4138 		mxge_media_probe(sc);
4139 		mtx_unlock(&sc->driver_mtx);
4140 		err = ifmedia_ioctl(ifp, (struct ifreq *)data,
4141 				    &sc->media, command);
4142 		break;
4143 
4144 	case SIOCGI2C:
4145 		if (sc->connector != MXGE_XFP &&
4146 		    sc->connector != MXGE_SFP) {
4147 			err = ENXIO;
4148 			break;
4149 		}
4150 		err = copyin(ifr_data_get_ptr(ifr), &i2c, sizeof(i2c));
4151 		if (err != 0)
4152 			break;
4153 		mtx_lock(&sc->driver_mtx);
4154 		if (sc->dying) {
4155 			mtx_unlock(&sc->driver_mtx);
4156 			return (EINVAL);
4157 		}
4158 		err = mxge_fetch_i2c(sc, &i2c);
4159 		mtx_unlock(&sc->driver_mtx);
4160 		if (err == 0)
4161 			err = copyout(&i2c, ifr_data_get_ptr(ifr),
4162 			    sizeof(i2c));
4163 		break;
4164 	default:
4165 		err = ether_ioctl(ifp, command, data);
4166 		break;
4167 	}
4168 	return err;
4169 }
4170 
4171 static void
4172 mxge_fetch_tunables(mxge_softc_t *sc)
4173 {
4174 
4175 	TUNABLE_INT_FETCH("hw.mxge.max_slices", &mxge_max_slices);
4176 	TUNABLE_INT_FETCH("hw.mxge.flow_control_enabled",
4177 			  &mxge_flow_control);
4178 	TUNABLE_INT_FETCH("hw.mxge.intr_coal_delay",
4179 			  &mxge_intr_coal_delay);
4180 	TUNABLE_INT_FETCH("hw.mxge.nvidia_ecrc_enable",
4181 			  &mxge_nvidia_ecrc_enable);
4182 	TUNABLE_INT_FETCH("hw.mxge.force_firmware",
4183 			  &mxge_force_firmware);
4184 	TUNABLE_INT_FETCH("hw.mxge.deassert_wait",
4185 			  &mxge_deassert_wait);
4186 	TUNABLE_INT_FETCH("hw.mxge.verbose",
4187 			  &mxge_verbose);
4188 	TUNABLE_INT_FETCH("hw.mxge.ticks", &mxge_ticks);
4189 	TUNABLE_INT_FETCH("hw.mxge.always_promisc", &mxge_always_promisc);
4190 	TUNABLE_INT_FETCH("hw.mxge.rss_hash_type", &mxge_rss_hash_type);
4191 	TUNABLE_INT_FETCH("hw.mxge.rss_hashtype", &mxge_rss_hash_type);
4192 	TUNABLE_INT_FETCH("hw.mxge.initial_mtu", &mxge_initial_mtu);
4193 	TUNABLE_INT_FETCH("hw.mxge.throttle", &mxge_throttle);
4194 
4195 	if (bootverbose)
4196 		mxge_verbose = 1;
4197 	if (mxge_intr_coal_delay < 0 || mxge_intr_coal_delay > 10*1000)
4198 		mxge_intr_coal_delay = 30;
4199 	if (mxge_ticks == 0)
4200 		mxge_ticks = hz / 2;
4201 	sc->pause = mxge_flow_control;
4202 	if (mxge_rss_hash_type < MXGEFW_RSS_HASH_TYPE_IPV4
4203 	    || mxge_rss_hash_type > MXGEFW_RSS_HASH_TYPE_MAX) {
4204 		mxge_rss_hash_type = MXGEFW_RSS_HASH_TYPE_SRC_DST_PORT;
4205 	}
4206 	if (mxge_initial_mtu > ETHERMTU_JUMBO ||
4207 	    mxge_initial_mtu < ETHER_MIN_LEN)
4208 		mxge_initial_mtu = ETHERMTU_JUMBO;
4209 
4210 	if (mxge_throttle && mxge_throttle > MXGE_MAX_THROTTLE)
4211 		mxge_throttle = MXGE_MAX_THROTTLE;
4212 	if (mxge_throttle && mxge_throttle < MXGE_MIN_THROTTLE)
4213 		mxge_throttle = MXGE_MIN_THROTTLE;
4214 	sc->throttle = mxge_throttle;
4215 }
4216 
4217 static void
4218 mxge_free_slices(mxge_softc_t *sc)
4219 {
4220 	struct mxge_slice_state *ss;
4221 	int i;
4222 
4223 	if (sc->ss == NULL)
4224 		return;
4225 
4226 	for (i = 0; i < sc->num_slices; i++) {
4227 		ss = &sc->ss[i];
4228 		if (ss->fw_stats != NULL) {
4229 			mxge_dma_free(&ss->fw_stats_dma);
4230 			ss->fw_stats = NULL;
4231 			if (ss->tx.br != NULL) {
4232 				drbr_free(ss->tx.br, M_DEVBUF);
4233 				ss->tx.br = NULL;
4234 			}
4235 			mtx_destroy(&ss->tx.mtx);
4236 		}
4237 		if (ss->rx_done.entry != NULL) {
4238 			mxge_dma_free(&ss->rx_done.dma);
4239 			ss->rx_done.entry = NULL;
4240 		}
4241 	}
4242 	free(sc->ss, M_DEVBUF);
4243 	sc->ss = NULL;
4244 }
4245 
4246 static int
4247 mxge_alloc_slices(mxge_softc_t *sc)
4248 {
4249 	mxge_cmd_t cmd;
4250 	struct mxge_slice_state *ss;
4251 	size_t bytes;
4252 	int err, i, max_intr_slots;
4253 
4254 	err = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
4255 	if (err != 0) {
4256 		device_printf(sc->dev, "Cannot determine rx ring size\n");
4257 		return err;
4258 	}
4259 	sc->rx_ring_size = cmd.data0;
4260 	max_intr_slots = 2 * (sc->rx_ring_size / sizeof (mcp_dma_addr_t));
4261 
4262 	bytes = sizeof (*sc->ss) * sc->num_slices;
4263 	sc->ss = malloc(bytes, M_DEVBUF, M_NOWAIT | M_ZERO);
4264 	if (sc->ss == NULL)
4265 		return (ENOMEM);
4266 	for (i = 0; i < sc->num_slices; i++) {
4267 		ss = &sc->ss[i];
4268 
4269 		ss->sc = sc;
4270 
4271 		/* allocate per-slice rx interrupt queues */
4272 
4273 		bytes = max_intr_slots * sizeof (*ss->rx_done.entry);
4274 		err = mxge_dma_alloc(sc, &ss->rx_done.dma, bytes, 4096);
4275 		if (err != 0)
4276 			goto abort;
4277 		ss->rx_done.entry = ss->rx_done.dma.addr;
4278 		bzero(ss->rx_done.entry, bytes);
4279 
4280 		/*
4281 		 * allocate the per-slice firmware stats; stats
4282 		 * (including tx) are used used only on the first
4283 		 * slice for now
4284 		 */
4285 
4286 		bytes = sizeof (*ss->fw_stats);
4287 		err = mxge_dma_alloc(sc, &ss->fw_stats_dma,
4288 				     sizeof (*ss->fw_stats), 64);
4289 		if (err != 0)
4290 			goto abort;
4291 		ss->fw_stats = (mcp_irq_data_t *)ss->fw_stats_dma.addr;
4292 		snprintf(ss->tx.mtx_name, sizeof(ss->tx.mtx_name),
4293 			 "%s:tx(%d)", device_get_nameunit(sc->dev), i);
4294 		mtx_init(&ss->tx.mtx, ss->tx.mtx_name, NULL, MTX_DEF);
4295 		ss->tx.br = buf_ring_alloc(2048, M_DEVBUF, M_WAITOK,
4296 					   &ss->tx.mtx);
4297 	}
4298 
4299 	return (0);
4300 
4301 abort:
4302 	mxge_free_slices(sc);
4303 	return (ENOMEM);
4304 }
4305 
4306 static void
4307 mxge_slice_probe(mxge_softc_t *sc)
4308 {
4309 	mxge_cmd_t cmd;
4310 	char *old_fw;
4311 	int msix_cnt, status, max_intr_slots;
4312 
4313 	sc->num_slices = 1;
4314 	/*
4315 	 *  don't enable multiple slices if they are not enabled,
4316 	 *  or if this is not an SMP system
4317 	 */
4318 
4319 	if (mxge_max_slices == 0 || mxge_max_slices == 1 || mp_ncpus < 2)
4320 		return;
4321 
4322 	/* see how many MSI-X interrupts are available */
4323 	msix_cnt = pci_msix_count(sc->dev);
4324 	if (msix_cnt < 2)
4325 		return;
4326 
4327 	/* now load the slice aware firmware see what it supports */
4328 	old_fw = sc->fw_name;
4329 	if (old_fw == mxge_fw_aligned)
4330 		sc->fw_name = mxge_fw_rss_aligned;
4331 	else
4332 		sc->fw_name = mxge_fw_rss_unaligned;
4333 	status = mxge_load_firmware(sc, 0);
4334 	if (status != 0) {
4335 		device_printf(sc->dev, "Falling back to a single slice\n");
4336 		return;
4337 	}
4338 
4339 	/* try to send a reset command to the card to see if it
4340 	   is alive */
4341 	memset(&cmd, 0, sizeof (cmd));
4342 	status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
4343 	if (status != 0) {
4344 		device_printf(sc->dev, "failed reset\n");
4345 		goto abort_with_fw;
4346 	}
4347 
4348 	/* get rx ring size */
4349 	status = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
4350 	if (status != 0) {
4351 		device_printf(sc->dev, "Cannot determine rx ring size\n");
4352 		goto abort_with_fw;
4353 	}
4354 	max_intr_slots = 2 * (cmd.data0 / sizeof (mcp_dma_addr_t));
4355 
4356 	/* tell it the size of the interrupt queues */
4357 	cmd.data0 = max_intr_slots * sizeof (struct mcp_slot);
4358 	status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
4359 	if (status != 0) {
4360 		device_printf(sc->dev, "failed MXGEFW_CMD_SET_INTRQ_SIZE\n");
4361 		goto abort_with_fw;
4362 	}
4363 
4364 	/* ask the maximum number of slices it supports */
4365 	status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES, &cmd);
4366 	if (status != 0) {
4367 		device_printf(sc->dev,
4368 			      "failed MXGEFW_CMD_GET_MAX_RSS_QUEUES\n");
4369 		goto abort_with_fw;
4370 	}
4371 	sc->num_slices = cmd.data0;
4372 	if (sc->num_slices > msix_cnt)
4373 		sc->num_slices = msix_cnt;
4374 
4375 	if (mxge_max_slices == -1) {
4376 		/* cap to number of CPUs in system */
4377 		if (sc->num_slices > mp_ncpus)
4378 			sc->num_slices = mp_ncpus;
4379 	} else {
4380 		if (sc->num_slices > mxge_max_slices)
4381 			sc->num_slices = mxge_max_slices;
4382 	}
4383 	/* make sure it is a power of two */
4384 	while (sc->num_slices & (sc->num_slices - 1))
4385 		sc->num_slices--;
4386 
4387 	if (mxge_verbose)
4388 		device_printf(sc->dev, "using %d slices\n",
4389 			      sc->num_slices);
4390 
4391 	return;
4392 
4393 abort_with_fw:
4394 	sc->fw_name = old_fw;
4395 	(void) mxge_load_firmware(sc, 0);
4396 }
4397 
4398 static int
4399 mxge_add_msix_irqs(mxge_softc_t *sc)
4400 {
4401 	size_t bytes;
4402 	int count, err, i, rid;
4403 
4404 	rid = PCIR_BAR(2);
4405 	sc->msix_table_res = bus_alloc_resource_any(sc->dev, SYS_RES_MEMORY,
4406 						    &rid, RF_ACTIVE);
4407 
4408 	if (sc->msix_table_res == NULL) {
4409 		device_printf(sc->dev, "couldn't alloc MSIX table res\n");
4410 		return ENXIO;
4411 	}
4412 
4413 	count = sc->num_slices;
4414 	err = pci_alloc_msix(sc->dev, &count);
4415 	if (err != 0) {
4416 		device_printf(sc->dev, "pci_alloc_msix: failed, wanted %d"
4417 			      "err = %d \n", sc->num_slices, err);
4418 		goto abort_with_msix_table;
4419 	}
4420 	if (count < sc->num_slices) {
4421 		device_printf(sc->dev, "pci_alloc_msix: need %d, got %d\n",
4422 			      count, sc->num_slices);
4423 		device_printf(sc->dev,
4424 			      "Try setting hw.mxge.max_slices to %d\n",
4425 			      count);
4426 		err = ENOSPC;
4427 		goto abort_with_msix;
4428 	}
4429 	bytes = sizeof (*sc->msix_irq_res) * sc->num_slices;
4430 	sc->msix_irq_res = malloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO);
4431 	if (sc->msix_irq_res == NULL) {
4432 		err = ENOMEM;
4433 		goto abort_with_msix;
4434 	}
4435 
4436 	for (i = 0; i < sc->num_slices; i++) {
4437 		rid = i + 1;
4438 		sc->msix_irq_res[i] = bus_alloc_resource_any(sc->dev,
4439 							  SYS_RES_IRQ,
4440 							  &rid, RF_ACTIVE);
4441 		if (sc->msix_irq_res[i] == NULL) {
4442 			device_printf(sc->dev, "couldn't allocate IRQ res"
4443 				      " for message %d\n", i);
4444 			err = ENXIO;
4445 			goto abort_with_res;
4446 		}
4447 	}
4448 
4449 	bytes = sizeof (*sc->msix_ih) * sc->num_slices;
4450 	sc->msix_ih =  malloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO);
4451 
4452 	for (i = 0; i < sc->num_slices; i++) {
4453 		err = bus_setup_intr(sc->dev, sc->msix_irq_res[i],
4454 				     INTR_TYPE_NET | INTR_MPSAFE, NULL,
4455 				     mxge_intr, &sc->ss[i], &sc->msix_ih[i]);
4456 		if (err != 0) {
4457 			device_printf(sc->dev, "couldn't setup intr for "
4458 				      "message %d\n", i);
4459 			goto abort_with_intr;
4460 		}
4461 		bus_describe_intr(sc->dev, sc->msix_irq_res[i],
4462 				  sc->msix_ih[i], "s%d", i);
4463 	}
4464 
4465 	if (mxge_verbose) {
4466 		device_printf(sc->dev, "using %d msix IRQs:",
4467 			      sc->num_slices);
4468 		for (i = 0; i < sc->num_slices; i++)
4469 			printf(" %jd", rman_get_start(sc->msix_irq_res[i]));
4470 		printf("\n");
4471 	}
4472 	return (0);
4473 
4474 abort_with_intr:
4475 	for (i = 0; i < sc->num_slices; i++) {
4476 		if (sc->msix_ih[i] != NULL) {
4477 			bus_teardown_intr(sc->dev, sc->msix_irq_res[i],
4478 					  sc->msix_ih[i]);
4479 			sc->msix_ih[i] = NULL;
4480 		}
4481 	}
4482 	free(sc->msix_ih, M_DEVBUF);
4483 
4484 abort_with_res:
4485 	for (i = 0; i < sc->num_slices; i++) {
4486 		rid = i + 1;
4487 		if (sc->msix_irq_res[i] != NULL)
4488 			bus_release_resource(sc->dev, SYS_RES_IRQ, rid,
4489 					     sc->msix_irq_res[i]);
4490 		sc->msix_irq_res[i] = NULL;
4491 	}
4492 	free(sc->msix_irq_res, M_DEVBUF);
4493 
4494 abort_with_msix:
4495 	pci_release_msi(sc->dev);
4496 
4497 abort_with_msix_table:
4498 	bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2),
4499 			     sc->msix_table_res);
4500 
4501 	return err;
4502 }
4503 
4504 static int
4505 mxge_add_single_irq(mxge_softc_t *sc)
4506 {
4507 	int count, err, rid;
4508 
4509 	count = pci_msi_count(sc->dev);
4510 	if (count == 1 && pci_alloc_msi(sc->dev, &count) == 0) {
4511 		rid = 1;
4512 	} else {
4513 		rid = 0;
4514 		sc->legacy_irq = 1;
4515 	}
4516 	sc->irq_res = bus_alloc_resource_any(sc->dev, SYS_RES_IRQ, &rid,
4517 					     RF_SHAREABLE | RF_ACTIVE);
4518 	if (sc->irq_res == NULL) {
4519 		device_printf(sc->dev, "could not alloc interrupt\n");
4520 		return ENXIO;
4521 	}
4522 	if (mxge_verbose)
4523 		device_printf(sc->dev, "using %s irq %jd\n",
4524 			      sc->legacy_irq ? "INTx" : "MSI",
4525 			      rman_get_start(sc->irq_res));
4526 	err = bus_setup_intr(sc->dev, sc->irq_res,
4527 			     INTR_TYPE_NET | INTR_MPSAFE, NULL,
4528 			     mxge_intr, &sc->ss[0], &sc->ih);
4529 	if (err != 0) {
4530 		bus_release_resource(sc->dev, SYS_RES_IRQ,
4531 				     sc->legacy_irq ? 0 : 1, sc->irq_res);
4532 		if (!sc->legacy_irq)
4533 			pci_release_msi(sc->dev);
4534 	}
4535 	return err;
4536 }
4537 
4538 static void
4539 mxge_rem_msix_irqs(mxge_softc_t *sc)
4540 {
4541 	int i, rid;
4542 
4543 	for (i = 0; i < sc->num_slices; i++) {
4544 		if (sc->msix_ih[i] != NULL) {
4545 			bus_teardown_intr(sc->dev, sc->msix_irq_res[i],
4546 					  sc->msix_ih[i]);
4547 			sc->msix_ih[i] = NULL;
4548 		}
4549 	}
4550 	free(sc->msix_ih, M_DEVBUF);
4551 
4552 	for (i = 0; i < sc->num_slices; i++) {
4553 		rid = i + 1;
4554 		if (sc->msix_irq_res[i] != NULL)
4555 			bus_release_resource(sc->dev, SYS_RES_IRQ, rid,
4556 					     sc->msix_irq_res[i]);
4557 		sc->msix_irq_res[i] = NULL;
4558 	}
4559 	free(sc->msix_irq_res, M_DEVBUF);
4560 
4561 	bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2),
4562 			     sc->msix_table_res);
4563 
4564 	pci_release_msi(sc->dev);
4565 	return;
4566 }
4567 
4568 static void
4569 mxge_rem_single_irq(mxge_softc_t *sc)
4570 {
4571 	bus_teardown_intr(sc->dev, sc->irq_res, sc->ih);
4572 	bus_release_resource(sc->dev, SYS_RES_IRQ,
4573 			     sc->legacy_irq ? 0 : 1, sc->irq_res);
4574 	if (!sc->legacy_irq)
4575 		pci_release_msi(sc->dev);
4576 }
4577 
4578 static void
4579 mxge_rem_irq(mxge_softc_t *sc)
4580 {
4581 	if (sc->num_slices > 1)
4582 		mxge_rem_msix_irqs(sc);
4583 	else
4584 		mxge_rem_single_irq(sc);
4585 }
4586 
4587 static int
4588 mxge_add_irq(mxge_softc_t *sc)
4589 {
4590 	int err;
4591 
4592 	if (sc->num_slices > 1)
4593 		err = mxge_add_msix_irqs(sc);
4594 	else
4595 		err = mxge_add_single_irq(sc);
4596 
4597 	if (0 && err == 0 && sc->num_slices > 1) {
4598 		mxge_rem_msix_irqs(sc);
4599 		err = mxge_add_msix_irqs(sc);
4600 	}
4601 	return err;
4602 }
4603 
4604 static int
4605 mxge_attach(device_t dev)
4606 {
4607 	mxge_cmd_t cmd;
4608 	mxge_softc_t *sc = device_get_softc(dev);
4609 	if_t ifp;
4610 	int err, rid;
4611 
4612 	sc->dev = dev;
4613 	mxge_fetch_tunables(sc);
4614 
4615 	TASK_INIT(&sc->watchdog_task, 1, mxge_watchdog_task, sc);
4616 	sc->tq = taskqueue_create("mxge_taskq", M_WAITOK,
4617 				  taskqueue_thread_enqueue, &sc->tq);
4618 	if (sc->tq == NULL) {
4619 		err = ENOMEM;
4620 		goto abort_with_nothing;
4621 	}
4622 
4623 	err = bus_dma_tag_create(bus_get_dma_tag(dev),	/* parent */
4624 				 1,			/* alignment */
4625 				 0,			/* boundary */
4626 				 BUS_SPACE_MAXADDR,	/* low */
4627 				 BUS_SPACE_MAXADDR,	/* high */
4628 				 NULL, NULL,		/* filter */
4629 				 65536 + 256,		/* maxsize */
4630 				 MXGE_MAX_SEND_DESC, 	/* num segs */
4631 				 65536,			/* maxsegsize */
4632 				 0,			/* flags */
4633 				 NULL, NULL,		/* lock */
4634 				 &sc->parent_dmat);	/* tag */
4635 
4636 	if (err != 0) {
4637 		device_printf(sc->dev, "Err %d allocating parent dmat\n",
4638 			      err);
4639 		goto abort_with_tq;
4640 	}
4641 
4642 	ifp = sc->ifp = if_alloc(IFT_ETHER);
4643 	if (ifp == NULL) {
4644 		device_printf(dev, "can not if_alloc()\n");
4645 		err = ENOSPC;
4646 		goto abort_with_parent_dmat;
4647 	}
4648 	if_initname(ifp, device_get_name(dev), device_get_unit(dev));
4649 
4650 	snprintf(sc->cmd_mtx_name, sizeof(sc->cmd_mtx_name), "%s:cmd",
4651 		 device_get_nameunit(dev));
4652 	mtx_init(&sc->cmd_mtx, sc->cmd_mtx_name, NULL, MTX_DEF);
4653 	snprintf(sc->driver_mtx_name, sizeof(sc->driver_mtx_name),
4654 		 "%s:drv", device_get_nameunit(dev));
4655 	mtx_init(&sc->driver_mtx, sc->driver_mtx_name,
4656 		 MTX_NETWORK_LOCK, MTX_DEF);
4657 
4658 	callout_init_mtx(&sc->co_hdl, &sc->driver_mtx, 0);
4659 
4660 	mxge_setup_cfg_space(sc);
4661 
4662 	/* Map the board into the kernel */
4663 	rid = PCIR_BARS;
4664 	sc->mem_res = bus_alloc_resource_any(dev, SYS_RES_MEMORY, &rid,
4665 					     RF_ACTIVE);
4666 	if (sc->mem_res == NULL) {
4667 		device_printf(dev, "could not map memory\n");
4668 		err = ENXIO;
4669 		goto abort_with_lock;
4670 	}
4671 	sc->sram = rman_get_virtual(sc->mem_res);
4672 	sc->sram_size = 2*1024*1024 - (2*(48*1024)+(32*1024)) - 0x100;
4673 	if (sc->sram_size > rman_get_size(sc->mem_res)) {
4674 		device_printf(dev, "impossible memory region size %jd\n",
4675 			      rman_get_size(sc->mem_res));
4676 		err = ENXIO;
4677 		goto abort_with_mem_res;
4678 	}
4679 
4680 	/* make NULL terminated copy of the EEPROM strings section of
4681 	   lanai SRAM */
4682 	bzero(sc->eeprom_strings, MXGE_EEPROM_STRINGS_SIZE);
4683 	bus_space_read_region_1(rman_get_bustag(sc->mem_res),
4684 				rman_get_bushandle(sc->mem_res),
4685 				sc->sram_size - MXGE_EEPROM_STRINGS_SIZE,
4686 				sc->eeprom_strings,
4687 				MXGE_EEPROM_STRINGS_SIZE - 2);
4688 	err = mxge_parse_strings(sc);
4689 	if (err != 0)
4690 		goto abort_with_mem_res;
4691 
4692 	/* Enable write combining for efficient use of PCIe bus */
4693 	mxge_enable_wc(sc);
4694 
4695 	/* Allocate the out of band dma memory */
4696 	err = mxge_dma_alloc(sc, &sc->cmd_dma,
4697 			     sizeof (mxge_cmd_t), 64);
4698 	if (err != 0)
4699 		goto abort_with_mem_res;
4700 	sc->cmd = (mcp_cmd_response_t *) sc->cmd_dma.addr;
4701 	err = mxge_dma_alloc(sc, &sc->zeropad_dma, 64, 64);
4702 	if (err != 0)
4703 		goto abort_with_cmd_dma;
4704 
4705 	err = mxge_dma_alloc(sc, &sc->dmabench_dma, 4096, 4096);
4706 	if (err != 0)
4707 		goto abort_with_zeropad_dma;
4708 
4709 	/* select & load the firmware */
4710 	err = mxge_select_firmware(sc);
4711 	if (err != 0)
4712 		goto abort_with_dmabench;
4713 	sc->intr_coal_delay = mxge_intr_coal_delay;
4714 
4715 	mxge_slice_probe(sc);
4716 	err = mxge_alloc_slices(sc);
4717 	if (err != 0)
4718 		goto abort_with_dmabench;
4719 
4720 	err = mxge_reset(sc, 0);
4721 	if (err != 0)
4722 		goto abort_with_slices;
4723 
4724 	err = mxge_alloc_rings(sc);
4725 	if (err != 0) {
4726 		device_printf(sc->dev, "failed to allocate rings\n");
4727 		goto abort_with_slices;
4728 	}
4729 
4730 	err = mxge_add_irq(sc);
4731 	if (err != 0) {
4732 		device_printf(sc->dev, "failed to add irq\n");
4733 		goto abort_with_rings;
4734 	}
4735 
4736 	if_setbaudrate(ifp, IF_Gbps(10));
4737 	if_setcapabilities(ifp, IFCAP_RXCSUM | IFCAP_TXCSUM | IFCAP_TSO4 |
4738 		IFCAP_VLAN_MTU | IFCAP_LINKSTATE | IFCAP_TXCSUM_IPV6 |
4739 		IFCAP_RXCSUM_IPV6);
4740 #if defined(INET) || defined(INET6)
4741 	if_setcapabilitiesbit(ifp, IFCAP_LRO, 0);
4742 #endif
4743 
4744 #ifdef MXGE_NEW_VLAN_API
4745 	if_setcapabilitiesbit(ifp, IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_HWCSUM, 0);
4746 
4747 	/* Only FW 1.4.32 and newer can do TSO over vlans */
4748 	if (sc->fw_ver_major == 1 && sc->fw_ver_minor == 4 &&
4749 	    sc->fw_ver_tiny >= 32)
4750 		if_setcapabilitiesbit(ifp, IFCAP_VLAN_HWTSO, 0);
4751 #endif
4752 	sc->max_mtu = mxge_max_mtu(sc);
4753 	if (sc->max_mtu >= 9000)
4754 		if_setcapabilitiesbit(ifp, IFCAP_JUMBO_MTU, 0);
4755 	else
4756 		device_printf(dev, "MTU limited to %d.  Install "
4757 			      "latest firmware for 9000 byte jumbo support\n",
4758 			      sc->max_mtu - ETHER_HDR_LEN);
4759 	if_sethwassist(ifp, CSUM_TCP | CSUM_UDP | CSUM_TSO);
4760 	if_sethwassistbits(ifp, CSUM_TCP_IPV6 | CSUM_UDP_IPV6, 0);
4761 	/* check to see if f/w supports TSO for IPv6 */
4762 	if (!mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_TSO6_HDR_SIZE, &cmd)) {
4763 		if (CSUM_TCP_IPV6)
4764 			if_setcapabilitiesbit(ifp, IFCAP_TSO6, 0);
4765 		sc->max_tso6_hlen = min(cmd.data0,
4766 					sizeof (sc->ss[0].scratch));
4767 	}
4768 	if_setcapenable(ifp, if_getcapabilities(ifp));
4769 	if (sc->lro_cnt == 0)
4770 		if_setcapenablebit(ifp, 0, IFCAP_LRO);
4771 	if_setinitfn(ifp, mxge_init);
4772 	if_setsoftc(ifp, sc);
4773 	if_setflags(ifp, IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST);
4774 	if_setioctlfn(ifp, mxge_ioctl);
4775 	if_setstartfn(ifp, mxge_start);
4776 	if_setgetcounterfn(ifp, mxge_get_counter);
4777 	if_sethwtsomax(ifp, IP_MAXPACKET - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN));
4778 	if_sethwtsomaxsegcount(ifp, sc->ss[0].tx.max_desc);
4779 	if_sethwtsomaxsegsize(ifp, IP_MAXPACKET);
4780 	/* Initialise the ifmedia structure */
4781 	ifmedia_init(&sc->media, 0, mxge_media_change,
4782 		     mxge_media_status);
4783 	mxge_media_init(sc);
4784 	mxge_media_probe(sc);
4785 	sc->dying = 0;
4786 	ether_ifattach(ifp, sc->mac_addr);
4787 	/* ether_ifattach sets mtu to ETHERMTU */
4788 	if (mxge_initial_mtu != ETHERMTU)
4789 		mxge_change_mtu(sc, mxge_initial_mtu);
4790 
4791 	mxge_add_sysctls(sc);
4792 	if_settransmitfn(ifp, mxge_transmit);
4793 	if_setqflushfn(ifp, mxge_qflush);
4794 	taskqueue_start_threads(&sc->tq, 1, PI_NET, "%s taskq",
4795 				device_get_nameunit(sc->dev));
4796 	callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
4797 	return 0;
4798 
4799 abort_with_rings:
4800 	mxge_free_rings(sc);
4801 abort_with_slices:
4802 	mxge_free_slices(sc);
4803 abort_with_dmabench:
4804 	mxge_dma_free(&sc->dmabench_dma);
4805 abort_with_zeropad_dma:
4806 	mxge_dma_free(&sc->zeropad_dma);
4807 abort_with_cmd_dma:
4808 	mxge_dma_free(&sc->cmd_dma);
4809 abort_with_mem_res:
4810 	bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
4811 abort_with_lock:
4812 	pci_disable_busmaster(dev);
4813 	mtx_destroy(&sc->cmd_mtx);
4814 	mtx_destroy(&sc->driver_mtx);
4815 	if_free(ifp);
4816 abort_with_parent_dmat:
4817 	bus_dma_tag_destroy(sc->parent_dmat);
4818 abort_with_tq:
4819 	if (sc->tq != NULL) {
4820 		taskqueue_drain(sc->tq, &sc->watchdog_task);
4821 		taskqueue_free(sc->tq);
4822 		sc->tq = NULL;
4823 	}
4824 abort_with_nothing:
4825 	return err;
4826 }
4827 
4828 static int
4829 mxge_detach(device_t dev)
4830 {
4831 	mxge_softc_t *sc = device_get_softc(dev);
4832 
4833 	if (mxge_vlans_active(sc)) {
4834 		device_printf(sc->dev,
4835 			      "Detach vlans before removing module\n");
4836 		return EBUSY;
4837 	}
4838 	mtx_lock(&sc->driver_mtx);
4839 	sc->dying = 1;
4840 	if (if_getdrvflags(sc->ifp) & IFF_DRV_RUNNING)
4841 		mxge_close(sc, 0);
4842 	mtx_unlock(&sc->driver_mtx);
4843 	ether_ifdetach(sc->ifp);
4844 	if (sc->tq != NULL) {
4845 		taskqueue_drain(sc->tq, &sc->watchdog_task);
4846 		taskqueue_free(sc->tq);
4847 		sc->tq = NULL;
4848 	}
4849 	callout_drain(&sc->co_hdl);
4850 	ifmedia_removeall(&sc->media);
4851 	mxge_dummy_rdma(sc, 0);
4852 	mxge_rem_sysctls(sc);
4853 	mxge_rem_irq(sc);
4854 	mxge_free_rings(sc);
4855 	mxge_free_slices(sc);
4856 	mxge_dma_free(&sc->dmabench_dma);
4857 	mxge_dma_free(&sc->zeropad_dma);
4858 	mxge_dma_free(&sc->cmd_dma);
4859 	bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
4860 	pci_disable_busmaster(dev);
4861 	mtx_destroy(&sc->cmd_mtx);
4862 	mtx_destroy(&sc->driver_mtx);
4863 	if_free(sc->ifp);
4864 	bus_dma_tag_destroy(sc->parent_dmat);
4865 	return 0;
4866 }
4867 
4868 static int
4869 mxge_shutdown(device_t dev)
4870 {
4871 	return 0;
4872 }
4873 
4874 /*
4875   This file uses Myri10GE driver indentation.
4876 
4877   Local Variables:
4878   c-file-style:"linux"
4879   tab-width:8
4880   End:
4881 */
4882