xref: /freebsd/sys/dev/mxge/if_mxge.c (revision 83129c0b650734f4ca6c6f9c214aa3110d2f129d)
1 /******************************************************************************
2 SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3 
4 Copyright (c) 2006-2013, Myricom Inc.
5 All rights reserved.
6 
7 Redistribution and use in source and binary forms, with or without
8 modification, are permitted provided that the following conditions are met:
9 
10  1. Redistributions of source code must retain the above copyright notice,
11     this list of conditions and the following disclaimer.
12 
13  2. Neither the name of the Myricom Inc, nor the names of its
14     contributors may be used to endorse or promote products derived from
15     this software without specific prior written permission.
16 
17 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
21 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27 POSSIBILITY OF SUCH DAMAGE.
28 
29 ***************************************************************************/
30 
31 #include <sys/cdefs.h>
32 __FBSDID("$FreeBSD$");
33 
34 #include <sys/param.h>
35 #include <sys/systm.h>
36 #include <sys/linker.h>
37 #include <sys/firmware.h>
38 #include <sys/endian.h>
39 #include <sys/sockio.h>
40 #include <sys/mbuf.h>
41 #include <sys/malloc.h>
42 #include <sys/kdb.h>
43 #include <sys/kernel.h>
44 #include <sys/lock.h>
45 #include <sys/module.h>
46 #include <sys/socket.h>
47 #include <sys/sysctl.h>
48 #include <sys/sx.h>
49 #include <sys/taskqueue.h>
50 #include <contrib/zlib/zlib.h>
51 #include <dev/zlib/zcalloc.h>
52 
53 #include <net/if.h>
54 #include <net/if_var.h>
55 #include <net/if_arp.h>
56 #include <net/ethernet.h>
57 #include <net/if_dl.h>
58 #include <net/if_media.h>
59 
60 #include <net/bpf.h>
61 
62 #include <net/if_types.h>
63 #include <net/if_vlan_var.h>
64 
65 #include <netinet/in_systm.h>
66 #include <netinet/in.h>
67 #include <netinet/ip.h>
68 #include <netinet/ip6.h>
69 #include <netinet/tcp.h>
70 #include <netinet/tcp_lro.h>
71 #include <netinet6/ip6_var.h>
72 
73 #include <machine/bus.h>
74 #include <machine/in_cksum.h>
75 #include <machine/resource.h>
76 #include <sys/bus.h>
77 #include <sys/rman.h>
78 #include <sys/smp.h>
79 
80 #include <dev/pci/pcireg.h>
81 #include <dev/pci/pcivar.h>
82 #include <dev/pci/pci_private.h> /* XXX for pci_cfg_restore */
83 
84 #include <vm/vm.h>		/* for pmap_mapdev() */
85 #include <vm/pmap.h>
86 
87 #if defined(__i386) || defined(__amd64)
88 #include <machine/specialreg.h>
89 #endif
90 
91 #include <dev/mxge/mxge_mcp.h>
92 #include <dev/mxge/mcp_gen_header.h>
93 /*#define MXGE_FAKE_IFP*/
94 #include <dev/mxge/if_mxge_var.h>
95 #ifdef IFNET_BUF_RING
96 #include <sys/buf_ring.h>
97 #endif
98 
99 #include "opt_inet.h"
100 #include "opt_inet6.h"
101 
102 /* tunable params */
103 static int mxge_nvidia_ecrc_enable = 1;
104 static int mxge_force_firmware = 0;
105 static int mxge_intr_coal_delay = 30;
106 static int mxge_deassert_wait = 1;
107 static int mxge_flow_control = 1;
108 static int mxge_verbose = 0;
109 static int mxge_ticks;
110 static int mxge_max_slices = 1;
111 static int mxge_rss_hash_type = MXGEFW_RSS_HASH_TYPE_SRC_DST_PORT;
112 static int mxge_always_promisc = 0;
113 static int mxge_initial_mtu = ETHERMTU_JUMBO;
114 static int mxge_throttle = 0;
115 static char *mxge_fw_unaligned = "mxge_ethp_z8e";
116 static char *mxge_fw_aligned = "mxge_eth_z8e";
117 static char *mxge_fw_rss_aligned = "mxge_rss_eth_z8e";
118 static char *mxge_fw_rss_unaligned = "mxge_rss_ethp_z8e";
119 
120 static int mxge_probe(device_t dev);
121 static int mxge_attach(device_t dev);
122 static int mxge_detach(device_t dev);
123 static int mxge_shutdown(device_t dev);
124 static void mxge_intr(void *arg);
125 
126 static device_method_t mxge_methods[] =
127 {
128   /* Device interface */
129   DEVMETHOD(device_probe, mxge_probe),
130   DEVMETHOD(device_attach, mxge_attach),
131   DEVMETHOD(device_detach, mxge_detach),
132   DEVMETHOD(device_shutdown, mxge_shutdown),
133 
134   DEVMETHOD_END
135 };
136 
137 static driver_t mxge_driver =
138 {
139   "mxge",
140   mxge_methods,
141   sizeof(mxge_softc_t),
142 };
143 
144 static devclass_t mxge_devclass;
145 
146 /* Declare ourselves to be a child of the PCI bus.*/
147 DRIVER_MODULE(mxge, pci, mxge_driver, mxge_devclass, 0, 0);
148 MODULE_DEPEND(mxge, firmware, 1, 1, 1);
149 MODULE_DEPEND(mxge, zlib, 1, 1, 1);
150 
151 static int mxge_load_firmware(mxge_softc_t *sc, int adopt);
152 static int mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data);
153 static int mxge_close(mxge_softc_t *sc, int down);
154 static int mxge_open(mxge_softc_t *sc);
155 static void mxge_tick(void *arg);
156 
157 static int
158 mxge_probe(device_t dev)
159 {
160 	int rev;
161 
162 
163 	if ((pci_get_vendor(dev) == MXGE_PCI_VENDOR_MYRICOM) &&
164 	    ((pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E) ||
165 	     (pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E_9))) {
166 		rev = pci_get_revid(dev);
167 		switch (rev) {
168 		case MXGE_PCI_REV_Z8E:
169 			device_set_desc(dev, "Myri10G-PCIE-8A");
170 			break;
171 		case MXGE_PCI_REV_Z8ES:
172 			device_set_desc(dev, "Myri10G-PCIE-8B");
173 			break;
174 		default:
175 			device_set_desc(dev, "Myri10G-PCIE-8??");
176 			device_printf(dev, "Unrecognized rev %d NIC\n",
177 				      rev);
178 			break;
179 		}
180 		return 0;
181 	}
182 	return ENXIO;
183 }
184 
185 static void
186 mxge_enable_wc(mxge_softc_t *sc)
187 {
188 #if defined(__i386) || defined(__amd64)
189 	vm_offset_t len;
190 	int err;
191 
192 	sc->wc = 1;
193 	len = rman_get_size(sc->mem_res);
194 	err = pmap_change_attr((vm_offset_t) sc->sram,
195 			       len, PAT_WRITE_COMBINING);
196 	if (err != 0) {
197 		device_printf(sc->dev, "pmap_change_attr failed, %d\n",
198 			      err);
199 		sc->wc = 0;
200 	}
201 #endif
202 }
203 
204 
205 /* callback to get our DMA address */
206 static void
207 mxge_dmamap_callback(void *arg, bus_dma_segment_t *segs, int nsegs,
208 			 int error)
209 {
210 	if (error == 0) {
211 		*(bus_addr_t *) arg = segs->ds_addr;
212 	}
213 }
214 
215 static int
216 mxge_dma_alloc(mxge_softc_t *sc, mxge_dma_t *dma, size_t bytes,
217 		   bus_size_t alignment)
218 {
219 	int err;
220 	device_t dev = sc->dev;
221 	bus_size_t boundary, maxsegsize;
222 
223 	if (bytes > 4096 && alignment == 4096) {
224 		boundary = 0;
225 		maxsegsize = bytes;
226 	} else {
227 		boundary = 4096;
228 		maxsegsize = 4096;
229 	}
230 
231 	/* allocate DMAable memory tags */
232 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
233 				 alignment,		/* alignment */
234 				 boundary,		/* boundary */
235 				 BUS_SPACE_MAXADDR,	/* low */
236 				 BUS_SPACE_MAXADDR,	/* high */
237 				 NULL, NULL,		/* filter */
238 				 bytes,			/* maxsize */
239 				 1,			/* num segs */
240 				 maxsegsize,		/* maxsegsize */
241 				 BUS_DMA_COHERENT,	/* flags */
242 				 NULL, NULL,		/* lock */
243 				 &dma->dmat);		/* tag */
244 	if (err != 0) {
245 		device_printf(dev, "couldn't alloc tag (err = %d)\n", err);
246 		return err;
247 	}
248 
249 	/* allocate DMAable memory & map */
250 	err = bus_dmamem_alloc(dma->dmat, &dma->addr,
251 			       (BUS_DMA_WAITOK | BUS_DMA_COHERENT
252 				| BUS_DMA_ZERO),  &dma->map);
253 	if (err != 0) {
254 		device_printf(dev, "couldn't alloc mem (err = %d)\n", err);
255 		goto abort_with_dmat;
256 	}
257 
258 	/* load the memory */
259 	err = bus_dmamap_load(dma->dmat, dma->map, dma->addr, bytes,
260 			      mxge_dmamap_callback,
261 			      (void *)&dma->bus_addr, 0);
262 	if (err != 0) {
263 		device_printf(dev, "couldn't load map (err = %d)\n", err);
264 		goto abort_with_mem;
265 	}
266 	return 0;
267 
268 abort_with_mem:
269 	bus_dmamem_free(dma->dmat, dma->addr, dma->map);
270 abort_with_dmat:
271 	(void)bus_dma_tag_destroy(dma->dmat);
272 	return err;
273 }
274 
275 
276 static void
277 mxge_dma_free(mxge_dma_t *dma)
278 {
279 	bus_dmamap_unload(dma->dmat, dma->map);
280 	bus_dmamem_free(dma->dmat, dma->addr, dma->map);
281 	(void)bus_dma_tag_destroy(dma->dmat);
282 }
283 
284 /*
285  * The eeprom strings on the lanaiX have the format
286  * SN=x\0
287  * MAC=x:x:x:x:x:x\0
288  * PC=text\0
289  */
290 
291 static int
292 mxge_parse_strings(mxge_softc_t *sc)
293 {
294 	char *ptr;
295 	int i, found_mac, found_sn2;
296 	char *endptr;
297 
298 	ptr = sc->eeprom_strings;
299 	found_mac = 0;
300 	found_sn2 = 0;
301 	while (*ptr != '\0') {
302 		if (strncmp(ptr, "MAC=", 4) == 0) {
303 			ptr += 4;
304 			for (i = 0;;) {
305 				sc->mac_addr[i] = strtoul(ptr, &endptr, 16);
306 				if (endptr - ptr != 2)
307 					goto abort;
308 				ptr = endptr;
309 				if (++i == 6)
310 					break;
311 				if (*ptr++ != ':')
312 					goto abort;
313 			}
314 			found_mac = 1;
315 		} else if (strncmp(ptr, "PC=", 3) == 0) {
316 			ptr += 3;
317 			strlcpy(sc->product_code_string, ptr,
318 			    sizeof(sc->product_code_string));
319 		} else if (!found_sn2 && (strncmp(ptr, "SN=", 3) == 0)) {
320 			ptr += 3;
321 			strlcpy(sc->serial_number_string, ptr,
322 			    sizeof(sc->serial_number_string));
323 		} else if (strncmp(ptr, "SN2=", 4) == 0) {
324 			/* SN2 takes precedence over SN */
325 			ptr += 4;
326 			found_sn2 = 1;
327 			strlcpy(sc->serial_number_string, ptr,
328 			    sizeof(sc->serial_number_string));
329 		}
330 		while (*ptr++ != '\0') {}
331 	}
332 
333 	if (found_mac)
334 		return 0;
335 
336  abort:
337 	device_printf(sc->dev, "failed to parse eeprom_strings\n");
338 
339 	return ENXIO;
340 }
341 
342 #if defined __i386 || defined i386 || defined __i386__ || defined __x86_64__
343 static void
344 mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
345 {
346 	uint32_t val;
347 	unsigned long base, off;
348 	char *va, *cfgptr;
349 	device_t pdev, mcp55;
350 	uint16_t vendor_id, device_id, word;
351 	uintptr_t bus, slot, func, ivend, idev;
352 	uint32_t *ptr32;
353 
354 
355 	if (!mxge_nvidia_ecrc_enable)
356 		return;
357 
358 	pdev = device_get_parent(device_get_parent(sc->dev));
359 	if (pdev == NULL) {
360 		device_printf(sc->dev, "could not find parent?\n");
361 		return;
362 	}
363 	vendor_id = pci_read_config(pdev, PCIR_VENDOR, 2);
364 	device_id = pci_read_config(pdev, PCIR_DEVICE, 2);
365 
366 	if (vendor_id != 0x10de)
367 		return;
368 
369 	base = 0;
370 
371 	if (device_id == 0x005d) {
372 		/* ck804, base address is magic */
373 		base = 0xe0000000UL;
374 	} else if (device_id >= 0x0374 && device_id <= 0x378) {
375 		/* mcp55, base address stored in chipset */
376 		mcp55 = pci_find_bsf(0, 0, 0);
377 		if (mcp55 &&
378 		    0x10de == pci_read_config(mcp55, PCIR_VENDOR, 2) &&
379 		    0x0369 == pci_read_config(mcp55, PCIR_DEVICE, 2)) {
380 			word = pci_read_config(mcp55, 0x90, 2);
381 			base = ((unsigned long)word & 0x7ffeU) << 25;
382 		}
383 	}
384 	if (!base)
385 		return;
386 
387 	/* XXXX
388 	   Test below is commented because it is believed that doing
389 	   config read/write beyond 0xff will access the config space
390 	   for the next larger function.  Uncomment this and remove
391 	   the hacky pmap_mapdev() way of accessing config space when
392 	   FreeBSD grows support for extended pcie config space access
393 	*/
394 #if 0
395 	/* See if we can, by some miracle, access the extended
396 	   config space */
397 	val = pci_read_config(pdev, 0x178, 4);
398 	if (val != 0xffffffff) {
399 		val |= 0x40;
400 		pci_write_config(pdev, 0x178, val, 4);
401 		return;
402 	}
403 #endif
404 	/* Rather than using normal pci config space writes, we must
405 	 * map the Nvidia config space ourselves.  This is because on
406 	 * opteron/nvidia class machine the 0xe000000 mapping is
407 	 * handled by the nvidia chipset, that means the internal PCI
408 	 * device (the on-chip northbridge), or the amd-8131 bridge
409 	 * and things behind them are not visible by this method.
410 	 */
411 
412 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
413 		      PCI_IVAR_BUS, &bus);
414 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
415 		      PCI_IVAR_SLOT, &slot);
416 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
417 		      PCI_IVAR_FUNCTION, &func);
418 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
419 		      PCI_IVAR_VENDOR, &ivend);
420 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
421 		      PCI_IVAR_DEVICE, &idev);
422 
423 	off =  base
424 		+ 0x00100000UL * (unsigned long)bus
425 		+ 0x00001000UL * (unsigned long)(func
426 						 + 8 * slot);
427 
428 	/* map it into the kernel */
429 	va = pmap_mapdev(trunc_page((vm_paddr_t)off), PAGE_SIZE);
430 
431 
432 	if (va == NULL) {
433 		device_printf(sc->dev, "pmap_kenter_temporary didn't\n");
434 		return;
435 	}
436 	/* get a pointer to the config space mapped into the kernel */
437 	cfgptr = va + (off & PAGE_MASK);
438 
439 	/* make sure that we can really access it */
440 	vendor_id = *(uint16_t *)(cfgptr + PCIR_VENDOR);
441 	device_id = *(uint16_t *)(cfgptr + PCIR_DEVICE);
442 	if (! (vendor_id == ivend && device_id == idev)) {
443 		device_printf(sc->dev, "mapping failed: 0x%x:0x%x\n",
444 			      vendor_id, device_id);
445 		pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
446 		return;
447 	}
448 
449 	ptr32 = (uint32_t*)(cfgptr + 0x178);
450 	val = *ptr32;
451 
452 	if (val == 0xffffffff) {
453 		device_printf(sc->dev, "extended mapping failed\n");
454 		pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
455 		return;
456 	}
457 	*ptr32 = val | 0x40;
458 	pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
459 	if (mxge_verbose)
460 		device_printf(sc->dev,
461 			      "Enabled ECRC on upstream Nvidia bridge "
462 			      "at %d:%d:%d\n",
463 			      (int)bus, (int)slot, (int)func);
464 	return;
465 }
466 #else
467 static void
468 mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
469 {
470 	device_printf(sc->dev,
471 		      "Nforce 4 chipset on non-x86/amd64!?!?!\n");
472 	return;
473 }
474 #endif
475 
476 
477 static int
478 mxge_dma_test(mxge_softc_t *sc, int test_type)
479 {
480 	mxge_cmd_t cmd;
481 	bus_addr_t dmatest_bus = sc->dmabench_dma.bus_addr;
482 	int status;
483 	uint32_t len;
484 	char *test = " ";
485 
486 
487 	/* Run a small DMA test.
488 	 * The magic multipliers to the length tell the firmware
489 	 * to do DMA read, write, or read+write tests.  The
490 	 * results are returned in cmd.data0.  The upper 16
491 	 * bits of the return is the number of transfers completed.
492 	 * The lower 16 bits is the time in 0.5us ticks that the
493 	 * transfers took to complete.
494 	 */
495 
496 	len = sc->tx_boundary;
497 
498 	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
499 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
500 	cmd.data2 = len * 0x10000;
501 	status = mxge_send_cmd(sc, test_type, &cmd);
502 	if (status != 0) {
503 		test = "read";
504 		goto abort;
505 	}
506 	sc->read_dma = ((cmd.data0>>16) * len * 2) /
507 		(cmd.data0 & 0xffff);
508 	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
509 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
510 	cmd.data2 = len * 0x1;
511 	status = mxge_send_cmd(sc, test_type, &cmd);
512 	if (status != 0) {
513 		test = "write";
514 		goto abort;
515 	}
516 	sc->write_dma = ((cmd.data0>>16) * len * 2) /
517 		(cmd.data0 & 0xffff);
518 
519 	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
520 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
521 	cmd.data2 = len * 0x10001;
522 	status = mxge_send_cmd(sc, test_type, &cmd);
523 	if (status != 0) {
524 		test = "read/write";
525 		goto abort;
526 	}
527 	sc->read_write_dma = ((cmd.data0>>16) * len * 2 * 2) /
528 		(cmd.data0 & 0xffff);
529 
530 abort:
531 	if (status != 0 && test_type != MXGEFW_CMD_UNALIGNED_TEST)
532 		device_printf(sc->dev, "DMA %s benchmark failed: %d\n",
533 			      test, status);
534 
535 	return status;
536 }
537 
538 /*
539  * The Lanai Z8E PCI-E interface achieves higher Read-DMA throughput
540  * when the PCI-E Completion packets are aligned on an 8-byte
541  * boundary.  Some PCI-E chip sets always align Completion packets; on
542  * the ones that do not, the alignment can be enforced by enabling
543  * ECRC generation (if supported).
544  *
545  * When PCI-E Completion packets are not aligned, it is actually more
546  * efficient to limit Read-DMA transactions to 2KB, rather than 4KB.
547  *
548  * If the driver can neither enable ECRC nor verify that it has
549  * already been enabled, then it must use a firmware image which works
550  * around unaligned completion packets (ethp_z8e.dat), and it should
551  * also ensure that it never gives the device a Read-DMA which is
552  * larger than 2KB by setting the tx_boundary to 2KB.  If ECRC is
553  * enabled, then the driver should use the aligned (eth_z8e.dat)
554  * firmware image, and set tx_boundary to 4KB.
555  */
556 
557 static int
558 mxge_firmware_probe(mxge_softc_t *sc)
559 {
560 	device_t dev = sc->dev;
561 	int reg, status;
562 	uint16_t pectl;
563 
564 	sc->tx_boundary = 4096;
565 	/*
566 	 * Verify the max read request size was set to 4KB
567 	 * before trying the test with 4KB.
568 	 */
569 	if (pci_find_cap(dev, PCIY_EXPRESS, &reg) == 0) {
570 		pectl = pci_read_config(dev, reg + 0x8, 2);
571 		if ((pectl & (5 << 12)) != (5 << 12)) {
572 			device_printf(dev, "Max Read Req. size != 4k (0x%x\n",
573 				      pectl);
574 			sc->tx_boundary = 2048;
575 		}
576 	}
577 
578 	/*
579 	 * load the optimized firmware (which assumes aligned PCIe
580 	 * completions) in order to see if it works on this host.
581 	 */
582 	sc->fw_name = mxge_fw_aligned;
583 	status = mxge_load_firmware(sc, 1);
584 	if (status != 0) {
585 		return status;
586 	}
587 
588 	/*
589 	 * Enable ECRC if possible
590 	 */
591 	mxge_enable_nvidia_ecrc(sc);
592 
593 	/*
594 	 * Run a DMA test which watches for unaligned completions and
595 	 * aborts on the first one seen.  Not required on Z8ES or newer.
596 	 */
597 	if (pci_get_revid(sc->dev) >= MXGE_PCI_REV_Z8ES)
598 		return 0;
599 	status = mxge_dma_test(sc, MXGEFW_CMD_UNALIGNED_TEST);
600 	if (status == 0)
601 		return 0; /* keep the aligned firmware */
602 
603 	if (status != E2BIG)
604 		device_printf(dev, "DMA test failed: %d\n", status);
605 	if (status == ENOSYS)
606 		device_printf(dev, "Falling back to ethp! "
607 			      "Please install up to date fw\n");
608 	return status;
609 }
610 
611 static int
612 mxge_select_firmware(mxge_softc_t *sc)
613 {
614 	int aligned = 0;
615 	int force_firmware = mxge_force_firmware;
616 
617 	if (sc->throttle)
618 		force_firmware = sc->throttle;
619 
620 	if (force_firmware != 0) {
621 		if (force_firmware == 1)
622 			aligned = 1;
623 		else
624 			aligned = 0;
625 		if (mxge_verbose)
626 			device_printf(sc->dev,
627 				      "Assuming %s completions (forced)\n",
628 				      aligned ? "aligned" : "unaligned");
629 		goto abort;
630 	}
631 
632 	/* if the PCIe link width is 4 or less, we can use the aligned
633 	   firmware and skip any checks */
634 	if (sc->link_width != 0 && sc->link_width <= 4) {
635 		device_printf(sc->dev,
636 			      "PCIe x%d Link, expect reduced performance\n",
637 			      sc->link_width);
638 		aligned = 1;
639 		goto abort;
640 	}
641 
642 	if (0 == mxge_firmware_probe(sc))
643 		return 0;
644 
645 abort:
646 	if (aligned) {
647 		sc->fw_name = mxge_fw_aligned;
648 		sc->tx_boundary = 4096;
649 	} else {
650 		sc->fw_name = mxge_fw_unaligned;
651 		sc->tx_boundary = 2048;
652 	}
653 	return (mxge_load_firmware(sc, 0));
654 }
655 
656 static int
657 mxge_validate_firmware(mxge_softc_t *sc, const mcp_gen_header_t *hdr)
658 {
659 
660 
661 	if (be32toh(hdr->mcp_type) != MCP_TYPE_ETH) {
662 		device_printf(sc->dev, "Bad firmware type: 0x%x\n",
663 			      be32toh(hdr->mcp_type));
664 		return EIO;
665 	}
666 
667 	/* save firmware version for sysctl */
668 	strlcpy(sc->fw_version, hdr->version, sizeof(sc->fw_version));
669 	if (mxge_verbose)
670 		device_printf(sc->dev, "firmware id: %s\n", hdr->version);
671 
672 	sscanf(sc->fw_version, "%d.%d.%d", &sc->fw_ver_major,
673 	       &sc->fw_ver_minor, &sc->fw_ver_tiny);
674 
675 	if (!(sc->fw_ver_major == MXGEFW_VERSION_MAJOR
676 	      && sc->fw_ver_minor == MXGEFW_VERSION_MINOR)) {
677 		device_printf(sc->dev, "Found firmware version %s\n",
678 			      sc->fw_version);
679 		device_printf(sc->dev, "Driver needs %d.%d\n",
680 			      MXGEFW_VERSION_MAJOR, MXGEFW_VERSION_MINOR);
681 		return EINVAL;
682 	}
683 	return 0;
684 
685 }
686 
687 static int
688 mxge_load_firmware_helper(mxge_softc_t *sc, uint32_t *limit)
689 {
690 	z_stream zs;
691 	char *inflate_buffer;
692 	const struct firmware *fw;
693 	const mcp_gen_header_t *hdr;
694 	unsigned hdr_offset;
695 	int status;
696 	unsigned int i;
697 	char dummy;
698 	size_t fw_len;
699 
700 	fw = firmware_get(sc->fw_name);
701 	if (fw == NULL) {
702 		device_printf(sc->dev, "Could not find firmware image %s\n",
703 			      sc->fw_name);
704 		return ENOENT;
705 	}
706 
707 
708 
709 	/* setup zlib and decompress f/w */
710 	bzero(&zs, sizeof (zs));
711 	zs.zalloc = zcalloc_nowait;
712 	zs.zfree = zcfree;
713 	status = inflateInit(&zs);
714 	if (status != Z_OK) {
715 		status = EIO;
716 		goto abort_with_fw;
717 	}
718 
719 	/* the uncompressed size is stored as the firmware version,
720 	   which would otherwise go unused */
721 	fw_len = (size_t) fw->version;
722 	inflate_buffer = malloc(fw_len, M_TEMP, M_NOWAIT);
723 	if (inflate_buffer == NULL)
724 		goto abort_with_zs;
725 	zs.avail_in = fw->datasize;
726 	zs.next_in = __DECONST(char *, fw->data);
727 	zs.avail_out = fw_len;
728 	zs.next_out = inflate_buffer;
729 	status = inflate(&zs, Z_FINISH);
730 	if (status != Z_STREAM_END) {
731 		device_printf(sc->dev, "zlib %d\n", status);
732 		status = EIO;
733 		goto abort_with_buffer;
734 	}
735 
736 	/* check id */
737 	hdr_offset = htobe32(*(const uint32_t *)
738 			     (inflate_buffer + MCP_HEADER_PTR_OFFSET));
739 	if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > fw_len) {
740 		device_printf(sc->dev, "Bad firmware file");
741 		status = EIO;
742 		goto abort_with_buffer;
743 	}
744 	hdr = (const void*)(inflate_buffer + hdr_offset);
745 
746 	status = mxge_validate_firmware(sc, hdr);
747 	if (status != 0)
748 		goto abort_with_buffer;
749 
750 	/* Copy the inflated firmware to NIC SRAM. */
751 	for (i = 0; i < fw_len; i += 256) {
752 		mxge_pio_copy(sc->sram + MXGE_FW_OFFSET + i,
753 			      inflate_buffer + i,
754 			      min(256U, (unsigned)(fw_len - i)));
755 		wmb();
756 		dummy = *sc->sram;
757 		wmb();
758 	}
759 
760 	*limit = fw_len;
761 	status = 0;
762 abort_with_buffer:
763 	free(inflate_buffer, M_TEMP);
764 abort_with_zs:
765 	inflateEnd(&zs);
766 abort_with_fw:
767 	firmware_put(fw, FIRMWARE_UNLOAD);
768 	return status;
769 }
770 
771 /*
772  * Enable or disable periodic RDMAs from the host to make certain
773  * chipsets resend dropped PCIe messages
774  */
775 
776 static void
777 mxge_dummy_rdma(mxge_softc_t *sc, int enable)
778 {
779 	char buf_bytes[72];
780 	volatile uint32_t *confirm;
781 	volatile char *submit;
782 	uint32_t *buf, dma_low, dma_high;
783 	int i;
784 
785 	buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
786 
787 	/* clear confirmation addr */
788 	confirm = (volatile uint32_t *)sc->cmd;
789 	*confirm = 0;
790 	wmb();
791 
792 	/* send an rdma command to the PCIe engine, and wait for the
793 	   response in the confirmation address.  The firmware should
794 	   write a -1 there to indicate it is alive and well
795 	*/
796 
797 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
798 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
799 	buf[0] = htobe32(dma_high);		/* confirm addr MSW */
800 	buf[1] = htobe32(dma_low);		/* confirm addr LSW */
801 	buf[2] = htobe32(0xffffffff);		/* confirm data */
802 	dma_low = MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr);
803 	dma_high = MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr);
804 	buf[3] = htobe32(dma_high); 		/* dummy addr MSW */
805 	buf[4] = htobe32(dma_low); 		/* dummy addr LSW */
806 	buf[5] = htobe32(enable);			/* enable? */
807 
808 
809 	submit = (volatile char *)(sc->sram + MXGEFW_BOOT_DUMMY_RDMA);
810 
811 	mxge_pio_copy(submit, buf, 64);
812 	wmb();
813 	DELAY(1000);
814 	wmb();
815 	i = 0;
816 	while (*confirm != 0xffffffff && i < 20) {
817 		DELAY(1000);
818 		i++;
819 	}
820 	if (*confirm != 0xffffffff) {
821 		device_printf(sc->dev, "dummy rdma %s failed (%p = 0x%x)",
822 			      (enable ? "enable" : "disable"), confirm,
823 			      *confirm);
824 	}
825 	return;
826 }
827 
828 static int
829 mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data)
830 {
831 	mcp_cmd_t *buf;
832 	char buf_bytes[sizeof(*buf) + 8];
833 	volatile mcp_cmd_response_t *response = sc->cmd;
834 	volatile char *cmd_addr = sc->sram + MXGEFW_ETH_CMD;
835 	uint32_t dma_low, dma_high;
836 	int err, sleep_total = 0;
837 
838 	/* ensure buf is aligned to 8 bytes */
839 	buf = (mcp_cmd_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
840 
841 	buf->data0 = htobe32(data->data0);
842 	buf->data1 = htobe32(data->data1);
843 	buf->data2 = htobe32(data->data2);
844 	buf->cmd = htobe32(cmd);
845 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
846 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
847 
848 	buf->response_addr.low = htobe32(dma_low);
849 	buf->response_addr.high = htobe32(dma_high);
850 	mtx_lock(&sc->cmd_mtx);
851 	response->result = 0xffffffff;
852 	wmb();
853 	mxge_pio_copy((volatile void *)cmd_addr, buf, sizeof (*buf));
854 
855 	/* wait up to 20ms */
856 	err = EAGAIN;
857 	for (sleep_total = 0; sleep_total <  20; sleep_total++) {
858 		bus_dmamap_sync(sc->cmd_dma.dmat,
859 				sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
860 		wmb();
861 		switch (be32toh(response->result)) {
862 		case 0:
863 			data->data0 = be32toh(response->data);
864 			err = 0;
865 			break;
866 		case 0xffffffff:
867 			DELAY(1000);
868 			break;
869 		case MXGEFW_CMD_UNKNOWN:
870 			err = ENOSYS;
871 			break;
872 		case MXGEFW_CMD_ERROR_UNALIGNED:
873 			err = E2BIG;
874 			break;
875 		case MXGEFW_CMD_ERROR_BUSY:
876 			err = EBUSY;
877 			break;
878 		case MXGEFW_CMD_ERROR_I2C_ABSENT:
879 			err = ENXIO;
880 			break;
881 		default:
882 			device_printf(sc->dev,
883 				      "mxge: command %d "
884 				      "failed, result = %d\n",
885 				      cmd, be32toh(response->result));
886 			err = ENXIO;
887 			break;
888 		}
889 		if (err != EAGAIN)
890 			break;
891 	}
892 	if (err == EAGAIN)
893 		device_printf(sc->dev, "mxge: command %d timed out"
894 			      "result = %d\n",
895 			      cmd, be32toh(response->result));
896 	mtx_unlock(&sc->cmd_mtx);
897 	return err;
898 }
899 
900 static int
901 mxge_adopt_running_firmware(mxge_softc_t *sc)
902 {
903 	struct mcp_gen_header *hdr;
904 	const size_t bytes = sizeof (struct mcp_gen_header);
905 	size_t hdr_offset;
906 	int status;
907 
908 	/* find running firmware header */
909 	hdr_offset = htobe32(*(volatile uint32_t *)
910 			     (sc->sram + MCP_HEADER_PTR_OFFSET));
911 
912 	if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > sc->sram_size) {
913 		device_printf(sc->dev,
914 			      "Running firmware has bad header offset (%d)\n",
915 			      (int)hdr_offset);
916 		return EIO;
917 	}
918 
919 	/* copy header of running firmware from SRAM to host memory to
920 	 * validate firmware */
921 	hdr = malloc(bytes, M_DEVBUF, M_NOWAIT);
922 	if (hdr == NULL) {
923 		device_printf(sc->dev, "could not malloc firmware hdr\n");
924 		return ENOMEM;
925 	}
926 	bus_space_read_region_1(rman_get_bustag(sc->mem_res),
927 				rman_get_bushandle(sc->mem_res),
928 				hdr_offset, (char *)hdr, bytes);
929 	status = mxge_validate_firmware(sc, hdr);
930 	free(hdr, M_DEVBUF);
931 
932 	/*
933 	 * check to see if adopted firmware has bug where adopting
934 	 * it will cause broadcasts to be filtered unless the NIC
935 	 * is kept in ALLMULTI mode
936 	 */
937 	if (sc->fw_ver_major == 1 && sc->fw_ver_minor == 4 &&
938 	    sc->fw_ver_tiny >= 4 && sc->fw_ver_tiny <= 11) {
939 		sc->adopted_rx_filter_bug = 1;
940 		device_printf(sc->dev, "Adopting fw %d.%d.%d: "
941 			      "working around rx filter bug\n",
942 			      sc->fw_ver_major, sc->fw_ver_minor,
943 			      sc->fw_ver_tiny);
944 	}
945 
946 	return status;
947 }
948 
949 
950 static int
951 mxge_load_firmware(mxge_softc_t *sc, int adopt)
952 {
953 	volatile uint32_t *confirm;
954 	volatile char *submit;
955 	char buf_bytes[72];
956 	uint32_t *buf, size, dma_low, dma_high;
957 	int status, i;
958 
959 	buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
960 
961 	size = sc->sram_size;
962 	status = mxge_load_firmware_helper(sc, &size);
963 	if (status) {
964 		if (!adopt)
965 			return status;
966 		/* Try to use the currently running firmware, if
967 		   it is new enough */
968 		status = mxge_adopt_running_firmware(sc);
969 		if (status) {
970 			device_printf(sc->dev,
971 				      "failed to adopt running firmware\n");
972 			return status;
973 		}
974 		device_printf(sc->dev,
975 			      "Successfully adopted running firmware\n");
976 		if (sc->tx_boundary == 4096) {
977 			device_printf(sc->dev,
978 				"Using firmware currently running on NIC"
979 				 ".  For optimal\n");
980 			device_printf(sc->dev,
981 				 "performance consider loading optimized "
982 				 "firmware\n");
983 		}
984 		sc->fw_name = mxge_fw_unaligned;
985 		sc->tx_boundary = 2048;
986 		return 0;
987 	}
988 	/* clear confirmation addr */
989 	confirm = (volatile uint32_t *)sc->cmd;
990 	*confirm = 0;
991 	wmb();
992 	/* send a reload command to the bootstrap MCP, and wait for the
993 	   response in the confirmation address.  The firmware should
994 	   write a -1 there to indicate it is alive and well
995 	*/
996 
997 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
998 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
999 
1000 	buf[0] = htobe32(dma_high);	/* confirm addr MSW */
1001 	buf[1] = htobe32(dma_low);	/* confirm addr LSW */
1002 	buf[2] = htobe32(0xffffffff);	/* confirm data */
1003 
1004 	/* FIX: All newest firmware should un-protect the bottom of
1005 	   the sram before handoff. However, the very first interfaces
1006 	   do not. Therefore the handoff copy must skip the first 8 bytes
1007 	*/
1008 					/* where the code starts*/
1009 	buf[3] = htobe32(MXGE_FW_OFFSET + 8);
1010 	buf[4] = htobe32(size - 8); 	/* length of code */
1011 	buf[5] = htobe32(8);		/* where to copy to */
1012 	buf[6] = htobe32(0);		/* where to jump to */
1013 
1014 	submit = (volatile char *)(sc->sram + MXGEFW_BOOT_HANDOFF);
1015 	mxge_pio_copy(submit, buf, 64);
1016 	wmb();
1017 	DELAY(1000);
1018 	wmb();
1019 	i = 0;
1020 	while (*confirm != 0xffffffff && i < 20) {
1021 		DELAY(1000*10);
1022 		i++;
1023 		bus_dmamap_sync(sc->cmd_dma.dmat,
1024 				sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
1025 	}
1026 	if (*confirm != 0xffffffff) {
1027 		device_printf(sc->dev,"handoff failed (%p = 0x%x)",
1028 			confirm, *confirm);
1029 
1030 		return ENXIO;
1031 	}
1032 	return 0;
1033 }
1034 
1035 static int
1036 mxge_update_mac_address(mxge_softc_t *sc)
1037 {
1038 	mxge_cmd_t cmd;
1039 	uint8_t *addr = sc->mac_addr;
1040 	int status;
1041 
1042 
1043 	cmd.data0 = ((addr[0] << 24) | (addr[1] << 16)
1044 		     | (addr[2] << 8) | addr[3]);
1045 
1046 	cmd.data1 = ((addr[4] << 8) | (addr[5]));
1047 
1048 	status = mxge_send_cmd(sc, MXGEFW_SET_MAC_ADDRESS, &cmd);
1049 	return status;
1050 }
1051 
1052 static int
1053 mxge_change_pause(mxge_softc_t *sc, int pause)
1054 {
1055 	mxge_cmd_t cmd;
1056 	int status;
1057 
1058 	if (pause)
1059 		status = mxge_send_cmd(sc, MXGEFW_ENABLE_FLOW_CONTROL,
1060 				       &cmd);
1061 	else
1062 		status = mxge_send_cmd(sc, MXGEFW_DISABLE_FLOW_CONTROL,
1063 				       &cmd);
1064 
1065 	if (status) {
1066 		device_printf(sc->dev, "Failed to set flow control mode\n");
1067 		return ENXIO;
1068 	}
1069 	sc->pause = pause;
1070 	return 0;
1071 }
1072 
1073 static void
1074 mxge_change_promisc(mxge_softc_t *sc, int promisc)
1075 {
1076 	mxge_cmd_t cmd;
1077 	int status;
1078 
1079 	if (mxge_always_promisc)
1080 		promisc = 1;
1081 
1082 	if (promisc)
1083 		status = mxge_send_cmd(sc, MXGEFW_ENABLE_PROMISC,
1084 				       &cmd);
1085 	else
1086 		status = mxge_send_cmd(sc, MXGEFW_DISABLE_PROMISC,
1087 				       &cmd);
1088 
1089 	if (status) {
1090 		device_printf(sc->dev, "Failed to set promisc mode\n");
1091 	}
1092 }
1093 
1094 static void
1095 mxge_set_multicast_list(mxge_softc_t *sc)
1096 {
1097 	mxge_cmd_t cmd;
1098 	struct ifmultiaddr *ifma;
1099 	struct ifnet *ifp = sc->ifp;
1100 	int err;
1101 
1102 	/* This firmware is known to not support multicast */
1103 	if (!sc->fw_multicast_support)
1104 		return;
1105 
1106 	/* Disable multicast filtering while we play with the lists*/
1107 	err = mxge_send_cmd(sc, MXGEFW_ENABLE_ALLMULTI, &cmd);
1108 	if (err != 0) {
1109 		device_printf(sc->dev, "Failed MXGEFW_ENABLE_ALLMULTI,"
1110 		       " error status: %d\n", err);
1111 		return;
1112 	}
1113 
1114 	if (sc->adopted_rx_filter_bug)
1115 		return;
1116 
1117 	if (ifp->if_flags & IFF_ALLMULTI)
1118 		/* request to disable multicast filtering, so quit here */
1119 		return;
1120 
1121 	/* Flush all the filters */
1122 
1123 	err = mxge_send_cmd(sc, MXGEFW_LEAVE_ALL_MULTICAST_GROUPS, &cmd);
1124 	if (err != 0) {
1125 		device_printf(sc->dev,
1126 			      "Failed MXGEFW_LEAVE_ALL_MULTICAST_GROUPS"
1127 			      ", error status: %d\n", err);
1128 		return;
1129 	}
1130 
1131 	/* Walk the multicast list, and add each address */
1132 
1133 	if_maddr_rlock(ifp);
1134 	CK_STAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
1135 		if (ifma->ifma_addr->sa_family != AF_LINK)
1136 			continue;
1137 		bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr),
1138 		      &cmd.data0, 4);
1139 		bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr) + 4,
1140 		      &cmd.data1, 2);
1141 		cmd.data0 = htonl(cmd.data0);
1142 		cmd.data1 = htonl(cmd.data1);
1143 		err = mxge_send_cmd(sc, MXGEFW_JOIN_MULTICAST_GROUP, &cmd);
1144 		if (err != 0) {
1145 			device_printf(sc->dev, "Failed "
1146 			       "MXGEFW_JOIN_MULTICAST_GROUP, error status:"
1147 			       "%d\t", err);
1148 			/* abort, leaving multicast filtering off */
1149 			if_maddr_runlock(ifp);
1150 			return;
1151 		}
1152 	}
1153 	if_maddr_runlock(ifp);
1154 	/* Enable multicast filtering */
1155 	err = mxge_send_cmd(sc, MXGEFW_DISABLE_ALLMULTI, &cmd);
1156 	if (err != 0) {
1157 		device_printf(sc->dev, "Failed MXGEFW_DISABLE_ALLMULTI"
1158 		       ", error status: %d\n", err);
1159 	}
1160 }
1161 
1162 static int
1163 mxge_max_mtu(mxge_softc_t *sc)
1164 {
1165 	mxge_cmd_t cmd;
1166 	int status;
1167 
1168 	if (MJUMPAGESIZE - MXGEFW_PAD >  MXGEFW_MAX_MTU)
1169 		return  MXGEFW_MAX_MTU - MXGEFW_PAD;
1170 
1171 	/* try to set nbufs to see if it we can
1172 	   use virtually contiguous jumbos */
1173 	cmd.data0 = 0;
1174 	status = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
1175 			       &cmd);
1176 	if (status == 0)
1177 		return  MXGEFW_MAX_MTU - MXGEFW_PAD;
1178 
1179 	/* otherwise, we're limited to MJUMPAGESIZE */
1180 	return MJUMPAGESIZE - MXGEFW_PAD;
1181 }
1182 
1183 static int
1184 mxge_reset(mxge_softc_t *sc, int interrupts_setup)
1185 {
1186 	struct mxge_slice_state *ss;
1187 	mxge_rx_done_t *rx_done;
1188 	volatile uint32_t *irq_claim;
1189 	mxge_cmd_t cmd;
1190 	int slice, status;
1191 
1192 	/* try to send a reset command to the card to see if it
1193 	   is alive */
1194 	memset(&cmd, 0, sizeof (cmd));
1195 	status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
1196 	if (status != 0) {
1197 		device_printf(sc->dev, "failed reset\n");
1198 		return ENXIO;
1199 	}
1200 
1201 	mxge_dummy_rdma(sc, 1);
1202 
1203 
1204 	/* set the intrq size */
1205 	cmd.data0 = sc->rx_ring_size;
1206 	status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
1207 
1208 	/*
1209 	 * Even though we already know how many slices are supported
1210 	 * via mxge_slice_probe(), MXGEFW_CMD_GET_MAX_RSS_QUEUES
1211 	 * has magic side effects, and must be called after a reset.
1212 	 * It must be called prior to calling any RSS related cmds,
1213 	 * including assigning an interrupt queue for anything but
1214 	 * slice 0.  It must also be called *after*
1215 	 * MXGEFW_CMD_SET_INTRQ_SIZE, since the intrq size is used by
1216 	 * the firmware to compute offsets.
1217 	 */
1218 
1219 	if (sc->num_slices > 1) {
1220 		/* ask the maximum number of slices it supports */
1221 		status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES,
1222 					   &cmd);
1223 		if (status != 0) {
1224 			device_printf(sc->dev,
1225 				      "failed to get number of slices\n");
1226 			return status;
1227 		}
1228 		/*
1229 		 * MXGEFW_CMD_ENABLE_RSS_QUEUES must be called prior
1230 		 * to setting up the interrupt queue DMA
1231 		 */
1232 		cmd.data0 = sc->num_slices;
1233 		cmd.data1 = MXGEFW_SLICE_INTR_MODE_ONE_PER_SLICE;
1234 #ifdef IFNET_BUF_RING
1235 		cmd.data1 |= MXGEFW_SLICE_ENABLE_MULTIPLE_TX_QUEUES;
1236 #endif
1237 		status = mxge_send_cmd(sc, MXGEFW_CMD_ENABLE_RSS_QUEUES,
1238 					   &cmd);
1239 		if (status != 0) {
1240 			device_printf(sc->dev,
1241 				      "failed to set number of slices\n");
1242 			return status;
1243 		}
1244 	}
1245 
1246 
1247 	if (interrupts_setup) {
1248 		/* Now exchange information about interrupts  */
1249 		for (slice = 0; slice < sc->num_slices; slice++) {
1250 			rx_done = &sc->ss[slice].rx_done;
1251 			memset(rx_done->entry, 0, sc->rx_ring_size);
1252 			cmd.data0 = MXGE_LOWPART_TO_U32(rx_done->dma.bus_addr);
1253 			cmd.data1 = MXGE_HIGHPART_TO_U32(rx_done->dma.bus_addr);
1254 			cmd.data2 = slice;
1255 			status |= mxge_send_cmd(sc,
1256 						MXGEFW_CMD_SET_INTRQ_DMA,
1257 						&cmd);
1258 		}
1259 	}
1260 
1261 	status |= mxge_send_cmd(sc,
1262 				MXGEFW_CMD_GET_INTR_COAL_DELAY_OFFSET, &cmd);
1263 
1264 
1265 	sc->intr_coal_delay_ptr = (volatile uint32_t *)(sc->sram + cmd.data0);
1266 
1267 	status |= mxge_send_cmd(sc, MXGEFW_CMD_GET_IRQ_ACK_OFFSET, &cmd);
1268 	irq_claim = (volatile uint32_t *)(sc->sram + cmd.data0);
1269 
1270 
1271 	status |= mxge_send_cmd(sc,  MXGEFW_CMD_GET_IRQ_DEASSERT_OFFSET,
1272 				&cmd);
1273 	sc->irq_deassert = (volatile uint32_t *)(sc->sram + cmd.data0);
1274 	if (status != 0) {
1275 		device_printf(sc->dev, "failed set interrupt parameters\n");
1276 		return status;
1277 	}
1278 
1279 
1280 	*sc->intr_coal_delay_ptr = htobe32(sc->intr_coal_delay);
1281 
1282 
1283 	/* run a DMA benchmark */
1284 	(void) mxge_dma_test(sc, MXGEFW_DMA_TEST);
1285 
1286 	for (slice = 0; slice < sc->num_slices; slice++) {
1287 		ss = &sc->ss[slice];
1288 
1289 		ss->irq_claim = irq_claim + (2 * slice);
1290 		/* reset mcp/driver shared state back to 0 */
1291 		ss->rx_done.idx = 0;
1292 		ss->rx_done.cnt = 0;
1293 		ss->tx.req = 0;
1294 		ss->tx.done = 0;
1295 		ss->tx.pkt_done = 0;
1296 		ss->tx.queue_active = 0;
1297 		ss->tx.activate = 0;
1298 		ss->tx.deactivate = 0;
1299 		ss->tx.wake = 0;
1300 		ss->tx.defrag = 0;
1301 		ss->tx.stall = 0;
1302 		ss->rx_big.cnt = 0;
1303 		ss->rx_small.cnt = 0;
1304 		ss->lc.lro_bad_csum = 0;
1305 		ss->lc.lro_queued = 0;
1306 		ss->lc.lro_flushed = 0;
1307 		if (ss->fw_stats != NULL) {
1308 			bzero(ss->fw_stats, sizeof *ss->fw_stats);
1309 		}
1310 	}
1311 	sc->rdma_tags_available = 15;
1312 	status = mxge_update_mac_address(sc);
1313 	mxge_change_promisc(sc, sc->ifp->if_flags & IFF_PROMISC);
1314 	mxge_change_pause(sc, sc->pause);
1315 	mxge_set_multicast_list(sc);
1316 	if (sc->throttle) {
1317 		cmd.data0 = sc->throttle;
1318 		if (mxge_send_cmd(sc, MXGEFW_CMD_SET_THROTTLE_FACTOR,
1319 				  &cmd)) {
1320 			device_printf(sc->dev,
1321 				      "can't enable throttle\n");
1322 		}
1323 	}
1324 	return status;
1325 }
1326 
1327 static int
1328 mxge_change_throttle(SYSCTL_HANDLER_ARGS)
1329 {
1330 	mxge_cmd_t cmd;
1331 	mxge_softc_t *sc;
1332 	int err;
1333 	unsigned int throttle;
1334 
1335 	sc = arg1;
1336 	throttle = sc->throttle;
1337 	err = sysctl_handle_int(oidp, &throttle, arg2, req);
1338 	if (err != 0) {
1339 		return err;
1340 	}
1341 
1342 	if (throttle == sc->throttle)
1343 		return 0;
1344 
1345 	if (throttle < MXGE_MIN_THROTTLE || throttle > MXGE_MAX_THROTTLE)
1346 		return EINVAL;
1347 
1348 	mtx_lock(&sc->driver_mtx);
1349 	cmd.data0 = throttle;
1350 	err = mxge_send_cmd(sc, MXGEFW_CMD_SET_THROTTLE_FACTOR, &cmd);
1351 	if (err == 0)
1352 		sc->throttle = throttle;
1353 	mtx_unlock(&sc->driver_mtx);
1354 	return err;
1355 }
1356 
1357 static int
1358 mxge_change_intr_coal(SYSCTL_HANDLER_ARGS)
1359 {
1360 	mxge_softc_t *sc;
1361 	unsigned int intr_coal_delay;
1362 	int err;
1363 
1364 	sc = arg1;
1365 	intr_coal_delay = sc->intr_coal_delay;
1366 	err = sysctl_handle_int(oidp, &intr_coal_delay, arg2, req);
1367 	if (err != 0) {
1368 		return err;
1369 	}
1370 	if (intr_coal_delay == sc->intr_coal_delay)
1371 		return 0;
1372 
1373 	if (intr_coal_delay == 0 || intr_coal_delay > 1000*1000)
1374 		return EINVAL;
1375 
1376 	mtx_lock(&sc->driver_mtx);
1377 	*sc->intr_coal_delay_ptr = htobe32(intr_coal_delay);
1378 	sc->intr_coal_delay = intr_coal_delay;
1379 
1380 	mtx_unlock(&sc->driver_mtx);
1381 	return err;
1382 }
1383 
1384 static int
1385 mxge_change_flow_control(SYSCTL_HANDLER_ARGS)
1386 {
1387 	mxge_softc_t *sc;
1388 	unsigned int enabled;
1389 	int err;
1390 
1391 	sc = arg1;
1392 	enabled = sc->pause;
1393 	err = sysctl_handle_int(oidp, &enabled, arg2, req);
1394 	if (err != 0) {
1395 		return err;
1396 	}
1397 	if (enabled == sc->pause)
1398 		return 0;
1399 
1400 	mtx_lock(&sc->driver_mtx);
1401 	err = mxge_change_pause(sc, enabled);
1402 	mtx_unlock(&sc->driver_mtx);
1403 	return err;
1404 }
1405 
1406 static int
1407 mxge_handle_be32(SYSCTL_HANDLER_ARGS)
1408 {
1409 	int err;
1410 
1411 	if (arg1 == NULL)
1412 		return EFAULT;
1413 	arg2 = be32toh(*(int *)arg1);
1414 	arg1 = NULL;
1415 	err = sysctl_handle_int(oidp, arg1, arg2, req);
1416 
1417 	return err;
1418 }
1419 
1420 static void
1421 mxge_rem_sysctls(mxge_softc_t *sc)
1422 {
1423 	struct mxge_slice_state *ss;
1424 	int slice;
1425 
1426 	if (sc->slice_sysctl_tree == NULL)
1427 		return;
1428 
1429 	for (slice = 0; slice < sc->num_slices; slice++) {
1430 		ss = &sc->ss[slice];
1431 		if (ss == NULL || ss->sysctl_tree == NULL)
1432 			continue;
1433 		sysctl_ctx_free(&ss->sysctl_ctx);
1434 		ss->sysctl_tree = NULL;
1435 	}
1436 	sysctl_ctx_free(&sc->slice_sysctl_ctx);
1437 	sc->slice_sysctl_tree = NULL;
1438 }
1439 
1440 static void
1441 mxge_add_sysctls(mxge_softc_t *sc)
1442 {
1443 	struct sysctl_ctx_list *ctx;
1444 	struct sysctl_oid_list *children;
1445 	mcp_irq_data_t *fw;
1446 	struct mxge_slice_state *ss;
1447 	int slice;
1448 	char slice_num[8];
1449 
1450 	ctx = device_get_sysctl_ctx(sc->dev);
1451 	children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev));
1452 	fw = sc->ss[0].fw_stats;
1453 
1454 	/* random information */
1455 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1456 		       "firmware_version",
1457 		       CTLFLAG_RD, sc->fw_version,
1458 		       0, "firmware version");
1459 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1460 		       "serial_number",
1461 		       CTLFLAG_RD, sc->serial_number_string,
1462 		       0, "serial number");
1463 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1464 		       "product_code",
1465 		       CTLFLAG_RD, sc->product_code_string,
1466 		       0, "product_code");
1467 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1468 		       "pcie_link_width",
1469 		       CTLFLAG_RD, &sc->link_width,
1470 		       0, "tx_boundary");
1471 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1472 		       "tx_boundary",
1473 		       CTLFLAG_RD, &sc->tx_boundary,
1474 		       0, "tx_boundary");
1475 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1476 		       "write_combine",
1477 		       CTLFLAG_RD, &sc->wc,
1478 		       0, "write combining PIO?");
1479 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1480 		       "read_dma_MBs",
1481 		       CTLFLAG_RD, &sc->read_dma,
1482 		       0, "DMA Read speed in MB/s");
1483 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1484 		       "write_dma_MBs",
1485 		       CTLFLAG_RD, &sc->write_dma,
1486 		       0, "DMA Write speed in MB/s");
1487 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1488 		       "read_write_dma_MBs",
1489 		       CTLFLAG_RD, &sc->read_write_dma,
1490 		       0, "DMA concurrent Read/Write speed in MB/s");
1491 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1492 		       "watchdog_resets",
1493 		       CTLFLAG_RD, &sc->watchdog_resets,
1494 		       0, "Number of times NIC was reset");
1495 
1496 
1497 	/* performance related tunables */
1498 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1499 			"intr_coal_delay",
1500 			CTLTYPE_INT|CTLFLAG_RW, sc,
1501 			0, mxge_change_intr_coal,
1502 			"I", "interrupt coalescing delay in usecs");
1503 
1504 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1505 			"throttle",
1506 			CTLTYPE_INT|CTLFLAG_RW, sc,
1507 			0, mxge_change_throttle,
1508 			"I", "transmit throttling");
1509 
1510 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1511 			"flow_control_enabled",
1512 			CTLTYPE_INT|CTLFLAG_RW, sc,
1513 			0, mxge_change_flow_control,
1514 			"I", "interrupt coalescing delay in usecs");
1515 
1516 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1517 		       "deassert_wait",
1518 		       CTLFLAG_RW, &mxge_deassert_wait,
1519 		       0, "Wait for IRQ line to go low in ihandler");
1520 
1521 	/* stats block from firmware is in network byte order.
1522 	   Need to swap it */
1523 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1524 			"link_up",
1525 			CTLTYPE_INT|CTLFLAG_RD, &fw->link_up,
1526 			0, mxge_handle_be32,
1527 			"I", "link up");
1528 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1529 			"rdma_tags_available",
1530 			CTLTYPE_INT|CTLFLAG_RD, &fw->rdma_tags_available,
1531 			0, mxge_handle_be32,
1532 			"I", "rdma_tags_available");
1533 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1534 			"dropped_bad_crc32",
1535 			CTLTYPE_INT|CTLFLAG_RD,
1536 			&fw->dropped_bad_crc32,
1537 			0, mxge_handle_be32,
1538 			"I", "dropped_bad_crc32");
1539 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1540 			"dropped_bad_phy",
1541 			CTLTYPE_INT|CTLFLAG_RD,
1542 			&fw->dropped_bad_phy,
1543 			0, mxge_handle_be32,
1544 			"I", "dropped_bad_phy");
1545 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1546 			"dropped_link_error_or_filtered",
1547 			CTLTYPE_INT|CTLFLAG_RD,
1548 			&fw->dropped_link_error_or_filtered,
1549 			0, mxge_handle_be32,
1550 			"I", "dropped_link_error_or_filtered");
1551 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1552 			"dropped_link_overflow",
1553 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_link_overflow,
1554 			0, mxge_handle_be32,
1555 			"I", "dropped_link_overflow");
1556 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1557 			"dropped_multicast_filtered",
1558 			CTLTYPE_INT|CTLFLAG_RD,
1559 			&fw->dropped_multicast_filtered,
1560 			0, mxge_handle_be32,
1561 			"I", "dropped_multicast_filtered");
1562 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1563 			"dropped_no_big_buffer",
1564 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_no_big_buffer,
1565 			0, mxge_handle_be32,
1566 			"I", "dropped_no_big_buffer");
1567 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1568 			"dropped_no_small_buffer",
1569 			CTLTYPE_INT|CTLFLAG_RD,
1570 			&fw->dropped_no_small_buffer,
1571 			0, mxge_handle_be32,
1572 			"I", "dropped_no_small_buffer");
1573 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1574 			"dropped_overrun",
1575 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_overrun,
1576 			0, mxge_handle_be32,
1577 			"I", "dropped_overrun");
1578 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1579 			"dropped_pause",
1580 			CTLTYPE_INT|CTLFLAG_RD,
1581 			&fw->dropped_pause,
1582 			0, mxge_handle_be32,
1583 			"I", "dropped_pause");
1584 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1585 			"dropped_runt",
1586 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_runt,
1587 			0, mxge_handle_be32,
1588 			"I", "dropped_runt");
1589 
1590 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1591 			"dropped_unicast_filtered",
1592 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_unicast_filtered,
1593 			0, mxge_handle_be32,
1594 			"I", "dropped_unicast_filtered");
1595 
1596 	/* verbose printing? */
1597 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1598 		       "verbose",
1599 		       CTLFLAG_RW, &mxge_verbose,
1600 		       0, "verbose printing");
1601 
1602 	/* add counters exported for debugging from all slices */
1603 	sysctl_ctx_init(&sc->slice_sysctl_ctx);
1604 	sc->slice_sysctl_tree =
1605 		SYSCTL_ADD_NODE(&sc->slice_sysctl_ctx, children, OID_AUTO,
1606 				"slice", CTLFLAG_RD, 0, "");
1607 
1608 	for (slice = 0; slice < sc->num_slices; slice++) {
1609 		ss = &sc->ss[slice];
1610 		sysctl_ctx_init(&ss->sysctl_ctx);
1611 		ctx = &ss->sysctl_ctx;
1612 		children = SYSCTL_CHILDREN(sc->slice_sysctl_tree);
1613 		sprintf(slice_num, "%d", slice);
1614 		ss->sysctl_tree =
1615 			SYSCTL_ADD_NODE(ctx, children, OID_AUTO, slice_num,
1616 					CTLFLAG_RD, 0, "");
1617 		children = SYSCTL_CHILDREN(ss->sysctl_tree);
1618 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1619 			       "rx_small_cnt",
1620 			       CTLFLAG_RD, &ss->rx_small.cnt,
1621 			       0, "rx_small_cnt");
1622 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1623 			       "rx_big_cnt",
1624 			       CTLFLAG_RD, &ss->rx_big.cnt,
1625 			       0, "rx_small_cnt");
1626 		SYSCTL_ADD_U64(ctx, children, OID_AUTO,
1627 			       "lro_flushed", CTLFLAG_RD, &ss->lc.lro_flushed,
1628 			       0, "number of lro merge queues flushed");
1629 
1630 		SYSCTL_ADD_U64(ctx, children, OID_AUTO,
1631 			       "lro_bad_csum", CTLFLAG_RD, &ss->lc.lro_bad_csum,
1632 			       0, "number of bad csums preventing LRO");
1633 
1634 		SYSCTL_ADD_U64(ctx, children, OID_AUTO,
1635 			       "lro_queued", CTLFLAG_RD, &ss->lc.lro_queued,
1636 			       0, "number of frames appended to lro merge"
1637 			       "queues");
1638 
1639 #ifndef IFNET_BUF_RING
1640 		/* only transmit from slice 0 for now */
1641 		if (slice > 0)
1642 			continue;
1643 #endif
1644 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1645 			       "tx_req",
1646 			       CTLFLAG_RD, &ss->tx.req,
1647 			       0, "tx_req");
1648 
1649 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1650 			       "tx_done",
1651 			       CTLFLAG_RD, &ss->tx.done,
1652 			       0, "tx_done");
1653 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1654 			       "tx_pkt_done",
1655 			       CTLFLAG_RD, &ss->tx.pkt_done,
1656 			       0, "tx_done");
1657 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1658 			       "tx_stall",
1659 			       CTLFLAG_RD, &ss->tx.stall,
1660 			       0, "tx_stall");
1661 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1662 			       "tx_wake",
1663 			       CTLFLAG_RD, &ss->tx.wake,
1664 			       0, "tx_wake");
1665 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1666 			       "tx_defrag",
1667 			       CTLFLAG_RD, &ss->tx.defrag,
1668 			       0, "tx_defrag");
1669 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1670 			       "tx_queue_active",
1671 			       CTLFLAG_RD, &ss->tx.queue_active,
1672 			       0, "tx_queue_active");
1673 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1674 			       "tx_activate",
1675 			       CTLFLAG_RD, &ss->tx.activate,
1676 			       0, "tx_activate");
1677 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1678 			       "tx_deactivate",
1679 			       CTLFLAG_RD, &ss->tx.deactivate,
1680 			       0, "tx_deactivate");
1681 	}
1682 }
1683 
1684 /* copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
1685    backwards one at a time and handle ring wraps */
1686 
1687 static inline void
1688 mxge_submit_req_backwards(mxge_tx_ring_t *tx,
1689 			    mcp_kreq_ether_send_t *src, int cnt)
1690 {
1691 	int idx, starting_slot;
1692 	starting_slot = tx->req;
1693 	while (cnt > 1) {
1694 		cnt--;
1695 		idx = (starting_slot + cnt) & tx->mask;
1696 		mxge_pio_copy(&tx->lanai[idx],
1697 			      &src[cnt], sizeof(*src));
1698 		wmb();
1699 	}
1700 }
1701 
1702 /*
1703  * copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
1704  * at most 32 bytes at a time, so as to avoid involving the software
1705  * pio handler in the nic.   We re-write the first segment's flags
1706  * to mark them valid only after writing the entire chain
1707  */
1708 
1709 static inline void
1710 mxge_submit_req(mxge_tx_ring_t *tx, mcp_kreq_ether_send_t *src,
1711 		  int cnt)
1712 {
1713 	int idx, i;
1714 	uint32_t *src_ints;
1715 	volatile uint32_t *dst_ints;
1716 	mcp_kreq_ether_send_t *srcp;
1717 	volatile mcp_kreq_ether_send_t *dstp, *dst;
1718 	uint8_t last_flags;
1719 
1720 	idx = tx->req & tx->mask;
1721 
1722 	last_flags = src->flags;
1723 	src->flags = 0;
1724 	wmb();
1725 	dst = dstp = &tx->lanai[idx];
1726 	srcp = src;
1727 
1728 	if ((idx + cnt) < tx->mask) {
1729 		for (i = 0; i < (cnt - 1); i += 2) {
1730 			mxge_pio_copy(dstp, srcp, 2 * sizeof(*src));
1731 			wmb(); /* force write every 32 bytes */
1732 			srcp += 2;
1733 			dstp += 2;
1734 		}
1735 	} else {
1736 		/* submit all but the first request, and ensure
1737 		   that it is submitted below */
1738 		mxge_submit_req_backwards(tx, src, cnt);
1739 		i = 0;
1740 	}
1741 	if (i < cnt) {
1742 		/* submit the first request */
1743 		mxge_pio_copy(dstp, srcp, sizeof(*src));
1744 		wmb(); /* barrier before setting valid flag */
1745 	}
1746 
1747 	/* re-write the last 32-bits with the valid flags */
1748 	src->flags = last_flags;
1749 	src_ints = (uint32_t *)src;
1750 	src_ints+=3;
1751 	dst_ints = (volatile uint32_t *)dst;
1752 	dst_ints+=3;
1753 	*dst_ints =  *src_ints;
1754 	tx->req += cnt;
1755 	wmb();
1756 }
1757 
1758 static int
1759 mxge_parse_tx(struct mxge_slice_state *ss, struct mbuf *m,
1760     struct mxge_pkt_info *pi)
1761 {
1762 	struct ether_vlan_header *eh;
1763 	uint16_t etype;
1764 	int tso = m->m_pkthdr.csum_flags & (CSUM_TSO);
1765 #if IFCAP_TSO6 && defined(INET6)
1766 	int nxt;
1767 #endif
1768 
1769 	eh = mtod(m, struct ether_vlan_header *);
1770 	if (eh->evl_encap_proto == htons(ETHERTYPE_VLAN)) {
1771 		etype = ntohs(eh->evl_proto);
1772 		pi->ip_off = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
1773 	} else {
1774 		etype = ntohs(eh->evl_encap_proto);
1775 		pi->ip_off = ETHER_HDR_LEN;
1776 	}
1777 
1778 	switch (etype) {
1779 	case ETHERTYPE_IP:
1780 		/*
1781 		 * ensure ip header is in first mbuf, copy it to a
1782 		 * scratch buffer if not
1783 		 */
1784 		pi->ip = (struct ip *)(m->m_data + pi->ip_off);
1785 		pi->ip6 = NULL;
1786 		if (__predict_false(m->m_len < pi->ip_off + sizeof(*pi->ip))) {
1787 			m_copydata(m, 0, pi->ip_off + sizeof(*pi->ip),
1788 			    ss->scratch);
1789 			pi->ip = (struct ip *)(ss->scratch + pi->ip_off);
1790 		}
1791 		pi->ip_hlen = pi->ip->ip_hl << 2;
1792 		if (!tso)
1793 			return 0;
1794 
1795 		if (__predict_false(m->m_len < pi->ip_off + pi->ip_hlen +
1796 		    sizeof(struct tcphdr))) {
1797 			m_copydata(m, 0, pi->ip_off + pi->ip_hlen +
1798 			    sizeof(struct tcphdr), ss->scratch);
1799 			pi->ip = (struct ip *)(ss->scratch + pi->ip_off);
1800 		}
1801 		pi->tcp = (struct tcphdr *)((char *)pi->ip + pi->ip_hlen);
1802 		break;
1803 #if IFCAP_TSO6 && defined(INET6)
1804 	case ETHERTYPE_IPV6:
1805 		pi->ip6 = (struct ip6_hdr *)(m->m_data + pi->ip_off);
1806 		if (__predict_false(m->m_len < pi->ip_off + sizeof(*pi->ip6))) {
1807 			m_copydata(m, 0, pi->ip_off + sizeof(*pi->ip6),
1808 			    ss->scratch);
1809 			pi->ip6 = (struct ip6_hdr *)(ss->scratch + pi->ip_off);
1810 		}
1811 		nxt = 0;
1812 		pi->ip_hlen = ip6_lasthdr(m, pi->ip_off, IPPROTO_IPV6, &nxt);
1813 		pi->ip_hlen -= pi->ip_off;
1814 		if (nxt != IPPROTO_TCP && nxt != IPPROTO_UDP)
1815 			return EINVAL;
1816 
1817 		if (!tso)
1818 			return 0;
1819 
1820 		if (pi->ip_off + pi->ip_hlen > ss->sc->max_tso6_hlen)
1821 			return EINVAL;
1822 
1823 		if (__predict_false(m->m_len < pi->ip_off + pi->ip_hlen +
1824 		    sizeof(struct tcphdr))) {
1825 			m_copydata(m, 0, pi->ip_off + pi->ip_hlen +
1826 			    sizeof(struct tcphdr), ss->scratch);
1827 			pi->ip6 = (struct ip6_hdr *)(ss->scratch + pi->ip_off);
1828 		}
1829 		pi->tcp = (struct tcphdr *)((char *)pi->ip6 + pi->ip_hlen);
1830 		break;
1831 #endif
1832 	default:
1833 		return EINVAL;
1834 	}
1835 	return 0;
1836 }
1837 
1838 #if IFCAP_TSO4
1839 
1840 static void
1841 mxge_encap_tso(struct mxge_slice_state *ss, struct mbuf *m,
1842 	       int busdma_seg_cnt, struct mxge_pkt_info *pi)
1843 {
1844 	mxge_tx_ring_t *tx;
1845 	mcp_kreq_ether_send_t *req;
1846 	bus_dma_segment_t *seg;
1847 	uint32_t low, high_swapped;
1848 	int len, seglen, cum_len, cum_len_next;
1849 	int next_is_first, chop, cnt, rdma_count, small;
1850 	uint16_t pseudo_hdr_offset, cksum_offset, mss, sum;
1851 	uint8_t flags, flags_next;
1852 	static int once;
1853 
1854 	mss = m->m_pkthdr.tso_segsz;
1855 
1856 	/* negative cum_len signifies to the
1857 	 * send loop that we are still in the
1858 	 * header portion of the TSO packet.
1859 	 */
1860 
1861 	cksum_offset = pi->ip_off + pi->ip_hlen;
1862 	cum_len = -(cksum_offset + (pi->tcp->th_off << 2));
1863 
1864 	/* TSO implies checksum offload on this hardware */
1865 	if (__predict_false((m->m_pkthdr.csum_flags & (CSUM_TCP|CSUM_TCP_IPV6)) == 0)) {
1866 		/*
1867 		 * If packet has full TCP csum, replace it with pseudo hdr
1868 		 * sum that the NIC expects, otherwise the NIC will emit
1869 		 * packets with bad TCP checksums.
1870 		 */
1871 		m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
1872 		if (pi->ip6) {
1873 #if (CSUM_TCP_IPV6 != 0) && defined(INET6)
1874 			m->m_pkthdr.csum_flags |= CSUM_TCP_IPV6;
1875 			sum = in6_cksum_pseudo(pi->ip6,
1876 			    m->m_pkthdr.len - cksum_offset,
1877 			    IPPROTO_TCP, 0);
1878 #endif
1879 		} else {
1880 #ifdef INET
1881 			m->m_pkthdr.csum_flags |= CSUM_TCP;
1882 			sum = in_pseudo(pi->ip->ip_src.s_addr,
1883 			    pi->ip->ip_dst.s_addr,
1884 			    htons(IPPROTO_TCP + (m->m_pkthdr.len -
1885 				    cksum_offset)));
1886 #endif
1887 		}
1888 		m_copyback(m, offsetof(struct tcphdr, th_sum) +
1889 		    cksum_offset, sizeof(sum), (caddr_t)&sum);
1890 	}
1891 	flags = MXGEFW_FLAGS_TSO_HDR | MXGEFW_FLAGS_FIRST;
1892 
1893 
1894 	/* for TSO, pseudo_hdr_offset holds mss.
1895 	 * The firmware figures out where to put
1896 	 * the checksum by parsing the header. */
1897 	pseudo_hdr_offset = htobe16(mss);
1898 
1899 	if (pi->ip6) {
1900 		/*
1901 		 * for IPv6 TSO, the "checksum offset" is re-purposed
1902 		 * to store the TCP header len
1903 		 */
1904 		cksum_offset = (pi->tcp->th_off << 2);
1905 	}
1906 
1907 	tx = &ss->tx;
1908 	req = tx->req_list;
1909 	seg = tx->seg_list;
1910 	cnt = 0;
1911 	rdma_count = 0;
1912 	/* "rdma_count" is the number of RDMAs belonging to the
1913 	 * current packet BEFORE the current send request. For
1914 	 * non-TSO packets, this is equal to "count".
1915 	 * For TSO packets, rdma_count needs to be reset
1916 	 * to 0 after a segment cut.
1917 	 *
1918 	 * The rdma_count field of the send request is
1919 	 * the number of RDMAs of the packet starting at
1920 	 * that request. For TSO send requests with one ore more cuts
1921 	 * in the middle, this is the number of RDMAs starting
1922 	 * after the last cut in the request. All previous
1923 	 * segments before the last cut implicitly have 1 RDMA.
1924 	 *
1925 	 * Since the number of RDMAs is not known beforehand,
1926 	 * it must be filled-in retroactively - after each
1927 	 * segmentation cut or at the end of the entire packet.
1928 	 */
1929 
1930 	while (busdma_seg_cnt) {
1931 		/* Break the busdma segment up into pieces*/
1932 		low = MXGE_LOWPART_TO_U32(seg->ds_addr);
1933 		high_swapped = 	htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
1934 		len = seg->ds_len;
1935 
1936 		while (len) {
1937 			flags_next = flags & ~MXGEFW_FLAGS_FIRST;
1938 			seglen = len;
1939 			cum_len_next = cum_len + seglen;
1940 			(req-rdma_count)->rdma_count = rdma_count + 1;
1941 			if (__predict_true(cum_len >= 0)) {
1942 				/* payload */
1943 				chop = (cum_len_next > mss);
1944 				cum_len_next = cum_len_next % mss;
1945 				next_is_first = (cum_len_next == 0);
1946 				flags |= chop * MXGEFW_FLAGS_TSO_CHOP;
1947 				flags_next |= next_is_first *
1948 					MXGEFW_FLAGS_FIRST;
1949 				rdma_count |= -(chop | next_is_first);
1950 				rdma_count += chop & !next_is_first;
1951 			} else if (cum_len_next >= 0) {
1952 				/* header ends */
1953 				rdma_count = -1;
1954 				cum_len_next = 0;
1955 				seglen = -cum_len;
1956 				small = (mss <= MXGEFW_SEND_SMALL_SIZE);
1957 				flags_next = MXGEFW_FLAGS_TSO_PLD |
1958 					MXGEFW_FLAGS_FIRST |
1959 					(small * MXGEFW_FLAGS_SMALL);
1960 			    }
1961 
1962 			req->addr_high = high_swapped;
1963 			req->addr_low = htobe32(low);
1964 			req->pseudo_hdr_offset = pseudo_hdr_offset;
1965 			req->pad = 0;
1966 			req->rdma_count = 1;
1967 			req->length = htobe16(seglen);
1968 			req->cksum_offset = cksum_offset;
1969 			req->flags = flags | ((cum_len & 1) *
1970 					      MXGEFW_FLAGS_ALIGN_ODD);
1971 			low += seglen;
1972 			len -= seglen;
1973 			cum_len = cum_len_next;
1974 			flags = flags_next;
1975 			req++;
1976 			cnt++;
1977 			rdma_count++;
1978 			if (cksum_offset != 0 && !pi->ip6) {
1979 				if (__predict_false(cksum_offset > seglen))
1980 					cksum_offset -= seglen;
1981 				else
1982 					cksum_offset = 0;
1983 			}
1984 			if (__predict_false(cnt > tx->max_desc))
1985 				goto drop;
1986 		}
1987 		busdma_seg_cnt--;
1988 		seg++;
1989 	}
1990 	(req-rdma_count)->rdma_count = rdma_count;
1991 
1992 	do {
1993 		req--;
1994 		req->flags |= MXGEFW_FLAGS_TSO_LAST;
1995 	} while (!(req->flags & (MXGEFW_FLAGS_TSO_CHOP | MXGEFW_FLAGS_FIRST)));
1996 
1997 	tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
1998 	mxge_submit_req(tx, tx->req_list, cnt);
1999 #ifdef IFNET_BUF_RING
2000 	if ((ss->sc->num_slices > 1) && tx->queue_active == 0) {
2001 		/* tell the NIC to start polling this slice */
2002 		*tx->send_go = 1;
2003 		tx->queue_active = 1;
2004 		tx->activate++;
2005 		wmb();
2006 	}
2007 #endif
2008 	return;
2009 
2010 drop:
2011 	bus_dmamap_unload(tx->dmat, tx->info[tx->req & tx->mask].map);
2012 	m_freem(m);
2013 	ss->oerrors++;
2014 	if (!once) {
2015 		printf("tx->max_desc exceeded via TSO!\n");
2016 		printf("mss = %d, %ld, %d!\n", mss,
2017 		       (long)seg - (long)tx->seg_list, tx->max_desc);
2018 		once = 1;
2019 	}
2020 	return;
2021 
2022 }
2023 
2024 #endif /* IFCAP_TSO4 */
2025 
2026 #ifdef MXGE_NEW_VLAN_API
2027 /*
2028  * We reproduce the software vlan tag insertion from
2029  * net/if_vlan.c:vlan_start() here so that we can advertise "hardware"
2030  * vlan tag insertion. We need to advertise this in order to have the
2031  * vlan interface respect our csum offload flags.
2032  */
2033 static struct mbuf *
2034 mxge_vlan_tag_insert(struct mbuf *m)
2035 {
2036 	struct ether_vlan_header *evl;
2037 
2038 	M_PREPEND(m, ETHER_VLAN_ENCAP_LEN, M_NOWAIT);
2039 	if (__predict_false(m == NULL))
2040 		return NULL;
2041 	if (m->m_len < sizeof(*evl)) {
2042 		m = m_pullup(m, sizeof(*evl));
2043 		if (__predict_false(m == NULL))
2044 			return NULL;
2045 	}
2046 	/*
2047 	 * Transform the Ethernet header into an Ethernet header
2048 	 * with 802.1Q encapsulation.
2049 	 */
2050 	evl = mtod(m, struct ether_vlan_header *);
2051 	bcopy((char *)evl + ETHER_VLAN_ENCAP_LEN,
2052 	      (char *)evl, ETHER_HDR_LEN - ETHER_TYPE_LEN);
2053 	evl->evl_encap_proto = htons(ETHERTYPE_VLAN);
2054 	evl->evl_tag = htons(m->m_pkthdr.ether_vtag);
2055 	m->m_flags &= ~M_VLANTAG;
2056 	return m;
2057 }
2058 #endif /* MXGE_NEW_VLAN_API */
2059 
2060 static void
2061 mxge_encap(struct mxge_slice_state *ss, struct mbuf *m)
2062 {
2063 	struct mxge_pkt_info pi = {0,0,0,0};
2064 	mxge_softc_t *sc;
2065 	mcp_kreq_ether_send_t *req;
2066 	bus_dma_segment_t *seg;
2067 	struct mbuf *m_tmp;
2068 	struct ifnet *ifp;
2069 	mxge_tx_ring_t *tx;
2070 	int cnt, cum_len, err, i, idx, odd_flag;
2071 	uint16_t pseudo_hdr_offset;
2072 	uint8_t flags, cksum_offset;
2073 
2074 
2075 	sc = ss->sc;
2076 	ifp = sc->ifp;
2077 	tx = &ss->tx;
2078 
2079 #ifdef MXGE_NEW_VLAN_API
2080 	if (m->m_flags & M_VLANTAG) {
2081 		m = mxge_vlan_tag_insert(m);
2082 		if (__predict_false(m == NULL))
2083 			goto drop_without_m;
2084 	}
2085 #endif
2086 	if (m->m_pkthdr.csum_flags &
2087 	    (CSUM_TSO | CSUM_DELAY_DATA | CSUM_DELAY_DATA_IPV6)) {
2088 		if (mxge_parse_tx(ss, m, &pi))
2089 			goto drop;
2090 	}
2091 
2092 	/* (try to) map the frame for DMA */
2093 	idx = tx->req & tx->mask;
2094 	err = bus_dmamap_load_mbuf_sg(tx->dmat, tx->info[idx].map,
2095 				      m, tx->seg_list, &cnt,
2096 				      BUS_DMA_NOWAIT);
2097 	if (__predict_false(err == EFBIG)) {
2098 		/* Too many segments in the chain.  Try
2099 		   to defrag */
2100 		m_tmp = m_defrag(m, M_NOWAIT);
2101 		if (m_tmp == NULL) {
2102 			goto drop;
2103 		}
2104 		ss->tx.defrag++;
2105 		m = m_tmp;
2106 		err = bus_dmamap_load_mbuf_sg(tx->dmat,
2107 					      tx->info[idx].map,
2108 					      m, tx->seg_list, &cnt,
2109 					      BUS_DMA_NOWAIT);
2110 	}
2111 	if (__predict_false(err != 0)) {
2112 		device_printf(sc->dev, "bus_dmamap_load_mbuf_sg returned %d"
2113 			      " packet len = %d\n", err, m->m_pkthdr.len);
2114 		goto drop;
2115 	}
2116 	bus_dmamap_sync(tx->dmat, tx->info[idx].map,
2117 			BUS_DMASYNC_PREWRITE);
2118 	tx->info[idx].m = m;
2119 
2120 #if IFCAP_TSO4
2121 	/* TSO is different enough, we handle it in another routine */
2122 	if (m->m_pkthdr.csum_flags & (CSUM_TSO)) {
2123 		mxge_encap_tso(ss, m, cnt, &pi);
2124 		return;
2125 	}
2126 #endif
2127 
2128 	req = tx->req_list;
2129 	cksum_offset = 0;
2130 	pseudo_hdr_offset = 0;
2131 	flags = MXGEFW_FLAGS_NO_TSO;
2132 
2133 	/* checksum offloading? */
2134 	if (m->m_pkthdr.csum_flags &
2135 	    (CSUM_DELAY_DATA | CSUM_DELAY_DATA_IPV6)) {
2136 		/* ensure ip header is in first mbuf, copy
2137 		   it to a scratch buffer if not */
2138 		cksum_offset = pi.ip_off + pi.ip_hlen;
2139 		pseudo_hdr_offset = cksum_offset +  m->m_pkthdr.csum_data;
2140 		pseudo_hdr_offset = htobe16(pseudo_hdr_offset);
2141 		req->cksum_offset = cksum_offset;
2142 		flags |= MXGEFW_FLAGS_CKSUM;
2143 		odd_flag = MXGEFW_FLAGS_ALIGN_ODD;
2144 	} else {
2145 		odd_flag = 0;
2146 	}
2147 	if (m->m_pkthdr.len < MXGEFW_SEND_SMALL_SIZE)
2148 		flags |= MXGEFW_FLAGS_SMALL;
2149 
2150 	/* convert segments into a request list */
2151 	cum_len = 0;
2152 	seg = tx->seg_list;
2153 	req->flags = MXGEFW_FLAGS_FIRST;
2154 	for (i = 0; i < cnt; i++) {
2155 		req->addr_low =
2156 			htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
2157 		req->addr_high =
2158 			htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
2159 		req->length = htobe16(seg->ds_len);
2160 		req->cksum_offset = cksum_offset;
2161 		if (cksum_offset > seg->ds_len)
2162 			cksum_offset -= seg->ds_len;
2163 		else
2164 			cksum_offset = 0;
2165 		req->pseudo_hdr_offset = pseudo_hdr_offset;
2166 		req->pad = 0; /* complete solid 16-byte block */
2167 		req->rdma_count = 1;
2168 		req->flags |= flags | ((cum_len & 1) * odd_flag);
2169 		cum_len += seg->ds_len;
2170 		seg++;
2171 		req++;
2172 		req->flags = 0;
2173 	}
2174 	req--;
2175 	/* pad runts to 60 bytes */
2176 	if (cum_len < 60) {
2177 		req++;
2178 		req->addr_low =
2179 			htobe32(MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr));
2180 		req->addr_high =
2181 			htobe32(MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr));
2182 		req->length = htobe16(60 - cum_len);
2183 		req->cksum_offset = 0;
2184 		req->pseudo_hdr_offset = pseudo_hdr_offset;
2185 		req->pad = 0; /* complete solid 16-byte block */
2186 		req->rdma_count = 1;
2187 		req->flags |= flags | ((cum_len & 1) * odd_flag);
2188 		cnt++;
2189 	}
2190 
2191 	tx->req_list[0].rdma_count = cnt;
2192 #if 0
2193 	/* print what the firmware will see */
2194 	for (i = 0; i < cnt; i++) {
2195 		printf("%d: addr: 0x%x 0x%x len:%d pso%d,"
2196 		    "cso:%d, flags:0x%x, rdma:%d\n",
2197 		    i, (int)ntohl(tx->req_list[i].addr_high),
2198 		    (int)ntohl(tx->req_list[i].addr_low),
2199 		    (int)ntohs(tx->req_list[i].length),
2200 		    (int)ntohs(tx->req_list[i].pseudo_hdr_offset),
2201 		    tx->req_list[i].cksum_offset, tx->req_list[i].flags,
2202 		    tx->req_list[i].rdma_count);
2203 	}
2204 	printf("--------------\n");
2205 #endif
2206 	tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
2207 	mxge_submit_req(tx, tx->req_list, cnt);
2208 #ifdef IFNET_BUF_RING
2209 	if ((ss->sc->num_slices > 1) && tx->queue_active == 0) {
2210 		/* tell the NIC to start polling this slice */
2211 		*tx->send_go = 1;
2212 		tx->queue_active = 1;
2213 		tx->activate++;
2214 		wmb();
2215 	}
2216 #endif
2217 	return;
2218 
2219 drop:
2220 	m_freem(m);
2221 drop_without_m:
2222 	ss->oerrors++;
2223 	return;
2224 }
2225 
2226 #ifdef IFNET_BUF_RING
2227 static void
2228 mxge_qflush(struct ifnet *ifp)
2229 {
2230 	mxge_softc_t *sc = ifp->if_softc;
2231 	mxge_tx_ring_t *tx;
2232 	struct mbuf *m;
2233 	int slice;
2234 
2235 	for (slice = 0; slice < sc->num_slices; slice++) {
2236 		tx = &sc->ss[slice].tx;
2237 		mtx_lock(&tx->mtx);
2238 		while ((m = buf_ring_dequeue_sc(tx->br)) != NULL)
2239 			m_freem(m);
2240 		mtx_unlock(&tx->mtx);
2241 	}
2242 	if_qflush(ifp);
2243 }
2244 
2245 static inline void
2246 mxge_start_locked(struct mxge_slice_state *ss)
2247 {
2248 	mxge_softc_t *sc;
2249 	struct mbuf *m;
2250 	struct ifnet *ifp;
2251 	mxge_tx_ring_t *tx;
2252 
2253 	sc = ss->sc;
2254 	ifp = sc->ifp;
2255 	tx = &ss->tx;
2256 
2257 	while ((tx->mask - (tx->req - tx->done)) > tx->max_desc) {
2258 		m = drbr_dequeue(ifp, tx->br);
2259 		if (m == NULL) {
2260 			return;
2261 		}
2262 		/* let BPF see it */
2263 		BPF_MTAP(ifp, m);
2264 
2265 		/* give it to the nic */
2266 		mxge_encap(ss, m);
2267 	}
2268 	/* ran out of transmit slots */
2269 	if (((ss->if_drv_flags & IFF_DRV_OACTIVE) == 0)
2270 	    && (!drbr_empty(ifp, tx->br))) {
2271 		ss->if_drv_flags |= IFF_DRV_OACTIVE;
2272 		tx->stall++;
2273 	}
2274 }
2275 
2276 static int
2277 mxge_transmit_locked(struct mxge_slice_state *ss, struct mbuf *m)
2278 {
2279 	mxge_softc_t *sc;
2280 	struct ifnet *ifp;
2281 	mxge_tx_ring_t *tx;
2282 	int err;
2283 
2284 	sc = ss->sc;
2285 	ifp = sc->ifp;
2286 	tx = &ss->tx;
2287 
2288 	if ((ss->if_drv_flags & (IFF_DRV_RUNNING|IFF_DRV_OACTIVE)) !=
2289 	    IFF_DRV_RUNNING) {
2290 		err = drbr_enqueue(ifp, tx->br, m);
2291 		return (err);
2292 	}
2293 
2294 	if (!drbr_needs_enqueue(ifp, tx->br) &&
2295 	    ((tx->mask - (tx->req - tx->done)) > tx->max_desc)) {
2296 		/* let BPF see it */
2297 		BPF_MTAP(ifp, m);
2298 		/* give it to the nic */
2299 		mxge_encap(ss, m);
2300 	} else if ((err = drbr_enqueue(ifp, tx->br, m)) != 0) {
2301 		return (err);
2302 	}
2303 	if (!drbr_empty(ifp, tx->br))
2304 		mxge_start_locked(ss);
2305 	return (0);
2306 }
2307 
2308 static int
2309 mxge_transmit(struct ifnet *ifp, struct mbuf *m)
2310 {
2311 	mxge_softc_t *sc = ifp->if_softc;
2312 	struct mxge_slice_state *ss;
2313 	mxge_tx_ring_t *tx;
2314 	int err = 0;
2315 	int slice;
2316 
2317 	slice = m->m_pkthdr.flowid;
2318 	slice &= (sc->num_slices - 1);  /* num_slices always power of 2 */
2319 
2320 	ss = &sc->ss[slice];
2321 	tx = &ss->tx;
2322 
2323 	if (mtx_trylock(&tx->mtx)) {
2324 		err = mxge_transmit_locked(ss, m);
2325 		mtx_unlock(&tx->mtx);
2326 	} else {
2327 		err = drbr_enqueue(ifp, tx->br, m);
2328 	}
2329 
2330 	return (err);
2331 }
2332 
2333 #else
2334 
2335 static inline void
2336 mxge_start_locked(struct mxge_slice_state *ss)
2337 {
2338 	mxge_softc_t *sc;
2339 	struct mbuf *m;
2340 	struct ifnet *ifp;
2341 	mxge_tx_ring_t *tx;
2342 
2343 	sc = ss->sc;
2344 	ifp = sc->ifp;
2345 	tx = &ss->tx;
2346 	while ((tx->mask - (tx->req - tx->done)) > tx->max_desc) {
2347 		IFQ_DRV_DEQUEUE(&ifp->if_snd, m);
2348 		if (m == NULL) {
2349 			return;
2350 		}
2351 		/* let BPF see it */
2352 		BPF_MTAP(ifp, m);
2353 
2354 		/* give it to the nic */
2355 		mxge_encap(ss, m);
2356 	}
2357 	/* ran out of transmit slots */
2358 	if ((sc->ifp->if_drv_flags & IFF_DRV_OACTIVE) == 0) {
2359 		sc->ifp->if_drv_flags |= IFF_DRV_OACTIVE;
2360 		tx->stall++;
2361 	}
2362 }
2363 #endif
2364 static void
2365 mxge_start(struct ifnet *ifp)
2366 {
2367 	mxge_softc_t *sc = ifp->if_softc;
2368 	struct mxge_slice_state *ss;
2369 
2370 	/* only use the first slice for now */
2371 	ss = &sc->ss[0];
2372 	mtx_lock(&ss->tx.mtx);
2373 	mxge_start_locked(ss);
2374 	mtx_unlock(&ss->tx.mtx);
2375 }
2376 
2377 /*
2378  * copy an array of mcp_kreq_ether_recv_t's to the mcp.  Copy
2379  * at most 32 bytes at a time, so as to avoid involving the software
2380  * pio handler in the nic.   We re-write the first segment's low
2381  * DMA address to mark it valid only after we write the entire chunk
2382  * in a burst
2383  */
2384 static inline void
2385 mxge_submit_8rx(volatile mcp_kreq_ether_recv_t *dst,
2386 		mcp_kreq_ether_recv_t *src)
2387 {
2388 	uint32_t low;
2389 
2390 	low = src->addr_low;
2391 	src->addr_low = 0xffffffff;
2392 	mxge_pio_copy(dst, src, 4 * sizeof (*src));
2393 	wmb();
2394 	mxge_pio_copy(dst + 4, src + 4, 4 * sizeof (*src));
2395 	wmb();
2396 	src->addr_low = low;
2397 	dst->addr_low = low;
2398 	wmb();
2399 }
2400 
2401 static int
2402 mxge_get_buf_small(struct mxge_slice_state *ss, bus_dmamap_t map, int idx)
2403 {
2404 	bus_dma_segment_t seg;
2405 	struct mbuf *m;
2406 	mxge_rx_ring_t *rx = &ss->rx_small;
2407 	int cnt, err;
2408 
2409 	m = m_gethdr(M_NOWAIT, MT_DATA);
2410 	if (m == NULL) {
2411 		rx->alloc_fail++;
2412 		err = ENOBUFS;
2413 		goto done;
2414 	}
2415 	m->m_len = MHLEN;
2416 	err = bus_dmamap_load_mbuf_sg(rx->dmat, map, m,
2417 				      &seg, &cnt, BUS_DMA_NOWAIT);
2418 	if (err != 0) {
2419 		m_free(m);
2420 		goto done;
2421 	}
2422 	rx->info[idx].m = m;
2423 	rx->shadow[idx].addr_low =
2424 		htobe32(MXGE_LOWPART_TO_U32(seg.ds_addr));
2425 	rx->shadow[idx].addr_high =
2426 		htobe32(MXGE_HIGHPART_TO_U32(seg.ds_addr));
2427 
2428 done:
2429 	if ((idx & 7) == 7)
2430 		mxge_submit_8rx(&rx->lanai[idx - 7], &rx->shadow[idx - 7]);
2431 	return err;
2432 }
2433 
2434 static int
2435 mxge_get_buf_big(struct mxge_slice_state *ss, bus_dmamap_t map, int idx)
2436 {
2437 	bus_dma_segment_t seg[3];
2438 	struct mbuf *m;
2439 	mxge_rx_ring_t *rx = &ss->rx_big;
2440 	int cnt, err, i;
2441 
2442 	m = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, rx->cl_size);
2443 	if (m == NULL) {
2444 		rx->alloc_fail++;
2445 		err = ENOBUFS;
2446 		goto done;
2447 	}
2448 	m->m_len = rx->mlen;
2449 	err = bus_dmamap_load_mbuf_sg(rx->dmat, map, m,
2450 				      seg, &cnt, BUS_DMA_NOWAIT);
2451 	if (err != 0) {
2452 		m_free(m);
2453 		goto done;
2454 	}
2455 	rx->info[idx].m = m;
2456 	rx->shadow[idx].addr_low =
2457 		htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
2458 	rx->shadow[idx].addr_high =
2459 		htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
2460 
2461 #if MXGE_VIRT_JUMBOS
2462 	for (i = 1; i < cnt; i++) {
2463 		rx->shadow[idx + i].addr_low =
2464 			htobe32(MXGE_LOWPART_TO_U32(seg[i].ds_addr));
2465 		rx->shadow[idx + i].addr_high =
2466 			htobe32(MXGE_HIGHPART_TO_U32(seg[i].ds_addr));
2467        }
2468 #endif
2469 
2470 done:
2471        for (i = 0; i < rx->nbufs; i++) {
2472 		if ((idx & 7) == 7) {
2473 			mxge_submit_8rx(&rx->lanai[idx - 7],
2474 					&rx->shadow[idx - 7]);
2475 		}
2476 		idx++;
2477 	}
2478 	return err;
2479 }
2480 
2481 #ifdef INET6
2482 
2483 static uint16_t
2484 mxge_csum_generic(uint16_t *raw, int len)
2485 {
2486 	uint32_t csum;
2487 
2488 
2489 	csum = 0;
2490 	while (len > 0) {
2491 		csum += *raw;
2492 		raw++;
2493 		len -= 2;
2494 	}
2495 	csum = (csum >> 16) + (csum & 0xffff);
2496 	csum = (csum >> 16) + (csum & 0xffff);
2497 	return (uint16_t)csum;
2498 }
2499 
2500 static inline uint16_t
2501 mxge_rx_csum6(void *p, struct mbuf *m, uint32_t csum)
2502 {
2503 	uint32_t partial;
2504 	int nxt, cksum_offset;
2505 	struct ip6_hdr *ip6 = p;
2506 	uint16_t c;
2507 
2508 	nxt = ip6->ip6_nxt;
2509 	cksum_offset = sizeof (*ip6) + ETHER_HDR_LEN;
2510 	if (nxt != IPPROTO_TCP && nxt != IPPROTO_UDP) {
2511 		cksum_offset = ip6_lasthdr(m, ETHER_HDR_LEN,
2512 					   IPPROTO_IPV6, &nxt);
2513 		if (nxt != IPPROTO_TCP && nxt != IPPROTO_UDP)
2514 			return (1);
2515 	}
2516 
2517 	/*
2518 	 * IPv6 headers do not contain a checksum, and hence
2519 	 * do not checksum to zero, so they don't "fall out"
2520 	 * of the partial checksum calculation like IPv4
2521 	 * headers do.  We need to fix the partial checksum by
2522 	 * subtracting the checksum of the IPv6 header.
2523 	 */
2524 
2525 	partial = mxge_csum_generic((uint16_t *)ip6, cksum_offset -
2526 				    ETHER_HDR_LEN);
2527 	csum += ~partial;
2528 	csum +=	 (csum < ~partial);
2529 	csum = (csum >> 16) + (csum & 0xFFFF);
2530 	csum = (csum >> 16) + (csum & 0xFFFF);
2531 	c = in6_cksum_pseudo(ip6, m->m_pkthdr.len - cksum_offset, nxt,
2532 			     csum);
2533 	c ^= 0xffff;
2534 	return (c);
2535 }
2536 #endif /* INET6 */
2537 /*
2538  *  Myri10GE hardware checksums are not valid if the sender
2539  *  padded the frame with non-zero padding.  This is because
2540  *  the firmware just does a simple 16-bit 1s complement
2541  *  checksum across the entire frame, excluding the first 14
2542  *  bytes.  It is best to simply to check the checksum and
2543  *  tell the stack about it only if the checksum is good
2544  */
2545 
2546 static inline uint16_t
2547 mxge_rx_csum(struct mbuf *m, int csum)
2548 {
2549 	struct ether_header *eh;
2550 #ifdef INET
2551 	struct ip *ip;
2552 #endif
2553 #if defined(INET) || defined(INET6)
2554 	int cap = m->m_pkthdr.rcvif->if_capenable;
2555 #endif
2556 	uint16_t c, etype;
2557 
2558 
2559 	eh = mtod(m, struct ether_header *);
2560 	etype = ntohs(eh->ether_type);
2561 	switch (etype) {
2562 #ifdef INET
2563 	case ETHERTYPE_IP:
2564 		if ((cap & IFCAP_RXCSUM) == 0)
2565 			return (1);
2566 		ip = (struct ip *)(eh + 1);
2567 		if (ip->ip_p != IPPROTO_TCP && ip->ip_p != IPPROTO_UDP)
2568 			return (1);
2569 		c = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
2570 			      htonl(ntohs(csum) + ntohs(ip->ip_len) -
2571 				    (ip->ip_hl << 2) + ip->ip_p));
2572 		c ^= 0xffff;
2573 		break;
2574 #endif
2575 #ifdef INET6
2576 	case ETHERTYPE_IPV6:
2577 		if ((cap & IFCAP_RXCSUM_IPV6) == 0)
2578 			return (1);
2579 		c = mxge_rx_csum6((eh + 1), m, csum);
2580 		break;
2581 #endif
2582 	default:
2583 		c = 1;
2584 	}
2585 	return (c);
2586 }
2587 
2588 static void
2589 mxge_vlan_tag_remove(struct mbuf *m, uint32_t *csum)
2590 {
2591 	struct ether_vlan_header *evl;
2592 	struct ether_header *eh;
2593 	uint32_t partial;
2594 
2595 	evl = mtod(m, struct ether_vlan_header *);
2596 	eh = mtod(m, struct ether_header *);
2597 
2598 	/*
2599 	 * fix checksum by subtracting ETHER_VLAN_ENCAP_LEN bytes
2600 	 * after what the firmware thought was the end of the ethernet
2601 	 * header.
2602 	 */
2603 
2604 	/* put checksum into host byte order */
2605 	*csum = ntohs(*csum);
2606 	partial = ntohl(*(uint32_t *)(mtod(m, char *) + ETHER_HDR_LEN));
2607 	(*csum) += ~partial;
2608 	(*csum) +=  ((*csum) < ~partial);
2609 	(*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2610 	(*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2611 
2612 	/* restore checksum to network byte order;
2613 	   later consumers expect this */
2614 	*csum = htons(*csum);
2615 
2616 	/* save the tag */
2617 #ifdef MXGE_NEW_VLAN_API
2618 	m->m_pkthdr.ether_vtag = ntohs(evl->evl_tag);
2619 #else
2620 	{
2621 		struct m_tag *mtag;
2622 		mtag = m_tag_alloc(MTAG_VLAN, MTAG_VLAN_TAG, sizeof(u_int),
2623 				   M_NOWAIT);
2624 		if (mtag == NULL)
2625 			return;
2626 		VLAN_TAG_VALUE(mtag) = ntohs(evl->evl_tag);
2627 		m_tag_prepend(m, mtag);
2628 	}
2629 
2630 #endif
2631 	m->m_flags |= M_VLANTAG;
2632 
2633 	/*
2634 	 * Remove the 802.1q header by copying the Ethernet
2635 	 * addresses over it and adjusting the beginning of
2636 	 * the data in the mbuf.  The encapsulated Ethernet
2637 	 * type field is already in place.
2638 	 */
2639 	bcopy((char *)evl, (char *)evl + ETHER_VLAN_ENCAP_LEN,
2640 	      ETHER_HDR_LEN - ETHER_TYPE_LEN);
2641 	m_adj(m, ETHER_VLAN_ENCAP_LEN);
2642 }
2643 
2644 
2645 static inline void
2646 mxge_rx_done_big(struct mxge_slice_state *ss, uint32_t len,
2647 		 uint32_t csum, int lro)
2648 {
2649 	mxge_softc_t *sc;
2650 	struct ifnet *ifp;
2651 	struct mbuf *m;
2652 	struct ether_header *eh;
2653 	mxge_rx_ring_t *rx;
2654 	bus_dmamap_t old_map;
2655 	int idx;
2656 
2657 	sc = ss->sc;
2658 	ifp = sc->ifp;
2659 	rx = &ss->rx_big;
2660 	idx = rx->cnt & rx->mask;
2661 	rx->cnt += rx->nbufs;
2662 	/* save a pointer to the received mbuf */
2663 	m = rx->info[idx].m;
2664 	/* try to replace the received mbuf */
2665 	if (mxge_get_buf_big(ss, rx->extra_map, idx)) {
2666 		/* drop the frame -- the old mbuf is re-cycled */
2667 		if_inc_counter(ifp, IFCOUNTER_IERRORS, 1);
2668 		return;
2669 	}
2670 
2671 	/* unmap the received buffer */
2672 	old_map = rx->info[idx].map;
2673 	bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2674 	bus_dmamap_unload(rx->dmat, old_map);
2675 
2676 	/* swap the bus_dmamap_t's */
2677 	rx->info[idx].map = rx->extra_map;
2678 	rx->extra_map = old_map;
2679 
2680 	/* mcp implicitly skips 1st 2 bytes so that packet is properly
2681 	 * aligned */
2682 	m->m_data += MXGEFW_PAD;
2683 
2684 	m->m_pkthdr.rcvif = ifp;
2685 	m->m_len = m->m_pkthdr.len = len;
2686 	ss->ipackets++;
2687 	eh = mtod(m, struct ether_header *);
2688 	if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2689 		mxge_vlan_tag_remove(m, &csum);
2690 	}
2691 	/* flowid only valid if RSS hashing is enabled */
2692 	if (sc->num_slices > 1) {
2693 		m->m_pkthdr.flowid = (ss - sc->ss);
2694 		M_HASHTYPE_SET(m, M_HASHTYPE_OPAQUE);
2695 	}
2696 	/* if the checksum is valid, mark it in the mbuf header */
2697 	if ((ifp->if_capenable & (IFCAP_RXCSUM_IPV6 | IFCAP_RXCSUM)) &&
2698 	    (0 == mxge_rx_csum(m, csum))) {
2699 		/* Tell the stack that the  checksum is good */
2700 		m->m_pkthdr.csum_data = 0xffff;
2701 		m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR |
2702 			CSUM_DATA_VALID;
2703 
2704 #if defined(INET) || defined (INET6)
2705 		if (lro && (0 == tcp_lro_rx(&ss->lc, m, 0)))
2706 			return;
2707 #endif
2708 	}
2709 	/* pass the frame up the stack */
2710 	(*ifp->if_input)(ifp, m);
2711 }
2712 
2713 static inline void
2714 mxge_rx_done_small(struct mxge_slice_state *ss, uint32_t len,
2715 		   uint32_t csum, int lro)
2716 {
2717 	mxge_softc_t *sc;
2718 	struct ifnet *ifp;
2719 	struct ether_header *eh;
2720 	struct mbuf *m;
2721 	mxge_rx_ring_t *rx;
2722 	bus_dmamap_t old_map;
2723 	int idx;
2724 
2725 	sc = ss->sc;
2726 	ifp = sc->ifp;
2727 	rx = &ss->rx_small;
2728 	idx = rx->cnt & rx->mask;
2729 	rx->cnt++;
2730 	/* save a pointer to the received mbuf */
2731 	m = rx->info[idx].m;
2732 	/* try to replace the received mbuf */
2733 	if (mxge_get_buf_small(ss, rx->extra_map, idx)) {
2734 		/* drop the frame -- the old mbuf is re-cycled */
2735 		if_inc_counter(ifp, IFCOUNTER_IERRORS, 1);
2736 		return;
2737 	}
2738 
2739 	/* unmap the received buffer */
2740 	old_map = rx->info[idx].map;
2741 	bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2742 	bus_dmamap_unload(rx->dmat, old_map);
2743 
2744 	/* swap the bus_dmamap_t's */
2745 	rx->info[idx].map = rx->extra_map;
2746 	rx->extra_map = old_map;
2747 
2748 	/* mcp implicitly skips 1st 2 bytes so that packet is properly
2749 	 * aligned */
2750 	m->m_data += MXGEFW_PAD;
2751 
2752 	m->m_pkthdr.rcvif = ifp;
2753 	m->m_len = m->m_pkthdr.len = len;
2754 	ss->ipackets++;
2755 	eh = mtod(m, struct ether_header *);
2756 	if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2757 		mxge_vlan_tag_remove(m, &csum);
2758 	}
2759 	/* flowid only valid if RSS hashing is enabled */
2760 	if (sc->num_slices > 1) {
2761 		m->m_pkthdr.flowid = (ss - sc->ss);
2762 		M_HASHTYPE_SET(m, M_HASHTYPE_OPAQUE);
2763 	}
2764 	/* if the checksum is valid, mark it in the mbuf header */
2765 	if ((ifp->if_capenable & (IFCAP_RXCSUM_IPV6 | IFCAP_RXCSUM)) &&
2766 	    (0 == mxge_rx_csum(m, csum))) {
2767 		/* Tell the stack that the  checksum is good */
2768 		m->m_pkthdr.csum_data = 0xffff;
2769 		m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR |
2770 			CSUM_DATA_VALID;
2771 
2772 #if defined(INET) || defined (INET6)
2773 		if (lro && (0 == tcp_lro_rx(&ss->lc, m, csum)))
2774 			return;
2775 #endif
2776 	}
2777 	/* pass the frame up the stack */
2778 	(*ifp->if_input)(ifp, m);
2779 }
2780 
2781 static inline void
2782 mxge_clean_rx_done(struct mxge_slice_state *ss)
2783 {
2784 	mxge_rx_done_t *rx_done = &ss->rx_done;
2785 	int limit = 0;
2786 	uint16_t length;
2787 	uint16_t checksum;
2788 	int lro;
2789 
2790 	lro = ss->sc->ifp->if_capenable & IFCAP_LRO;
2791 	while (rx_done->entry[rx_done->idx].length != 0) {
2792 		length = ntohs(rx_done->entry[rx_done->idx].length);
2793 		rx_done->entry[rx_done->idx].length = 0;
2794 		checksum = rx_done->entry[rx_done->idx].checksum;
2795 		if (length <= (MHLEN - MXGEFW_PAD))
2796 			mxge_rx_done_small(ss, length, checksum, lro);
2797 		else
2798 			mxge_rx_done_big(ss, length, checksum, lro);
2799 		rx_done->cnt++;
2800 		rx_done->idx = rx_done->cnt & rx_done->mask;
2801 
2802 		/* limit potential for livelock */
2803 		if (__predict_false(++limit > rx_done->mask / 2))
2804 			break;
2805 	}
2806 #if defined(INET)  || defined (INET6)
2807 	tcp_lro_flush_all(&ss->lc);
2808 #endif
2809 }
2810 
2811 
2812 static inline void
2813 mxge_tx_done(struct mxge_slice_state *ss, uint32_t mcp_idx)
2814 {
2815 	struct ifnet *ifp;
2816 	mxge_tx_ring_t *tx;
2817 	struct mbuf *m;
2818 	bus_dmamap_t map;
2819 	int idx;
2820 	int *flags;
2821 
2822 	tx = &ss->tx;
2823 	ifp = ss->sc->ifp;
2824 	while (tx->pkt_done != mcp_idx) {
2825 		idx = tx->done & tx->mask;
2826 		tx->done++;
2827 		m = tx->info[idx].m;
2828 		/* mbuf and DMA map only attached to the first
2829 		   segment per-mbuf */
2830 		if (m != NULL) {
2831 			ss->obytes += m->m_pkthdr.len;
2832 			if (m->m_flags & M_MCAST)
2833 				ss->omcasts++;
2834 			ss->opackets++;
2835 			tx->info[idx].m = NULL;
2836 			map = tx->info[idx].map;
2837 			bus_dmamap_unload(tx->dmat, map);
2838 			m_freem(m);
2839 		}
2840 		if (tx->info[idx].flag) {
2841 			tx->info[idx].flag = 0;
2842 			tx->pkt_done++;
2843 		}
2844 	}
2845 
2846 	/* If we have space, clear IFF_OACTIVE to tell the stack that
2847 	   its OK to send packets */
2848 #ifdef IFNET_BUF_RING
2849 	flags = &ss->if_drv_flags;
2850 #else
2851 	flags = &ifp->if_drv_flags;
2852 #endif
2853 	mtx_lock(&ss->tx.mtx);
2854 	if ((*flags) & IFF_DRV_OACTIVE &&
2855 	    tx->req - tx->done < (tx->mask + 1)/4) {
2856 		*(flags) &= ~IFF_DRV_OACTIVE;
2857 		ss->tx.wake++;
2858 		mxge_start_locked(ss);
2859 	}
2860 #ifdef IFNET_BUF_RING
2861 	if ((ss->sc->num_slices > 1) && (tx->req == tx->done)) {
2862 		/* let the NIC stop polling this queue, since there
2863 		 * are no more transmits pending */
2864 		if (tx->req == tx->done) {
2865 			*tx->send_stop = 1;
2866 			tx->queue_active = 0;
2867 			tx->deactivate++;
2868 			wmb();
2869 		}
2870 	}
2871 #endif
2872 	mtx_unlock(&ss->tx.mtx);
2873 
2874 }
2875 
2876 static struct mxge_media_type mxge_xfp_media_types[] =
2877 {
2878 	{IFM_10G_CX4,	0x7f, 		"10GBASE-CX4 (module)"},
2879 	{IFM_10G_SR, 	(1 << 7),	"10GBASE-SR"},
2880 	{IFM_10G_LR, 	(1 << 6),	"10GBASE-LR"},
2881 	{0,		(1 << 5),	"10GBASE-ER"},
2882 	{IFM_10G_LRM,	(1 << 4),	"10GBASE-LRM"},
2883 	{0,		(1 << 3),	"10GBASE-SW"},
2884 	{0,		(1 << 2),	"10GBASE-LW"},
2885 	{0,		(1 << 1),	"10GBASE-EW"},
2886 	{0,		(1 << 0),	"Reserved"}
2887 };
2888 static struct mxge_media_type mxge_sfp_media_types[] =
2889 {
2890 	{IFM_10G_TWINAX,      0,	"10GBASE-Twinax"},
2891 	{0,		(1 << 7),	"Reserved"},
2892 	{IFM_10G_LRM,	(1 << 6),	"10GBASE-LRM"},
2893 	{IFM_10G_LR, 	(1 << 5),	"10GBASE-LR"},
2894 	{IFM_10G_SR,	(1 << 4),	"10GBASE-SR"},
2895 	{IFM_10G_TWINAX,(1 << 0),	"10GBASE-Twinax"}
2896 };
2897 
2898 static void
2899 mxge_media_set(mxge_softc_t *sc, int media_type)
2900 {
2901 
2902 
2903 	ifmedia_add(&sc->media, IFM_ETHER | IFM_FDX | media_type,
2904 		    0, NULL);
2905 	ifmedia_set(&sc->media, IFM_ETHER | IFM_FDX | media_type);
2906 	sc->current_media = media_type;
2907 	sc->media.ifm_media = sc->media.ifm_cur->ifm_media;
2908 }
2909 
2910 static void
2911 mxge_media_init(mxge_softc_t *sc)
2912 {
2913 	char *ptr;
2914 	int i;
2915 
2916 	ifmedia_removeall(&sc->media);
2917 	mxge_media_set(sc, IFM_AUTO);
2918 
2919 	/*
2920 	 * parse the product code to deterimine the interface type
2921 	 * (CX4, XFP, Quad Ribbon Fiber) by looking at the character
2922 	 * after the 3rd dash in the driver's cached copy of the
2923 	 * EEPROM's product code string.
2924 	 */
2925 	ptr = sc->product_code_string;
2926 	if (ptr == NULL) {
2927 		device_printf(sc->dev, "Missing product code\n");
2928 		return;
2929 	}
2930 
2931 	for (i = 0; i < 3; i++, ptr++) {
2932 		ptr = strchr(ptr, '-');
2933 		if (ptr == NULL) {
2934 			device_printf(sc->dev,
2935 				      "only %d dashes in PC?!?\n", i);
2936 			return;
2937 		}
2938 	}
2939 	if (*ptr == 'C' || *(ptr +1) == 'C') {
2940 		/* -C is CX4 */
2941 		sc->connector = MXGE_CX4;
2942 		mxge_media_set(sc, IFM_10G_CX4);
2943 	} else if (*ptr == 'Q') {
2944 		/* -Q is Quad Ribbon Fiber */
2945 		sc->connector = MXGE_QRF;
2946 		device_printf(sc->dev, "Quad Ribbon Fiber Media\n");
2947 		/* FreeBSD has no media type for Quad ribbon fiber */
2948 	} else if (*ptr == 'R') {
2949 		/* -R is XFP */
2950 		sc->connector = MXGE_XFP;
2951 	} else if (*ptr == 'S' || *(ptr +1) == 'S') {
2952 		/* -S or -2S is SFP+ */
2953 		sc->connector = MXGE_SFP;
2954 	} else {
2955 		device_printf(sc->dev, "Unknown media type: %c\n", *ptr);
2956 	}
2957 }
2958 
2959 /*
2960  * Determine the media type for a NIC.  Some XFPs will identify
2961  * themselves only when their link is up, so this is initiated via a
2962  * link up interrupt.  However, this can potentially take up to
2963  * several milliseconds, so it is run via the watchdog routine, rather
2964  * than in the interrupt handler itself.
2965  */
2966 static void
2967 mxge_media_probe(mxge_softc_t *sc)
2968 {
2969 	mxge_cmd_t cmd;
2970 	char *cage_type;
2971 
2972 	struct mxge_media_type *mxge_media_types = NULL;
2973 	int i, err, ms, mxge_media_type_entries;
2974 	uint32_t byte;
2975 
2976 	sc->need_media_probe = 0;
2977 
2978 	if (sc->connector == MXGE_XFP) {
2979 		/* -R is XFP */
2980 		mxge_media_types = mxge_xfp_media_types;
2981 		mxge_media_type_entries =
2982 			nitems(mxge_xfp_media_types);
2983 		byte = MXGE_XFP_COMPLIANCE_BYTE;
2984 		cage_type = "XFP";
2985 	} else 	if (sc->connector == MXGE_SFP) {
2986 		/* -S or -2S is SFP+ */
2987 		mxge_media_types = mxge_sfp_media_types;
2988 		mxge_media_type_entries =
2989 			nitems(mxge_sfp_media_types);
2990 		cage_type = "SFP+";
2991 		byte = 3;
2992 	} else {
2993 		/* nothing to do; media type cannot change */
2994 		return;
2995 	}
2996 
2997 	/*
2998 	 * At this point we know the NIC has an XFP cage, so now we
2999 	 * try to determine what is in the cage by using the
3000 	 * firmware's XFP I2C commands to read the XFP 10GbE compilance
3001 	 * register.  We read just one byte, which may take over
3002 	 * a millisecond
3003 	 */
3004 
3005 	cmd.data0 = 0;	 /* just fetch 1 byte, not all 256 */
3006 	cmd.data1 = byte;
3007 	err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_READ, &cmd);
3008 	if (err == MXGEFW_CMD_ERROR_I2C_FAILURE) {
3009 		device_printf(sc->dev, "failed to read XFP\n");
3010 	}
3011 	if (err == MXGEFW_CMD_ERROR_I2C_ABSENT) {
3012 		device_printf(sc->dev, "Type R/S with no XFP!?!?\n");
3013 	}
3014 	if (err != MXGEFW_CMD_OK) {
3015 		return;
3016 	}
3017 
3018 	/* now we wait for the data to be cached */
3019 	cmd.data0 = byte;
3020 	err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
3021 	for (ms = 0; (err == EBUSY) && (ms < 50); ms++) {
3022 		DELAY(1000);
3023 		cmd.data0 = byte;
3024 		err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
3025 	}
3026 	if (err != MXGEFW_CMD_OK) {
3027 		device_printf(sc->dev, "failed to read %s (%d, %dms)\n",
3028 			      cage_type, err, ms);
3029 		return;
3030 	}
3031 
3032 	if (cmd.data0 == mxge_media_types[0].bitmask) {
3033 		if (mxge_verbose)
3034 			device_printf(sc->dev, "%s:%s\n", cage_type,
3035 				      mxge_media_types[0].name);
3036 		if (sc->current_media != mxge_media_types[0].flag) {
3037 			mxge_media_init(sc);
3038 			mxge_media_set(sc, mxge_media_types[0].flag);
3039 		}
3040 		return;
3041 	}
3042 	for (i = 1; i < mxge_media_type_entries; i++) {
3043 		if (cmd.data0 & mxge_media_types[i].bitmask) {
3044 			if (mxge_verbose)
3045 				device_printf(sc->dev, "%s:%s\n",
3046 					      cage_type,
3047 					      mxge_media_types[i].name);
3048 
3049 			if (sc->current_media != mxge_media_types[i].flag) {
3050 				mxge_media_init(sc);
3051 				mxge_media_set(sc, mxge_media_types[i].flag);
3052 			}
3053 			return;
3054 		}
3055 	}
3056 	if (mxge_verbose)
3057 		device_printf(sc->dev, "%s media 0x%x unknown\n",
3058 			      cage_type, cmd.data0);
3059 
3060 	return;
3061 }
3062 
3063 static void
3064 mxge_intr(void *arg)
3065 {
3066 	struct mxge_slice_state *ss = arg;
3067 	mxge_softc_t *sc = ss->sc;
3068 	mcp_irq_data_t *stats = ss->fw_stats;
3069 	mxge_tx_ring_t *tx = &ss->tx;
3070 	mxge_rx_done_t *rx_done = &ss->rx_done;
3071 	uint32_t send_done_count;
3072 	uint8_t valid;
3073 
3074 
3075 #ifndef IFNET_BUF_RING
3076 	/* an interrupt on a non-zero slice is implicitly valid
3077 	   since MSI-X irqs are not shared */
3078 	if (ss != sc->ss) {
3079 		mxge_clean_rx_done(ss);
3080 		*ss->irq_claim = be32toh(3);
3081 		return;
3082 	}
3083 #endif
3084 
3085 	/* make sure the DMA has finished */
3086 	if (!stats->valid) {
3087 		return;
3088 	}
3089 	valid = stats->valid;
3090 
3091 	if (sc->legacy_irq) {
3092 		/* lower legacy IRQ  */
3093 		*sc->irq_deassert = 0;
3094 		if (!mxge_deassert_wait)
3095 			/* don't wait for conf. that irq is low */
3096 			stats->valid = 0;
3097 	} else {
3098 		stats->valid = 0;
3099 	}
3100 
3101 	/* loop while waiting for legacy irq deassertion */
3102 	do {
3103 		/* check for transmit completes and receives */
3104 		send_done_count = be32toh(stats->send_done_count);
3105 		while ((send_done_count != tx->pkt_done) ||
3106 		       (rx_done->entry[rx_done->idx].length != 0)) {
3107 			if (send_done_count != tx->pkt_done)
3108 				mxge_tx_done(ss, (int)send_done_count);
3109 			mxge_clean_rx_done(ss);
3110 			send_done_count = be32toh(stats->send_done_count);
3111 		}
3112 		if (sc->legacy_irq && mxge_deassert_wait)
3113 			wmb();
3114 	} while (*((volatile uint8_t *) &stats->valid));
3115 
3116 	/* fw link & error stats meaningful only on the first slice */
3117 	if (__predict_false((ss == sc->ss) && stats->stats_updated)) {
3118 		if (sc->link_state != stats->link_up) {
3119 			sc->link_state = stats->link_up;
3120 			if (sc->link_state) {
3121 				if_link_state_change(sc->ifp, LINK_STATE_UP);
3122 				if (mxge_verbose)
3123 					device_printf(sc->dev, "link up\n");
3124 			} else {
3125 				if_link_state_change(sc->ifp, LINK_STATE_DOWN);
3126 				if (mxge_verbose)
3127 					device_printf(sc->dev, "link down\n");
3128 			}
3129 			sc->need_media_probe = 1;
3130 		}
3131 		if (sc->rdma_tags_available !=
3132 		    be32toh(stats->rdma_tags_available)) {
3133 			sc->rdma_tags_available =
3134 				be32toh(stats->rdma_tags_available);
3135 			device_printf(sc->dev, "RDMA timed out! %d tags "
3136 				      "left\n", sc->rdma_tags_available);
3137 		}
3138 
3139 		if (stats->link_down) {
3140 			sc->down_cnt += stats->link_down;
3141 			sc->link_state = 0;
3142 			if_link_state_change(sc->ifp, LINK_STATE_DOWN);
3143 		}
3144 	}
3145 
3146 	/* check to see if we have rx token to pass back */
3147 	if (valid & 0x1)
3148 	    *ss->irq_claim = be32toh(3);
3149 	*(ss->irq_claim + 1) = be32toh(3);
3150 }
3151 
3152 static void
3153 mxge_init(void *arg)
3154 {
3155 	mxge_softc_t *sc = arg;
3156 	struct ifnet *ifp = sc->ifp;
3157 
3158 
3159 	mtx_lock(&sc->driver_mtx);
3160 	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0)
3161 		(void) mxge_open(sc);
3162 	mtx_unlock(&sc->driver_mtx);
3163 }
3164 
3165 
3166 
3167 static void
3168 mxge_free_slice_mbufs(struct mxge_slice_state *ss)
3169 {
3170 	int i;
3171 
3172 #if defined(INET) || defined(INET6)
3173 	tcp_lro_free(&ss->lc);
3174 #endif
3175 	for (i = 0; i <= ss->rx_big.mask; i++) {
3176 		if (ss->rx_big.info[i].m == NULL)
3177 			continue;
3178 		bus_dmamap_unload(ss->rx_big.dmat,
3179 				  ss->rx_big.info[i].map);
3180 		m_freem(ss->rx_big.info[i].m);
3181 		ss->rx_big.info[i].m = NULL;
3182 	}
3183 
3184 	for (i = 0; i <= ss->rx_small.mask; i++) {
3185 		if (ss->rx_small.info[i].m == NULL)
3186 			continue;
3187 		bus_dmamap_unload(ss->rx_small.dmat,
3188 				  ss->rx_small.info[i].map);
3189 		m_freem(ss->rx_small.info[i].m);
3190 		ss->rx_small.info[i].m = NULL;
3191 	}
3192 
3193 	/* transmit ring used only on the first slice */
3194 	if (ss->tx.info == NULL)
3195 		return;
3196 
3197 	for (i = 0; i <= ss->tx.mask; i++) {
3198 		ss->tx.info[i].flag = 0;
3199 		if (ss->tx.info[i].m == NULL)
3200 			continue;
3201 		bus_dmamap_unload(ss->tx.dmat,
3202 				  ss->tx.info[i].map);
3203 		m_freem(ss->tx.info[i].m);
3204 		ss->tx.info[i].m = NULL;
3205 	}
3206 }
3207 
3208 static void
3209 mxge_free_mbufs(mxge_softc_t *sc)
3210 {
3211 	int slice;
3212 
3213 	for (slice = 0; slice < sc->num_slices; slice++)
3214 		mxge_free_slice_mbufs(&sc->ss[slice]);
3215 }
3216 
3217 static void
3218 mxge_free_slice_rings(struct mxge_slice_state *ss)
3219 {
3220 	int i;
3221 
3222 
3223 	if (ss->rx_done.entry != NULL)
3224 		mxge_dma_free(&ss->rx_done.dma);
3225 	ss->rx_done.entry = NULL;
3226 
3227 	if (ss->tx.req_bytes != NULL)
3228 		free(ss->tx.req_bytes, M_DEVBUF);
3229 	ss->tx.req_bytes = NULL;
3230 
3231 	if (ss->tx.seg_list != NULL)
3232 		free(ss->tx.seg_list, M_DEVBUF);
3233 	ss->tx.seg_list = NULL;
3234 
3235 	if (ss->rx_small.shadow != NULL)
3236 		free(ss->rx_small.shadow, M_DEVBUF);
3237 	ss->rx_small.shadow = NULL;
3238 
3239 	if (ss->rx_big.shadow != NULL)
3240 		free(ss->rx_big.shadow, M_DEVBUF);
3241 	ss->rx_big.shadow = NULL;
3242 
3243 	if (ss->tx.info != NULL) {
3244 		if (ss->tx.dmat != NULL) {
3245 			for (i = 0; i <= ss->tx.mask; i++) {
3246 				bus_dmamap_destroy(ss->tx.dmat,
3247 						   ss->tx.info[i].map);
3248 			}
3249 			bus_dma_tag_destroy(ss->tx.dmat);
3250 		}
3251 		free(ss->tx.info, M_DEVBUF);
3252 	}
3253 	ss->tx.info = NULL;
3254 
3255 	if (ss->rx_small.info != NULL) {
3256 		if (ss->rx_small.dmat != NULL) {
3257 			for (i = 0; i <= ss->rx_small.mask; i++) {
3258 				bus_dmamap_destroy(ss->rx_small.dmat,
3259 						   ss->rx_small.info[i].map);
3260 			}
3261 			bus_dmamap_destroy(ss->rx_small.dmat,
3262 					   ss->rx_small.extra_map);
3263 			bus_dma_tag_destroy(ss->rx_small.dmat);
3264 		}
3265 		free(ss->rx_small.info, M_DEVBUF);
3266 	}
3267 	ss->rx_small.info = NULL;
3268 
3269 	if (ss->rx_big.info != NULL) {
3270 		if (ss->rx_big.dmat != NULL) {
3271 			for (i = 0; i <= ss->rx_big.mask; i++) {
3272 				bus_dmamap_destroy(ss->rx_big.dmat,
3273 						   ss->rx_big.info[i].map);
3274 			}
3275 			bus_dmamap_destroy(ss->rx_big.dmat,
3276 					   ss->rx_big.extra_map);
3277 			bus_dma_tag_destroy(ss->rx_big.dmat);
3278 		}
3279 		free(ss->rx_big.info, M_DEVBUF);
3280 	}
3281 	ss->rx_big.info = NULL;
3282 }
3283 
3284 static void
3285 mxge_free_rings(mxge_softc_t *sc)
3286 {
3287 	int slice;
3288 
3289 	for (slice = 0; slice < sc->num_slices; slice++)
3290 		mxge_free_slice_rings(&sc->ss[slice]);
3291 }
3292 
3293 static int
3294 mxge_alloc_slice_rings(struct mxge_slice_state *ss, int rx_ring_entries,
3295 		       int tx_ring_entries)
3296 {
3297 	mxge_softc_t *sc = ss->sc;
3298 	size_t bytes;
3299 	int err, i;
3300 
3301 	/* allocate per-slice receive resources */
3302 
3303 	ss->rx_small.mask = ss->rx_big.mask = rx_ring_entries - 1;
3304 	ss->rx_done.mask = (2 * rx_ring_entries) - 1;
3305 
3306 	/* allocate the rx shadow rings */
3307 	bytes = rx_ring_entries * sizeof (*ss->rx_small.shadow);
3308 	ss->rx_small.shadow = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3309 
3310 	bytes = rx_ring_entries * sizeof (*ss->rx_big.shadow);
3311 	ss->rx_big.shadow = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3312 
3313 	/* allocate the rx host info rings */
3314 	bytes = rx_ring_entries * sizeof (*ss->rx_small.info);
3315 	ss->rx_small.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3316 
3317 	bytes = rx_ring_entries * sizeof (*ss->rx_big.info);
3318 	ss->rx_big.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3319 
3320 	/* allocate the rx busdma resources */
3321 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
3322 				 1,			/* alignment */
3323 				 4096,			/* boundary */
3324 				 BUS_SPACE_MAXADDR,	/* low */
3325 				 BUS_SPACE_MAXADDR,	/* high */
3326 				 NULL, NULL,		/* filter */
3327 				 MHLEN,			/* maxsize */
3328 				 1,			/* num segs */
3329 				 MHLEN,			/* maxsegsize */
3330 				 BUS_DMA_ALLOCNOW,	/* flags */
3331 				 NULL, NULL,		/* lock */
3332 				 &ss->rx_small.dmat);	/* tag */
3333 	if (err != 0) {
3334 		device_printf(sc->dev, "Err %d allocating rx_small dmat\n",
3335 			      err);
3336 		return err;
3337 	}
3338 
3339 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
3340 				 1,			/* alignment */
3341 #if MXGE_VIRT_JUMBOS
3342 				 4096,			/* boundary */
3343 #else
3344 				 0,			/* boundary */
3345 #endif
3346 				 BUS_SPACE_MAXADDR,	/* low */
3347 				 BUS_SPACE_MAXADDR,	/* high */
3348 				 NULL, NULL,		/* filter */
3349 				 3*4096,		/* maxsize */
3350 #if MXGE_VIRT_JUMBOS
3351 				 3,			/* num segs */
3352 				 4096,			/* maxsegsize*/
3353 #else
3354 				 1,			/* num segs */
3355 				 MJUM9BYTES,		/* maxsegsize*/
3356 #endif
3357 				 BUS_DMA_ALLOCNOW,	/* flags */
3358 				 NULL, NULL,		/* lock */
3359 				 &ss->rx_big.dmat);	/* tag */
3360 	if (err != 0) {
3361 		device_printf(sc->dev, "Err %d allocating rx_big dmat\n",
3362 			      err);
3363 		return err;
3364 	}
3365 	for (i = 0; i <= ss->rx_small.mask; i++) {
3366 		err = bus_dmamap_create(ss->rx_small.dmat, 0,
3367 					&ss->rx_small.info[i].map);
3368 		if (err != 0) {
3369 			device_printf(sc->dev, "Err %d  rx_small dmamap\n",
3370 				      err);
3371 			return err;
3372 		}
3373 	}
3374 	err = bus_dmamap_create(ss->rx_small.dmat, 0,
3375 				&ss->rx_small.extra_map);
3376 	if (err != 0) {
3377 		device_printf(sc->dev, "Err %d extra rx_small dmamap\n",
3378 			      err);
3379 		return err;
3380 	}
3381 
3382 	for (i = 0; i <= ss->rx_big.mask; i++) {
3383 		err = bus_dmamap_create(ss->rx_big.dmat, 0,
3384 					&ss->rx_big.info[i].map);
3385 		if (err != 0) {
3386 			device_printf(sc->dev, "Err %d  rx_big dmamap\n",
3387 				      err);
3388 			return err;
3389 		}
3390 	}
3391 	err = bus_dmamap_create(ss->rx_big.dmat, 0,
3392 				&ss->rx_big.extra_map);
3393 	if (err != 0) {
3394 		device_printf(sc->dev, "Err %d extra rx_big dmamap\n",
3395 			      err);
3396 		return err;
3397 	}
3398 
3399 	/* now allocate TX resources */
3400 
3401 #ifndef IFNET_BUF_RING
3402 	/* only use a single TX ring for now */
3403 	if (ss != ss->sc->ss)
3404 		return 0;
3405 #endif
3406 
3407 	ss->tx.mask = tx_ring_entries - 1;
3408 	ss->tx.max_desc = MIN(MXGE_MAX_SEND_DESC, tx_ring_entries / 4);
3409 
3410 
3411 	/* allocate the tx request copy block */
3412 	bytes = 8 +
3413 		sizeof (*ss->tx.req_list) * (ss->tx.max_desc + 4);
3414 	ss->tx.req_bytes = malloc(bytes, M_DEVBUF, M_WAITOK);
3415 	/* ensure req_list entries are aligned to 8 bytes */
3416 	ss->tx.req_list = (mcp_kreq_ether_send_t *)
3417 		((unsigned long)(ss->tx.req_bytes + 7) & ~7UL);
3418 
3419 	/* allocate the tx busdma segment list */
3420 	bytes = sizeof (*ss->tx.seg_list) * ss->tx.max_desc;
3421 	ss->tx.seg_list = (bus_dma_segment_t *)
3422 		malloc(bytes, M_DEVBUF, M_WAITOK);
3423 
3424 	/* allocate the tx host info ring */
3425 	bytes = tx_ring_entries * sizeof (*ss->tx.info);
3426 	ss->tx.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3427 
3428 	/* allocate the tx busdma resources */
3429 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
3430 				 1,			/* alignment */
3431 				 sc->tx_boundary,	/* boundary */
3432 				 BUS_SPACE_MAXADDR,	/* low */
3433 				 BUS_SPACE_MAXADDR,	/* high */
3434 				 NULL, NULL,		/* filter */
3435 				 65536 + 256,		/* maxsize */
3436 				 ss->tx.max_desc - 2,	/* num segs */
3437 				 sc->tx_boundary,	/* maxsegsz */
3438 				 BUS_DMA_ALLOCNOW,	/* flags */
3439 				 NULL, NULL,		/* lock */
3440 				 &ss->tx.dmat);		/* tag */
3441 
3442 	if (err != 0) {
3443 		device_printf(sc->dev, "Err %d allocating tx dmat\n",
3444 			      err);
3445 		return err;
3446 	}
3447 
3448 	/* now use these tags to setup dmamaps for each slot
3449 	   in the ring */
3450 	for (i = 0; i <= ss->tx.mask; i++) {
3451 		err = bus_dmamap_create(ss->tx.dmat, 0,
3452 					&ss->tx.info[i].map);
3453 		if (err != 0) {
3454 			device_printf(sc->dev, "Err %d  tx dmamap\n",
3455 				      err);
3456 			return err;
3457 		}
3458 	}
3459 	return 0;
3460 
3461 }
3462 
3463 static int
3464 mxge_alloc_rings(mxge_softc_t *sc)
3465 {
3466 	mxge_cmd_t cmd;
3467 	int tx_ring_size;
3468 	int tx_ring_entries, rx_ring_entries;
3469 	int err, slice;
3470 
3471 	/* get ring sizes */
3472 	err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_RING_SIZE, &cmd);
3473 	tx_ring_size = cmd.data0;
3474 	if (err != 0) {
3475 		device_printf(sc->dev, "Cannot determine tx ring sizes\n");
3476 		goto abort;
3477 	}
3478 
3479 	tx_ring_entries = tx_ring_size / sizeof (mcp_kreq_ether_send_t);
3480 	rx_ring_entries = sc->rx_ring_size / sizeof (mcp_dma_addr_t);
3481 	IFQ_SET_MAXLEN(&sc->ifp->if_snd, tx_ring_entries - 1);
3482 	sc->ifp->if_snd.ifq_drv_maxlen = sc->ifp->if_snd.ifq_maxlen;
3483 	IFQ_SET_READY(&sc->ifp->if_snd);
3484 
3485 	for (slice = 0; slice < sc->num_slices; slice++) {
3486 		err = mxge_alloc_slice_rings(&sc->ss[slice],
3487 					     rx_ring_entries,
3488 					     tx_ring_entries);
3489 		if (err != 0)
3490 			goto abort;
3491 	}
3492 	return 0;
3493 
3494 abort:
3495 	mxge_free_rings(sc);
3496 	return err;
3497 
3498 }
3499 
3500 
3501 static void
3502 mxge_choose_params(int mtu, int *big_buf_size, int *cl_size, int *nbufs)
3503 {
3504 	int bufsize = mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN + MXGEFW_PAD;
3505 
3506 	if (bufsize < MCLBYTES) {
3507 		/* easy, everything fits in a single buffer */
3508 		*big_buf_size = MCLBYTES;
3509 		*cl_size = MCLBYTES;
3510 		*nbufs = 1;
3511 		return;
3512 	}
3513 
3514 	if (bufsize < MJUMPAGESIZE) {
3515 		/* still easy, everything still fits in a single buffer */
3516 		*big_buf_size = MJUMPAGESIZE;
3517 		*cl_size = MJUMPAGESIZE;
3518 		*nbufs = 1;
3519 		return;
3520 	}
3521 #if MXGE_VIRT_JUMBOS
3522 	/* now we need to use virtually contiguous buffers */
3523 	*cl_size = MJUM9BYTES;
3524 	*big_buf_size = 4096;
3525 	*nbufs = mtu / 4096 + 1;
3526 	/* needs to be a power of two, so round up */
3527 	if (*nbufs == 3)
3528 		*nbufs = 4;
3529 #else
3530 	*cl_size = MJUM9BYTES;
3531 	*big_buf_size = MJUM9BYTES;
3532 	*nbufs = 1;
3533 #endif
3534 }
3535 
3536 static int
3537 mxge_slice_open(struct mxge_slice_state *ss, int nbufs, int cl_size)
3538 {
3539 	mxge_softc_t *sc;
3540 	mxge_cmd_t cmd;
3541 	bus_dmamap_t map;
3542 	int err, i, slice;
3543 
3544 
3545 	sc = ss->sc;
3546 	slice = ss - sc->ss;
3547 
3548 #if defined(INET) || defined(INET6)
3549 	(void)tcp_lro_init(&ss->lc);
3550 #endif
3551 	ss->lc.ifp = sc->ifp;
3552 
3553 	/* get the lanai pointers to the send and receive rings */
3554 
3555 	err = 0;
3556 #ifndef IFNET_BUF_RING
3557 	/* We currently only send from the first slice */
3558 	if (slice == 0) {
3559 #endif
3560 		cmd.data0 = slice;
3561 		err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_OFFSET, &cmd);
3562 		ss->tx.lanai =
3563 			(volatile mcp_kreq_ether_send_t *)(sc->sram + cmd.data0);
3564 		ss->tx.send_go = (volatile uint32_t *)
3565 			(sc->sram + MXGEFW_ETH_SEND_GO + 64 * slice);
3566 		ss->tx.send_stop = (volatile uint32_t *)
3567 		(sc->sram + MXGEFW_ETH_SEND_STOP + 64 * slice);
3568 #ifndef IFNET_BUF_RING
3569 	}
3570 #endif
3571 	cmd.data0 = slice;
3572 	err |= mxge_send_cmd(sc,
3573 			     MXGEFW_CMD_GET_SMALL_RX_OFFSET, &cmd);
3574 	ss->rx_small.lanai =
3575 		(volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
3576 	cmd.data0 = slice;
3577 	err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_BIG_RX_OFFSET, &cmd);
3578 	ss->rx_big.lanai =
3579 		(volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
3580 
3581 	if (err != 0) {
3582 		device_printf(sc->dev,
3583 			      "failed to get ring sizes or locations\n");
3584 		return EIO;
3585 	}
3586 
3587 	/* stock receive rings */
3588 	for (i = 0; i <= ss->rx_small.mask; i++) {
3589 		map = ss->rx_small.info[i].map;
3590 		err = mxge_get_buf_small(ss, map, i);
3591 		if (err) {
3592 			device_printf(sc->dev, "alloced %d/%d smalls\n",
3593 				      i, ss->rx_small.mask + 1);
3594 			return ENOMEM;
3595 		}
3596 	}
3597 	for (i = 0; i <= ss->rx_big.mask; i++) {
3598 		ss->rx_big.shadow[i].addr_low = 0xffffffff;
3599 		ss->rx_big.shadow[i].addr_high = 0xffffffff;
3600 	}
3601 	ss->rx_big.nbufs = nbufs;
3602 	ss->rx_big.cl_size = cl_size;
3603 	ss->rx_big.mlen = ss->sc->ifp->if_mtu + ETHER_HDR_LEN +
3604 		ETHER_VLAN_ENCAP_LEN + MXGEFW_PAD;
3605 	for (i = 0; i <= ss->rx_big.mask; i += ss->rx_big.nbufs) {
3606 		map = ss->rx_big.info[i].map;
3607 		err = mxge_get_buf_big(ss, map, i);
3608 		if (err) {
3609 			device_printf(sc->dev, "alloced %d/%d bigs\n",
3610 				      i, ss->rx_big.mask + 1);
3611 			return ENOMEM;
3612 		}
3613 	}
3614 	return 0;
3615 }
3616 
3617 static int
3618 mxge_open(mxge_softc_t *sc)
3619 {
3620 	mxge_cmd_t cmd;
3621 	int err, big_bytes, nbufs, slice, cl_size, i;
3622 	bus_addr_t bus;
3623 	volatile uint8_t *itable;
3624 	struct mxge_slice_state *ss;
3625 
3626 	/* Copy the MAC address in case it was overridden */
3627 	bcopy(IF_LLADDR(sc->ifp), sc->mac_addr, ETHER_ADDR_LEN);
3628 
3629 	err = mxge_reset(sc, 1);
3630 	if (err != 0) {
3631 		device_printf(sc->dev, "failed to reset\n");
3632 		return EIO;
3633 	}
3634 
3635 	if (sc->num_slices > 1) {
3636 		/* setup the indirection table */
3637 		cmd.data0 = sc->num_slices;
3638 		err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_TABLE_SIZE,
3639 				    &cmd);
3640 
3641 		err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_RSS_TABLE_OFFSET,
3642 				     &cmd);
3643 		if (err != 0) {
3644 			device_printf(sc->dev,
3645 				      "failed to setup rss tables\n");
3646 			return err;
3647 		}
3648 
3649 		/* just enable an identity mapping */
3650 		itable = sc->sram + cmd.data0;
3651 		for (i = 0; i < sc->num_slices; i++)
3652 			itable[i] = (uint8_t)i;
3653 
3654 		cmd.data0 = 1;
3655 		cmd.data1 = mxge_rss_hash_type;
3656 		err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_ENABLE, &cmd);
3657 		if (err != 0) {
3658 			device_printf(sc->dev, "failed to enable slices\n");
3659 			return err;
3660 		}
3661 	}
3662 
3663 
3664 	mxge_choose_params(sc->ifp->if_mtu, &big_bytes, &cl_size, &nbufs);
3665 
3666 	cmd.data0 = nbufs;
3667 	err = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
3668 			    &cmd);
3669 	/* error is only meaningful if we're trying to set
3670 	   MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS > 1 */
3671 	if (err && nbufs > 1) {
3672 		device_printf(sc->dev,
3673 			      "Failed to set alway-use-n to %d\n",
3674 			      nbufs);
3675 		return EIO;
3676 	}
3677 	/* Give the firmware the mtu and the big and small buffer
3678 	   sizes.  The firmware wants the big buf size to be a power
3679 	   of two. Luckily, FreeBSD's clusters are powers of two */
3680 	cmd.data0 = sc->ifp->if_mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
3681 	err = mxge_send_cmd(sc, MXGEFW_CMD_SET_MTU, &cmd);
3682 	cmd.data0 = MHLEN - MXGEFW_PAD;
3683 	err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_SMALL_BUFFER_SIZE,
3684 			     &cmd);
3685 	cmd.data0 = big_bytes;
3686 	err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_BIG_BUFFER_SIZE, &cmd);
3687 
3688 	if (err != 0) {
3689 		device_printf(sc->dev, "failed to setup params\n");
3690 		goto abort;
3691 	}
3692 
3693 	/* Now give him the pointer to the stats block */
3694 	for (slice = 0;
3695 #ifdef IFNET_BUF_RING
3696 	     slice < sc->num_slices;
3697 #else
3698 	     slice < 1;
3699 #endif
3700 	     slice++) {
3701 		ss = &sc->ss[slice];
3702 		cmd.data0 =
3703 			MXGE_LOWPART_TO_U32(ss->fw_stats_dma.bus_addr);
3704 		cmd.data1 =
3705 			MXGE_HIGHPART_TO_U32(ss->fw_stats_dma.bus_addr);
3706 		cmd.data2 = sizeof(struct mcp_irq_data);
3707 		cmd.data2 |= (slice << 16);
3708 		err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_STATS_DMA_V2, &cmd);
3709 	}
3710 
3711 	if (err != 0) {
3712 		bus = sc->ss->fw_stats_dma.bus_addr;
3713 		bus += offsetof(struct mcp_irq_data, send_done_count);
3714 		cmd.data0 = MXGE_LOWPART_TO_U32(bus);
3715 		cmd.data1 = MXGE_HIGHPART_TO_U32(bus);
3716 		err = mxge_send_cmd(sc,
3717 				    MXGEFW_CMD_SET_STATS_DMA_OBSOLETE,
3718 				    &cmd);
3719 		/* Firmware cannot support multicast without STATS_DMA_V2 */
3720 		sc->fw_multicast_support = 0;
3721 	} else {
3722 		sc->fw_multicast_support = 1;
3723 	}
3724 
3725 	if (err != 0) {
3726 		device_printf(sc->dev, "failed to setup params\n");
3727 		goto abort;
3728 	}
3729 
3730 	for (slice = 0; slice < sc->num_slices; slice++) {
3731 		err = mxge_slice_open(&sc->ss[slice], nbufs, cl_size);
3732 		if (err != 0) {
3733 			device_printf(sc->dev, "couldn't open slice %d\n",
3734 				      slice);
3735 			goto abort;
3736 		}
3737 	}
3738 
3739 	/* Finally, start the firmware running */
3740 	err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_UP, &cmd);
3741 	if (err) {
3742 		device_printf(sc->dev, "Couldn't bring up link\n");
3743 		goto abort;
3744 	}
3745 #ifdef IFNET_BUF_RING
3746 	for (slice = 0; slice < sc->num_slices; slice++) {
3747 		ss = &sc->ss[slice];
3748 		ss->if_drv_flags |= IFF_DRV_RUNNING;
3749 		ss->if_drv_flags &= ~IFF_DRV_OACTIVE;
3750 	}
3751 #endif
3752 	sc->ifp->if_drv_flags |= IFF_DRV_RUNNING;
3753 	sc->ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
3754 
3755 	return 0;
3756 
3757 
3758 abort:
3759 	mxge_free_mbufs(sc);
3760 
3761 	return err;
3762 }
3763 
3764 static int
3765 mxge_close(mxge_softc_t *sc, int down)
3766 {
3767 	mxge_cmd_t cmd;
3768 	int err, old_down_cnt;
3769 #ifdef IFNET_BUF_RING
3770 	struct mxge_slice_state *ss;
3771 	int slice;
3772 #endif
3773 
3774 #ifdef IFNET_BUF_RING
3775 	for (slice = 0; slice < sc->num_slices; slice++) {
3776 		ss = &sc->ss[slice];
3777 		ss->if_drv_flags &= ~IFF_DRV_RUNNING;
3778 	}
3779 #endif
3780 	sc->ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
3781 	if (!down) {
3782 		old_down_cnt = sc->down_cnt;
3783 		wmb();
3784 		err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_DOWN, &cmd);
3785 		if (err) {
3786 			device_printf(sc->dev,
3787 				      "Couldn't bring down link\n");
3788 		}
3789 		if (old_down_cnt == sc->down_cnt) {
3790 			/* wait for down irq */
3791 			DELAY(10 * sc->intr_coal_delay);
3792 		}
3793 		wmb();
3794 		if (old_down_cnt == sc->down_cnt) {
3795 			device_printf(sc->dev, "never got down irq\n");
3796 		}
3797 	}
3798 	mxge_free_mbufs(sc);
3799 
3800 	return 0;
3801 }
3802 
3803 static void
3804 mxge_setup_cfg_space(mxge_softc_t *sc)
3805 {
3806 	device_t dev = sc->dev;
3807 	int reg;
3808 	uint16_t lnk, pectl;
3809 
3810 	/* find the PCIe link width and set max read request to 4KB*/
3811 	if (pci_find_cap(dev, PCIY_EXPRESS, &reg) == 0) {
3812 		lnk = pci_read_config(dev, reg + 0x12, 2);
3813 		sc->link_width = (lnk >> 4) & 0x3f;
3814 
3815 		if (sc->pectl == 0) {
3816 			pectl = pci_read_config(dev, reg + 0x8, 2);
3817 			pectl = (pectl & ~0x7000) | (5 << 12);
3818 			pci_write_config(dev, reg + 0x8, pectl, 2);
3819 			sc->pectl = pectl;
3820 		} else {
3821 			/* restore saved pectl after watchdog reset */
3822 			pci_write_config(dev, reg + 0x8, sc->pectl, 2);
3823 		}
3824 	}
3825 
3826 	/* Enable DMA and Memory space access */
3827 	pci_enable_busmaster(dev);
3828 }
3829 
3830 static uint32_t
3831 mxge_read_reboot(mxge_softc_t *sc)
3832 {
3833 	device_t dev = sc->dev;
3834 	uint32_t vs;
3835 
3836 	/* find the vendor specific offset */
3837 	if (pci_find_cap(dev, PCIY_VENDOR, &vs) != 0) {
3838 		device_printf(sc->dev,
3839 			      "could not find vendor specific offset\n");
3840 		return (uint32_t)-1;
3841 	}
3842 	/* enable read32 mode */
3843 	pci_write_config(dev, vs + 0x10, 0x3, 1);
3844 	/* tell NIC which register to read */
3845 	pci_write_config(dev, vs + 0x18, 0xfffffff0, 4);
3846 	return (pci_read_config(dev, vs + 0x14, 4));
3847 }
3848 
3849 static void
3850 mxge_watchdog_reset(mxge_softc_t *sc)
3851 {
3852 	struct pci_devinfo *dinfo;
3853 	struct mxge_slice_state *ss;
3854 	int err, running, s, num_tx_slices = 1;
3855 	uint32_t reboot;
3856 	uint16_t cmd;
3857 
3858 	err = ENXIO;
3859 
3860 	device_printf(sc->dev, "Watchdog reset!\n");
3861 
3862 	/*
3863 	 * check to see if the NIC rebooted.  If it did, then all of
3864 	 * PCI config space has been reset, and things like the
3865 	 * busmaster bit will be zero.  If this is the case, then we
3866 	 * must restore PCI config space before the NIC can be used
3867 	 * again
3868 	 */
3869 	cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3870 	if (cmd == 0xffff) {
3871 		/*
3872 		 * maybe the watchdog caught the NIC rebooting; wait
3873 		 * up to 100ms for it to finish.  If it does not come
3874 		 * back, then give up
3875 		 */
3876 		DELAY(1000*100);
3877 		cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3878 		if (cmd == 0xffff) {
3879 			device_printf(sc->dev, "NIC disappeared!\n");
3880 		}
3881 	}
3882 	if ((cmd & PCIM_CMD_BUSMASTEREN) == 0) {
3883 		/* print the reboot status */
3884 		reboot = mxge_read_reboot(sc);
3885 		device_printf(sc->dev, "NIC rebooted, status = 0x%x\n",
3886 			      reboot);
3887 		running = sc->ifp->if_drv_flags & IFF_DRV_RUNNING;
3888 		if (running) {
3889 
3890 			/*
3891 			 * quiesce NIC so that TX routines will not try to
3892 			 * xmit after restoration of BAR
3893 			 */
3894 
3895 			/* Mark the link as down */
3896 			if (sc->link_state) {
3897 				sc->link_state = 0;
3898 				if_link_state_change(sc->ifp,
3899 						     LINK_STATE_DOWN);
3900 			}
3901 #ifdef IFNET_BUF_RING
3902 			num_tx_slices = sc->num_slices;
3903 #endif
3904 			/* grab all TX locks to ensure no tx  */
3905 			for (s = 0; s < num_tx_slices; s++) {
3906 				ss = &sc->ss[s];
3907 				mtx_lock(&ss->tx.mtx);
3908 			}
3909 			mxge_close(sc, 1);
3910 		}
3911 		/* restore PCI configuration space */
3912 		dinfo = device_get_ivars(sc->dev);
3913 		pci_cfg_restore(sc->dev, dinfo);
3914 
3915 		/* and redo any changes we made to our config space */
3916 		mxge_setup_cfg_space(sc);
3917 
3918 		/* reload f/w */
3919 		err = mxge_load_firmware(sc, 0);
3920 		if (err) {
3921 			device_printf(sc->dev,
3922 				      "Unable to re-load f/w\n");
3923 		}
3924 		if (running) {
3925 			if (!err)
3926 				err = mxge_open(sc);
3927 			/* release all TX locks */
3928 			for (s = 0; s < num_tx_slices; s++) {
3929 				ss = &sc->ss[s];
3930 #ifdef IFNET_BUF_RING
3931 				mxge_start_locked(ss);
3932 #endif
3933 				mtx_unlock(&ss->tx.mtx);
3934 			}
3935 		}
3936 		sc->watchdog_resets++;
3937 	} else {
3938 		device_printf(sc->dev,
3939 			      "NIC did not reboot, not resetting\n");
3940 		err = 0;
3941 	}
3942 	if (err) {
3943 		device_printf(sc->dev, "watchdog reset failed\n");
3944 	} else {
3945 		if (sc->dying == 2)
3946 			sc->dying = 0;
3947 		callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
3948 	}
3949 }
3950 
3951 static void
3952 mxge_watchdog_task(void *arg, int pending)
3953 {
3954 	mxge_softc_t *sc = arg;
3955 
3956 
3957 	mtx_lock(&sc->driver_mtx);
3958 	mxge_watchdog_reset(sc);
3959 	mtx_unlock(&sc->driver_mtx);
3960 }
3961 
3962 static void
3963 mxge_warn_stuck(mxge_softc_t *sc, mxge_tx_ring_t *tx, int slice)
3964 {
3965 	tx = &sc->ss[slice].tx;
3966 	device_printf(sc->dev, "slice %d struck? ring state:\n", slice);
3967 	device_printf(sc->dev,
3968 		      "tx.req=%d tx.done=%d, tx.queue_active=%d\n",
3969 		      tx->req, tx->done, tx->queue_active);
3970 	device_printf(sc->dev, "tx.activate=%d tx.deactivate=%d\n",
3971 			      tx->activate, tx->deactivate);
3972 	device_printf(sc->dev, "pkt_done=%d fw=%d\n",
3973 		      tx->pkt_done,
3974 		      be32toh(sc->ss->fw_stats->send_done_count));
3975 }
3976 
3977 static int
3978 mxge_watchdog(mxge_softc_t *sc)
3979 {
3980 	mxge_tx_ring_t *tx;
3981 	uint32_t rx_pause = be32toh(sc->ss->fw_stats->dropped_pause);
3982 	int i, err = 0;
3983 
3984 	/* see if we have outstanding transmits, which
3985 	   have been pending for more than mxge_ticks */
3986 	for (i = 0;
3987 #ifdef IFNET_BUF_RING
3988 	     (i < sc->num_slices) && (err == 0);
3989 #else
3990 	     (i < 1) && (err == 0);
3991 #endif
3992 	     i++) {
3993 		tx = &sc->ss[i].tx;
3994 		if (tx->req != tx->done &&
3995 		    tx->watchdog_req != tx->watchdog_done &&
3996 		    tx->done == tx->watchdog_done) {
3997 			/* check for pause blocking before resetting */
3998 			if (tx->watchdog_rx_pause == rx_pause) {
3999 				mxge_warn_stuck(sc, tx, i);
4000 				taskqueue_enqueue(sc->tq, &sc->watchdog_task);
4001 				return (ENXIO);
4002 			}
4003 			else
4004 				device_printf(sc->dev, "Flow control blocking "
4005 					      "xmits, check link partner\n");
4006 		}
4007 
4008 		tx->watchdog_req = tx->req;
4009 		tx->watchdog_done = tx->done;
4010 		tx->watchdog_rx_pause = rx_pause;
4011 	}
4012 
4013 	if (sc->need_media_probe)
4014 		mxge_media_probe(sc);
4015 	return (err);
4016 }
4017 
4018 static uint64_t
4019 mxge_get_counter(struct ifnet *ifp, ift_counter cnt)
4020 {
4021 	struct mxge_softc *sc;
4022 	uint64_t rv;
4023 
4024 	sc = if_getsoftc(ifp);
4025 	rv = 0;
4026 
4027 	switch (cnt) {
4028 	case IFCOUNTER_IPACKETS:
4029 		for (int s = 0; s < sc->num_slices; s++)
4030 			rv += sc->ss[s].ipackets;
4031 		return (rv);
4032 	case IFCOUNTER_OPACKETS:
4033 		for (int s = 0; s < sc->num_slices; s++)
4034 			rv += sc->ss[s].opackets;
4035 		return (rv);
4036 	case IFCOUNTER_OERRORS:
4037 		for (int s = 0; s < sc->num_slices; s++)
4038 			rv += sc->ss[s].oerrors;
4039 		return (rv);
4040 #ifdef IFNET_BUF_RING
4041 	case IFCOUNTER_OBYTES:
4042 		for (int s = 0; s < sc->num_slices; s++)
4043 			rv += sc->ss[s].obytes;
4044 		return (rv);
4045 	case IFCOUNTER_OMCASTS:
4046 		for (int s = 0; s < sc->num_slices; s++)
4047 			rv += sc->ss[s].omcasts;
4048 		return (rv);
4049 	case IFCOUNTER_OQDROPS:
4050 		for (int s = 0; s < sc->num_slices; s++)
4051 			rv += sc->ss[s].tx.br->br_drops;
4052 		return (rv);
4053 #endif
4054 	default:
4055 		return (if_get_counter_default(ifp, cnt));
4056 	}
4057 }
4058 
4059 static void
4060 mxge_tick(void *arg)
4061 {
4062 	mxge_softc_t *sc = arg;
4063 	u_long pkts = 0;
4064 	int err = 0;
4065 	int running, ticks;
4066 	uint16_t cmd;
4067 
4068 	ticks = mxge_ticks;
4069 	running = sc->ifp->if_drv_flags & IFF_DRV_RUNNING;
4070 	if (running) {
4071 		if (!sc->watchdog_countdown) {
4072 			err = mxge_watchdog(sc);
4073 			sc->watchdog_countdown = 4;
4074 		}
4075 		sc->watchdog_countdown--;
4076 	}
4077 	if (pkts == 0) {
4078 		/* ensure NIC did not suffer h/w fault while idle */
4079 		cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
4080 		if ((cmd & PCIM_CMD_BUSMASTEREN) == 0) {
4081 			sc->dying = 2;
4082 			taskqueue_enqueue(sc->tq, &sc->watchdog_task);
4083 			err = ENXIO;
4084 		}
4085 		/* look less often if NIC is idle */
4086 		ticks *= 4;
4087 	}
4088 
4089 	if (err == 0)
4090 		callout_reset(&sc->co_hdl, ticks, mxge_tick, sc);
4091 
4092 }
4093 
4094 static int
4095 mxge_media_change(struct ifnet *ifp)
4096 {
4097 	return EINVAL;
4098 }
4099 
4100 static int
4101 mxge_change_mtu(mxge_softc_t *sc, int mtu)
4102 {
4103 	struct ifnet *ifp = sc->ifp;
4104 	int real_mtu, old_mtu;
4105 	int err = 0;
4106 
4107 
4108 	real_mtu = mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
4109 	if ((real_mtu > sc->max_mtu) || real_mtu < 60)
4110 		return EINVAL;
4111 	mtx_lock(&sc->driver_mtx);
4112 	old_mtu = ifp->if_mtu;
4113 	ifp->if_mtu = mtu;
4114 	if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
4115 		mxge_close(sc, 0);
4116 		err = mxge_open(sc);
4117 		if (err != 0) {
4118 			ifp->if_mtu = old_mtu;
4119 			mxge_close(sc, 0);
4120 			(void) mxge_open(sc);
4121 		}
4122 	}
4123 	mtx_unlock(&sc->driver_mtx);
4124 	return err;
4125 }
4126 
4127 static void
4128 mxge_media_status(struct ifnet *ifp, struct ifmediareq *ifmr)
4129 {
4130 	mxge_softc_t *sc = ifp->if_softc;
4131 
4132 
4133 	if (sc == NULL)
4134 		return;
4135 	ifmr->ifm_status = IFM_AVALID;
4136 	ifmr->ifm_active = IFM_ETHER | IFM_FDX;
4137 	ifmr->ifm_status |= sc->link_state ? IFM_ACTIVE : 0;
4138 	ifmr->ifm_active |= sc->current_media;
4139 }
4140 
4141 static int
4142 mxge_fetch_i2c(mxge_softc_t *sc, struct ifi2creq *i2c)
4143 {
4144 	mxge_cmd_t cmd;
4145 	uint32_t i2c_args;
4146 	int i, ms, err;
4147 
4148 
4149 	if (i2c->dev_addr != 0xA0 &&
4150 	    i2c->dev_addr != 0xA2)
4151 		return (EINVAL);
4152 	if (i2c->len > sizeof(i2c->data))
4153 		return (EINVAL);
4154 
4155 	for (i = 0; i < i2c->len; i++) {
4156 		i2c_args = i2c->dev_addr << 0x8;
4157 		i2c_args |= i2c->offset + i;
4158 		cmd.data0 = 0;	 /* just fetch 1 byte, not all 256 */
4159 		cmd.data1 = i2c_args;
4160 		err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_READ, &cmd);
4161 
4162 		if (err != MXGEFW_CMD_OK)
4163 			return (EIO);
4164 		/* now we wait for the data to be cached */
4165 		cmd.data0 = i2c_args & 0xff;
4166 		err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
4167 		for (ms = 0; (err == EBUSY) && (ms < 50); ms++) {
4168 			cmd.data0 = i2c_args & 0xff;
4169 			err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
4170 			if (err == EBUSY)
4171 				DELAY(1000);
4172 		}
4173 		if (err != MXGEFW_CMD_OK)
4174 			return (EIO);
4175 		i2c->data[i] = cmd.data0;
4176 	}
4177 	return (0);
4178 }
4179 
4180 static int
4181 mxge_ioctl(struct ifnet *ifp, u_long command, caddr_t data)
4182 {
4183 	mxge_softc_t *sc = ifp->if_softc;
4184 	struct ifreq *ifr = (struct ifreq *)data;
4185 	struct ifi2creq i2c;
4186 	int err, mask;
4187 
4188 	err = 0;
4189 	switch (command) {
4190 	case SIOCSIFMTU:
4191 		err = mxge_change_mtu(sc, ifr->ifr_mtu);
4192 		break;
4193 
4194 	case SIOCSIFFLAGS:
4195 		mtx_lock(&sc->driver_mtx);
4196 		if (sc->dying) {
4197 			mtx_unlock(&sc->driver_mtx);
4198 			return EINVAL;
4199 		}
4200 		if (ifp->if_flags & IFF_UP) {
4201 			if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) {
4202 				err = mxge_open(sc);
4203 			} else {
4204 				/* take care of promis can allmulti
4205 				   flag chages */
4206 				mxge_change_promisc(sc,
4207 						    ifp->if_flags & IFF_PROMISC);
4208 				mxge_set_multicast_list(sc);
4209 			}
4210 		} else {
4211 			if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
4212 				mxge_close(sc, 0);
4213 			}
4214 		}
4215 		mtx_unlock(&sc->driver_mtx);
4216 		break;
4217 
4218 	case SIOCADDMULTI:
4219 	case SIOCDELMULTI:
4220 		mtx_lock(&sc->driver_mtx);
4221 		if (sc->dying) {
4222 			mtx_unlock(&sc->driver_mtx);
4223 			return (EINVAL);
4224 		}
4225 		mxge_set_multicast_list(sc);
4226 		mtx_unlock(&sc->driver_mtx);
4227 		break;
4228 
4229 	case SIOCSIFCAP:
4230 		mtx_lock(&sc->driver_mtx);
4231 		mask = ifr->ifr_reqcap ^ ifp->if_capenable;
4232 		if (mask & IFCAP_TXCSUM) {
4233 			if (IFCAP_TXCSUM & ifp->if_capenable) {
4234 				ifp->if_capenable &= ~(IFCAP_TXCSUM|IFCAP_TSO4);
4235 				ifp->if_hwassist &= ~(CSUM_TCP | CSUM_UDP);
4236 			} else {
4237 				ifp->if_capenable |= IFCAP_TXCSUM;
4238 				ifp->if_hwassist |= (CSUM_TCP | CSUM_UDP);
4239 			}
4240 		} else if (mask & IFCAP_RXCSUM) {
4241 			if (IFCAP_RXCSUM & ifp->if_capenable) {
4242 				ifp->if_capenable &= ~IFCAP_RXCSUM;
4243 			} else {
4244 				ifp->if_capenable |= IFCAP_RXCSUM;
4245 			}
4246 		}
4247 		if (mask & IFCAP_TSO4) {
4248 			if (IFCAP_TSO4 & ifp->if_capenable) {
4249 				ifp->if_capenable &= ~IFCAP_TSO4;
4250 			} else if (IFCAP_TXCSUM & ifp->if_capenable) {
4251 				ifp->if_capenable |= IFCAP_TSO4;
4252 				ifp->if_hwassist |= CSUM_TSO;
4253 			} else {
4254 				printf("mxge requires tx checksum offload"
4255 				       " be enabled to use TSO\n");
4256 				err = EINVAL;
4257 			}
4258 		}
4259 #if IFCAP_TSO6
4260 		if (mask & IFCAP_TXCSUM_IPV6) {
4261 			if (IFCAP_TXCSUM_IPV6 & ifp->if_capenable) {
4262 				ifp->if_capenable &= ~(IFCAP_TXCSUM_IPV6
4263 						       | IFCAP_TSO6);
4264 				ifp->if_hwassist &= ~(CSUM_TCP_IPV6
4265 						      | CSUM_UDP);
4266 			} else {
4267 				ifp->if_capenable |= IFCAP_TXCSUM_IPV6;
4268 				ifp->if_hwassist |= (CSUM_TCP_IPV6
4269 						     | CSUM_UDP_IPV6);
4270 			}
4271 		} else if (mask & IFCAP_RXCSUM_IPV6) {
4272 			if (IFCAP_RXCSUM_IPV6 & ifp->if_capenable) {
4273 				ifp->if_capenable &= ~IFCAP_RXCSUM_IPV6;
4274 			} else {
4275 				ifp->if_capenable |= IFCAP_RXCSUM_IPV6;
4276 			}
4277 		}
4278 		if (mask & IFCAP_TSO6) {
4279 			if (IFCAP_TSO6 & ifp->if_capenable) {
4280 				ifp->if_capenable &= ~IFCAP_TSO6;
4281 			} else if (IFCAP_TXCSUM_IPV6 & ifp->if_capenable) {
4282 				ifp->if_capenable |= IFCAP_TSO6;
4283 				ifp->if_hwassist |= CSUM_TSO;
4284 			} else {
4285 				printf("mxge requires tx checksum offload"
4286 				       " be enabled to use TSO\n");
4287 				err = EINVAL;
4288 			}
4289 		}
4290 #endif /*IFCAP_TSO6 */
4291 
4292 		if (mask & IFCAP_LRO)
4293 			ifp->if_capenable ^= IFCAP_LRO;
4294 		if (mask & IFCAP_VLAN_HWTAGGING)
4295 			ifp->if_capenable ^= IFCAP_VLAN_HWTAGGING;
4296 		if (mask & IFCAP_VLAN_HWTSO)
4297 			ifp->if_capenable ^= IFCAP_VLAN_HWTSO;
4298 
4299 		if (!(ifp->if_capabilities & IFCAP_VLAN_HWTSO) ||
4300 		    !(ifp->if_capenable & IFCAP_VLAN_HWTAGGING))
4301 			ifp->if_capenable &= ~IFCAP_VLAN_HWTSO;
4302 
4303 		mtx_unlock(&sc->driver_mtx);
4304 		VLAN_CAPABILITIES(ifp);
4305 
4306 		break;
4307 
4308 	case SIOCGIFMEDIA:
4309 		mtx_lock(&sc->driver_mtx);
4310 		if (sc->dying) {
4311 			mtx_unlock(&sc->driver_mtx);
4312 			return (EINVAL);
4313 		}
4314 		mxge_media_probe(sc);
4315 		mtx_unlock(&sc->driver_mtx);
4316 		err = ifmedia_ioctl(ifp, (struct ifreq *)data,
4317 				    &sc->media, command);
4318 		break;
4319 
4320 	case SIOCGI2C:
4321 		if (sc->connector != MXGE_XFP &&
4322 		    sc->connector != MXGE_SFP) {
4323 			err = ENXIO;
4324 			break;
4325 		}
4326 		err = copyin(ifr_data_get_ptr(ifr), &i2c, sizeof(i2c));
4327 		if (err != 0)
4328 			break;
4329 		mtx_lock(&sc->driver_mtx);
4330 		if (sc->dying) {
4331 			mtx_unlock(&sc->driver_mtx);
4332 			return (EINVAL);
4333 		}
4334 		err = mxge_fetch_i2c(sc, &i2c);
4335 		mtx_unlock(&sc->driver_mtx);
4336 		if (err == 0)
4337 			err = copyout(&i2c, ifr->ifr_ifru.ifru_data,
4338 			    sizeof(i2c));
4339 		break;
4340 	default:
4341 		err = ether_ioctl(ifp, command, data);
4342 		break;
4343 	}
4344 	return err;
4345 }
4346 
4347 static void
4348 mxge_fetch_tunables(mxge_softc_t *sc)
4349 {
4350 
4351 	TUNABLE_INT_FETCH("hw.mxge.max_slices", &mxge_max_slices);
4352 	TUNABLE_INT_FETCH("hw.mxge.flow_control_enabled",
4353 			  &mxge_flow_control);
4354 	TUNABLE_INT_FETCH("hw.mxge.intr_coal_delay",
4355 			  &mxge_intr_coal_delay);
4356 	TUNABLE_INT_FETCH("hw.mxge.nvidia_ecrc_enable",
4357 			  &mxge_nvidia_ecrc_enable);
4358 	TUNABLE_INT_FETCH("hw.mxge.force_firmware",
4359 			  &mxge_force_firmware);
4360 	TUNABLE_INT_FETCH("hw.mxge.deassert_wait",
4361 			  &mxge_deassert_wait);
4362 	TUNABLE_INT_FETCH("hw.mxge.verbose",
4363 			  &mxge_verbose);
4364 	TUNABLE_INT_FETCH("hw.mxge.ticks", &mxge_ticks);
4365 	TUNABLE_INT_FETCH("hw.mxge.always_promisc", &mxge_always_promisc);
4366 	TUNABLE_INT_FETCH("hw.mxge.rss_hash_type", &mxge_rss_hash_type);
4367 	TUNABLE_INT_FETCH("hw.mxge.rss_hashtype", &mxge_rss_hash_type);
4368 	TUNABLE_INT_FETCH("hw.mxge.initial_mtu", &mxge_initial_mtu);
4369 	TUNABLE_INT_FETCH("hw.mxge.throttle", &mxge_throttle);
4370 
4371 	if (bootverbose)
4372 		mxge_verbose = 1;
4373 	if (mxge_intr_coal_delay < 0 || mxge_intr_coal_delay > 10*1000)
4374 		mxge_intr_coal_delay = 30;
4375 	if (mxge_ticks == 0)
4376 		mxge_ticks = hz / 2;
4377 	sc->pause = mxge_flow_control;
4378 	if (mxge_rss_hash_type < MXGEFW_RSS_HASH_TYPE_IPV4
4379 	    || mxge_rss_hash_type > MXGEFW_RSS_HASH_TYPE_MAX) {
4380 		mxge_rss_hash_type = MXGEFW_RSS_HASH_TYPE_SRC_DST_PORT;
4381 	}
4382 	if (mxge_initial_mtu > ETHERMTU_JUMBO ||
4383 	    mxge_initial_mtu < ETHER_MIN_LEN)
4384 		mxge_initial_mtu = ETHERMTU_JUMBO;
4385 
4386 	if (mxge_throttle && mxge_throttle > MXGE_MAX_THROTTLE)
4387 		mxge_throttle = MXGE_MAX_THROTTLE;
4388 	if (mxge_throttle && mxge_throttle < MXGE_MIN_THROTTLE)
4389 		mxge_throttle = MXGE_MIN_THROTTLE;
4390 	sc->throttle = mxge_throttle;
4391 }
4392 
4393 
4394 static void
4395 mxge_free_slices(mxge_softc_t *sc)
4396 {
4397 	struct mxge_slice_state *ss;
4398 	int i;
4399 
4400 
4401 	if (sc->ss == NULL)
4402 		return;
4403 
4404 	for (i = 0; i < sc->num_slices; i++) {
4405 		ss = &sc->ss[i];
4406 		if (ss->fw_stats != NULL) {
4407 			mxge_dma_free(&ss->fw_stats_dma);
4408 			ss->fw_stats = NULL;
4409 #ifdef IFNET_BUF_RING
4410 			if (ss->tx.br != NULL) {
4411 				drbr_free(ss->tx.br, M_DEVBUF);
4412 				ss->tx.br = NULL;
4413 			}
4414 #endif
4415 			mtx_destroy(&ss->tx.mtx);
4416 		}
4417 		if (ss->rx_done.entry != NULL) {
4418 			mxge_dma_free(&ss->rx_done.dma);
4419 			ss->rx_done.entry = NULL;
4420 		}
4421 	}
4422 	free(sc->ss, M_DEVBUF);
4423 	sc->ss = NULL;
4424 }
4425 
4426 static int
4427 mxge_alloc_slices(mxge_softc_t *sc)
4428 {
4429 	mxge_cmd_t cmd;
4430 	struct mxge_slice_state *ss;
4431 	size_t bytes;
4432 	int err, i, max_intr_slots;
4433 
4434 	err = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
4435 	if (err != 0) {
4436 		device_printf(sc->dev, "Cannot determine rx ring size\n");
4437 		return err;
4438 	}
4439 	sc->rx_ring_size = cmd.data0;
4440 	max_intr_slots = 2 * (sc->rx_ring_size / sizeof (mcp_dma_addr_t));
4441 
4442 	bytes = sizeof (*sc->ss) * sc->num_slices;
4443 	sc->ss = malloc(bytes, M_DEVBUF, M_NOWAIT | M_ZERO);
4444 	if (sc->ss == NULL)
4445 		return (ENOMEM);
4446 	for (i = 0; i < sc->num_slices; i++) {
4447 		ss = &sc->ss[i];
4448 
4449 		ss->sc = sc;
4450 
4451 		/* allocate per-slice rx interrupt queues */
4452 
4453 		bytes = max_intr_slots * sizeof (*ss->rx_done.entry);
4454 		err = mxge_dma_alloc(sc, &ss->rx_done.dma, bytes, 4096);
4455 		if (err != 0)
4456 			goto abort;
4457 		ss->rx_done.entry = ss->rx_done.dma.addr;
4458 		bzero(ss->rx_done.entry, bytes);
4459 
4460 		/*
4461 		 * allocate the per-slice firmware stats; stats
4462 		 * (including tx) are used used only on the first
4463 		 * slice for now
4464 		 */
4465 #ifndef IFNET_BUF_RING
4466 		if (i > 0)
4467 			continue;
4468 #endif
4469 
4470 		bytes = sizeof (*ss->fw_stats);
4471 		err = mxge_dma_alloc(sc, &ss->fw_stats_dma,
4472 				     sizeof (*ss->fw_stats), 64);
4473 		if (err != 0)
4474 			goto abort;
4475 		ss->fw_stats = (mcp_irq_data_t *)ss->fw_stats_dma.addr;
4476 		snprintf(ss->tx.mtx_name, sizeof(ss->tx.mtx_name),
4477 			 "%s:tx(%d)", device_get_nameunit(sc->dev), i);
4478 		mtx_init(&ss->tx.mtx, ss->tx.mtx_name, NULL, MTX_DEF);
4479 #ifdef IFNET_BUF_RING
4480 		ss->tx.br = buf_ring_alloc(2048, M_DEVBUF, M_WAITOK,
4481 					   &ss->tx.mtx);
4482 #endif
4483 	}
4484 
4485 	return (0);
4486 
4487 abort:
4488 	mxge_free_slices(sc);
4489 	return (ENOMEM);
4490 }
4491 
4492 static void
4493 mxge_slice_probe(mxge_softc_t *sc)
4494 {
4495 	mxge_cmd_t cmd;
4496 	char *old_fw;
4497 	int msix_cnt, status, max_intr_slots;
4498 
4499 	sc->num_slices = 1;
4500 	/*
4501 	 *  don't enable multiple slices if they are not enabled,
4502 	 *  or if this is not an SMP system
4503 	 */
4504 
4505 	if (mxge_max_slices == 0 || mxge_max_slices == 1 || mp_ncpus < 2)
4506 		return;
4507 
4508 	/* see how many MSI-X interrupts are available */
4509 	msix_cnt = pci_msix_count(sc->dev);
4510 	if (msix_cnt < 2)
4511 		return;
4512 
4513 	/* now load the slice aware firmware see what it supports */
4514 	old_fw = sc->fw_name;
4515 	if (old_fw == mxge_fw_aligned)
4516 		sc->fw_name = mxge_fw_rss_aligned;
4517 	else
4518 		sc->fw_name = mxge_fw_rss_unaligned;
4519 	status = mxge_load_firmware(sc, 0);
4520 	if (status != 0) {
4521 		device_printf(sc->dev, "Falling back to a single slice\n");
4522 		return;
4523 	}
4524 
4525 	/* try to send a reset command to the card to see if it
4526 	   is alive */
4527 	memset(&cmd, 0, sizeof (cmd));
4528 	status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
4529 	if (status != 0) {
4530 		device_printf(sc->dev, "failed reset\n");
4531 		goto abort_with_fw;
4532 	}
4533 
4534 	/* get rx ring size */
4535 	status = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
4536 	if (status != 0) {
4537 		device_printf(sc->dev, "Cannot determine rx ring size\n");
4538 		goto abort_with_fw;
4539 	}
4540 	max_intr_slots = 2 * (cmd.data0 / sizeof (mcp_dma_addr_t));
4541 
4542 	/* tell it the size of the interrupt queues */
4543 	cmd.data0 = max_intr_slots * sizeof (struct mcp_slot);
4544 	status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
4545 	if (status != 0) {
4546 		device_printf(sc->dev, "failed MXGEFW_CMD_SET_INTRQ_SIZE\n");
4547 		goto abort_with_fw;
4548 	}
4549 
4550 	/* ask the maximum number of slices it supports */
4551 	status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES, &cmd);
4552 	if (status != 0) {
4553 		device_printf(sc->dev,
4554 			      "failed MXGEFW_CMD_GET_MAX_RSS_QUEUES\n");
4555 		goto abort_with_fw;
4556 	}
4557 	sc->num_slices = cmd.data0;
4558 	if (sc->num_slices > msix_cnt)
4559 		sc->num_slices = msix_cnt;
4560 
4561 	if (mxge_max_slices == -1) {
4562 		/* cap to number of CPUs in system */
4563 		if (sc->num_slices > mp_ncpus)
4564 			sc->num_slices = mp_ncpus;
4565 	} else {
4566 		if (sc->num_slices > mxge_max_slices)
4567 			sc->num_slices = mxge_max_slices;
4568 	}
4569 	/* make sure it is a power of two */
4570 	while (sc->num_slices & (sc->num_slices - 1))
4571 		sc->num_slices--;
4572 
4573 	if (mxge_verbose)
4574 		device_printf(sc->dev, "using %d slices\n",
4575 			      sc->num_slices);
4576 
4577 	return;
4578 
4579 abort_with_fw:
4580 	sc->fw_name = old_fw;
4581 	(void) mxge_load_firmware(sc, 0);
4582 }
4583 
4584 static int
4585 mxge_add_msix_irqs(mxge_softc_t *sc)
4586 {
4587 	size_t bytes;
4588 	int count, err, i, rid;
4589 
4590 	rid = PCIR_BAR(2);
4591 	sc->msix_table_res = bus_alloc_resource_any(sc->dev, SYS_RES_MEMORY,
4592 						    &rid, RF_ACTIVE);
4593 
4594 	if (sc->msix_table_res == NULL) {
4595 		device_printf(sc->dev, "couldn't alloc MSIX table res\n");
4596 		return ENXIO;
4597 	}
4598 
4599 	count = sc->num_slices;
4600 	err = pci_alloc_msix(sc->dev, &count);
4601 	if (err != 0) {
4602 		device_printf(sc->dev, "pci_alloc_msix: failed, wanted %d"
4603 			      "err = %d \n", sc->num_slices, err);
4604 		goto abort_with_msix_table;
4605 	}
4606 	if (count < sc->num_slices) {
4607 		device_printf(sc->dev, "pci_alloc_msix: need %d, got %d\n",
4608 			      count, sc->num_slices);
4609 		device_printf(sc->dev,
4610 			      "Try setting hw.mxge.max_slices to %d\n",
4611 			      count);
4612 		err = ENOSPC;
4613 		goto abort_with_msix;
4614 	}
4615 	bytes = sizeof (*sc->msix_irq_res) * sc->num_slices;
4616 	sc->msix_irq_res = malloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO);
4617 	if (sc->msix_irq_res == NULL) {
4618 		err = ENOMEM;
4619 		goto abort_with_msix;
4620 	}
4621 
4622 	for (i = 0; i < sc->num_slices; i++) {
4623 		rid = i + 1;
4624 		sc->msix_irq_res[i] = bus_alloc_resource_any(sc->dev,
4625 							  SYS_RES_IRQ,
4626 							  &rid, RF_ACTIVE);
4627 		if (sc->msix_irq_res[i] == NULL) {
4628 			device_printf(sc->dev, "couldn't allocate IRQ res"
4629 				      " for message %d\n", i);
4630 			err = ENXIO;
4631 			goto abort_with_res;
4632 		}
4633 	}
4634 
4635 	bytes = sizeof (*sc->msix_ih) * sc->num_slices;
4636 	sc->msix_ih =  malloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO);
4637 
4638 	for (i = 0; i < sc->num_slices; i++) {
4639 		err = bus_setup_intr(sc->dev, sc->msix_irq_res[i],
4640 				     INTR_TYPE_NET | INTR_MPSAFE,
4641 #if __FreeBSD_version > 700030
4642 				     NULL,
4643 #endif
4644 				     mxge_intr, &sc->ss[i], &sc->msix_ih[i]);
4645 		if (err != 0) {
4646 			device_printf(sc->dev, "couldn't setup intr for "
4647 				      "message %d\n", i);
4648 			goto abort_with_intr;
4649 		}
4650 		bus_describe_intr(sc->dev, sc->msix_irq_res[i],
4651 				  sc->msix_ih[i], "s%d", i);
4652 	}
4653 
4654 	if (mxge_verbose) {
4655 		device_printf(sc->dev, "using %d msix IRQs:",
4656 			      sc->num_slices);
4657 		for (i = 0; i < sc->num_slices; i++)
4658 			printf(" %jd", rman_get_start(sc->msix_irq_res[i]));
4659 		printf("\n");
4660 	}
4661 	return (0);
4662 
4663 abort_with_intr:
4664 	for (i = 0; i < sc->num_slices; i++) {
4665 		if (sc->msix_ih[i] != NULL) {
4666 			bus_teardown_intr(sc->dev, sc->msix_irq_res[i],
4667 					  sc->msix_ih[i]);
4668 			sc->msix_ih[i] = NULL;
4669 		}
4670 	}
4671 	free(sc->msix_ih, M_DEVBUF);
4672 
4673 
4674 abort_with_res:
4675 	for (i = 0; i < sc->num_slices; i++) {
4676 		rid = i + 1;
4677 		if (sc->msix_irq_res[i] != NULL)
4678 			bus_release_resource(sc->dev, SYS_RES_IRQ, rid,
4679 					     sc->msix_irq_res[i]);
4680 		sc->msix_irq_res[i] = NULL;
4681 	}
4682 	free(sc->msix_irq_res, M_DEVBUF);
4683 
4684 
4685 abort_with_msix:
4686 	pci_release_msi(sc->dev);
4687 
4688 abort_with_msix_table:
4689 	bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2),
4690 			     sc->msix_table_res);
4691 
4692 	return err;
4693 }
4694 
4695 static int
4696 mxge_add_single_irq(mxge_softc_t *sc)
4697 {
4698 	int count, err, rid;
4699 
4700 	count = pci_msi_count(sc->dev);
4701 	if (count == 1 && pci_alloc_msi(sc->dev, &count) == 0) {
4702 		rid = 1;
4703 	} else {
4704 		rid = 0;
4705 		sc->legacy_irq = 1;
4706 	}
4707 	sc->irq_res = bus_alloc_resource_any(sc->dev, SYS_RES_IRQ, &rid,
4708 					     RF_SHAREABLE | RF_ACTIVE);
4709 	if (sc->irq_res == NULL) {
4710 		device_printf(sc->dev, "could not alloc interrupt\n");
4711 		return ENXIO;
4712 	}
4713 	if (mxge_verbose)
4714 		device_printf(sc->dev, "using %s irq %jd\n",
4715 			      sc->legacy_irq ? "INTx" : "MSI",
4716 			      rman_get_start(sc->irq_res));
4717 	err = bus_setup_intr(sc->dev, sc->irq_res,
4718 			     INTR_TYPE_NET | INTR_MPSAFE,
4719 #if __FreeBSD_version > 700030
4720 			     NULL,
4721 #endif
4722 			     mxge_intr, &sc->ss[0], &sc->ih);
4723 	if (err != 0) {
4724 		bus_release_resource(sc->dev, SYS_RES_IRQ,
4725 				     sc->legacy_irq ? 0 : 1, sc->irq_res);
4726 		if (!sc->legacy_irq)
4727 			pci_release_msi(sc->dev);
4728 	}
4729 	return err;
4730 }
4731 
4732 static void
4733 mxge_rem_msix_irqs(mxge_softc_t *sc)
4734 {
4735 	int i, rid;
4736 
4737 	for (i = 0; i < sc->num_slices; i++) {
4738 		if (sc->msix_ih[i] != NULL) {
4739 			bus_teardown_intr(sc->dev, sc->msix_irq_res[i],
4740 					  sc->msix_ih[i]);
4741 			sc->msix_ih[i] = NULL;
4742 		}
4743 	}
4744 	free(sc->msix_ih, M_DEVBUF);
4745 
4746 	for (i = 0; i < sc->num_slices; i++) {
4747 		rid = i + 1;
4748 		if (sc->msix_irq_res[i] != NULL)
4749 			bus_release_resource(sc->dev, SYS_RES_IRQ, rid,
4750 					     sc->msix_irq_res[i]);
4751 		sc->msix_irq_res[i] = NULL;
4752 	}
4753 	free(sc->msix_irq_res, M_DEVBUF);
4754 
4755 	bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2),
4756 			     sc->msix_table_res);
4757 
4758 	pci_release_msi(sc->dev);
4759 	return;
4760 }
4761 
4762 static void
4763 mxge_rem_single_irq(mxge_softc_t *sc)
4764 {
4765 	bus_teardown_intr(sc->dev, sc->irq_res, sc->ih);
4766 	bus_release_resource(sc->dev, SYS_RES_IRQ,
4767 			     sc->legacy_irq ? 0 : 1, sc->irq_res);
4768 	if (!sc->legacy_irq)
4769 		pci_release_msi(sc->dev);
4770 }
4771 
4772 static void
4773 mxge_rem_irq(mxge_softc_t *sc)
4774 {
4775 	if (sc->num_slices > 1)
4776 		mxge_rem_msix_irqs(sc);
4777 	else
4778 		mxge_rem_single_irq(sc);
4779 }
4780 
4781 static int
4782 mxge_add_irq(mxge_softc_t *sc)
4783 {
4784 	int err;
4785 
4786 	if (sc->num_slices > 1)
4787 		err = mxge_add_msix_irqs(sc);
4788 	else
4789 		err = mxge_add_single_irq(sc);
4790 
4791 	if (0 && err == 0 && sc->num_slices > 1) {
4792 		mxge_rem_msix_irqs(sc);
4793 		err = mxge_add_msix_irqs(sc);
4794 	}
4795 	return err;
4796 }
4797 
4798 
4799 static int
4800 mxge_attach(device_t dev)
4801 {
4802 	mxge_cmd_t cmd;
4803 	mxge_softc_t *sc = device_get_softc(dev);
4804 	struct ifnet *ifp;
4805 	int err, rid;
4806 
4807 	sc->dev = dev;
4808 	mxge_fetch_tunables(sc);
4809 
4810 	TASK_INIT(&sc->watchdog_task, 1, mxge_watchdog_task, sc);
4811 	sc->tq = taskqueue_create("mxge_taskq", M_WAITOK,
4812 				  taskqueue_thread_enqueue, &sc->tq);
4813 	if (sc->tq == NULL) {
4814 		err = ENOMEM;
4815 		goto abort_with_nothing;
4816 	}
4817 
4818 	err = bus_dma_tag_create(bus_get_dma_tag(dev),	/* parent */
4819 				 1,			/* alignment */
4820 				 0,			/* boundary */
4821 				 BUS_SPACE_MAXADDR,	/* low */
4822 				 BUS_SPACE_MAXADDR,	/* high */
4823 				 NULL, NULL,		/* filter */
4824 				 65536 + 256,		/* maxsize */
4825 				 MXGE_MAX_SEND_DESC, 	/* num segs */
4826 				 65536,			/* maxsegsize */
4827 				 0,			/* flags */
4828 				 NULL, NULL,		/* lock */
4829 				 &sc->parent_dmat);	/* tag */
4830 
4831 	if (err != 0) {
4832 		device_printf(sc->dev, "Err %d allocating parent dmat\n",
4833 			      err);
4834 		goto abort_with_tq;
4835 	}
4836 
4837 	ifp = sc->ifp = if_alloc(IFT_ETHER);
4838 	if (ifp == NULL) {
4839 		device_printf(dev, "can not if_alloc()\n");
4840 		err = ENOSPC;
4841 		goto abort_with_parent_dmat;
4842 	}
4843 	if_initname(ifp, device_get_name(dev), device_get_unit(dev));
4844 
4845 	snprintf(sc->cmd_mtx_name, sizeof(sc->cmd_mtx_name), "%s:cmd",
4846 		 device_get_nameunit(dev));
4847 	mtx_init(&sc->cmd_mtx, sc->cmd_mtx_name, NULL, MTX_DEF);
4848 	snprintf(sc->driver_mtx_name, sizeof(sc->driver_mtx_name),
4849 		 "%s:drv", device_get_nameunit(dev));
4850 	mtx_init(&sc->driver_mtx, sc->driver_mtx_name,
4851 		 MTX_NETWORK_LOCK, MTX_DEF);
4852 
4853 	callout_init_mtx(&sc->co_hdl, &sc->driver_mtx, 0);
4854 
4855 	mxge_setup_cfg_space(sc);
4856 
4857 	/* Map the board into the kernel */
4858 	rid = PCIR_BARS;
4859 	sc->mem_res = bus_alloc_resource_any(dev, SYS_RES_MEMORY, &rid,
4860 					     RF_ACTIVE);
4861 	if (sc->mem_res == NULL) {
4862 		device_printf(dev, "could not map memory\n");
4863 		err = ENXIO;
4864 		goto abort_with_lock;
4865 	}
4866 	sc->sram = rman_get_virtual(sc->mem_res);
4867 	sc->sram_size = 2*1024*1024 - (2*(48*1024)+(32*1024)) - 0x100;
4868 	if (sc->sram_size > rman_get_size(sc->mem_res)) {
4869 		device_printf(dev, "impossible memory region size %jd\n",
4870 			      rman_get_size(sc->mem_res));
4871 		err = ENXIO;
4872 		goto abort_with_mem_res;
4873 	}
4874 
4875 	/* make NULL terminated copy of the EEPROM strings section of
4876 	   lanai SRAM */
4877 	bzero(sc->eeprom_strings, MXGE_EEPROM_STRINGS_SIZE);
4878 	bus_space_read_region_1(rman_get_bustag(sc->mem_res),
4879 				rman_get_bushandle(sc->mem_res),
4880 				sc->sram_size - MXGE_EEPROM_STRINGS_SIZE,
4881 				sc->eeprom_strings,
4882 				MXGE_EEPROM_STRINGS_SIZE - 2);
4883 	err = mxge_parse_strings(sc);
4884 	if (err != 0)
4885 		goto abort_with_mem_res;
4886 
4887 	/* Enable write combining for efficient use of PCIe bus */
4888 	mxge_enable_wc(sc);
4889 
4890 	/* Allocate the out of band dma memory */
4891 	err = mxge_dma_alloc(sc, &sc->cmd_dma,
4892 			     sizeof (mxge_cmd_t), 64);
4893 	if (err != 0)
4894 		goto abort_with_mem_res;
4895 	sc->cmd = (mcp_cmd_response_t *) sc->cmd_dma.addr;
4896 	err = mxge_dma_alloc(sc, &sc->zeropad_dma, 64, 64);
4897 	if (err != 0)
4898 		goto abort_with_cmd_dma;
4899 
4900 	err = mxge_dma_alloc(sc, &sc->dmabench_dma, 4096, 4096);
4901 	if (err != 0)
4902 		goto abort_with_zeropad_dma;
4903 
4904 	/* select & load the firmware */
4905 	err = mxge_select_firmware(sc);
4906 	if (err != 0)
4907 		goto abort_with_dmabench;
4908 	sc->intr_coal_delay = mxge_intr_coal_delay;
4909 
4910 	mxge_slice_probe(sc);
4911 	err = mxge_alloc_slices(sc);
4912 	if (err != 0)
4913 		goto abort_with_dmabench;
4914 
4915 	err = mxge_reset(sc, 0);
4916 	if (err != 0)
4917 		goto abort_with_slices;
4918 
4919 	err = mxge_alloc_rings(sc);
4920 	if (err != 0) {
4921 		device_printf(sc->dev, "failed to allocate rings\n");
4922 		goto abort_with_slices;
4923 	}
4924 
4925 	err = mxge_add_irq(sc);
4926 	if (err != 0) {
4927 		device_printf(sc->dev, "failed to add irq\n");
4928 		goto abort_with_rings;
4929 	}
4930 
4931 	ifp->if_baudrate = IF_Gbps(10);
4932 	ifp->if_capabilities = IFCAP_RXCSUM | IFCAP_TXCSUM | IFCAP_TSO4 |
4933 		IFCAP_VLAN_MTU | IFCAP_LINKSTATE | IFCAP_TXCSUM_IPV6 |
4934 		IFCAP_RXCSUM_IPV6;
4935 #if defined(INET) || defined(INET6)
4936 	ifp->if_capabilities |= IFCAP_LRO;
4937 #endif
4938 
4939 #ifdef MXGE_NEW_VLAN_API
4940 	ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_HWCSUM;
4941 
4942 	/* Only FW 1.4.32 and newer can do TSO over vlans */
4943 	if (sc->fw_ver_major == 1 && sc->fw_ver_minor == 4 &&
4944 	    sc->fw_ver_tiny >= 32)
4945 		ifp->if_capabilities |= IFCAP_VLAN_HWTSO;
4946 #endif
4947 	sc->max_mtu = mxge_max_mtu(sc);
4948 	if (sc->max_mtu >= 9000)
4949 		ifp->if_capabilities |= IFCAP_JUMBO_MTU;
4950 	else
4951 		device_printf(dev, "MTU limited to %d.  Install "
4952 			      "latest firmware for 9000 byte jumbo support\n",
4953 			      sc->max_mtu - ETHER_HDR_LEN);
4954 	ifp->if_hwassist = CSUM_TCP | CSUM_UDP | CSUM_TSO;
4955 	ifp->if_hwassist |= CSUM_TCP_IPV6 | CSUM_UDP_IPV6;
4956 	/* check to see if f/w supports TSO for IPv6 */
4957 	if (!mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_TSO6_HDR_SIZE, &cmd)) {
4958 		if (CSUM_TCP_IPV6)
4959 			ifp->if_capabilities |= IFCAP_TSO6;
4960 		sc->max_tso6_hlen = min(cmd.data0,
4961 					sizeof (sc->ss[0].scratch));
4962 	}
4963 	ifp->if_capenable = ifp->if_capabilities;
4964 	if (sc->lro_cnt == 0)
4965 		ifp->if_capenable &= ~IFCAP_LRO;
4966 	ifp->if_init = mxge_init;
4967 	ifp->if_softc = sc;
4968 	ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
4969 	ifp->if_ioctl = mxge_ioctl;
4970 	ifp->if_start = mxge_start;
4971 	ifp->if_get_counter = mxge_get_counter;
4972 	ifp->if_hw_tsomax = IP_MAXPACKET - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN);
4973 	ifp->if_hw_tsomaxsegcount = sc->ss[0].tx.max_desc;
4974 	ifp->if_hw_tsomaxsegsize = IP_MAXPACKET;
4975 	/* Initialise the ifmedia structure */
4976 	ifmedia_init(&sc->media, 0, mxge_media_change,
4977 		     mxge_media_status);
4978 	mxge_media_init(sc);
4979 	mxge_media_probe(sc);
4980 	sc->dying = 0;
4981 	ether_ifattach(ifp, sc->mac_addr);
4982 	/* ether_ifattach sets mtu to ETHERMTU */
4983 	if (mxge_initial_mtu != ETHERMTU)
4984 		mxge_change_mtu(sc, mxge_initial_mtu);
4985 
4986 	mxge_add_sysctls(sc);
4987 #ifdef IFNET_BUF_RING
4988 	ifp->if_transmit = mxge_transmit;
4989 	ifp->if_qflush = mxge_qflush;
4990 #endif
4991 	taskqueue_start_threads(&sc->tq, 1, PI_NET, "%s taskq",
4992 				device_get_nameunit(sc->dev));
4993 	callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
4994 	return 0;
4995 
4996 abort_with_rings:
4997 	mxge_free_rings(sc);
4998 abort_with_slices:
4999 	mxge_free_slices(sc);
5000 abort_with_dmabench:
5001 	mxge_dma_free(&sc->dmabench_dma);
5002 abort_with_zeropad_dma:
5003 	mxge_dma_free(&sc->zeropad_dma);
5004 abort_with_cmd_dma:
5005 	mxge_dma_free(&sc->cmd_dma);
5006 abort_with_mem_res:
5007 	bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
5008 abort_with_lock:
5009 	pci_disable_busmaster(dev);
5010 	mtx_destroy(&sc->cmd_mtx);
5011 	mtx_destroy(&sc->driver_mtx);
5012 	if_free(ifp);
5013 abort_with_parent_dmat:
5014 	bus_dma_tag_destroy(sc->parent_dmat);
5015 abort_with_tq:
5016 	if (sc->tq != NULL) {
5017 		taskqueue_drain(sc->tq, &sc->watchdog_task);
5018 		taskqueue_free(sc->tq);
5019 		sc->tq = NULL;
5020 	}
5021 abort_with_nothing:
5022 	return err;
5023 }
5024 
5025 static int
5026 mxge_detach(device_t dev)
5027 {
5028 	mxge_softc_t *sc = device_get_softc(dev);
5029 
5030 	if (mxge_vlans_active(sc)) {
5031 		device_printf(sc->dev,
5032 			      "Detach vlans before removing module\n");
5033 		return EBUSY;
5034 	}
5035 	mtx_lock(&sc->driver_mtx);
5036 	sc->dying = 1;
5037 	if (sc->ifp->if_drv_flags & IFF_DRV_RUNNING)
5038 		mxge_close(sc, 0);
5039 	mtx_unlock(&sc->driver_mtx);
5040 	ether_ifdetach(sc->ifp);
5041 	if (sc->tq != NULL) {
5042 		taskqueue_drain(sc->tq, &sc->watchdog_task);
5043 		taskqueue_free(sc->tq);
5044 		sc->tq = NULL;
5045 	}
5046 	callout_drain(&sc->co_hdl);
5047 	ifmedia_removeall(&sc->media);
5048 	mxge_dummy_rdma(sc, 0);
5049 	mxge_rem_sysctls(sc);
5050 	mxge_rem_irq(sc);
5051 	mxge_free_rings(sc);
5052 	mxge_free_slices(sc);
5053 	mxge_dma_free(&sc->dmabench_dma);
5054 	mxge_dma_free(&sc->zeropad_dma);
5055 	mxge_dma_free(&sc->cmd_dma);
5056 	bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
5057 	pci_disable_busmaster(dev);
5058 	mtx_destroy(&sc->cmd_mtx);
5059 	mtx_destroy(&sc->driver_mtx);
5060 	if_free(sc->ifp);
5061 	bus_dma_tag_destroy(sc->parent_dmat);
5062 	return 0;
5063 }
5064 
5065 static int
5066 mxge_shutdown(device_t dev)
5067 {
5068 	return 0;
5069 }
5070 
5071 /*
5072   This file uses Myri10GE driver indentation.
5073 
5074   Local Variables:
5075   c-file-style:"linux"
5076   tab-width:8
5077   End:
5078 */
5079