xref: /freebsd/sys/dev/mxge/if_mxge.c (revision bc7512cc58af2e8bbe5bbf5ca0059b1daa1da897)
1 /******************************************************************************
2 SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3 
4 Copyright (c) 2006-2013, Myricom Inc.
5 All rights reserved.
6 
7 Redistribution and use in source and binary forms, with or without
8 modification, are permitted provided that the following conditions are met:
9 
10  1. Redistributions of source code must retain the above copyright notice,
11     this list of conditions and the following disclaimer.
12 
13  2. Neither the name of the Myricom Inc, nor the names of its
14     contributors may be used to endorse or promote products derived from
15     this software without specific prior written permission.
16 
17 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
21 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27 POSSIBILITY OF SUCH DAMAGE.
28 
29 ***************************************************************************/
30 
31 #include <sys/cdefs.h>
32 __FBSDID("$FreeBSD$");
33 
34 #include <sys/param.h>
35 #include <sys/systm.h>
36 #include <sys/linker.h>
37 #include <sys/firmware.h>
38 #include <sys/endian.h>
39 #include <sys/sockio.h>
40 #include <sys/mbuf.h>
41 #include <sys/malloc.h>
42 #include <sys/kdb.h>
43 #include <sys/kernel.h>
44 #include <sys/lock.h>
45 #include <sys/module.h>
46 #include <sys/socket.h>
47 #include <sys/sysctl.h>
48 #include <sys/sx.h>
49 #include <sys/taskqueue.h>
50 #include <contrib/zlib/zlib.h>
51 #include <dev/zlib/zcalloc.h>
52 
53 #include <net/if.h>
54 #include <net/if_var.h>
55 #include <net/if_arp.h>
56 #include <net/ethernet.h>
57 #include <net/if_dl.h>
58 #include <net/if_media.h>
59 
60 #include <net/bpf.h>
61 
62 #include <net/if_types.h>
63 #include <net/if_vlan_var.h>
64 
65 #include <netinet/in_systm.h>
66 #include <netinet/in.h>
67 #include <netinet/ip.h>
68 #include <netinet/ip6.h>
69 #include <netinet/tcp.h>
70 #include <netinet/tcp_lro.h>
71 #include <netinet6/ip6_var.h>
72 
73 #include <machine/bus.h>
74 #include <machine/in_cksum.h>
75 #include <machine/resource.h>
76 #include <sys/bus.h>
77 #include <sys/rman.h>
78 #include <sys/smp.h>
79 
80 #include <dev/pci/pcireg.h>
81 #include <dev/pci/pcivar.h>
82 #include <dev/pci/pci_private.h> /* XXX for pci_cfg_restore */
83 
84 #include <vm/vm.h>		/* for pmap_mapdev() */
85 #include <vm/pmap.h>
86 
87 #if defined(__i386) || defined(__amd64)
88 #include <machine/specialreg.h>
89 #endif
90 
91 #include <dev/mxge/mxge_mcp.h>
92 #include <dev/mxge/mcp_gen_header.h>
93 /*#define MXGE_FAKE_IFP*/
94 #include <dev/mxge/if_mxge_var.h>
95 #ifdef IFNET_BUF_RING
96 #include <sys/buf_ring.h>
97 #endif
98 
99 #include "opt_inet.h"
100 #include "opt_inet6.h"
101 
102 /* tunable params */
103 static int mxge_nvidia_ecrc_enable = 1;
104 static int mxge_force_firmware = 0;
105 static int mxge_intr_coal_delay = 30;
106 static int mxge_deassert_wait = 1;
107 static int mxge_flow_control = 1;
108 static int mxge_verbose = 0;
109 static int mxge_ticks;
110 static int mxge_max_slices = 1;
111 static int mxge_rss_hash_type = MXGEFW_RSS_HASH_TYPE_SRC_DST_PORT;
112 static int mxge_always_promisc = 0;
113 static int mxge_initial_mtu = ETHERMTU_JUMBO;
114 static int mxge_throttle = 0;
115 static char *mxge_fw_unaligned = "mxge_ethp_z8e";
116 static char *mxge_fw_aligned = "mxge_eth_z8e";
117 static char *mxge_fw_rss_aligned = "mxge_rss_eth_z8e";
118 static char *mxge_fw_rss_unaligned = "mxge_rss_ethp_z8e";
119 
120 static int mxge_probe(device_t dev);
121 static int mxge_attach(device_t dev);
122 static int mxge_detach(device_t dev);
123 static int mxge_shutdown(device_t dev);
124 static void mxge_intr(void *arg);
125 
126 static device_method_t mxge_methods[] =
127 {
128   /* Device interface */
129   DEVMETHOD(device_probe, mxge_probe),
130   DEVMETHOD(device_attach, mxge_attach),
131   DEVMETHOD(device_detach, mxge_detach),
132   DEVMETHOD(device_shutdown, mxge_shutdown),
133 
134   DEVMETHOD_END
135 };
136 
137 static driver_t mxge_driver =
138 {
139   "mxge",
140   mxge_methods,
141   sizeof(mxge_softc_t),
142 };
143 
144 /* Declare ourselves to be a child of the PCI bus.*/
145 DRIVER_MODULE(mxge, pci, mxge_driver, 0, 0);
146 MODULE_DEPEND(mxge, firmware, 1, 1, 1);
147 MODULE_DEPEND(mxge, zlib, 1, 1, 1);
148 
149 static int mxge_load_firmware(mxge_softc_t *sc, int adopt);
150 static int mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data);
151 static int mxge_close(mxge_softc_t *sc, int down);
152 static int mxge_open(mxge_softc_t *sc);
153 static void mxge_tick(void *arg);
154 
155 static int
156 mxge_probe(device_t dev)
157 {
158 	int rev;
159 
160 	if ((pci_get_vendor(dev) == MXGE_PCI_VENDOR_MYRICOM) &&
161 	    ((pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E) ||
162 	     (pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E_9))) {
163 		rev = pci_get_revid(dev);
164 		switch (rev) {
165 		case MXGE_PCI_REV_Z8E:
166 			device_set_desc(dev, "Myri10G-PCIE-8A");
167 			break;
168 		case MXGE_PCI_REV_Z8ES:
169 			device_set_desc(dev, "Myri10G-PCIE-8B");
170 			break;
171 		default:
172 			device_set_desc(dev, "Myri10G-PCIE-8??");
173 			device_printf(dev, "Unrecognized rev %d NIC\n",
174 				      rev);
175 			break;
176 		}
177 		return 0;
178 	}
179 	return ENXIO;
180 }
181 
182 static void
183 mxge_enable_wc(mxge_softc_t *sc)
184 {
185 #if defined(__i386) || defined(__amd64)
186 	vm_offset_t len;
187 	int err;
188 
189 	sc->wc = 1;
190 	len = rman_get_size(sc->mem_res);
191 	err = pmap_change_attr((vm_offset_t) sc->sram,
192 			       len, PAT_WRITE_COMBINING);
193 	if (err != 0) {
194 		device_printf(sc->dev, "pmap_change_attr failed, %d\n",
195 			      err);
196 		sc->wc = 0;
197 	}
198 #endif
199 }
200 
201 /* callback to get our DMA address */
202 static void
203 mxge_dmamap_callback(void *arg, bus_dma_segment_t *segs, int nsegs,
204 			 int error)
205 {
206 	if (error == 0) {
207 		*(bus_addr_t *) arg = segs->ds_addr;
208 	}
209 }
210 
211 static int
212 mxge_dma_alloc(mxge_softc_t *sc, mxge_dma_t *dma, size_t bytes,
213 		   bus_size_t alignment)
214 {
215 	int err;
216 	device_t dev = sc->dev;
217 	bus_size_t boundary, maxsegsize;
218 
219 	if (bytes > 4096 && alignment == 4096) {
220 		boundary = 0;
221 		maxsegsize = bytes;
222 	} else {
223 		boundary = 4096;
224 		maxsegsize = 4096;
225 	}
226 
227 	/* allocate DMAable memory tags */
228 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
229 				 alignment,		/* alignment */
230 				 boundary,		/* boundary */
231 				 BUS_SPACE_MAXADDR,	/* low */
232 				 BUS_SPACE_MAXADDR,	/* high */
233 				 NULL, NULL,		/* filter */
234 				 bytes,			/* maxsize */
235 				 1,			/* num segs */
236 				 maxsegsize,		/* maxsegsize */
237 				 BUS_DMA_COHERENT,	/* flags */
238 				 NULL, NULL,		/* lock */
239 				 &dma->dmat);		/* tag */
240 	if (err != 0) {
241 		device_printf(dev, "couldn't alloc tag (err = %d)\n", err);
242 		return err;
243 	}
244 
245 	/* allocate DMAable memory & map */
246 	err = bus_dmamem_alloc(dma->dmat, &dma->addr,
247 			       (BUS_DMA_WAITOK | BUS_DMA_COHERENT
248 				| BUS_DMA_ZERO),  &dma->map);
249 	if (err != 0) {
250 		device_printf(dev, "couldn't alloc mem (err = %d)\n", err);
251 		goto abort_with_dmat;
252 	}
253 
254 	/* load the memory */
255 	err = bus_dmamap_load(dma->dmat, dma->map, dma->addr, bytes,
256 			      mxge_dmamap_callback,
257 			      (void *)&dma->bus_addr, 0);
258 	if (err != 0) {
259 		device_printf(dev, "couldn't load map (err = %d)\n", err);
260 		goto abort_with_mem;
261 	}
262 	return 0;
263 
264 abort_with_mem:
265 	bus_dmamem_free(dma->dmat, dma->addr, dma->map);
266 abort_with_dmat:
267 	(void)bus_dma_tag_destroy(dma->dmat);
268 	return err;
269 }
270 
271 static void
272 mxge_dma_free(mxge_dma_t *dma)
273 {
274 	bus_dmamap_unload(dma->dmat, dma->map);
275 	bus_dmamem_free(dma->dmat, dma->addr, dma->map);
276 	(void)bus_dma_tag_destroy(dma->dmat);
277 }
278 
279 /*
280  * The eeprom strings on the lanaiX have the format
281  * SN=x\0
282  * MAC=x:x:x:x:x:x\0
283  * PC=text\0
284  */
285 
286 static int
287 mxge_parse_strings(mxge_softc_t *sc)
288 {
289 	char *ptr;
290 	int i, found_mac, found_sn2;
291 	char *endptr;
292 
293 	ptr = sc->eeprom_strings;
294 	found_mac = 0;
295 	found_sn2 = 0;
296 	while (*ptr != '\0') {
297 		if (strncmp(ptr, "MAC=", 4) == 0) {
298 			ptr += 4;
299 			for (i = 0;;) {
300 				sc->mac_addr[i] = strtoul(ptr, &endptr, 16);
301 				if (endptr - ptr != 2)
302 					goto abort;
303 				ptr = endptr;
304 				if (++i == 6)
305 					break;
306 				if (*ptr++ != ':')
307 					goto abort;
308 			}
309 			found_mac = 1;
310 		} else if (strncmp(ptr, "PC=", 3) == 0) {
311 			ptr += 3;
312 			strlcpy(sc->product_code_string, ptr,
313 			    sizeof(sc->product_code_string));
314 		} else if (!found_sn2 && (strncmp(ptr, "SN=", 3) == 0)) {
315 			ptr += 3;
316 			strlcpy(sc->serial_number_string, ptr,
317 			    sizeof(sc->serial_number_string));
318 		} else if (strncmp(ptr, "SN2=", 4) == 0) {
319 			/* SN2 takes precedence over SN */
320 			ptr += 4;
321 			found_sn2 = 1;
322 			strlcpy(sc->serial_number_string, ptr,
323 			    sizeof(sc->serial_number_string));
324 		}
325 		while (*ptr++ != '\0') {}
326 	}
327 
328 	if (found_mac)
329 		return 0;
330 
331  abort:
332 	device_printf(sc->dev, "failed to parse eeprom_strings\n");
333 
334 	return ENXIO;
335 }
336 
337 #if defined __i386 || defined i386 || defined __i386__ || defined __x86_64__
338 static void
339 mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
340 {
341 	uint32_t val;
342 	unsigned long base, off;
343 	char *va, *cfgptr;
344 	device_t pdev, mcp55;
345 	uint16_t vendor_id, device_id, word;
346 	uintptr_t bus, slot, func, ivend, idev;
347 	uint32_t *ptr32;
348 
349 	if (!mxge_nvidia_ecrc_enable)
350 		return;
351 
352 	pdev = device_get_parent(device_get_parent(sc->dev));
353 	if (pdev == NULL) {
354 		device_printf(sc->dev, "could not find parent?\n");
355 		return;
356 	}
357 	vendor_id = pci_read_config(pdev, PCIR_VENDOR, 2);
358 	device_id = pci_read_config(pdev, PCIR_DEVICE, 2);
359 
360 	if (vendor_id != 0x10de)
361 		return;
362 
363 	base = 0;
364 
365 	if (device_id == 0x005d) {
366 		/* ck804, base address is magic */
367 		base = 0xe0000000UL;
368 	} else if (device_id >= 0x0374 && device_id <= 0x378) {
369 		/* mcp55, base address stored in chipset */
370 		mcp55 = pci_find_bsf(0, 0, 0);
371 		if (mcp55 &&
372 		    0x10de == pci_read_config(mcp55, PCIR_VENDOR, 2) &&
373 		    0x0369 == pci_read_config(mcp55, PCIR_DEVICE, 2)) {
374 			word = pci_read_config(mcp55, 0x90, 2);
375 			base = ((unsigned long)word & 0x7ffeU) << 25;
376 		}
377 	}
378 	if (!base)
379 		return;
380 
381 	/* XXXX
382 	   Test below is commented because it is believed that doing
383 	   config read/write beyond 0xff will access the config space
384 	   for the next larger function.  Uncomment this and remove
385 	   the hacky pmap_mapdev() way of accessing config space when
386 	   FreeBSD grows support for extended pcie config space access
387 	*/
388 #if 0
389 	/* See if we can, by some miracle, access the extended
390 	   config space */
391 	val = pci_read_config(pdev, 0x178, 4);
392 	if (val != 0xffffffff) {
393 		val |= 0x40;
394 		pci_write_config(pdev, 0x178, val, 4);
395 		return;
396 	}
397 #endif
398 	/* Rather than using normal pci config space writes, we must
399 	 * map the Nvidia config space ourselves.  This is because on
400 	 * opteron/nvidia class machine the 0xe000000 mapping is
401 	 * handled by the nvidia chipset, that means the internal PCI
402 	 * device (the on-chip northbridge), or the amd-8131 bridge
403 	 * and things behind them are not visible by this method.
404 	 */
405 
406 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
407 		      PCI_IVAR_BUS, &bus);
408 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
409 		      PCI_IVAR_SLOT, &slot);
410 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
411 		      PCI_IVAR_FUNCTION, &func);
412 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
413 		      PCI_IVAR_VENDOR, &ivend);
414 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
415 		      PCI_IVAR_DEVICE, &idev);
416 
417 	off =  base
418 		+ 0x00100000UL * (unsigned long)bus
419 		+ 0x00001000UL * (unsigned long)(func
420 						 + 8 * slot);
421 
422 	/* map it into the kernel */
423 	va = pmap_mapdev(trunc_page((vm_paddr_t)off), PAGE_SIZE);
424 
425 	if (va == NULL) {
426 		device_printf(sc->dev, "pmap_kenter_temporary didn't\n");
427 		return;
428 	}
429 	/* get a pointer to the config space mapped into the kernel */
430 	cfgptr = va + (off & PAGE_MASK);
431 
432 	/* make sure that we can really access it */
433 	vendor_id = *(uint16_t *)(cfgptr + PCIR_VENDOR);
434 	device_id = *(uint16_t *)(cfgptr + PCIR_DEVICE);
435 	if (! (vendor_id == ivend && device_id == idev)) {
436 		device_printf(sc->dev, "mapping failed: 0x%x:0x%x\n",
437 			      vendor_id, device_id);
438 		pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
439 		return;
440 	}
441 
442 	ptr32 = (uint32_t*)(cfgptr + 0x178);
443 	val = *ptr32;
444 
445 	if (val == 0xffffffff) {
446 		device_printf(sc->dev, "extended mapping failed\n");
447 		pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
448 		return;
449 	}
450 	*ptr32 = val | 0x40;
451 	pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
452 	if (mxge_verbose)
453 		device_printf(sc->dev,
454 			      "Enabled ECRC on upstream Nvidia bridge "
455 			      "at %d:%d:%d\n",
456 			      (int)bus, (int)slot, (int)func);
457 	return;
458 }
459 #else
460 static void
461 mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
462 {
463 	device_printf(sc->dev,
464 		      "Nforce 4 chipset on non-x86/amd64!?!?!\n");
465 	return;
466 }
467 #endif
468 
469 static int
470 mxge_dma_test(mxge_softc_t *sc, int test_type)
471 {
472 	mxge_cmd_t cmd;
473 	bus_addr_t dmatest_bus = sc->dmabench_dma.bus_addr;
474 	int status;
475 	uint32_t len;
476 	char *test = " ";
477 
478 	/* Run a small DMA test.
479 	 * The magic multipliers to the length tell the firmware
480 	 * to do DMA read, write, or read+write tests.  The
481 	 * results are returned in cmd.data0.  The upper 16
482 	 * bits of the return is the number of transfers completed.
483 	 * The lower 16 bits is the time in 0.5us ticks that the
484 	 * transfers took to complete.
485 	 */
486 
487 	len = sc->tx_boundary;
488 
489 	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
490 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
491 	cmd.data2 = len * 0x10000;
492 	status = mxge_send_cmd(sc, test_type, &cmd);
493 	if (status != 0) {
494 		test = "read";
495 		goto abort;
496 	}
497 	sc->read_dma = ((cmd.data0>>16) * len * 2) /
498 		(cmd.data0 & 0xffff);
499 	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
500 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
501 	cmd.data2 = len * 0x1;
502 	status = mxge_send_cmd(sc, test_type, &cmd);
503 	if (status != 0) {
504 		test = "write";
505 		goto abort;
506 	}
507 	sc->write_dma = ((cmd.data0>>16) * len * 2) /
508 		(cmd.data0 & 0xffff);
509 
510 	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
511 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
512 	cmd.data2 = len * 0x10001;
513 	status = mxge_send_cmd(sc, test_type, &cmd);
514 	if (status != 0) {
515 		test = "read/write";
516 		goto abort;
517 	}
518 	sc->read_write_dma = ((cmd.data0>>16) * len * 2 * 2) /
519 		(cmd.data0 & 0xffff);
520 
521 abort:
522 	if (status != 0 && test_type != MXGEFW_CMD_UNALIGNED_TEST)
523 		device_printf(sc->dev, "DMA %s benchmark failed: %d\n",
524 			      test, status);
525 
526 	return status;
527 }
528 
529 /*
530  * The Lanai Z8E PCI-E interface achieves higher Read-DMA throughput
531  * when the PCI-E Completion packets are aligned on an 8-byte
532  * boundary.  Some PCI-E chip sets always align Completion packets; on
533  * the ones that do not, the alignment can be enforced by enabling
534  * ECRC generation (if supported).
535  *
536  * When PCI-E Completion packets are not aligned, it is actually more
537  * efficient to limit Read-DMA transactions to 2KB, rather than 4KB.
538  *
539  * If the driver can neither enable ECRC nor verify that it has
540  * already been enabled, then it must use a firmware image which works
541  * around unaligned completion packets (ethp_z8e.dat), and it should
542  * also ensure that it never gives the device a Read-DMA which is
543  * larger than 2KB by setting the tx_boundary to 2KB.  If ECRC is
544  * enabled, then the driver should use the aligned (eth_z8e.dat)
545  * firmware image, and set tx_boundary to 4KB.
546  */
547 
548 static int
549 mxge_firmware_probe(mxge_softc_t *sc)
550 {
551 	device_t dev = sc->dev;
552 	int reg, status;
553 	uint16_t pectl;
554 
555 	sc->tx_boundary = 4096;
556 	/*
557 	 * Verify the max read request size was set to 4KB
558 	 * before trying the test with 4KB.
559 	 */
560 	if (pci_find_cap(dev, PCIY_EXPRESS, &reg) == 0) {
561 		pectl = pci_read_config(dev, reg + 0x8, 2);
562 		if ((pectl & (5 << 12)) != (5 << 12)) {
563 			device_printf(dev, "Max Read Req. size != 4k (0x%x\n",
564 				      pectl);
565 			sc->tx_boundary = 2048;
566 		}
567 	}
568 
569 	/*
570 	 * load the optimized firmware (which assumes aligned PCIe
571 	 * completions) in order to see if it works on this host.
572 	 */
573 	sc->fw_name = mxge_fw_aligned;
574 	status = mxge_load_firmware(sc, 1);
575 	if (status != 0) {
576 		return status;
577 	}
578 
579 	/*
580 	 * Enable ECRC if possible
581 	 */
582 	mxge_enable_nvidia_ecrc(sc);
583 
584 	/*
585 	 * Run a DMA test which watches for unaligned completions and
586 	 * aborts on the first one seen.  Not required on Z8ES or newer.
587 	 */
588 	if (pci_get_revid(sc->dev) >= MXGE_PCI_REV_Z8ES)
589 		return 0;
590 	status = mxge_dma_test(sc, MXGEFW_CMD_UNALIGNED_TEST);
591 	if (status == 0)
592 		return 0; /* keep the aligned firmware */
593 
594 	if (status != E2BIG)
595 		device_printf(dev, "DMA test failed: %d\n", status);
596 	if (status == ENOSYS)
597 		device_printf(dev, "Falling back to ethp! "
598 			      "Please install up to date fw\n");
599 	return status;
600 }
601 
602 static int
603 mxge_select_firmware(mxge_softc_t *sc)
604 {
605 	int aligned = 0;
606 	int force_firmware = mxge_force_firmware;
607 
608 	if (sc->throttle)
609 		force_firmware = sc->throttle;
610 
611 	if (force_firmware != 0) {
612 		if (force_firmware == 1)
613 			aligned = 1;
614 		else
615 			aligned = 0;
616 		if (mxge_verbose)
617 			device_printf(sc->dev,
618 				      "Assuming %s completions (forced)\n",
619 				      aligned ? "aligned" : "unaligned");
620 		goto abort;
621 	}
622 
623 	/* if the PCIe link width is 4 or less, we can use the aligned
624 	   firmware and skip any checks */
625 	if (sc->link_width != 0 && sc->link_width <= 4) {
626 		device_printf(sc->dev,
627 			      "PCIe x%d Link, expect reduced performance\n",
628 			      sc->link_width);
629 		aligned = 1;
630 		goto abort;
631 	}
632 
633 	if (0 == mxge_firmware_probe(sc))
634 		return 0;
635 
636 abort:
637 	if (aligned) {
638 		sc->fw_name = mxge_fw_aligned;
639 		sc->tx_boundary = 4096;
640 	} else {
641 		sc->fw_name = mxge_fw_unaligned;
642 		sc->tx_boundary = 2048;
643 	}
644 	return (mxge_load_firmware(sc, 0));
645 }
646 
647 static int
648 mxge_validate_firmware(mxge_softc_t *sc, const mcp_gen_header_t *hdr)
649 {
650 
651 	if (be32toh(hdr->mcp_type) != MCP_TYPE_ETH) {
652 		device_printf(sc->dev, "Bad firmware type: 0x%x\n",
653 			      be32toh(hdr->mcp_type));
654 		return EIO;
655 	}
656 
657 	/* save firmware version for sysctl */
658 	strlcpy(sc->fw_version, hdr->version, sizeof(sc->fw_version));
659 	if (mxge_verbose)
660 		device_printf(sc->dev, "firmware id: %s\n", hdr->version);
661 
662 	sscanf(sc->fw_version, "%d.%d.%d", &sc->fw_ver_major,
663 	       &sc->fw_ver_minor, &sc->fw_ver_tiny);
664 
665 	if (!(sc->fw_ver_major == MXGEFW_VERSION_MAJOR
666 	      && sc->fw_ver_minor == MXGEFW_VERSION_MINOR)) {
667 		device_printf(sc->dev, "Found firmware version %s\n",
668 			      sc->fw_version);
669 		device_printf(sc->dev, "Driver needs %d.%d\n",
670 			      MXGEFW_VERSION_MAJOR, MXGEFW_VERSION_MINOR);
671 		return EINVAL;
672 	}
673 	return 0;
674 
675 }
676 
677 static int
678 mxge_load_firmware_helper(mxge_softc_t *sc, uint32_t *limit)
679 {
680 	z_stream zs;
681 	char *inflate_buffer;
682 	const struct firmware *fw;
683 	const mcp_gen_header_t *hdr;
684 	unsigned hdr_offset;
685 	int status;
686 	unsigned int i;
687 	size_t fw_len;
688 
689 	fw = firmware_get(sc->fw_name);
690 	if (fw == NULL) {
691 		device_printf(sc->dev, "Could not find firmware image %s\n",
692 			      sc->fw_name);
693 		return ENOENT;
694 	}
695 
696 	/* setup zlib and decompress f/w */
697 	bzero(&zs, sizeof (zs));
698 	zs.zalloc = zcalloc_nowait;
699 	zs.zfree = zcfree;
700 	status = inflateInit(&zs);
701 	if (status != Z_OK) {
702 		status = EIO;
703 		goto abort_with_fw;
704 	}
705 
706 	/* the uncompressed size is stored as the firmware version,
707 	   which would otherwise go unused */
708 	fw_len = (size_t) fw->version;
709 	inflate_buffer = malloc(fw_len, M_TEMP, M_NOWAIT);
710 	if (inflate_buffer == NULL)
711 		goto abort_with_zs;
712 	zs.avail_in = fw->datasize;
713 	zs.next_in = __DECONST(char *, fw->data);
714 	zs.avail_out = fw_len;
715 	zs.next_out = inflate_buffer;
716 	status = inflate(&zs, Z_FINISH);
717 	if (status != Z_STREAM_END) {
718 		device_printf(sc->dev, "zlib %d\n", status);
719 		status = EIO;
720 		goto abort_with_buffer;
721 	}
722 
723 	/* check id */
724 	hdr_offset = htobe32(*(const uint32_t *)
725 			     (inflate_buffer + MCP_HEADER_PTR_OFFSET));
726 	if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > fw_len) {
727 		device_printf(sc->dev, "Bad firmware file");
728 		status = EIO;
729 		goto abort_with_buffer;
730 	}
731 	hdr = (const void*)(inflate_buffer + hdr_offset);
732 
733 	status = mxge_validate_firmware(sc, hdr);
734 	if (status != 0)
735 		goto abort_with_buffer;
736 
737 	/* Copy the inflated firmware to NIC SRAM. */
738 	for (i = 0; i < fw_len; i += 256) {
739 		mxge_pio_copy(sc->sram + MXGE_FW_OFFSET + i,
740 			      inflate_buffer + i,
741 			      min(256U, (unsigned)(fw_len - i)));
742 		wmb();
743 		(void)*sc->sram;
744 		wmb();
745 	}
746 
747 	*limit = fw_len;
748 	status = 0;
749 abort_with_buffer:
750 	free(inflate_buffer, M_TEMP);
751 abort_with_zs:
752 	inflateEnd(&zs);
753 abort_with_fw:
754 	firmware_put(fw, FIRMWARE_UNLOAD);
755 	return status;
756 }
757 
758 /*
759  * Enable or disable periodic RDMAs from the host to make certain
760  * chipsets resend dropped PCIe messages
761  */
762 
763 static void
764 mxge_dummy_rdma(mxge_softc_t *sc, int enable)
765 {
766 	char buf_bytes[72];
767 	volatile uint32_t *confirm;
768 	volatile char *submit;
769 	uint32_t *buf, dma_low, dma_high;
770 	int i;
771 
772 	buf = (uint32_t *)((uintptr_t)(buf_bytes + 7) & ~7UL);
773 
774 	/* clear confirmation addr */
775 	confirm = (volatile uint32_t *)sc->cmd;
776 	*confirm = 0;
777 	wmb();
778 
779 	/* send an rdma command to the PCIe engine, and wait for the
780 	   response in the confirmation address.  The firmware should
781 	   write a -1 there to indicate it is alive and well
782 	*/
783 
784 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
785 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
786 	buf[0] = htobe32(dma_high);		/* confirm addr MSW */
787 	buf[1] = htobe32(dma_low);		/* confirm addr LSW */
788 	buf[2] = htobe32(0xffffffff);		/* confirm data */
789 	dma_low = MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr);
790 	dma_high = MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr);
791 	buf[3] = htobe32(dma_high); 		/* dummy addr MSW */
792 	buf[4] = htobe32(dma_low); 		/* dummy addr LSW */
793 	buf[5] = htobe32(enable);			/* enable? */
794 
795 	submit = (volatile char *)(sc->sram + MXGEFW_BOOT_DUMMY_RDMA);
796 
797 	mxge_pio_copy(submit, buf, 64);
798 	wmb();
799 	DELAY(1000);
800 	wmb();
801 	i = 0;
802 	while (*confirm != 0xffffffff && i < 20) {
803 		DELAY(1000);
804 		i++;
805 	}
806 	if (*confirm != 0xffffffff) {
807 		device_printf(sc->dev, "dummy rdma %s failed (%p = 0x%x)",
808 			      (enable ? "enable" : "disable"), confirm,
809 			      *confirm);
810 	}
811 	return;
812 }
813 
814 static int
815 mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data)
816 {
817 	mcp_cmd_t *buf;
818 	char buf_bytes[sizeof(*buf) + 8];
819 	volatile mcp_cmd_response_t *response = sc->cmd;
820 	volatile char *cmd_addr = sc->sram + MXGEFW_ETH_CMD;
821 	uint32_t dma_low, dma_high;
822 	int err, sleep_total = 0;
823 
824 	/* ensure buf is aligned to 8 bytes */
825 	buf = (mcp_cmd_t *)((uintptr_t)(buf_bytes + 7) & ~7UL);
826 
827 	buf->data0 = htobe32(data->data0);
828 	buf->data1 = htobe32(data->data1);
829 	buf->data2 = htobe32(data->data2);
830 	buf->cmd = htobe32(cmd);
831 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
832 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
833 
834 	buf->response_addr.low = htobe32(dma_low);
835 	buf->response_addr.high = htobe32(dma_high);
836 	mtx_lock(&sc->cmd_mtx);
837 	response->result = 0xffffffff;
838 	wmb();
839 	mxge_pio_copy((volatile void *)cmd_addr, buf, sizeof (*buf));
840 
841 	/* wait up to 20ms */
842 	err = EAGAIN;
843 	for (sleep_total = 0; sleep_total <  20; sleep_total++) {
844 		bus_dmamap_sync(sc->cmd_dma.dmat,
845 				sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
846 		wmb();
847 		switch (be32toh(response->result)) {
848 		case 0:
849 			data->data0 = be32toh(response->data);
850 			err = 0;
851 			break;
852 		case 0xffffffff:
853 			DELAY(1000);
854 			break;
855 		case MXGEFW_CMD_UNKNOWN:
856 			err = ENOSYS;
857 			break;
858 		case MXGEFW_CMD_ERROR_UNALIGNED:
859 			err = E2BIG;
860 			break;
861 		case MXGEFW_CMD_ERROR_BUSY:
862 			err = EBUSY;
863 			break;
864 		case MXGEFW_CMD_ERROR_I2C_ABSENT:
865 			err = ENXIO;
866 			break;
867 		default:
868 			device_printf(sc->dev,
869 				      "mxge: command %d "
870 				      "failed, result = %d\n",
871 				      cmd, be32toh(response->result));
872 			err = ENXIO;
873 			break;
874 		}
875 		if (err != EAGAIN)
876 			break;
877 	}
878 	if (err == EAGAIN)
879 		device_printf(sc->dev, "mxge: command %d timed out"
880 			      "result = %d\n",
881 			      cmd, be32toh(response->result));
882 	mtx_unlock(&sc->cmd_mtx);
883 	return err;
884 }
885 
886 static int
887 mxge_adopt_running_firmware(mxge_softc_t *sc)
888 {
889 	struct mcp_gen_header *hdr;
890 	const size_t bytes = sizeof (struct mcp_gen_header);
891 	size_t hdr_offset;
892 	int status;
893 
894 	/* find running firmware header */
895 	hdr_offset = htobe32(*(volatile uint32_t *)
896 			     (sc->sram + MCP_HEADER_PTR_OFFSET));
897 
898 	if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > sc->sram_size) {
899 		device_printf(sc->dev,
900 			      "Running firmware has bad header offset (%d)\n",
901 			      (int)hdr_offset);
902 		return EIO;
903 	}
904 
905 	/* copy header of running firmware from SRAM to host memory to
906 	 * validate firmware */
907 	hdr = malloc(bytes, M_DEVBUF, M_NOWAIT);
908 	if (hdr == NULL) {
909 		device_printf(sc->dev, "could not malloc firmware hdr\n");
910 		return ENOMEM;
911 	}
912 	bus_space_read_region_1(rman_get_bustag(sc->mem_res),
913 				rman_get_bushandle(sc->mem_res),
914 				hdr_offset, (char *)hdr, bytes);
915 	status = mxge_validate_firmware(sc, hdr);
916 	free(hdr, M_DEVBUF);
917 
918 	/*
919 	 * check to see if adopted firmware has bug where adopting
920 	 * it will cause broadcasts to be filtered unless the NIC
921 	 * is kept in ALLMULTI mode
922 	 */
923 	if (sc->fw_ver_major == 1 && sc->fw_ver_minor == 4 &&
924 	    sc->fw_ver_tiny >= 4 && sc->fw_ver_tiny <= 11) {
925 		sc->adopted_rx_filter_bug = 1;
926 		device_printf(sc->dev, "Adopting fw %d.%d.%d: "
927 			      "working around rx filter bug\n",
928 			      sc->fw_ver_major, sc->fw_ver_minor,
929 			      sc->fw_ver_tiny);
930 	}
931 
932 	return status;
933 }
934 
935 static int
936 mxge_load_firmware(mxge_softc_t *sc, int adopt)
937 {
938 	volatile uint32_t *confirm;
939 	volatile char *submit;
940 	char buf_bytes[72];
941 	uint32_t *buf, size, dma_low, dma_high;
942 	int status, i;
943 
944 	buf = (uint32_t *)((uintptr_t)(buf_bytes + 7) & ~7UL);
945 
946 	size = sc->sram_size;
947 	status = mxge_load_firmware_helper(sc, &size);
948 	if (status) {
949 		if (!adopt)
950 			return status;
951 		/* Try to use the currently running firmware, if
952 		   it is new enough */
953 		status = mxge_adopt_running_firmware(sc);
954 		if (status) {
955 			device_printf(sc->dev,
956 				      "failed to adopt running firmware\n");
957 			return status;
958 		}
959 		device_printf(sc->dev,
960 			      "Successfully adopted running firmware\n");
961 		if (sc->tx_boundary == 4096) {
962 			device_printf(sc->dev,
963 				"Using firmware currently running on NIC"
964 				 ".  For optimal\n");
965 			device_printf(sc->dev,
966 				 "performance consider loading optimized "
967 				 "firmware\n");
968 		}
969 		sc->fw_name = mxge_fw_unaligned;
970 		sc->tx_boundary = 2048;
971 		return 0;
972 	}
973 	/* clear confirmation addr */
974 	confirm = (volatile uint32_t *)sc->cmd;
975 	*confirm = 0;
976 	wmb();
977 	/* send a reload command to the bootstrap MCP, and wait for the
978 	   response in the confirmation address.  The firmware should
979 	   write a -1 there to indicate it is alive and well
980 	*/
981 
982 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
983 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
984 
985 	buf[0] = htobe32(dma_high);	/* confirm addr MSW */
986 	buf[1] = htobe32(dma_low);	/* confirm addr LSW */
987 	buf[2] = htobe32(0xffffffff);	/* confirm data */
988 
989 	/* FIX: All newest firmware should un-protect the bottom of
990 	   the sram before handoff. However, the very first interfaces
991 	   do not. Therefore the handoff copy must skip the first 8 bytes
992 	*/
993 					/* where the code starts*/
994 	buf[3] = htobe32(MXGE_FW_OFFSET + 8);
995 	buf[4] = htobe32(size - 8); 	/* length of code */
996 	buf[5] = htobe32(8);		/* where to copy to */
997 	buf[6] = htobe32(0);		/* where to jump to */
998 
999 	submit = (volatile char *)(sc->sram + MXGEFW_BOOT_HANDOFF);
1000 	mxge_pio_copy(submit, buf, 64);
1001 	wmb();
1002 	DELAY(1000);
1003 	wmb();
1004 	i = 0;
1005 	while (*confirm != 0xffffffff && i < 20) {
1006 		DELAY(1000*10);
1007 		i++;
1008 		bus_dmamap_sync(sc->cmd_dma.dmat,
1009 				sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
1010 	}
1011 	if (*confirm != 0xffffffff) {
1012 		device_printf(sc->dev,"handoff failed (%p = 0x%x)",
1013 			confirm, *confirm);
1014 
1015 		return ENXIO;
1016 	}
1017 	return 0;
1018 }
1019 
1020 static int
1021 mxge_update_mac_address(mxge_softc_t *sc)
1022 {
1023 	mxge_cmd_t cmd;
1024 	uint8_t *addr = sc->mac_addr;
1025 	int status;
1026 
1027 	cmd.data0 = ((addr[0] << 24) | (addr[1] << 16)
1028 		     | (addr[2] << 8) | addr[3]);
1029 
1030 	cmd.data1 = ((addr[4] << 8) | (addr[5]));
1031 
1032 	status = mxge_send_cmd(sc, MXGEFW_SET_MAC_ADDRESS, &cmd);
1033 	return status;
1034 }
1035 
1036 static int
1037 mxge_change_pause(mxge_softc_t *sc, int pause)
1038 {
1039 	mxge_cmd_t cmd;
1040 	int status;
1041 
1042 	if (pause)
1043 		status = mxge_send_cmd(sc, MXGEFW_ENABLE_FLOW_CONTROL,
1044 				       &cmd);
1045 	else
1046 		status = mxge_send_cmd(sc, MXGEFW_DISABLE_FLOW_CONTROL,
1047 				       &cmd);
1048 
1049 	if (status) {
1050 		device_printf(sc->dev, "Failed to set flow control mode\n");
1051 		return ENXIO;
1052 	}
1053 	sc->pause = pause;
1054 	return 0;
1055 }
1056 
1057 static void
1058 mxge_change_promisc(mxge_softc_t *sc, int promisc)
1059 {
1060 	mxge_cmd_t cmd;
1061 	int status;
1062 
1063 	if (mxge_always_promisc)
1064 		promisc = 1;
1065 
1066 	if (promisc)
1067 		status = mxge_send_cmd(sc, MXGEFW_ENABLE_PROMISC,
1068 				       &cmd);
1069 	else
1070 		status = mxge_send_cmd(sc, MXGEFW_DISABLE_PROMISC,
1071 				       &cmd);
1072 
1073 	if (status) {
1074 		device_printf(sc->dev, "Failed to set promisc mode\n");
1075 	}
1076 }
1077 
1078 struct mxge_add_maddr_ctx {
1079 	mxge_softc_t *sc;
1080 	int error;
1081 };
1082 
1083 static u_int
1084 mxge_add_maddr(void *arg, struct sockaddr_dl *sdl, u_int cnt)
1085 {
1086 	struct mxge_add_maddr_ctx *ctx = arg;
1087 	mxge_cmd_t cmd;
1088 
1089 	if (ctx->error != 0)
1090 		return (0);
1091 	bcopy(LLADDR(sdl), &cmd.data0, 4);
1092 	bcopy(LLADDR(sdl) + 4, &cmd.data1, 2);
1093 	cmd.data0 = htonl(cmd.data0);
1094 	cmd.data1 = htonl(cmd.data1);
1095 
1096 	ctx->error = mxge_send_cmd(ctx->sc, MXGEFW_JOIN_MULTICAST_GROUP, &cmd);
1097 
1098 	return (1);
1099 }
1100 
1101 static void
1102 mxge_set_multicast_list(mxge_softc_t *sc)
1103 {
1104 	struct mxge_add_maddr_ctx ctx;
1105 	struct ifnet *ifp = sc->ifp;
1106 	mxge_cmd_t cmd;
1107 	int err;
1108 
1109 	/* This firmware is known to not support multicast */
1110 	if (!sc->fw_multicast_support)
1111 		return;
1112 
1113 	/* Disable multicast filtering while we play with the lists*/
1114 	err = mxge_send_cmd(sc, MXGEFW_ENABLE_ALLMULTI, &cmd);
1115 	if (err != 0) {
1116 		device_printf(sc->dev, "Failed MXGEFW_ENABLE_ALLMULTI,"
1117 		       " error status: %d\n", err);
1118 		return;
1119 	}
1120 
1121 	if (sc->adopted_rx_filter_bug)
1122 		return;
1123 
1124 	if (ifp->if_flags & IFF_ALLMULTI)
1125 		/* request to disable multicast filtering, so quit here */
1126 		return;
1127 
1128 	/* Flush all the filters */
1129 
1130 	err = mxge_send_cmd(sc, MXGEFW_LEAVE_ALL_MULTICAST_GROUPS, &cmd);
1131 	if (err != 0) {
1132 		device_printf(sc->dev,
1133 			      "Failed MXGEFW_LEAVE_ALL_MULTICAST_GROUPS"
1134 			      ", error status: %d\n", err);
1135 		return;
1136 	}
1137 
1138 	/* Walk the multicast list, and add each address */
1139 	ctx.sc = sc;
1140 	ctx.error = 0;
1141 	if_foreach_llmaddr(ifp, mxge_add_maddr, &ctx);
1142 	if (ctx.error != 0) {
1143 		device_printf(sc->dev, "Failed MXGEFW_JOIN_MULTICAST_GROUP, "
1144 		    "error status:" "%d\t", ctx.error);
1145 		/* abort, leaving multicast filtering off */
1146 		return;
1147 	}
1148 
1149 	/* Enable multicast filtering */
1150 	err = mxge_send_cmd(sc, MXGEFW_DISABLE_ALLMULTI, &cmd);
1151 	if (err != 0) {
1152 		device_printf(sc->dev, "Failed MXGEFW_DISABLE_ALLMULTI"
1153 		       ", error status: %d\n", err);
1154 	}
1155 }
1156 
1157 static int
1158 mxge_max_mtu(mxge_softc_t *sc)
1159 {
1160 	mxge_cmd_t cmd;
1161 	int status;
1162 
1163 	if (MJUMPAGESIZE - MXGEFW_PAD >  MXGEFW_MAX_MTU)
1164 		return  MXGEFW_MAX_MTU - MXGEFW_PAD;
1165 
1166 	/* try to set nbufs to see if it we can
1167 	   use virtually contiguous jumbos */
1168 	cmd.data0 = 0;
1169 	status = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
1170 			       &cmd);
1171 	if (status == 0)
1172 		return  MXGEFW_MAX_MTU - MXGEFW_PAD;
1173 
1174 	/* otherwise, we're limited to MJUMPAGESIZE */
1175 	return MJUMPAGESIZE - MXGEFW_PAD;
1176 }
1177 
1178 static int
1179 mxge_reset(mxge_softc_t *sc, int interrupts_setup)
1180 {
1181 	struct mxge_slice_state *ss;
1182 	mxge_rx_done_t *rx_done;
1183 	volatile uint32_t *irq_claim;
1184 	mxge_cmd_t cmd;
1185 	int slice, status;
1186 
1187 	/* try to send a reset command to the card to see if it
1188 	   is alive */
1189 	memset(&cmd, 0, sizeof (cmd));
1190 	status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
1191 	if (status != 0) {
1192 		device_printf(sc->dev, "failed reset\n");
1193 		return ENXIO;
1194 	}
1195 
1196 	mxge_dummy_rdma(sc, 1);
1197 
1198 	/* set the intrq size */
1199 	cmd.data0 = sc->rx_ring_size;
1200 	status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
1201 
1202 	/*
1203 	 * Even though we already know how many slices are supported
1204 	 * via mxge_slice_probe(), MXGEFW_CMD_GET_MAX_RSS_QUEUES
1205 	 * has magic side effects, and must be called after a reset.
1206 	 * It must be called prior to calling any RSS related cmds,
1207 	 * including assigning an interrupt queue for anything but
1208 	 * slice 0.  It must also be called *after*
1209 	 * MXGEFW_CMD_SET_INTRQ_SIZE, since the intrq size is used by
1210 	 * the firmware to compute offsets.
1211 	 */
1212 
1213 	if (sc->num_slices > 1) {
1214 		/* ask the maximum number of slices it supports */
1215 		status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES,
1216 					   &cmd);
1217 		if (status != 0) {
1218 			device_printf(sc->dev,
1219 				      "failed to get number of slices\n");
1220 			return status;
1221 		}
1222 		/*
1223 		 * MXGEFW_CMD_ENABLE_RSS_QUEUES must be called prior
1224 		 * to setting up the interrupt queue DMA
1225 		 */
1226 		cmd.data0 = sc->num_slices;
1227 		cmd.data1 = MXGEFW_SLICE_INTR_MODE_ONE_PER_SLICE;
1228 #ifdef IFNET_BUF_RING
1229 		cmd.data1 |= MXGEFW_SLICE_ENABLE_MULTIPLE_TX_QUEUES;
1230 #endif
1231 		status = mxge_send_cmd(sc, MXGEFW_CMD_ENABLE_RSS_QUEUES,
1232 					   &cmd);
1233 		if (status != 0) {
1234 			device_printf(sc->dev,
1235 				      "failed to set number of slices\n");
1236 			return status;
1237 		}
1238 	}
1239 
1240 	if (interrupts_setup) {
1241 		/* Now exchange information about interrupts  */
1242 		for (slice = 0; slice < sc->num_slices; slice++) {
1243 			rx_done = &sc->ss[slice].rx_done;
1244 			memset(rx_done->entry, 0, sc->rx_ring_size);
1245 			cmd.data0 = MXGE_LOWPART_TO_U32(rx_done->dma.bus_addr);
1246 			cmd.data1 = MXGE_HIGHPART_TO_U32(rx_done->dma.bus_addr);
1247 			cmd.data2 = slice;
1248 			status |= mxge_send_cmd(sc,
1249 						MXGEFW_CMD_SET_INTRQ_DMA,
1250 						&cmd);
1251 		}
1252 	}
1253 
1254 	status |= mxge_send_cmd(sc,
1255 				MXGEFW_CMD_GET_INTR_COAL_DELAY_OFFSET, &cmd);
1256 
1257 	sc->intr_coal_delay_ptr = (volatile uint32_t *)(sc->sram + cmd.data0);
1258 
1259 	status |= mxge_send_cmd(sc, MXGEFW_CMD_GET_IRQ_ACK_OFFSET, &cmd);
1260 	irq_claim = (volatile uint32_t *)(sc->sram + cmd.data0);
1261 
1262 	status |= mxge_send_cmd(sc,  MXGEFW_CMD_GET_IRQ_DEASSERT_OFFSET,
1263 				&cmd);
1264 	sc->irq_deassert = (volatile uint32_t *)(sc->sram + cmd.data0);
1265 	if (status != 0) {
1266 		device_printf(sc->dev, "failed set interrupt parameters\n");
1267 		return status;
1268 	}
1269 
1270 	*sc->intr_coal_delay_ptr = htobe32(sc->intr_coal_delay);
1271 
1272 	/* run a DMA benchmark */
1273 	(void) mxge_dma_test(sc, MXGEFW_DMA_TEST);
1274 
1275 	for (slice = 0; slice < sc->num_slices; slice++) {
1276 		ss = &sc->ss[slice];
1277 
1278 		ss->irq_claim = irq_claim + (2 * slice);
1279 		/* reset mcp/driver shared state back to 0 */
1280 		ss->rx_done.idx = 0;
1281 		ss->rx_done.cnt = 0;
1282 		ss->tx.req = 0;
1283 		ss->tx.done = 0;
1284 		ss->tx.pkt_done = 0;
1285 		ss->tx.queue_active = 0;
1286 		ss->tx.activate = 0;
1287 		ss->tx.deactivate = 0;
1288 		ss->tx.wake = 0;
1289 		ss->tx.defrag = 0;
1290 		ss->tx.stall = 0;
1291 		ss->rx_big.cnt = 0;
1292 		ss->rx_small.cnt = 0;
1293 		ss->lc.lro_bad_csum = 0;
1294 		ss->lc.lro_queued = 0;
1295 		ss->lc.lro_flushed = 0;
1296 		if (ss->fw_stats != NULL) {
1297 			bzero(ss->fw_stats, sizeof *ss->fw_stats);
1298 		}
1299 	}
1300 	sc->rdma_tags_available = 15;
1301 	status = mxge_update_mac_address(sc);
1302 	mxge_change_promisc(sc, sc->ifp->if_flags & IFF_PROMISC);
1303 	mxge_change_pause(sc, sc->pause);
1304 	mxge_set_multicast_list(sc);
1305 	if (sc->throttle) {
1306 		cmd.data0 = sc->throttle;
1307 		if (mxge_send_cmd(sc, MXGEFW_CMD_SET_THROTTLE_FACTOR,
1308 				  &cmd)) {
1309 			device_printf(sc->dev,
1310 				      "can't enable throttle\n");
1311 		}
1312 	}
1313 	return status;
1314 }
1315 
1316 static int
1317 mxge_change_throttle(SYSCTL_HANDLER_ARGS)
1318 {
1319 	mxge_cmd_t cmd;
1320 	mxge_softc_t *sc;
1321 	int err;
1322 	unsigned int throttle;
1323 
1324 	sc = arg1;
1325 	throttle = sc->throttle;
1326 	err = sysctl_handle_int(oidp, &throttle, arg2, req);
1327 	if (err != 0) {
1328 		return err;
1329 	}
1330 
1331 	if (throttle == sc->throttle)
1332 		return 0;
1333 
1334 	if (throttle < MXGE_MIN_THROTTLE || throttle > MXGE_MAX_THROTTLE)
1335 		return EINVAL;
1336 
1337 	mtx_lock(&sc->driver_mtx);
1338 	cmd.data0 = throttle;
1339 	err = mxge_send_cmd(sc, MXGEFW_CMD_SET_THROTTLE_FACTOR, &cmd);
1340 	if (err == 0)
1341 		sc->throttle = throttle;
1342 	mtx_unlock(&sc->driver_mtx);
1343 	return err;
1344 }
1345 
1346 static int
1347 mxge_change_intr_coal(SYSCTL_HANDLER_ARGS)
1348 {
1349 	mxge_softc_t *sc;
1350 	unsigned int intr_coal_delay;
1351 	int err;
1352 
1353 	sc = arg1;
1354 	intr_coal_delay = sc->intr_coal_delay;
1355 	err = sysctl_handle_int(oidp, &intr_coal_delay, arg2, req);
1356 	if (err != 0) {
1357 		return err;
1358 	}
1359 	if (intr_coal_delay == sc->intr_coal_delay)
1360 		return 0;
1361 
1362 	if (intr_coal_delay == 0 || intr_coal_delay > 1000*1000)
1363 		return EINVAL;
1364 
1365 	mtx_lock(&sc->driver_mtx);
1366 	*sc->intr_coal_delay_ptr = htobe32(intr_coal_delay);
1367 	sc->intr_coal_delay = intr_coal_delay;
1368 
1369 	mtx_unlock(&sc->driver_mtx);
1370 	return err;
1371 }
1372 
1373 static int
1374 mxge_change_flow_control(SYSCTL_HANDLER_ARGS)
1375 {
1376 	mxge_softc_t *sc;
1377 	unsigned int enabled;
1378 	int err;
1379 
1380 	sc = arg1;
1381 	enabled = sc->pause;
1382 	err = sysctl_handle_int(oidp, &enabled, arg2, req);
1383 	if (err != 0) {
1384 		return err;
1385 	}
1386 	if (enabled == sc->pause)
1387 		return 0;
1388 
1389 	mtx_lock(&sc->driver_mtx);
1390 	err = mxge_change_pause(sc, enabled);
1391 	mtx_unlock(&sc->driver_mtx);
1392 	return err;
1393 }
1394 
1395 static int
1396 mxge_handle_be32(SYSCTL_HANDLER_ARGS)
1397 {
1398 	int err;
1399 
1400 	if (arg1 == NULL)
1401 		return EFAULT;
1402 	arg2 = be32toh(*(int *)arg1);
1403 	arg1 = NULL;
1404 	err = sysctl_handle_int(oidp, arg1, arg2, req);
1405 
1406 	return err;
1407 }
1408 
1409 static void
1410 mxge_rem_sysctls(mxge_softc_t *sc)
1411 {
1412 	struct mxge_slice_state *ss;
1413 	int slice;
1414 
1415 	if (sc->slice_sysctl_tree == NULL)
1416 		return;
1417 
1418 	for (slice = 0; slice < sc->num_slices; slice++) {
1419 		ss = &sc->ss[slice];
1420 		if (ss == NULL || ss->sysctl_tree == NULL)
1421 			continue;
1422 		sysctl_ctx_free(&ss->sysctl_ctx);
1423 		ss->sysctl_tree = NULL;
1424 	}
1425 	sysctl_ctx_free(&sc->slice_sysctl_ctx);
1426 	sc->slice_sysctl_tree = NULL;
1427 }
1428 
1429 static void
1430 mxge_add_sysctls(mxge_softc_t *sc)
1431 {
1432 	struct sysctl_ctx_list *ctx;
1433 	struct sysctl_oid_list *children;
1434 	mcp_irq_data_t *fw;
1435 	struct mxge_slice_state *ss;
1436 	int slice;
1437 	char slice_num[8];
1438 
1439 	ctx = device_get_sysctl_ctx(sc->dev);
1440 	children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev));
1441 	fw = sc->ss[0].fw_stats;
1442 
1443 	/* random information */
1444 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1445 		       "firmware_version",
1446 		       CTLFLAG_RD, sc->fw_version,
1447 		       0, "firmware version");
1448 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1449 		       "serial_number",
1450 		       CTLFLAG_RD, sc->serial_number_string,
1451 		       0, "serial number");
1452 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1453 		       "product_code",
1454 		       CTLFLAG_RD, sc->product_code_string,
1455 		       0, "product_code");
1456 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1457 		       "pcie_link_width",
1458 		       CTLFLAG_RD, &sc->link_width,
1459 		       0, "tx_boundary");
1460 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1461 		       "tx_boundary",
1462 		       CTLFLAG_RD, &sc->tx_boundary,
1463 		       0, "tx_boundary");
1464 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1465 		       "write_combine",
1466 		       CTLFLAG_RD, &sc->wc,
1467 		       0, "write combining PIO?");
1468 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1469 		       "read_dma_MBs",
1470 		       CTLFLAG_RD, &sc->read_dma,
1471 		       0, "DMA Read speed in MB/s");
1472 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1473 		       "write_dma_MBs",
1474 		       CTLFLAG_RD, &sc->write_dma,
1475 		       0, "DMA Write speed in MB/s");
1476 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1477 		       "read_write_dma_MBs",
1478 		       CTLFLAG_RD, &sc->read_write_dma,
1479 		       0, "DMA concurrent Read/Write speed in MB/s");
1480 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1481 		       "watchdog_resets",
1482 		       CTLFLAG_RD, &sc->watchdog_resets,
1483 		       0, "Number of times NIC was reset");
1484 
1485 	/* performance related tunables */
1486 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1487 	    "intr_coal_delay", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
1488 	    sc, 0, mxge_change_intr_coal, "I",
1489 	    "interrupt coalescing delay in usecs");
1490 
1491 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1492 	    "throttle", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
1493 	    mxge_change_throttle, "I", "transmit throttling");
1494 
1495 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1496 	    "flow_control_enabled",
1497 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
1498 	    mxge_change_flow_control, "I",
1499 	    "interrupt coalescing delay in usecs");
1500 
1501 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1502 		       "deassert_wait",
1503 		       CTLFLAG_RW, &mxge_deassert_wait,
1504 		       0, "Wait for IRQ line to go low in ihandler");
1505 
1506 	/* stats block from firmware is in network byte order.
1507 	   Need to swap it */
1508 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1509 	    "link_up", CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
1510 	    &fw->link_up, 0, mxge_handle_be32, "I", "link up");
1511 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1512 	    "rdma_tags_available", CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
1513 	    &fw->rdma_tags_available, 0, mxge_handle_be32, "I",
1514 	    "rdma_tags_available");
1515 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1516 	    "dropped_bad_crc32", CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
1517 	    &fw->dropped_bad_crc32, 0, mxge_handle_be32, "I",
1518 	    "dropped_bad_crc32");
1519 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1520 	    "dropped_bad_phy", CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
1521 	    &fw->dropped_bad_phy, 0, mxge_handle_be32, "I", "dropped_bad_phy");
1522 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1523 	    "dropped_link_error_or_filtered",
1524 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
1525 	    &fw->dropped_link_error_or_filtered, 0, mxge_handle_be32, "I",
1526 	    "dropped_link_error_or_filtered");
1527 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1528 	    "dropped_link_overflow",
1529 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
1530 	    &fw->dropped_link_overflow, 0, mxge_handle_be32, "I",
1531 	    "dropped_link_overflow");
1532 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1533 	    "dropped_multicast_filtered",
1534 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
1535 	    &fw->dropped_multicast_filtered, 0, mxge_handle_be32, "I",
1536 	    "dropped_multicast_filtered");
1537 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1538 	    "dropped_no_big_buffer",
1539 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
1540 	    &fw->dropped_no_big_buffer, 0, mxge_handle_be32, "I",
1541 	    "dropped_no_big_buffer");
1542 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1543 	    "dropped_no_small_buffer",
1544 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
1545 	    &fw->dropped_no_small_buffer, 0, mxge_handle_be32, "I",
1546 	    "dropped_no_small_buffer");
1547 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1548 	    "dropped_overrun",
1549 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
1550 	    &fw->dropped_overrun, 0, mxge_handle_be32, "I",
1551 	    "dropped_overrun");
1552 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1553 	    "dropped_pause", CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
1554 	    &fw->dropped_pause, 0, mxge_handle_be32, "I", "dropped_pause");
1555 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1556 	    "dropped_runt", CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
1557 	    &fw->dropped_runt, 0, mxge_handle_be32, "I", "dropped_runt");
1558 
1559 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1560 	    "dropped_unicast_filtered",
1561 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
1562 	    &fw->dropped_unicast_filtered, 0, mxge_handle_be32, "I",
1563 	    "dropped_unicast_filtered");
1564 
1565 	/* verbose printing? */
1566 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1567 		       "verbose",
1568 		       CTLFLAG_RW, &mxge_verbose,
1569 		       0, "verbose printing");
1570 
1571 	/* add counters exported for debugging from all slices */
1572 	sysctl_ctx_init(&sc->slice_sysctl_ctx);
1573 	sc->slice_sysctl_tree =
1574 		SYSCTL_ADD_NODE(&sc->slice_sysctl_ctx, children, OID_AUTO,
1575 		    "slice", CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
1576 
1577 	for (slice = 0; slice < sc->num_slices; slice++) {
1578 		ss = &sc->ss[slice];
1579 		sysctl_ctx_init(&ss->sysctl_ctx);
1580 		ctx = &ss->sysctl_ctx;
1581 		children = SYSCTL_CHILDREN(sc->slice_sysctl_tree);
1582 		sprintf(slice_num, "%d", slice);
1583 		ss->sysctl_tree =
1584 			SYSCTL_ADD_NODE(ctx, children, OID_AUTO, slice_num,
1585 			    CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
1586 		children = SYSCTL_CHILDREN(ss->sysctl_tree);
1587 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1588 			       "rx_small_cnt",
1589 			       CTLFLAG_RD, &ss->rx_small.cnt,
1590 			       0, "rx_small_cnt");
1591 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1592 			       "rx_big_cnt",
1593 			       CTLFLAG_RD, &ss->rx_big.cnt,
1594 			       0, "rx_small_cnt");
1595 		SYSCTL_ADD_U64(ctx, children, OID_AUTO,
1596 			       "lro_flushed", CTLFLAG_RD, &ss->lc.lro_flushed,
1597 			       0, "number of lro merge queues flushed");
1598 
1599 		SYSCTL_ADD_U64(ctx, children, OID_AUTO,
1600 			       "lro_bad_csum", CTLFLAG_RD, &ss->lc.lro_bad_csum,
1601 			       0, "number of bad csums preventing LRO");
1602 
1603 		SYSCTL_ADD_U64(ctx, children, OID_AUTO,
1604 			       "lro_queued", CTLFLAG_RD, &ss->lc.lro_queued,
1605 			       0, "number of frames appended to lro merge"
1606 			       "queues");
1607 
1608 #ifndef IFNET_BUF_RING
1609 		/* only transmit from slice 0 for now */
1610 		if (slice > 0)
1611 			continue;
1612 #endif
1613 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1614 			       "tx_req",
1615 			       CTLFLAG_RD, &ss->tx.req,
1616 			       0, "tx_req");
1617 
1618 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1619 			       "tx_done",
1620 			       CTLFLAG_RD, &ss->tx.done,
1621 			       0, "tx_done");
1622 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1623 			       "tx_pkt_done",
1624 			       CTLFLAG_RD, &ss->tx.pkt_done,
1625 			       0, "tx_done");
1626 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1627 			       "tx_stall",
1628 			       CTLFLAG_RD, &ss->tx.stall,
1629 			       0, "tx_stall");
1630 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1631 			       "tx_wake",
1632 			       CTLFLAG_RD, &ss->tx.wake,
1633 			       0, "tx_wake");
1634 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1635 			       "tx_defrag",
1636 			       CTLFLAG_RD, &ss->tx.defrag,
1637 			       0, "tx_defrag");
1638 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1639 			       "tx_queue_active",
1640 			       CTLFLAG_RD, &ss->tx.queue_active,
1641 			       0, "tx_queue_active");
1642 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1643 			       "tx_activate",
1644 			       CTLFLAG_RD, &ss->tx.activate,
1645 			       0, "tx_activate");
1646 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1647 			       "tx_deactivate",
1648 			       CTLFLAG_RD, &ss->tx.deactivate,
1649 			       0, "tx_deactivate");
1650 	}
1651 }
1652 
1653 /* copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
1654    backwards one at a time and handle ring wraps */
1655 
1656 static inline void
1657 mxge_submit_req_backwards(mxge_tx_ring_t *tx,
1658 			    mcp_kreq_ether_send_t *src, int cnt)
1659 {
1660 	int idx, starting_slot;
1661 	starting_slot = tx->req;
1662 	while (cnt > 1) {
1663 		cnt--;
1664 		idx = (starting_slot + cnt) & tx->mask;
1665 		mxge_pio_copy(&tx->lanai[idx],
1666 			      &src[cnt], sizeof(*src));
1667 		wmb();
1668 	}
1669 }
1670 
1671 /*
1672  * copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
1673  * at most 32 bytes at a time, so as to avoid involving the software
1674  * pio handler in the nic.   We re-write the first segment's flags
1675  * to mark them valid only after writing the entire chain
1676  */
1677 
1678 static inline void
1679 mxge_submit_req(mxge_tx_ring_t *tx, mcp_kreq_ether_send_t *src,
1680 		  int cnt)
1681 {
1682 	int idx, i;
1683 	uint32_t *src_ints;
1684 	volatile uint32_t *dst_ints;
1685 	mcp_kreq_ether_send_t *srcp;
1686 	volatile mcp_kreq_ether_send_t *dstp, *dst;
1687 	uint8_t last_flags;
1688 
1689 	idx = tx->req & tx->mask;
1690 
1691 	last_flags = src->flags;
1692 	src->flags = 0;
1693 	wmb();
1694 	dst = dstp = &tx->lanai[idx];
1695 	srcp = src;
1696 
1697 	if ((idx + cnt) < tx->mask) {
1698 		for (i = 0; i < (cnt - 1); i += 2) {
1699 			mxge_pio_copy(dstp, srcp, 2 * sizeof(*src));
1700 			wmb(); /* force write every 32 bytes */
1701 			srcp += 2;
1702 			dstp += 2;
1703 		}
1704 	} else {
1705 		/* submit all but the first request, and ensure
1706 		   that it is submitted below */
1707 		mxge_submit_req_backwards(tx, src, cnt);
1708 		i = 0;
1709 	}
1710 	if (i < cnt) {
1711 		/* submit the first request */
1712 		mxge_pio_copy(dstp, srcp, sizeof(*src));
1713 		wmb(); /* barrier before setting valid flag */
1714 	}
1715 
1716 	/* re-write the last 32-bits with the valid flags */
1717 	src->flags = last_flags;
1718 	src_ints = (uint32_t *)src;
1719 	src_ints+=3;
1720 	dst_ints = (volatile uint32_t *)dst;
1721 	dst_ints+=3;
1722 	*dst_ints =  *src_ints;
1723 	tx->req += cnt;
1724 	wmb();
1725 }
1726 
1727 static int
1728 mxge_parse_tx(struct mxge_slice_state *ss, struct mbuf *m,
1729     struct mxge_pkt_info *pi)
1730 {
1731 	struct ether_vlan_header *eh;
1732 	uint16_t etype;
1733 	int tso = m->m_pkthdr.csum_flags & (CSUM_TSO);
1734 #if IFCAP_TSO6 && defined(INET6)
1735 	int nxt;
1736 #endif
1737 
1738 	eh = mtod(m, struct ether_vlan_header *);
1739 	if (eh->evl_encap_proto == htons(ETHERTYPE_VLAN)) {
1740 		etype = ntohs(eh->evl_proto);
1741 		pi->ip_off = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
1742 	} else {
1743 		etype = ntohs(eh->evl_encap_proto);
1744 		pi->ip_off = ETHER_HDR_LEN;
1745 	}
1746 
1747 	switch (etype) {
1748 	case ETHERTYPE_IP:
1749 		/*
1750 		 * ensure ip header is in first mbuf, copy it to a
1751 		 * scratch buffer if not
1752 		 */
1753 		pi->ip = (struct ip *)(m->m_data + pi->ip_off);
1754 		pi->ip6 = NULL;
1755 		if (__predict_false(m->m_len < pi->ip_off + sizeof(*pi->ip))) {
1756 			m_copydata(m, 0, pi->ip_off + sizeof(*pi->ip),
1757 			    ss->scratch);
1758 			pi->ip = (struct ip *)(ss->scratch + pi->ip_off);
1759 		}
1760 		pi->ip_hlen = pi->ip->ip_hl << 2;
1761 		if (!tso)
1762 			return 0;
1763 
1764 		if (__predict_false(m->m_len < pi->ip_off + pi->ip_hlen +
1765 		    sizeof(struct tcphdr))) {
1766 			m_copydata(m, 0, pi->ip_off + pi->ip_hlen +
1767 			    sizeof(struct tcphdr), ss->scratch);
1768 			pi->ip = (struct ip *)(ss->scratch + pi->ip_off);
1769 		}
1770 		pi->tcp = (struct tcphdr *)((char *)pi->ip + pi->ip_hlen);
1771 		break;
1772 #if IFCAP_TSO6 && defined(INET6)
1773 	case ETHERTYPE_IPV6:
1774 		pi->ip6 = (struct ip6_hdr *)(m->m_data + pi->ip_off);
1775 		if (__predict_false(m->m_len < pi->ip_off + sizeof(*pi->ip6))) {
1776 			m_copydata(m, 0, pi->ip_off + sizeof(*pi->ip6),
1777 			    ss->scratch);
1778 			pi->ip6 = (struct ip6_hdr *)(ss->scratch + pi->ip_off);
1779 		}
1780 		nxt = 0;
1781 		pi->ip_hlen = ip6_lasthdr(m, pi->ip_off, IPPROTO_IPV6, &nxt);
1782 		pi->ip_hlen -= pi->ip_off;
1783 		if (nxt != IPPROTO_TCP && nxt != IPPROTO_UDP)
1784 			return EINVAL;
1785 
1786 		if (!tso)
1787 			return 0;
1788 
1789 		if (pi->ip_off + pi->ip_hlen > ss->sc->max_tso6_hlen)
1790 			return EINVAL;
1791 
1792 		if (__predict_false(m->m_len < pi->ip_off + pi->ip_hlen +
1793 		    sizeof(struct tcphdr))) {
1794 			m_copydata(m, 0, pi->ip_off + pi->ip_hlen +
1795 			    sizeof(struct tcphdr), ss->scratch);
1796 			pi->ip6 = (struct ip6_hdr *)(ss->scratch + pi->ip_off);
1797 		}
1798 		pi->tcp = (struct tcphdr *)((char *)pi->ip6 + pi->ip_hlen);
1799 		break;
1800 #endif
1801 	default:
1802 		return EINVAL;
1803 	}
1804 	return 0;
1805 }
1806 
1807 #if IFCAP_TSO4
1808 
1809 static void
1810 mxge_encap_tso(struct mxge_slice_state *ss, struct mbuf *m,
1811 	       int busdma_seg_cnt, struct mxge_pkt_info *pi)
1812 {
1813 	mxge_tx_ring_t *tx;
1814 	mcp_kreq_ether_send_t *req;
1815 	bus_dma_segment_t *seg;
1816 	uint32_t low, high_swapped;
1817 	int len, seglen, cum_len, cum_len_next;
1818 	int next_is_first, chop, cnt, rdma_count, small;
1819 	uint16_t pseudo_hdr_offset, cksum_offset, mss, sum;
1820 	uint8_t flags, flags_next;
1821 	static int once;
1822 
1823 	mss = m->m_pkthdr.tso_segsz;
1824 
1825 	/* negative cum_len signifies to the
1826 	 * send loop that we are still in the
1827 	 * header portion of the TSO packet.
1828 	 */
1829 
1830 	cksum_offset = pi->ip_off + pi->ip_hlen;
1831 	cum_len = -(cksum_offset + (pi->tcp->th_off << 2));
1832 
1833 	/* TSO implies checksum offload on this hardware */
1834 	if (__predict_false((m->m_pkthdr.csum_flags & (CSUM_TCP|CSUM_TCP_IPV6)) == 0)) {
1835 		/*
1836 		 * If packet has full TCP csum, replace it with pseudo hdr
1837 		 * sum that the NIC expects, otherwise the NIC will emit
1838 		 * packets with bad TCP checksums.
1839 		 */
1840 		m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
1841 		if (pi->ip6) {
1842 #if (CSUM_TCP_IPV6 != 0) && defined(INET6)
1843 			m->m_pkthdr.csum_flags |= CSUM_TCP_IPV6;
1844 			sum = in6_cksum_pseudo(pi->ip6,
1845 			    m->m_pkthdr.len - cksum_offset,
1846 			    IPPROTO_TCP, 0);
1847 #endif
1848 		} else {
1849 #ifdef INET
1850 			m->m_pkthdr.csum_flags |= CSUM_TCP;
1851 			sum = in_pseudo(pi->ip->ip_src.s_addr,
1852 			    pi->ip->ip_dst.s_addr,
1853 			    htons(IPPROTO_TCP + (m->m_pkthdr.len -
1854 				    cksum_offset)));
1855 #endif
1856 		}
1857 		m_copyback(m, offsetof(struct tcphdr, th_sum) +
1858 		    cksum_offset, sizeof(sum), (caddr_t)&sum);
1859 	}
1860 	flags = MXGEFW_FLAGS_TSO_HDR | MXGEFW_FLAGS_FIRST;
1861 
1862 	/* for TSO, pseudo_hdr_offset holds mss.
1863 	 * The firmware figures out where to put
1864 	 * the checksum by parsing the header. */
1865 	pseudo_hdr_offset = htobe16(mss);
1866 
1867 	if (pi->ip6) {
1868 		/*
1869 		 * for IPv6 TSO, the "checksum offset" is re-purposed
1870 		 * to store the TCP header len
1871 		 */
1872 		cksum_offset = (pi->tcp->th_off << 2);
1873 	}
1874 
1875 	tx = &ss->tx;
1876 	req = tx->req_list;
1877 	seg = tx->seg_list;
1878 	cnt = 0;
1879 	rdma_count = 0;
1880 	/* "rdma_count" is the number of RDMAs belonging to the
1881 	 * current packet BEFORE the current send request. For
1882 	 * non-TSO packets, this is equal to "count".
1883 	 * For TSO packets, rdma_count needs to be reset
1884 	 * to 0 after a segment cut.
1885 	 *
1886 	 * The rdma_count field of the send request is
1887 	 * the number of RDMAs of the packet starting at
1888 	 * that request. For TSO send requests with one ore more cuts
1889 	 * in the middle, this is the number of RDMAs starting
1890 	 * after the last cut in the request. All previous
1891 	 * segments before the last cut implicitly have 1 RDMA.
1892 	 *
1893 	 * Since the number of RDMAs is not known beforehand,
1894 	 * it must be filled-in retroactively - after each
1895 	 * segmentation cut or at the end of the entire packet.
1896 	 */
1897 
1898 	while (busdma_seg_cnt) {
1899 		/* Break the busdma segment up into pieces*/
1900 		low = MXGE_LOWPART_TO_U32(seg->ds_addr);
1901 		high_swapped = 	htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
1902 		len = seg->ds_len;
1903 
1904 		while (len) {
1905 			flags_next = flags & ~MXGEFW_FLAGS_FIRST;
1906 			seglen = len;
1907 			cum_len_next = cum_len + seglen;
1908 			(req-rdma_count)->rdma_count = rdma_count + 1;
1909 			if (__predict_true(cum_len >= 0)) {
1910 				/* payload */
1911 				chop = (cum_len_next > mss);
1912 				cum_len_next = cum_len_next % mss;
1913 				next_is_first = (cum_len_next == 0);
1914 				flags |= chop * MXGEFW_FLAGS_TSO_CHOP;
1915 				flags_next |= next_is_first *
1916 					MXGEFW_FLAGS_FIRST;
1917 				rdma_count |= -(chop | next_is_first);
1918 				rdma_count += chop & !next_is_first;
1919 			} else if (cum_len_next >= 0) {
1920 				/* header ends */
1921 				rdma_count = -1;
1922 				cum_len_next = 0;
1923 				seglen = -cum_len;
1924 				small = (mss <= MXGEFW_SEND_SMALL_SIZE);
1925 				flags_next = MXGEFW_FLAGS_TSO_PLD |
1926 					MXGEFW_FLAGS_FIRST |
1927 					(small * MXGEFW_FLAGS_SMALL);
1928 			    }
1929 
1930 			req->addr_high = high_swapped;
1931 			req->addr_low = htobe32(low);
1932 			req->pseudo_hdr_offset = pseudo_hdr_offset;
1933 			req->pad = 0;
1934 			req->rdma_count = 1;
1935 			req->length = htobe16(seglen);
1936 			req->cksum_offset = cksum_offset;
1937 			req->flags = flags | ((cum_len & 1) *
1938 					      MXGEFW_FLAGS_ALIGN_ODD);
1939 			low += seglen;
1940 			len -= seglen;
1941 			cum_len = cum_len_next;
1942 			flags = flags_next;
1943 			req++;
1944 			cnt++;
1945 			rdma_count++;
1946 			if (cksum_offset != 0 && !pi->ip6) {
1947 				if (__predict_false(cksum_offset > seglen))
1948 					cksum_offset -= seglen;
1949 				else
1950 					cksum_offset = 0;
1951 			}
1952 			if (__predict_false(cnt > tx->max_desc))
1953 				goto drop;
1954 		}
1955 		busdma_seg_cnt--;
1956 		seg++;
1957 	}
1958 	(req-rdma_count)->rdma_count = rdma_count;
1959 
1960 	do {
1961 		req--;
1962 		req->flags |= MXGEFW_FLAGS_TSO_LAST;
1963 	} while (!(req->flags & (MXGEFW_FLAGS_TSO_CHOP | MXGEFW_FLAGS_FIRST)));
1964 
1965 	tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
1966 	mxge_submit_req(tx, tx->req_list, cnt);
1967 #ifdef IFNET_BUF_RING
1968 	if ((ss->sc->num_slices > 1) && tx->queue_active == 0) {
1969 		/* tell the NIC to start polling this slice */
1970 		*tx->send_go = 1;
1971 		tx->queue_active = 1;
1972 		tx->activate++;
1973 		wmb();
1974 	}
1975 #endif
1976 	return;
1977 
1978 drop:
1979 	bus_dmamap_unload(tx->dmat, tx->info[tx->req & tx->mask].map);
1980 	m_freem(m);
1981 	ss->oerrors++;
1982 	if (!once) {
1983 		printf("tx->max_desc exceeded via TSO!\n");
1984 		printf("mss = %d, %ld, %d!\n", mss,
1985 		       (long)seg - (long)tx->seg_list, tx->max_desc);
1986 		once = 1;
1987 	}
1988 	return;
1989 
1990 }
1991 
1992 #endif /* IFCAP_TSO4 */
1993 
1994 #ifdef MXGE_NEW_VLAN_API
1995 /*
1996  * We reproduce the software vlan tag insertion from
1997  * net/if_vlan.c:vlan_start() here so that we can advertise "hardware"
1998  * vlan tag insertion. We need to advertise this in order to have the
1999  * vlan interface respect our csum offload flags.
2000  */
2001 static struct mbuf *
2002 mxge_vlan_tag_insert(struct mbuf *m)
2003 {
2004 	struct ether_vlan_header *evl;
2005 
2006 	M_PREPEND(m, ETHER_VLAN_ENCAP_LEN, M_NOWAIT);
2007 	if (__predict_false(m == NULL))
2008 		return NULL;
2009 	if (m->m_len < sizeof(*evl)) {
2010 		m = m_pullup(m, sizeof(*evl));
2011 		if (__predict_false(m == NULL))
2012 			return NULL;
2013 	}
2014 	/*
2015 	 * Transform the Ethernet header into an Ethernet header
2016 	 * with 802.1Q encapsulation.
2017 	 */
2018 	evl = mtod(m, struct ether_vlan_header *);
2019 	bcopy((char *)evl + ETHER_VLAN_ENCAP_LEN,
2020 	      (char *)evl, ETHER_HDR_LEN - ETHER_TYPE_LEN);
2021 	evl->evl_encap_proto = htons(ETHERTYPE_VLAN);
2022 	evl->evl_tag = htons(m->m_pkthdr.ether_vtag);
2023 	m->m_flags &= ~M_VLANTAG;
2024 	return m;
2025 }
2026 #endif /* MXGE_NEW_VLAN_API */
2027 
2028 static void
2029 mxge_encap(struct mxge_slice_state *ss, struct mbuf *m)
2030 {
2031 	struct mxge_pkt_info pi = {0,0,0,0};
2032 	mxge_softc_t *sc;
2033 	mcp_kreq_ether_send_t *req;
2034 	bus_dma_segment_t *seg;
2035 	struct mbuf *m_tmp;
2036 	mxge_tx_ring_t *tx;
2037 	int cnt, cum_len, err, i, idx, odd_flag;
2038 	uint16_t pseudo_hdr_offset;
2039 	uint8_t flags, cksum_offset;
2040 
2041 	sc = ss->sc;
2042 	tx = &ss->tx;
2043 
2044 #ifdef MXGE_NEW_VLAN_API
2045 	if (m->m_flags & M_VLANTAG) {
2046 		m = mxge_vlan_tag_insert(m);
2047 		if (__predict_false(m == NULL))
2048 			goto drop_without_m;
2049 	}
2050 #endif
2051 	if (m->m_pkthdr.csum_flags &
2052 	    (CSUM_TSO | CSUM_DELAY_DATA | CSUM_DELAY_DATA_IPV6)) {
2053 		if (mxge_parse_tx(ss, m, &pi))
2054 			goto drop;
2055 	}
2056 
2057 	/* (try to) map the frame for DMA */
2058 	idx = tx->req & tx->mask;
2059 	err = bus_dmamap_load_mbuf_sg(tx->dmat, tx->info[idx].map,
2060 				      m, tx->seg_list, &cnt,
2061 				      BUS_DMA_NOWAIT);
2062 	if (__predict_false(err == EFBIG)) {
2063 		/* Too many segments in the chain.  Try
2064 		   to defrag */
2065 		m_tmp = m_defrag(m, M_NOWAIT);
2066 		if (m_tmp == NULL) {
2067 			goto drop;
2068 		}
2069 		ss->tx.defrag++;
2070 		m = m_tmp;
2071 		err = bus_dmamap_load_mbuf_sg(tx->dmat,
2072 					      tx->info[idx].map,
2073 					      m, tx->seg_list, &cnt,
2074 					      BUS_DMA_NOWAIT);
2075 	}
2076 	if (__predict_false(err != 0)) {
2077 		device_printf(sc->dev, "bus_dmamap_load_mbuf_sg returned %d"
2078 			      " packet len = %d\n", err, m->m_pkthdr.len);
2079 		goto drop;
2080 	}
2081 	bus_dmamap_sync(tx->dmat, tx->info[idx].map,
2082 			BUS_DMASYNC_PREWRITE);
2083 	tx->info[idx].m = m;
2084 
2085 #if IFCAP_TSO4
2086 	/* TSO is different enough, we handle it in another routine */
2087 	if (m->m_pkthdr.csum_flags & (CSUM_TSO)) {
2088 		mxge_encap_tso(ss, m, cnt, &pi);
2089 		return;
2090 	}
2091 #endif
2092 
2093 	req = tx->req_list;
2094 	cksum_offset = 0;
2095 	pseudo_hdr_offset = 0;
2096 	flags = MXGEFW_FLAGS_NO_TSO;
2097 
2098 	/* checksum offloading? */
2099 	if (m->m_pkthdr.csum_flags &
2100 	    (CSUM_DELAY_DATA | CSUM_DELAY_DATA_IPV6)) {
2101 		/* ensure ip header is in first mbuf, copy
2102 		   it to a scratch buffer if not */
2103 		cksum_offset = pi.ip_off + pi.ip_hlen;
2104 		pseudo_hdr_offset = cksum_offset +  m->m_pkthdr.csum_data;
2105 		pseudo_hdr_offset = htobe16(pseudo_hdr_offset);
2106 		req->cksum_offset = cksum_offset;
2107 		flags |= MXGEFW_FLAGS_CKSUM;
2108 		odd_flag = MXGEFW_FLAGS_ALIGN_ODD;
2109 	} else {
2110 		odd_flag = 0;
2111 	}
2112 	if (m->m_pkthdr.len < MXGEFW_SEND_SMALL_SIZE)
2113 		flags |= MXGEFW_FLAGS_SMALL;
2114 
2115 	/* convert segments into a request list */
2116 	cum_len = 0;
2117 	seg = tx->seg_list;
2118 	req->flags = MXGEFW_FLAGS_FIRST;
2119 	for (i = 0; i < cnt; i++) {
2120 		req->addr_low =
2121 			htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
2122 		req->addr_high =
2123 			htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
2124 		req->length = htobe16(seg->ds_len);
2125 		req->cksum_offset = cksum_offset;
2126 		if (cksum_offset > seg->ds_len)
2127 			cksum_offset -= seg->ds_len;
2128 		else
2129 			cksum_offset = 0;
2130 		req->pseudo_hdr_offset = pseudo_hdr_offset;
2131 		req->pad = 0; /* complete solid 16-byte block */
2132 		req->rdma_count = 1;
2133 		req->flags |= flags | ((cum_len & 1) * odd_flag);
2134 		cum_len += seg->ds_len;
2135 		seg++;
2136 		req++;
2137 		req->flags = 0;
2138 	}
2139 	req--;
2140 	/* pad runts to 60 bytes */
2141 	if (cum_len < 60) {
2142 		req++;
2143 		req->addr_low =
2144 			htobe32(MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr));
2145 		req->addr_high =
2146 			htobe32(MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr));
2147 		req->length = htobe16(60 - cum_len);
2148 		req->cksum_offset = 0;
2149 		req->pseudo_hdr_offset = pseudo_hdr_offset;
2150 		req->pad = 0; /* complete solid 16-byte block */
2151 		req->rdma_count = 1;
2152 		req->flags |= flags | ((cum_len & 1) * odd_flag);
2153 		cnt++;
2154 	}
2155 
2156 	tx->req_list[0].rdma_count = cnt;
2157 #if 0
2158 	/* print what the firmware will see */
2159 	for (i = 0; i < cnt; i++) {
2160 		printf("%d: addr: 0x%x 0x%x len:%d pso%d,"
2161 		    "cso:%d, flags:0x%x, rdma:%d\n",
2162 		    i, (int)ntohl(tx->req_list[i].addr_high),
2163 		    (int)ntohl(tx->req_list[i].addr_low),
2164 		    (int)ntohs(tx->req_list[i].length),
2165 		    (int)ntohs(tx->req_list[i].pseudo_hdr_offset),
2166 		    tx->req_list[i].cksum_offset, tx->req_list[i].flags,
2167 		    tx->req_list[i].rdma_count);
2168 	}
2169 	printf("--------------\n");
2170 #endif
2171 	tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
2172 	mxge_submit_req(tx, tx->req_list, cnt);
2173 #ifdef IFNET_BUF_RING
2174 	if ((ss->sc->num_slices > 1) && tx->queue_active == 0) {
2175 		/* tell the NIC to start polling this slice */
2176 		*tx->send_go = 1;
2177 		tx->queue_active = 1;
2178 		tx->activate++;
2179 		wmb();
2180 	}
2181 #endif
2182 	return;
2183 
2184 drop:
2185 	m_freem(m);
2186 drop_without_m:
2187 	ss->oerrors++;
2188 	return;
2189 }
2190 
2191 #ifdef IFNET_BUF_RING
2192 static void
2193 mxge_qflush(struct ifnet *ifp)
2194 {
2195 	mxge_softc_t *sc = ifp->if_softc;
2196 	mxge_tx_ring_t *tx;
2197 	struct mbuf *m;
2198 	int slice;
2199 
2200 	for (slice = 0; slice < sc->num_slices; slice++) {
2201 		tx = &sc->ss[slice].tx;
2202 		mtx_lock(&tx->mtx);
2203 		while ((m = buf_ring_dequeue_sc(tx->br)) != NULL)
2204 			m_freem(m);
2205 		mtx_unlock(&tx->mtx);
2206 	}
2207 	if_qflush(ifp);
2208 }
2209 
2210 static inline void
2211 mxge_start_locked(struct mxge_slice_state *ss)
2212 {
2213 	mxge_softc_t *sc;
2214 	struct mbuf *m;
2215 	struct ifnet *ifp;
2216 	mxge_tx_ring_t *tx;
2217 
2218 	sc = ss->sc;
2219 	ifp = sc->ifp;
2220 	tx = &ss->tx;
2221 
2222 	while ((tx->mask - (tx->req - tx->done)) > tx->max_desc) {
2223 		m = drbr_dequeue(ifp, tx->br);
2224 		if (m == NULL) {
2225 			return;
2226 		}
2227 		/* let BPF see it */
2228 		BPF_MTAP(ifp, m);
2229 
2230 		/* give it to the nic */
2231 		mxge_encap(ss, m);
2232 	}
2233 	/* ran out of transmit slots */
2234 	if (((ss->if_drv_flags & IFF_DRV_OACTIVE) == 0)
2235 	    && (!drbr_empty(ifp, tx->br))) {
2236 		ss->if_drv_flags |= IFF_DRV_OACTIVE;
2237 		tx->stall++;
2238 	}
2239 }
2240 
2241 static int
2242 mxge_transmit_locked(struct mxge_slice_state *ss, struct mbuf *m)
2243 {
2244 	mxge_softc_t *sc;
2245 	struct ifnet *ifp;
2246 	mxge_tx_ring_t *tx;
2247 	int err;
2248 
2249 	sc = ss->sc;
2250 	ifp = sc->ifp;
2251 	tx = &ss->tx;
2252 
2253 	if ((ss->if_drv_flags & (IFF_DRV_RUNNING|IFF_DRV_OACTIVE)) !=
2254 	    IFF_DRV_RUNNING) {
2255 		err = drbr_enqueue(ifp, tx->br, m);
2256 		return (err);
2257 	}
2258 
2259 	if (!drbr_needs_enqueue(ifp, tx->br) &&
2260 	    ((tx->mask - (tx->req - tx->done)) > tx->max_desc)) {
2261 		/* let BPF see it */
2262 		BPF_MTAP(ifp, m);
2263 		/* give it to the nic */
2264 		mxge_encap(ss, m);
2265 	} else if ((err = drbr_enqueue(ifp, tx->br, m)) != 0) {
2266 		return (err);
2267 	}
2268 	if (!drbr_empty(ifp, tx->br))
2269 		mxge_start_locked(ss);
2270 	return (0);
2271 }
2272 
2273 static int
2274 mxge_transmit(struct ifnet *ifp, struct mbuf *m)
2275 {
2276 	mxge_softc_t *sc = ifp->if_softc;
2277 	struct mxge_slice_state *ss;
2278 	mxge_tx_ring_t *tx;
2279 	int err = 0;
2280 	int slice;
2281 
2282 	slice = m->m_pkthdr.flowid;
2283 	slice &= (sc->num_slices - 1);  /* num_slices always power of 2 */
2284 
2285 	ss = &sc->ss[slice];
2286 	tx = &ss->tx;
2287 
2288 	if (mtx_trylock(&tx->mtx)) {
2289 		err = mxge_transmit_locked(ss, m);
2290 		mtx_unlock(&tx->mtx);
2291 	} else {
2292 		err = drbr_enqueue(ifp, tx->br, m);
2293 	}
2294 
2295 	return (err);
2296 }
2297 
2298 #else
2299 
2300 static inline void
2301 mxge_start_locked(struct mxge_slice_state *ss)
2302 {
2303 	mxge_softc_t *sc;
2304 	struct mbuf *m;
2305 	struct ifnet *ifp;
2306 	mxge_tx_ring_t *tx;
2307 
2308 	sc = ss->sc;
2309 	ifp = sc->ifp;
2310 	tx = &ss->tx;
2311 	while ((tx->mask - (tx->req - tx->done)) > tx->max_desc) {
2312 		IFQ_DRV_DEQUEUE(&ifp->if_snd, m);
2313 		if (m == NULL) {
2314 			return;
2315 		}
2316 		/* let BPF see it */
2317 		BPF_MTAP(ifp, m);
2318 
2319 		/* give it to the nic */
2320 		mxge_encap(ss, m);
2321 	}
2322 	/* ran out of transmit slots */
2323 	if ((sc->ifp->if_drv_flags & IFF_DRV_OACTIVE) == 0) {
2324 		sc->ifp->if_drv_flags |= IFF_DRV_OACTIVE;
2325 		tx->stall++;
2326 	}
2327 }
2328 #endif
2329 static void
2330 mxge_start(struct ifnet *ifp)
2331 {
2332 	mxge_softc_t *sc = ifp->if_softc;
2333 	struct mxge_slice_state *ss;
2334 
2335 	/* only use the first slice for now */
2336 	ss = &sc->ss[0];
2337 	mtx_lock(&ss->tx.mtx);
2338 	mxge_start_locked(ss);
2339 	mtx_unlock(&ss->tx.mtx);
2340 }
2341 
2342 /*
2343  * copy an array of mcp_kreq_ether_recv_t's to the mcp.  Copy
2344  * at most 32 bytes at a time, so as to avoid involving the software
2345  * pio handler in the nic.   We re-write the first segment's low
2346  * DMA address to mark it valid only after we write the entire chunk
2347  * in a burst
2348  */
2349 static inline void
2350 mxge_submit_8rx(volatile mcp_kreq_ether_recv_t *dst,
2351 		mcp_kreq_ether_recv_t *src)
2352 {
2353 	uint32_t low;
2354 
2355 	low = src->addr_low;
2356 	src->addr_low = 0xffffffff;
2357 	mxge_pio_copy(dst, src, 4 * sizeof (*src));
2358 	wmb();
2359 	mxge_pio_copy(dst + 4, src + 4, 4 * sizeof (*src));
2360 	wmb();
2361 	src->addr_low = low;
2362 	dst->addr_low = low;
2363 	wmb();
2364 }
2365 
2366 static int
2367 mxge_get_buf_small(struct mxge_slice_state *ss, bus_dmamap_t map, int idx)
2368 {
2369 	bus_dma_segment_t seg;
2370 	struct mbuf *m;
2371 	mxge_rx_ring_t *rx = &ss->rx_small;
2372 	int cnt, err;
2373 
2374 	m = m_gethdr(M_NOWAIT, MT_DATA);
2375 	if (m == NULL) {
2376 		rx->alloc_fail++;
2377 		err = ENOBUFS;
2378 		goto done;
2379 	}
2380 	m->m_len = MHLEN;
2381 	err = bus_dmamap_load_mbuf_sg(rx->dmat, map, m,
2382 				      &seg, &cnt, BUS_DMA_NOWAIT);
2383 	if (err != 0) {
2384 		m_free(m);
2385 		goto done;
2386 	}
2387 	rx->info[idx].m = m;
2388 	rx->shadow[idx].addr_low =
2389 		htobe32(MXGE_LOWPART_TO_U32(seg.ds_addr));
2390 	rx->shadow[idx].addr_high =
2391 		htobe32(MXGE_HIGHPART_TO_U32(seg.ds_addr));
2392 
2393 done:
2394 	if ((idx & 7) == 7)
2395 		mxge_submit_8rx(&rx->lanai[idx - 7], &rx->shadow[idx - 7]);
2396 	return err;
2397 }
2398 
2399 static int
2400 mxge_get_buf_big(struct mxge_slice_state *ss, bus_dmamap_t map, int idx)
2401 {
2402 	bus_dma_segment_t seg[3];
2403 	struct mbuf *m;
2404 	mxge_rx_ring_t *rx = &ss->rx_big;
2405 	int cnt, err, i;
2406 
2407 	m = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, rx->cl_size);
2408 	if (m == NULL) {
2409 		rx->alloc_fail++;
2410 		err = ENOBUFS;
2411 		goto done;
2412 	}
2413 	m->m_len = rx->mlen;
2414 	err = bus_dmamap_load_mbuf_sg(rx->dmat, map, m,
2415 				      seg, &cnt, BUS_DMA_NOWAIT);
2416 	if (err != 0) {
2417 		m_free(m);
2418 		goto done;
2419 	}
2420 	rx->info[idx].m = m;
2421 	rx->shadow[idx].addr_low =
2422 		htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
2423 	rx->shadow[idx].addr_high =
2424 		htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
2425 
2426 #if MXGE_VIRT_JUMBOS
2427 	for (i = 1; i < cnt; i++) {
2428 		rx->shadow[idx + i].addr_low =
2429 			htobe32(MXGE_LOWPART_TO_U32(seg[i].ds_addr));
2430 		rx->shadow[idx + i].addr_high =
2431 			htobe32(MXGE_HIGHPART_TO_U32(seg[i].ds_addr));
2432        }
2433 #endif
2434 
2435 done:
2436        for (i = 0; i < rx->nbufs; i++) {
2437 		if ((idx & 7) == 7) {
2438 			mxge_submit_8rx(&rx->lanai[idx - 7],
2439 					&rx->shadow[idx - 7]);
2440 		}
2441 		idx++;
2442 	}
2443 	return err;
2444 }
2445 
2446 #ifdef INET6
2447 
2448 static uint16_t
2449 mxge_csum_generic(uint16_t *raw, int len)
2450 {
2451 	uint32_t csum;
2452 
2453 	csum = 0;
2454 	while (len > 0) {
2455 		csum += *raw;
2456 		raw++;
2457 		len -= 2;
2458 	}
2459 	csum = (csum >> 16) + (csum & 0xffff);
2460 	csum = (csum >> 16) + (csum & 0xffff);
2461 	return (uint16_t)csum;
2462 }
2463 
2464 static inline uint16_t
2465 mxge_rx_csum6(void *p, struct mbuf *m, uint32_t csum)
2466 {
2467 	uint32_t partial;
2468 	int nxt, cksum_offset;
2469 	struct ip6_hdr *ip6 = p;
2470 	uint16_t c;
2471 
2472 	nxt = ip6->ip6_nxt;
2473 	cksum_offset = sizeof (*ip6) + ETHER_HDR_LEN;
2474 	if (nxt != IPPROTO_TCP && nxt != IPPROTO_UDP) {
2475 		cksum_offset = ip6_lasthdr(m, ETHER_HDR_LEN,
2476 					   IPPROTO_IPV6, &nxt);
2477 		if (nxt != IPPROTO_TCP && nxt != IPPROTO_UDP)
2478 			return (1);
2479 	}
2480 
2481 	/*
2482 	 * IPv6 headers do not contain a checksum, and hence
2483 	 * do not checksum to zero, so they don't "fall out"
2484 	 * of the partial checksum calculation like IPv4
2485 	 * headers do.  We need to fix the partial checksum by
2486 	 * subtracting the checksum of the IPv6 header.
2487 	 */
2488 
2489 	partial = mxge_csum_generic((uint16_t *)ip6, cksum_offset -
2490 				    ETHER_HDR_LEN);
2491 	csum += ~partial;
2492 	csum +=	 (csum < ~partial);
2493 	csum = (csum >> 16) + (csum & 0xFFFF);
2494 	csum = (csum >> 16) + (csum & 0xFFFF);
2495 	c = in6_cksum_pseudo(ip6, m->m_pkthdr.len - cksum_offset, nxt,
2496 			     csum);
2497 	c ^= 0xffff;
2498 	return (c);
2499 }
2500 #endif /* INET6 */
2501 /*
2502  *  Myri10GE hardware checksums are not valid if the sender
2503  *  padded the frame with non-zero padding.  This is because
2504  *  the firmware just does a simple 16-bit 1s complement
2505  *  checksum across the entire frame, excluding the first 14
2506  *  bytes.  It is best to simply to check the checksum and
2507  *  tell the stack about it only if the checksum is good
2508  */
2509 
2510 static inline uint16_t
2511 mxge_rx_csum(struct mbuf *m, int csum)
2512 {
2513 	struct ether_header *eh;
2514 #ifdef INET
2515 	struct ip *ip;
2516 #endif
2517 #if defined(INET) || defined(INET6)
2518 	int cap = m->m_pkthdr.rcvif->if_capenable;
2519 #endif
2520 	uint16_t c, etype;
2521 
2522 	eh = mtod(m, struct ether_header *);
2523 	etype = ntohs(eh->ether_type);
2524 	switch (etype) {
2525 #ifdef INET
2526 	case ETHERTYPE_IP:
2527 		if ((cap & IFCAP_RXCSUM) == 0)
2528 			return (1);
2529 		ip = (struct ip *)(eh + 1);
2530 		if (ip->ip_p != IPPROTO_TCP && ip->ip_p != IPPROTO_UDP)
2531 			return (1);
2532 		c = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
2533 			      htonl(ntohs(csum) + ntohs(ip->ip_len) -
2534 				    (ip->ip_hl << 2) + ip->ip_p));
2535 		c ^= 0xffff;
2536 		break;
2537 #endif
2538 #ifdef INET6
2539 	case ETHERTYPE_IPV6:
2540 		if ((cap & IFCAP_RXCSUM_IPV6) == 0)
2541 			return (1);
2542 		c = mxge_rx_csum6((eh + 1), m, csum);
2543 		break;
2544 #endif
2545 	default:
2546 		c = 1;
2547 	}
2548 	return (c);
2549 }
2550 
2551 static void
2552 mxge_vlan_tag_remove(struct mbuf *m, uint32_t *csum)
2553 {
2554 	struct ether_vlan_header *evl;
2555 	uint32_t partial;
2556 
2557 	evl = mtod(m, struct ether_vlan_header *);
2558 
2559 	/*
2560 	 * fix checksum by subtracting ETHER_VLAN_ENCAP_LEN bytes
2561 	 * after what the firmware thought was the end of the ethernet
2562 	 * header.
2563 	 */
2564 
2565 	/* put checksum into host byte order */
2566 	*csum = ntohs(*csum);
2567 	partial = ntohl(*(uint32_t *)(mtod(m, char *) + ETHER_HDR_LEN));
2568 	(*csum) += ~partial;
2569 	(*csum) +=  ((*csum) < ~partial);
2570 	(*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2571 	(*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2572 
2573 	/* restore checksum to network byte order;
2574 	   later consumers expect this */
2575 	*csum = htons(*csum);
2576 
2577 	/* save the tag */
2578 #ifdef MXGE_NEW_VLAN_API
2579 	m->m_pkthdr.ether_vtag = ntohs(evl->evl_tag);
2580 #else
2581 	{
2582 		struct m_tag *mtag;
2583 		mtag = m_tag_alloc(MTAG_VLAN, MTAG_VLAN_TAG, sizeof(u_int),
2584 				   M_NOWAIT);
2585 		if (mtag == NULL)
2586 			return;
2587 		VLAN_TAG_VALUE(mtag) = ntohs(evl->evl_tag);
2588 		m_tag_prepend(m, mtag);
2589 	}
2590 
2591 #endif
2592 	m->m_flags |= M_VLANTAG;
2593 
2594 	/*
2595 	 * Remove the 802.1q header by copying the Ethernet
2596 	 * addresses over it and adjusting the beginning of
2597 	 * the data in the mbuf.  The encapsulated Ethernet
2598 	 * type field is already in place.
2599 	 */
2600 	bcopy((char *)evl, (char *)evl + ETHER_VLAN_ENCAP_LEN,
2601 	      ETHER_HDR_LEN - ETHER_TYPE_LEN);
2602 	m_adj(m, ETHER_VLAN_ENCAP_LEN);
2603 }
2604 
2605 static inline void
2606 mxge_rx_done_big(struct mxge_slice_state *ss, uint32_t len,
2607 		 uint32_t csum, int lro)
2608 {
2609 	mxge_softc_t *sc;
2610 	struct ifnet *ifp;
2611 	struct mbuf *m;
2612 	struct ether_header *eh;
2613 	mxge_rx_ring_t *rx;
2614 	bus_dmamap_t old_map;
2615 	int idx;
2616 
2617 	sc = ss->sc;
2618 	ifp = sc->ifp;
2619 	rx = &ss->rx_big;
2620 	idx = rx->cnt & rx->mask;
2621 	rx->cnt += rx->nbufs;
2622 	/* save a pointer to the received mbuf */
2623 	m = rx->info[idx].m;
2624 	/* try to replace the received mbuf */
2625 	if (mxge_get_buf_big(ss, rx->extra_map, idx)) {
2626 		/* drop the frame -- the old mbuf is re-cycled */
2627 		if_inc_counter(ifp, IFCOUNTER_IERRORS, 1);
2628 		return;
2629 	}
2630 
2631 	/* unmap the received buffer */
2632 	old_map = rx->info[idx].map;
2633 	bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2634 	bus_dmamap_unload(rx->dmat, old_map);
2635 
2636 	/* swap the bus_dmamap_t's */
2637 	rx->info[idx].map = rx->extra_map;
2638 	rx->extra_map = old_map;
2639 
2640 	/* mcp implicitly skips 1st 2 bytes so that packet is properly
2641 	 * aligned */
2642 	m->m_data += MXGEFW_PAD;
2643 
2644 	m->m_pkthdr.rcvif = ifp;
2645 	m->m_len = m->m_pkthdr.len = len;
2646 	ss->ipackets++;
2647 	eh = mtod(m, struct ether_header *);
2648 	if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2649 		mxge_vlan_tag_remove(m, &csum);
2650 	}
2651 	/* flowid only valid if RSS hashing is enabled */
2652 	if (sc->num_slices > 1) {
2653 		m->m_pkthdr.flowid = (ss - sc->ss);
2654 		M_HASHTYPE_SET(m, M_HASHTYPE_OPAQUE);
2655 	}
2656 	/* if the checksum is valid, mark it in the mbuf header */
2657 	if ((ifp->if_capenable & (IFCAP_RXCSUM_IPV6 | IFCAP_RXCSUM)) &&
2658 	    (0 == mxge_rx_csum(m, csum))) {
2659 		/* Tell the stack that the  checksum is good */
2660 		m->m_pkthdr.csum_data = 0xffff;
2661 		m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR |
2662 			CSUM_DATA_VALID;
2663 
2664 #if defined(INET) || defined (INET6)
2665 		if (lro && (0 == tcp_lro_rx(&ss->lc, m, 0)))
2666 			return;
2667 #endif
2668 	}
2669 	/* pass the frame up the stack */
2670 	(*ifp->if_input)(ifp, m);
2671 }
2672 
2673 static inline void
2674 mxge_rx_done_small(struct mxge_slice_state *ss, uint32_t len,
2675 		   uint32_t csum, int lro)
2676 {
2677 	mxge_softc_t *sc;
2678 	struct ifnet *ifp;
2679 	struct ether_header *eh;
2680 	struct mbuf *m;
2681 	mxge_rx_ring_t *rx;
2682 	bus_dmamap_t old_map;
2683 	int idx;
2684 
2685 	sc = ss->sc;
2686 	ifp = sc->ifp;
2687 	rx = &ss->rx_small;
2688 	idx = rx->cnt & rx->mask;
2689 	rx->cnt++;
2690 	/* save a pointer to the received mbuf */
2691 	m = rx->info[idx].m;
2692 	/* try to replace the received mbuf */
2693 	if (mxge_get_buf_small(ss, rx->extra_map, idx)) {
2694 		/* drop the frame -- the old mbuf is re-cycled */
2695 		if_inc_counter(ifp, IFCOUNTER_IERRORS, 1);
2696 		return;
2697 	}
2698 
2699 	/* unmap the received buffer */
2700 	old_map = rx->info[idx].map;
2701 	bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2702 	bus_dmamap_unload(rx->dmat, old_map);
2703 
2704 	/* swap the bus_dmamap_t's */
2705 	rx->info[idx].map = rx->extra_map;
2706 	rx->extra_map = old_map;
2707 
2708 	/* mcp implicitly skips 1st 2 bytes so that packet is properly
2709 	 * aligned */
2710 	m->m_data += MXGEFW_PAD;
2711 
2712 	m->m_pkthdr.rcvif = ifp;
2713 	m->m_len = m->m_pkthdr.len = len;
2714 	ss->ipackets++;
2715 	eh = mtod(m, struct ether_header *);
2716 	if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2717 		mxge_vlan_tag_remove(m, &csum);
2718 	}
2719 	/* flowid only valid if RSS hashing is enabled */
2720 	if (sc->num_slices > 1) {
2721 		m->m_pkthdr.flowid = (ss - sc->ss);
2722 		M_HASHTYPE_SET(m, M_HASHTYPE_OPAQUE);
2723 	}
2724 	/* if the checksum is valid, mark it in the mbuf header */
2725 	if ((ifp->if_capenable & (IFCAP_RXCSUM_IPV6 | IFCAP_RXCSUM)) &&
2726 	    (0 == mxge_rx_csum(m, csum))) {
2727 		/* Tell the stack that the  checksum is good */
2728 		m->m_pkthdr.csum_data = 0xffff;
2729 		m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR |
2730 			CSUM_DATA_VALID;
2731 
2732 #if defined(INET) || defined (INET6)
2733 		if (lro && (0 == tcp_lro_rx(&ss->lc, m, csum)))
2734 			return;
2735 #endif
2736 	}
2737 	/* pass the frame up the stack */
2738 	(*ifp->if_input)(ifp, m);
2739 }
2740 
2741 static inline void
2742 mxge_clean_rx_done(struct mxge_slice_state *ss)
2743 {
2744 	mxge_rx_done_t *rx_done = &ss->rx_done;
2745 	int limit = 0;
2746 	uint16_t length;
2747 	uint16_t checksum;
2748 	int lro;
2749 
2750 	lro = ss->sc->ifp->if_capenable & IFCAP_LRO;
2751 	while (rx_done->entry[rx_done->idx].length != 0) {
2752 		length = ntohs(rx_done->entry[rx_done->idx].length);
2753 		rx_done->entry[rx_done->idx].length = 0;
2754 		checksum = rx_done->entry[rx_done->idx].checksum;
2755 		if (length <= (MHLEN - MXGEFW_PAD))
2756 			mxge_rx_done_small(ss, length, checksum, lro);
2757 		else
2758 			mxge_rx_done_big(ss, length, checksum, lro);
2759 		rx_done->cnt++;
2760 		rx_done->idx = rx_done->cnt & rx_done->mask;
2761 
2762 		/* limit potential for livelock */
2763 		if (__predict_false(++limit > rx_done->mask / 2))
2764 			break;
2765 	}
2766 #if defined(INET)  || defined (INET6)
2767 	tcp_lro_flush_all(&ss->lc);
2768 #endif
2769 }
2770 
2771 static inline void
2772 mxge_tx_done(struct mxge_slice_state *ss, uint32_t mcp_idx)
2773 {
2774 	struct ifnet *ifp __unused;
2775 	mxge_tx_ring_t *tx;
2776 	struct mbuf *m;
2777 	bus_dmamap_t map;
2778 	int idx;
2779 	int *flags;
2780 
2781 	tx = &ss->tx;
2782 	ifp = ss->sc->ifp;
2783 	while (tx->pkt_done != mcp_idx) {
2784 		idx = tx->done & tx->mask;
2785 		tx->done++;
2786 		m = tx->info[idx].m;
2787 		/* mbuf and DMA map only attached to the first
2788 		   segment per-mbuf */
2789 		if (m != NULL) {
2790 			ss->obytes += m->m_pkthdr.len;
2791 			if (m->m_flags & M_MCAST)
2792 				ss->omcasts++;
2793 			ss->opackets++;
2794 			tx->info[idx].m = NULL;
2795 			map = tx->info[idx].map;
2796 			bus_dmamap_unload(tx->dmat, map);
2797 			m_freem(m);
2798 		}
2799 		if (tx->info[idx].flag) {
2800 			tx->info[idx].flag = 0;
2801 			tx->pkt_done++;
2802 		}
2803 	}
2804 
2805 	/* If we have space, clear IFF_OACTIVE to tell the stack that
2806 	   its OK to send packets */
2807 #ifdef IFNET_BUF_RING
2808 	flags = &ss->if_drv_flags;
2809 #else
2810 	flags = &ifp->if_drv_flags;
2811 #endif
2812 	mtx_lock(&ss->tx.mtx);
2813 	if ((*flags) & IFF_DRV_OACTIVE &&
2814 	    tx->req - tx->done < (tx->mask + 1)/4) {
2815 		*(flags) &= ~IFF_DRV_OACTIVE;
2816 		ss->tx.wake++;
2817 		mxge_start_locked(ss);
2818 	}
2819 #ifdef IFNET_BUF_RING
2820 	if ((ss->sc->num_slices > 1) && (tx->req == tx->done)) {
2821 		/* let the NIC stop polling this queue, since there
2822 		 * are no more transmits pending */
2823 		if (tx->req == tx->done) {
2824 			*tx->send_stop = 1;
2825 			tx->queue_active = 0;
2826 			tx->deactivate++;
2827 			wmb();
2828 		}
2829 	}
2830 #endif
2831 	mtx_unlock(&ss->tx.mtx);
2832 
2833 }
2834 
2835 static struct mxge_media_type mxge_xfp_media_types[] =
2836 {
2837 	{IFM_10G_CX4,	0x7f, 		"10GBASE-CX4 (module)"},
2838 	{IFM_10G_SR, 	(1 << 7),	"10GBASE-SR"},
2839 	{IFM_10G_LR, 	(1 << 6),	"10GBASE-LR"},
2840 	{0,		(1 << 5),	"10GBASE-ER"},
2841 	{IFM_10G_LRM,	(1 << 4),	"10GBASE-LRM"},
2842 	{0,		(1 << 3),	"10GBASE-SW"},
2843 	{0,		(1 << 2),	"10GBASE-LW"},
2844 	{0,		(1 << 1),	"10GBASE-EW"},
2845 	{0,		(1 << 0),	"Reserved"}
2846 };
2847 static struct mxge_media_type mxge_sfp_media_types[] =
2848 {
2849 	{IFM_10G_TWINAX,      0,	"10GBASE-Twinax"},
2850 	{0,		(1 << 7),	"Reserved"},
2851 	{IFM_10G_LRM,	(1 << 6),	"10GBASE-LRM"},
2852 	{IFM_10G_LR, 	(1 << 5),	"10GBASE-LR"},
2853 	{IFM_10G_SR,	(1 << 4),	"10GBASE-SR"},
2854 	{IFM_10G_TWINAX,(1 << 0),	"10GBASE-Twinax"}
2855 };
2856 
2857 static void
2858 mxge_media_set(mxge_softc_t *sc, int media_type)
2859 {
2860 
2861 	ifmedia_add(&sc->media, IFM_ETHER | IFM_FDX | media_type,
2862 		    0, NULL);
2863 	ifmedia_set(&sc->media, IFM_ETHER | IFM_FDX | media_type);
2864 	sc->current_media = media_type;
2865 	sc->media.ifm_media = sc->media.ifm_cur->ifm_media;
2866 }
2867 
2868 static void
2869 mxge_media_init(mxge_softc_t *sc)
2870 {
2871 	char *ptr;
2872 	int i;
2873 
2874 	ifmedia_removeall(&sc->media);
2875 	mxge_media_set(sc, IFM_AUTO);
2876 
2877 	/*
2878 	 * parse the product code to deterimine the interface type
2879 	 * (CX4, XFP, Quad Ribbon Fiber) by looking at the character
2880 	 * after the 3rd dash in the driver's cached copy of the
2881 	 * EEPROM's product code string.
2882 	 */
2883 	ptr = sc->product_code_string;
2884 	if (ptr == NULL) {
2885 		device_printf(sc->dev, "Missing product code\n");
2886 		return;
2887 	}
2888 
2889 	for (i = 0; i < 3; i++, ptr++) {
2890 		ptr = strchr(ptr, '-');
2891 		if (ptr == NULL) {
2892 			device_printf(sc->dev,
2893 				      "only %d dashes in PC?!?\n", i);
2894 			return;
2895 		}
2896 	}
2897 	if (*ptr == 'C' || *(ptr +1) == 'C') {
2898 		/* -C is CX4 */
2899 		sc->connector = MXGE_CX4;
2900 		mxge_media_set(sc, IFM_10G_CX4);
2901 	} else if (*ptr == 'Q') {
2902 		/* -Q is Quad Ribbon Fiber */
2903 		sc->connector = MXGE_QRF;
2904 		device_printf(sc->dev, "Quad Ribbon Fiber Media\n");
2905 		/* FreeBSD has no media type for Quad ribbon fiber */
2906 	} else if (*ptr == 'R') {
2907 		/* -R is XFP */
2908 		sc->connector = MXGE_XFP;
2909 	} else if (*ptr == 'S' || *(ptr +1) == 'S') {
2910 		/* -S or -2S is SFP+ */
2911 		sc->connector = MXGE_SFP;
2912 	} else {
2913 		device_printf(sc->dev, "Unknown media type: %c\n", *ptr);
2914 	}
2915 }
2916 
2917 /*
2918  * Determine the media type for a NIC.  Some XFPs will identify
2919  * themselves only when their link is up, so this is initiated via a
2920  * link up interrupt.  However, this can potentially take up to
2921  * several milliseconds, so it is run via the watchdog routine, rather
2922  * than in the interrupt handler itself.
2923  */
2924 static void
2925 mxge_media_probe(mxge_softc_t *sc)
2926 {
2927 	mxge_cmd_t cmd;
2928 	char *cage_type;
2929 
2930 	struct mxge_media_type *mxge_media_types = NULL;
2931 	int i, err, ms, mxge_media_type_entries;
2932 	uint32_t byte;
2933 
2934 	sc->need_media_probe = 0;
2935 
2936 	if (sc->connector == MXGE_XFP) {
2937 		/* -R is XFP */
2938 		mxge_media_types = mxge_xfp_media_types;
2939 		mxge_media_type_entries =
2940 			nitems(mxge_xfp_media_types);
2941 		byte = MXGE_XFP_COMPLIANCE_BYTE;
2942 		cage_type = "XFP";
2943 	} else 	if (sc->connector == MXGE_SFP) {
2944 		/* -S or -2S is SFP+ */
2945 		mxge_media_types = mxge_sfp_media_types;
2946 		mxge_media_type_entries =
2947 			nitems(mxge_sfp_media_types);
2948 		cage_type = "SFP+";
2949 		byte = 3;
2950 	} else {
2951 		/* nothing to do; media type cannot change */
2952 		return;
2953 	}
2954 
2955 	/*
2956 	 * At this point we know the NIC has an XFP cage, so now we
2957 	 * try to determine what is in the cage by using the
2958 	 * firmware's XFP I2C commands to read the XFP 10GbE compilance
2959 	 * register.  We read just one byte, which may take over
2960 	 * a millisecond
2961 	 */
2962 
2963 	cmd.data0 = 0;	 /* just fetch 1 byte, not all 256 */
2964 	cmd.data1 = byte;
2965 	err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_READ, &cmd);
2966 	if (err == MXGEFW_CMD_ERROR_I2C_FAILURE) {
2967 		device_printf(sc->dev, "failed to read XFP\n");
2968 	}
2969 	if (err == MXGEFW_CMD_ERROR_I2C_ABSENT) {
2970 		device_printf(sc->dev, "Type R/S with no XFP!?!?\n");
2971 	}
2972 	if (err != MXGEFW_CMD_OK) {
2973 		return;
2974 	}
2975 
2976 	/* now we wait for the data to be cached */
2977 	cmd.data0 = byte;
2978 	err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
2979 	for (ms = 0; (err == EBUSY) && (ms < 50); ms++) {
2980 		DELAY(1000);
2981 		cmd.data0 = byte;
2982 		err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
2983 	}
2984 	if (err != MXGEFW_CMD_OK) {
2985 		device_printf(sc->dev, "failed to read %s (%d, %dms)\n",
2986 			      cage_type, err, ms);
2987 		return;
2988 	}
2989 
2990 	if (cmd.data0 == mxge_media_types[0].bitmask) {
2991 		if (mxge_verbose)
2992 			device_printf(sc->dev, "%s:%s\n", cage_type,
2993 				      mxge_media_types[0].name);
2994 		if (sc->current_media != mxge_media_types[0].flag) {
2995 			mxge_media_init(sc);
2996 			mxge_media_set(sc, mxge_media_types[0].flag);
2997 		}
2998 		return;
2999 	}
3000 	for (i = 1; i < mxge_media_type_entries; i++) {
3001 		if (cmd.data0 & mxge_media_types[i].bitmask) {
3002 			if (mxge_verbose)
3003 				device_printf(sc->dev, "%s:%s\n",
3004 					      cage_type,
3005 					      mxge_media_types[i].name);
3006 
3007 			if (sc->current_media != mxge_media_types[i].flag) {
3008 				mxge_media_init(sc);
3009 				mxge_media_set(sc, mxge_media_types[i].flag);
3010 			}
3011 			return;
3012 		}
3013 	}
3014 	if (mxge_verbose)
3015 		device_printf(sc->dev, "%s media 0x%x unknown\n",
3016 			      cage_type, cmd.data0);
3017 
3018 	return;
3019 }
3020 
3021 static void
3022 mxge_intr(void *arg)
3023 {
3024 	struct mxge_slice_state *ss = arg;
3025 	mxge_softc_t *sc = ss->sc;
3026 	mcp_irq_data_t *stats = ss->fw_stats;
3027 	mxge_tx_ring_t *tx = &ss->tx;
3028 	mxge_rx_done_t *rx_done = &ss->rx_done;
3029 	uint32_t send_done_count;
3030 	uint8_t valid;
3031 
3032 #ifndef IFNET_BUF_RING
3033 	/* an interrupt on a non-zero slice is implicitly valid
3034 	   since MSI-X irqs are not shared */
3035 	if (ss != sc->ss) {
3036 		mxge_clean_rx_done(ss);
3037 		*ss->irq_claim = be32toh(3);
3038 		return;
3039 	}
3040 #endif
3041 
3042 	/* make sure the DMA has finished */
3043 	if (!stats->valid) {
3044 		return;
3045 	}
3046 	valid = stats->valid;
3047 
3048 	if (sc->legacy_irq) {
3049 		/* lower legacy IRQ  */
3050 		*sc->irq_deassert = 0;
3051 		if (!mxge_deassert_wait)
3052 			/* don't wait for conf. that irq is low */
3053 			stats->valid = 0;
3054 	} else {
3055 		stats->valid = 0;
3056 	}
3057 
3058 	/* loop while waiting for legacy irq deassertion */
3059 	do {
3060 		/* check for transmit completes and receives */
3061 		send_done_count = be32toh(stats->send_done_count);
3062 		while ((send_done_count != tx->pkt_done) ||
3063 		       (rx_done->entry[rx_done->idx].length != 0)) {
3064 			if (send_done_count != tx->pkt_done)
3065 				mxge_tx_done(ss, (int)send_done_count);
3066 			mxge_clean_rx_done(ss);
3067 			send_done_count = be32toh(stats->send_done_count);
3068 		}
3069 		if (sc->legacy_irq && mxge_deassert_wait)
3070 			wmb();
3071 	} while (*((volatile uint8_t *) &stats->valid));
3072 
3073 	/* fw link & error stats meaningful only on the first slice */
3074 	if (__predict_false((ss == sc->ss) && stats->stats_updated)) {
3075 		if (sc->link_state != stats->link_up) {
3076 			sc->link_state = stats->link_up;
3077 			if (sc->link_state) {
3078 				if_link_state_change(sc->ifp, LINK_STATE_UP);
3079 				if (mxge_verbose)
3080 					device_printf(sc->dev, "link up\n");
3081 			} else {
3082 				if_link_state_change(sc->ifp, LINK_STATE_DOWN);
3083 				if (mxge_verbose)
3084 					device_printf(sc->dev, "link down\n");
3085 			}
3086 			sc->need_media_probe = 1;
3087 		}
3088 		if (sc->rdma_tags_available !=
3089 		    be32toh(stats->rdma_tags_available)) {
3090 			sc->rdma_tags_available =
3091 				be32toh(stats->rdma_tags_available);
3092 			device_printf(sc->dev, "RDMA timed out! %d tags "
3093 				      "left\n", sc->rdma_tags_available);
3094 		}
3095 
3096 		if (stats->link_down) {
3097 			sc->down_cnt += stats->link_down;
3098 			sc->link_state = 0;
3099 			if_link_state_change(sc->ifp, LINK_STATE_DOWN);
3100 		}
3101 	}
3102 
3103 	/* check to see if we have rx token to pass back */
3104 	if (valid & 0x1)
3105 	    *ss->irq_claim = be32toh(3);
3106 	*(ss->irq_claim + 1) = be32toh(3);
3107 }
3108 
3109 static void
3110 mxge_init(void *arg)
3111 {
3112 	mxge_softc_t *sc = arg;
3113 	struct ifnet *ifp = sc->ifp;
3114 
3115 	mtx_lock(&sc->driver_mtx);
3116 	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0)
3117 		(void) mxge_open(sc);
3118 	mtx_unlock(&sc->driver_mtx);
3119 }
3120 
3121 static void
3122 mxge_free_slice_mbufs(struct mxge_slice_state *ss)
3123 {
3124 	int i;
3125 
3126 #if defined(INET) || defined(INET6)
3127 	tcp_lro_free(&ss->lc);
3128 #endif
3129 	for (i = 0; i <= ss->rx_big.mask; i++) {
3130 		if (ss->rx_big.info[i].m == NULL)
3131 			continue;
3132 		bus_dmamap_unload(ss->rx_big.dmat,
3133 				  ss->rx_big.info[i].map);
3134 		m_freem(ss->rx_big.info[i].m);
3135 		ss->rx_big.info[i].m = NULL;
3136 	}
3137 
3138 	for (i = 0; i <= ss->rx_small.mask; i++) {
3139 		if (ss->rx_small.info[i].m == NULL)
3140 			continue;
3141 		bus_dmamap_unload(ss->rx_small.dmat,
3142 				  ss->rx_small.info[i].map);
3143 		m_freem(ss->rx_small.info[i].m);
3144 		ss->rx_small.info[i].m = NULL;
3145 	}
3146 
3147 	/* transmit ring used only on the first slice */
3148 	if (ss->tx.info == NULL)
3149 		return;
3150 
3151 	for (i = 0; i <= ss->tx.mask; i++) {
3152 		ss->tx.info[i].flag = 0;
3153 		if (ss->tx.info[i].m == NULL)
3154 			continue;
3155 		bus_dmamap_unload(ss->tx.dmat,
3156 				  ss->tx.info[i].map);
3157 		m_freem(ss->tx.info[i].m);
3158 		ss->tx.info[i].m = NULL;
3159 	}
3160 }
3161 
3162 static void
3163 mxge_free_mbufs(mxge_softc_t *sc)
3164 {
3165 	int slice;
3166 
3167 	for (slice = 0; slice < sc->num_slices; slice++)
3168 		mxge_free_slice_mbufs(&sc->ss[slice]);
3169 }
3170 
3171 static void
3172 mxge_free_slice_rings(struct mxge_slice_state *ss)
3173 {
3174 	int i;
3175 
3176 	if (ss->rx_done.entry != NULL)
3177 		mxge_dma_free(&ss->rx_done.dma);
3178 	ss->rx_done.entry = NULL;
3179 
3180 	if (ss->tx.req_bytes != NULL)
3181 		free(ss->tx.req_bytes, M_DEVBUF);
3182 	ss->tx.req_bytes = NULL;
3183 
3184 	if (ss->tx.seg_list != NULL)
3185 		free(ss->tx.seg_list, M_DEVBUF);
3186 	ss->tx.seg_list = NULL;
3187 
3188 	if (ss->rx_small.shadow != NULL)
3189 		free(ss->rx_small.shadow, M_DEVBUF);
3190 	ss->rx_small.shadow = NULL;
3191 
3192 	if (ss->rx_big.shadow != NULL)
3193 		free(ss->rx_big.shadow, M_DEVBUF);
3194 	ss->rx_big.shadow = NULL;
3195 
3196 	if (ss->tx.info != NULL) {
3197 		if (ss->tx.dmat != NULL) {
3198 			for (i = 0; i <= ss->tx.mask; i++) {
3199 				bus_dmamap_destroy(ss->tx.dmat,
3200 						   ss->tx.info[i].map);
3201 			}
3202 			bus_dma_tag_destroy(ss->tx.dmat);
3203 		}
3204 		free(ss->tx.info, M_DEVBUF);
3205 	}
3206 	ss->tx.info = NULL;
3207 
3208 	if (ss->rx_small.info != NULL) {
3209 		if (ss->rx_small.dmat != NULL) {
3210 			for (i = 0; i <= ss->rx_small.mask; i++) {
3211 				bus_dmamap_destroy(ss->rx_small.dmat,
3212 						   ss->rx_small.info[i].map);
3213 			}
3214 			bus_dmamap_destroy(ss->rx_small.dmat,
3215 					   ss->rx_small.extra_map);
3216 			bus_dma_tag_destroy(ss->rx_small.dmat);
3217 		}
3218 		free(ss->rx_small.info, M_DEVBUF);
3219 	}
3220 	ss->rx_small.info = NULL;
3221 
3222 	if (ss->rx_big.info != NULL) {
3223 		if (ss->rx_big.dmat != NULL) {
3224 			for (i = 0; i <= ss->rx_big.mask; i++) {
3225 				bus_dmamap_destroy(ss->rx_big.dmat,
3226 						   ss->rx_big.info[i].map);
3227 			}
3228 			bus_dmamap_destroy(ss->rx_big.dmat,
3229 					   ss->rx_big.extra_map);
3230 			bus_dma_tag_destroy(ss->rx_big.dmat);
3231 		}
3232 		free(ss->rx_big.info, M_DEVBUF);
3233 	}
3234 	ss->rx_big.info = NULL;
3235 }
3236 
3237 static void
3238 mxge_free_rings(mxge_softc_t *sc)
3239 {
3240 	int slice;
3241 
3242 	for (slice = 0; slice < sc->num_slices; slice++)
3243 		mxge_free_slice_rings(&sc->ss[slice]);
3244 }
3245 
3246 static int
3247 mxge_alloc_slice_rings(struct mxge_slice_state *ss, int rx_ring_entries,
3248 		       int tx_ring_entries)
3249 {
3250 	mxge_softc_t *sc = ss->sc;
3251 	size_t bytes;
3252 	int err, i;
3253 
3254 	/* allocate per-slice receive resources */
3255 
3256 	ss->rx_small.mask = ss->rx_big.mask = rx_ring_entries - 1;
3257 	ss->rx_done.mask = (2 * rx_ring_entries) - 1;
3258 
3259 	/* allocate the rx shadow rings */
3260 	bytes = rx_ring_entries * sizeof (*ss->rx_small.shadow);
3261 	ss->rx_small.shadow = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3262 
3263 	bytes = rx_ring_entries * sizeof (*ss->rx_big.shadow);
3264 	ss->rx_big.shadow = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3265 
3266 	/* allocate the rx host info rings */
3267 	bytes = rx_ring_entries * sizeof (*ss->rx_small.info);
3268 	ss->rx_small.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3269 
3270 	bytes = rx_ring_entries * sizeof (*ss->rx_big.info);
3271 	ss->rx_big.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3272 
3273 	/* allocate the rx busdma resources */
3274 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
3275 				 1,			/* alignment */
3276 				 4096,			/* boundary */
3277 				 BUS_SPACE_MAXADDR,	/* low */
3278 				 BUS_SPACE_MAXADDR,	/* high */
3279 				 NULL, NULL,		/* filter */
3280 				 MHLEN,			/* maxsize */
3281 				 1,			/* num segs */
3282 				 MHLEN,			/* maxsegsize */
3283 				 BUS_DMA_ALLOCNOW,	/* flags */
3284 				 NULL, NULL,		/* lock */
3285 				 &ss->rx_small.dmat);	/* tag */
3286 	if (err != 0) {
3287 		device_printf(sc->dev, "Err %d allocating rx_small dmat\n",
3288 			      err);
3289 		return err;
3290 	}
3291 
3292 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
3293 				 1,			/* alignment */
3294 #if MXGE_VIRT_JUMBOS
3295 				 4096,			/* boundary */
3296 #else
3297 				 0,			/* boundary */
3298 #endif
3299 				 BUS_SPACE_MAXADDR,	/* low */
3300 				 BUS_SPACE_MAXADDR,	/* high */
3301 				 NULL, NULL,		/* filter */
3302 				 3*4096,		/* maxsize */
3303 #if MXGE_VIRT_JUMBOS
3304 				 3,			/* num segs */
3305 				 4096,			/* maxsegsize*/
3306 #else
3307 				 1,			/* num segs */
3308 				 MJUM9BYTES,		/* maxsegsize*/
3309 #endif
3310 				 BUS_DMA_ALLOCNOW,	/* flags */
3311 				 NULL, NULL,		/* lock */
3312 				 &ss->rx_big.dmat);	/* tag */
3313 	if (err != 0) {
3314 		device_printf(sc->dev, "Err %d allocating rx_big dmat\n",
3315 			      err);
3316 		return err;
3317 	}
3318 	for (i = 0; i <= ss->rx_small.mask; i++) {
3319 		err = bus_dmamap_create(ss->rx_small.dmat, 0,
3320 					&ss->rx_small.info[i].map);
3321 		if (err != 0) {
3322 			device_printf(sc->dev, "Err %d  rx_small dmamap\n",
3323 				      err);
3324 			return err;
3325 		}
3326 	}
3327 	err = bus_dmamap_create(ss->rx_small.dmat, 0,
3328 				&ss->rx_small.extra_map);
3329 	if (err != 0) {
3330 		device_printf(sc->dev, "Err %d extra rx_small dmamap\n",
3331 			      err);
3332 		return err;
3333 	}
3334 
3335 	for (i = 0; i <= ss->rx_big.mask; i++) {
3336 		err = bus_dmamap_create(ss->rx_big.dmat, 0,
3337 					&ss->rx_big.info[i].map);
3338 		if (err != 0) {
3339 			device_printf(sc->dev, "Err %d  rx_big dmamap\n",
3340 				      err);
3341 			return err;
3342 		}
3343 	}
3344 	err = bus_dmamap_create(ss->rx_big.dmat, 0,
3345 				&ss->rx_big.extra_map);
3346 	if (err != 0) {
3347 		device_printf(sc->dev, "Err %d extra rx_big dmamap\n",
3348 			      err);
3349 		return err;
3350 	}
3351 
3352 	/* now allocate TX resources */
3353 
3354 #ifndef IFNET_BUF_RING
3355 	/* only use a single TX ring for now */
3356 	if (ss != ss->sc->ss)
3357 		return 0;
3358 #endif
3359 
3360 	ss->tx.mask = tx_ring_entries - 1;
3361 	ss->tx.max_desc = MIN(MXGE_MAX_SEND_DESC, tx_ring_entries / 4);
3362 
3363 	/* allocate the tx request copy block */
3364 	bytes = 8 +
3365 		sizeof (*ss->tx.req_list) * (ss->tx.max_desc + 4);
3366 	ss->tx.req_bytes = malloc(bytes, M_DEVBUF, M_WAITOK);
3367 	/* ensure req_list entries are aligned to 8 bytes */
3368 	ss->tx.req_list = (mcp_kreq_ether_send_t *)
3369 		((uintptr_t)(ss->tx.req_bytes + 7) & ~7UL);
3370 
3371 	/* allocate the tx busdma segment list */
3372 	bytes = sizeof (*ss->tx.seg_list) * ss->tx.max_desc;
3373 	ss->tx.seg_list = (bus_dma_segment_t *)
3374 		malloc(bytes, M_DEVBUF, M_WAITOK);
3375 
3376 	/* allocate the tx host info ring */
3377 	bytes = tx_ring_entries * sizeof (*ss->tx.info);
3378 	ss->tx.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3379 
3380 	/* allocate the tx busdma resources */
3381 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
3382 				 1,			/* alignment */
3383 				 sc->tx_boundary,	/* boundary */
3384 				 BUS_SPACE_MAXADDR,	/* low */
3385 				 BUS_SPACE_MAXADDR,	/* high */
3386 				 NULL, NULL,		/* filter */
3387 				 65536 + 256,		/* maxsize */
3388 				 ss->tx.max_desc - 2,	/* num segs */
3389 				 sc->tx_boundary,	/* maxsegsz */
3390 				 BUS_DMA_ALLOCNOW,	/* flags */
3391 				 NULL, NULL,		/* lock */
3392 				 &ss->tx.dmat);		/* tag */
3393 
3394 	if (err != 0) {
3395 		device_printf(sc->dev, "Err %d allocating tx dmat\n",
3396 			      err);
3397 		return err;
3398 	}
3399 
3400 	/* now use these tags to setup dmamaps for each slot
3401 	   in the ring */
3402 	for (i = 0; i <= ss->tx.mask; i++) {
3403 		err = bus_dmamap_create(ss->tx.dmat, 0,
3404 					&ss->tx.info[i].map);
3405 		if (err != 0) {
3406 			device_printf(sc->dev, "Err %d  tx dmamap\n",
3407 				      err);
3408 			return err;
3409 		}
3410 	}
3411 	return 0;
3412 
3413 }
3414 
3415 static int
3416 mxge_alloc_rings(mxge_softc_t *sc)
3417 {
3418 	mxge_cmd_t cmd;
3419 	int tx_ring_size;
3420 	int tx_ring_entries, rx_ring_entries;
3421 	int err, slice;
3422 
3423 	/* get ring sizes */
3424 	err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_RING_SIZE, &cmd);
3425 	tx_ring_size = cmd.data0;
3426 	if (err != 0) {
3427 		device_printf(sc->dev, "Cannot determine tx ring sizes\n");
3428 		goto abort;
3429 	}
3430 
3431 	tx_ring_entries = tx_ring_size / sizeof (mcp_kreq_ether_send_t);
3432 	rx_ring_entries = sc->rx_ring_size / sizeof (mcp_dma_addr_t);
3433 	IFQ_SET_MAXLEN(&sc->ifp->if_snd, tx_ring_entries - 1);
3434 	sc->ifp->if_snd.ifq_drv_maxlen = sc->ifp->if_snd.ifq_maxlen;
3435 	IFQ_SET_READY(&sc->ifp->if_snd);
3436 
3437 	for (slice = 0; slice < sc->num_slices; slice++) {
3438 		err = mxge_alloc_slice_rings(&sc->ss[slice],
3439 					     rx_ring_entries,
3440 					     tx_ring_entries);
3441 		if (err != 0)
3442 			goto abort;
3443 	}
3444 	return 0;
3445 
3446 abort:
3447 	mxge_free_rings(sc);
3448 	return err;
3449 
3450 }
3451 
3452 static void
3453 mxge_choose_params(int mtu, int *big_buf_size, int *cl_size, int *nbufs)
3454 {
3455 	int bufsize = mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN + MXGEFW_PAD;
3456 
3457 	if (bufsize < MCLBYTES) {
3458 		/* easy, everything fits in a single buffer */
3459 		*big_buf_size = MCLBYTES;
3460 		*cl_size = MCLBYTES;
3461 		*nbufs = 1;
3462 		return;
3463 	}
3464 
3465 	if (bufsize < MJUMPAGESIZE) {
3466 		/* still easy, everything still fits in a single buffer */
3467 		*big_buf_size = MJUMPAGESIZE;
3468 		*cl_size = MJUMPAGESIZE;
3469 		*nbufs = 1;
3470 		return;
3471 	}
3472 #if MXGE_VIRT_JUMBOS
3473 	/* now we need to use virtually contiguous buffers */
3474 	*cl_size = MJUM9BYTES;
3475 	*big_buf_size = 4096;
3476 	*nbufs = mtu / 4096 + 1;
3477 	/* needs to be a power of two, so round up */
3478 	if (*nbufs == 3)
3479 		*nbufs = 4;
3480 #else
3481 	*cl_size = MJUM9BYTES;
3482 	*big_buf_size = MJUM9BYTES;
3483 	*nbufs = 1;
3484 #endif
3485 }
3486 
3487 static int
3488 mxge_slice_open(struct mxge_slice_state *ss, int nbufs, int cl_size)
3489 {
3490 	mxge_softc_t *sc;
3491 	mxge_cmd_t cmd;
3492 	bus_dmamap_t map;
3493 	int err, i, slice;
3494 
3495 	sc = ss->sc;
3496 	slice = ss - sc->ss;
3497 
3498 #if defined(INET) || defined(INET6)
3499 	(void)tcp_lro_init(&ss->lc);
3500 #endif
3501 	ss->lc.ifp = sc->ifp;
3502 
3503 	/* get the lanai pointers to the send and receive rings */
3504 
3505 	err = 0;
3506 #ifndef IFNET_BUF_RING
3507 	/* We currently only send from the first slice */
3508 	if (slice == 0) {
3509 #endif
3510 		cmd.data0 = slice;
3511 		err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_OFFSET, &cmd);
3512 		ss->tx.lanai =
3513 			(volatile mcp_kreq_ether_send_t *)(sc->sram + cmd.data0);
3514 		ss->tx.send_go = (volatile uint32_t *)
3515 			(sc->sram + MXGEFW_ETH_SEND_GO + 64 * slice);
3516 		ss->tx.send_stop = (volatile uint32_t *)
3517 		(sc->sram + MXGEFW_ETH_SEND_STOP + 64 * slice);
3518 #ifndef IFNET_BUF_RING
3519 	}
3520 #endif
3521 	cmd.data0 = slice;
3522 	err |= mxge_send_cmd(sc,
3523 			     MXGEFW_CMD_GET_SMALL_RX_OFFSET, &cmd);
3524 	ss->rx_small.lanai =
3525 		(volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
3526 	cmd.data0 = slice;
3527 	err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_BIG_RX_OFFSET, &cmd);
3528 	ss->rx_big.lanai =
3529 		(volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
3530 
3531 	if (err != 0) {
3532 		device_printf(sc->dev,
3533 			      "failed to get ring sizes or locations\n");
3534 		return EIO;
3535 	}
3536 
3537 	/* stock receive rings */
3538 	for (i = 0; i <= ss->rx_small.mask; i++) {
3539 		map = ss->rx_small.info[i].map;
3540 		err = mxge_get_buf_small(ss, map, i);
3541 		if (err) {
3542 			device_printf(sc->dev, "alloced %d/%d smalls\n",
3543 				      i, ss->rx_small.mask + 1);
3544 			return ENOMEM;
3545 		}
3546 	}
3547 	for (i = 0; i <= ss->rx_big.mask; i++) {
3548 		ss->rx_big.shadow[i].addr_low = 0xffffffff;
3549 		ss->rx_big.shadow[i].addr_high = 0xffffffff;
3550 	}
3551 	ss->rx_big.nbufs = nbufs;
3552 	ss->rx_big.cl_size = cl_size;
3553 	ss->rx_big.mlen = ss->sc->ifp->if_mtu + ETHER_HDR_LEN +
3554 		ETHER_VLAN_ENCAP_LEN + MXGEFW_PAD;
3555 	for (i = 0; i <= ss->rx_big.mask; i += ss->rx_big.nbufs) {
3556 		map = ss->rx_big.info[i].map;
3557 		err = mxge_get_buf_big(ss, map, i);
3558 		if (err) {
3559 			device_printf(sc->dev, "alloced %d/%d bigs\n",
3560 				      i, ss->rx_big.mask + 1);
3561 			return ENOMEM;
3562 		}
3563 	}
3564 	return 0;
3565 }
3566 
3567 static int
3568 mxge_open(mxge_softc_t *sc)
3569 {
3570 	mxge_cmd_t cmd;
3571 	int err, big_bytes, nbufs, slice, cl_size, i;
3572 	bus_addr_t bus;
3573 	volatile uint8_t *itable;
3574 	struct mxge_slice_state *ss;
3575 
3576 	/* Copy the MAC address in case it was overridden */
3577 	bcopy(IF_LLADDR(sc->ifp), sc->mac_addr, ETHER_ADDR_LEN);
3578 
3579 	err = mxge_reset(sc, 1);
3580 	if (err != 0) {
3581 		device_printf(sc->dev, "failed to reset\n");
3582 		return EIO;
3583 	}
3584 
3585 	if (sc->num_slices > 1) {
3586 		/* setup the indirection table */
3587 		cmd.data0 = sc->num_slices;
3588 		err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_TABLE_SIZE,
3589 				    &cmd);
3590 
3591 		err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_RSS_TABLE_OFFSET,
3592 				     &cmd);
3593 		if (err != 0) {
3594 			device_printf(sc->dev,
3595 				      "failed to setup rss tables\n");
3596 			return err;
3597 		}
3598 
3599 		/* just enable an identity mapping */
3600 		itable = sc->sram + cmd.data0;
3601 		for (i = 0; i < sc->num_slices; i++)
3602 			itable[i] = (uint8_t)i;
3603 
3604 		cmd.data0 = 1;
3605 		cmd.data1 = mxge_rss_hash_type;
3606 		err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_ENABLE, &cmd);
3607 		if (err != 0) {
3608 			device_printf(sc->dev, "failed to enable slices\n");
3609 			return err;
3610 		}
3611 	}
3612 
3613 	mxge_choose_params(sc->ifp->if_mtu, &big_bytes, &cl_size, &nbufs);
3614 
3615 	cmd.data0 = nbufs;
3616 	err = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
3617 			    &cmd);
3618 	/* error is only meaningful if we're trying to set
3619 	   MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS > 1 */
3620 	if (err && nbufs > 1) {
3621 		device_printf(sc->dev,
3622 			      "Failed to set alway-use-n to %d\n",
3623 			      nbufs);
3624 		return EIO;
3625 	}
3626 	/* Give the firmware the mtu and the big and small buffer
3627 	   sizes.  The firmware wants the big buf size to be a power
3628 	   of two. Luckily, FreeBSD's clusters are powers of two */
3629 	cmd.data0 = sc->ifp->if_mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
3630 	err = mxge_send_cmd(sc, MXGEFW_CMD_SET_MTU, &cmd);
3631 	cmd.data0 = MHLEN - MXGEFW_PAD;
3632 	err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_SMALL_BUFFER_SIZE,
3633 			     &cmd);
3634 	cmd.data0 = big_bytes;
3635 	err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_BIG_BUFFER_SIZE, &cmd);
3636 
3637 	if (err != 0) {
3638 		device_printf(sc->dev, "failed to setup params\n");
3639 		goto abort;
3640 	}
3641 
3642 	/* Now give him the pointer to the stats block */
3643 	for (slice = 0;
3644 #ifdef IFNET_BUF_RING
3645 	     slice < sc->num_slices;
3646 #else
3647 	     slice < 1;
3648 #endif
3649 	     slice++) {
3650 		ss = &sc->ss[slice];
3651 		cmd.data0 =
3652 			MXGE_LOWPART_TO_U32(ss->fw_stats_dma.bus_addr);
3653 		cmd.data1 =
3654 			MXGE_HIGHPART_TO_U32(ss->fw_stats_dma.bus_addr);
3655 		cmd.data2 = sizeof(struct mcp_irq_data);
3656 		cmd.data2 |= (slice << 16);
3657 		err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_STATS_DMA_V2, &cmd);
3658 	}
3659 
3660 	if (err != 0) {
3661 		bus = sc->ss->fw_stats_dma.bus_addr;
3662 		bus += offsetof(struct mcp_irq_data, send_done_count);
3663 		cmd.data0 = MXGE_LOWPART_TO_U32(bus);
3664 		cmd.data1 = MXGE_HIGHPART_TO_U32(bus);
3665 		err = mxge_send_cmd(sc,
3666 				    MXGEFW_CMD_SET_STATS_DMA_OBSOLETE,
3667 				    &cmd);
3668 		/* Firmware cannot support multicast without STATS_DMA_V2 */
3669 		sc->fw_multicast_support = 0;
3670 	} else {
3671 		sc->fw_multicast_support = 1;
3672 	}
3673 
3674 	if (err != 0) {
3675 		device_printf(sc->dev, "failed to setup params\n");
3676 		goto abort;
3677 	}
3678 
3679 	for (slice = 0; slice < sc->num_slices; slice++) {
3680 		err = mxge_slice_open(&sc->ss[slice], nbufs, cl_size);
3681 		if (err != 0) {
3682 			device_printf(sc->dev, "couldn't open slice %d\n",
3683 				      slice);
3684 			goto abort;
3685 		}
3686 	}
3687 
3688 	/* Finally, start the firmware running */
3689 	err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_UP, &cmd);
3690 	if (err) {
3691 		device_printf(sc->dev, "Couldn't bring up link\n");
3692 		goto abort;
3693 	}
3694 #ifdef IFNET_BUF_RING
3695 	for (slice = 0; slice < sc->num_slices; slice++) {
3696 		ss = &sc->ss[slice];
3697 		ss->if_drv_flags |= IFF_DRV_RUNNING;
3698 		ss->if_drv_flags &= ~IFF_DRV_OACTIVE;
3699 	}
3700 #endif
3701 	sc->ifp->if_drv_flags |= IFF_DRV_RUNNING;
3702 	sc->ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
3703 
3704 	return 0;
3705 
3706 abort:
3707 	mxge_free_mbufs(sc);
3708 
3709 	return err;
3710 }
3711 
3712 static int
3713 mxge_close(mxge_softc_t *sc, int down)
3714 {
3715 	mxge_cmd_t cmd;
3716 	int err, old_down_cnt;
3717 #ifdef IFNET_BUF_RING
3718 	struct mxge_slice_state *ss;
3719 	int slice;
3720 #endif
3721 
3722 #ifdef IFNET_BUF_RING
3723 	for (slice = 0; slice < sc->num_slices; slice++) {
3724 		ss = &sc->ss[slice];
3725 		ss->if_drv_flags &= ~IFF_DRV_RUNNING;
3726 	}
3727 #endif
3728 	sc->ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
3729 	if (!down) {
3730 		old_down_cnt = sc->down_cnt;
3731 		wmb();
3732 		err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_DOWN, &cmd);
3733 		if (err) {
3734 			device_printf(sc->dev,
3735 				      "Couldn't bring down link\n");
3736 		}
3737 		if (old_down_cnt == sc->down_cnt) {
3738 			/* wait for down irq */
3739 			DELAY(10 * sc->intr_coal_delay);
3740 		}
3741 		wmb();
3742 		if (old_down_cnt == sc->down_cnt) {
3743 			device_printf(sc->dev, "never got down irq\n");
3744 		}
3745 	}
3746 	mxge_free_mbufs(sc);
3747 
3748 	return 0;
3749 }
3750 
3751 static void
3752 mxge_setup_cfg_space(mxge_softc_t *sc)
3753 {
3754 	device_t dev = sc->dev;
3755 	int reg;
3756 	uint16_t lnk, pectl;
3757 
3758 	/* find the PCIe link width and set max read request to 4KB*/
3759 	if (pci_find_cap(dev, PCIY_EXPRESS, &reg) == 0) {
3760 		lnk = pci_read_config(dev, reg + 0x12, 2);
3761 		sc->link_width = (lnk >> 4) & 0x3f;
3762 
3763 		if (sc->pectl == 0) {
3764 			pectl = pci_read_config(dev, reg + 0x8, 2);
3765 			pectl = (pectl & ~0x7000) | (5 << 12);
3766 			pci_write_config(dev, reg + 0x8, pectl, 2);
3767 			sc->pectl = pectl;
3768 		} else {
3769 			/* restore saved pectl after watchdog reset */
3770 			pci_write_config(dev, reg + 0x8, sc->pectl, 2);
3771 		}
3772 	}
3773 
3774 	/* Enable DMA and Memory space access */
3775 	pci_enable_busmaster(dev);
3776 }
3777 
3778 static uint32_t
3779 mxge_read_reboot(mxge_softc_t *sc)
3780 {
3781 	device_t dev = sc->dev;
3782 	uint32_t vs;
3783 
3784 	/* find the vendor specific offset */
3785 	if (pci_find_cap(dev, PCIY_VENDOR, &vs) != 0) {
3786 		device_printf(sc->dev,
3787 			      "could not find vendor specific offset\n");
3788 		return (uint32_t)-1;
3789 	}
3790 	/* enable read32 mode */
3791 	pci_write_config(dev, vs + 0x10, 0x3, 1);
3792 	/* tell NIC which register to read */
3793 	pci_write_config(dev, vs + 0x18, 0xfffffff0, 4);
3794 	return (pci_read_config(dev, vs + 0x14, 4));
3795 }
3796 
3797 static void
3798 mxge_watchdog_reset(mxge_softc_t *sc)
3799 {
3800 	struct pci_devinfo *dinfo;
3801 	struct mxge_slice_state *ss;
3802 	int err, running, s, num_tx_slices = 1;
3803 	uint32_t reboot;
3804 	uint16_t cmd;
3805 
3806 	err = ENXIO;
3807 
3808 	device_printf(sc->dev, "Watchdog reset!\n");
3809 
3810 	/*
3811 	 * check to see if the NIC rebooted.  If it did, then all of
3812 	 * PCI config space has been reset, and things like the
3813 	 * busmaster bit will be zero.  If this is the case, then we
3814 	 * must restore PCI config space before the NIC can be used
3815 	 * again
3816 	 */
3817 	cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3818 	if (cmd == 0xffff) {
3819 		/*
3820 		 * maybe the watchdog caught the NIC rebooting; wait
3821 		 * up to 100ms for it to finish.  If it does not come
3822 		 * back, then give up
3823 		 */
3824 		DELAY(1000*100);
3825 		cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3826 		if (cmd == 0xffff) {
3827 			device_printf(sc->dev, "NIC disappeared!\n");
3828 		}
3829 	}
3830 	if ((cmd & PCIM_CMD_BUSMASTEREN) == 0) {
3831 		/* print the reboot status */
3832 		reboot = mxge_read_reboot(sc);
3833 		device_printf(sc->dev, "NIC rebooted, status = 0x%x\n",
3834 			      reboot);
3835 		running = sc->ifp->if_drv_flags & IFF_DRV_RUNNING;
3836 		if (running) {
3837 			/*
3838 			 * quiesce NIC so that TX routines will not try to
3839 			 * xmit after restoration of BAR
3840 			 */
3841 
3842 			/* Mark the link as down */
3843 			if (sc->link_state) {
3844 				sc->link_state = 0;
3845 				if_link_state_change(sc->ifp,
3846 						     LINK_STATE_DOWN);
3847 			}
3848 #ifdef IFNET_BUF_RING
3849 			num_tx_slices = sc->num_slices;
3850 #endif
3851 			/* grab all TX locks to ensure no tx  */
3852 			for (s = 0; s < num_tx_slices; s++) {
3853 				ss = &sc->ss[s];
3854 				mtx_lock(&ss->tx.mtx);
3855 			}
3856 			mxge_close(sc, 1);
3857 		}
3858 		/* restore PCI configuration space */
3859 		dinfo = device_get_ivars(sc->dev);
3860 		pci_cfg_restore(sc->dev, dinfo);
3861 
3862 		/* and redo any changes we made to our config space */
3863 		mxge_setup_cfg_space(sc);
3864 
3865 		/* reload f/w */
3866 		err = mxge_load_firmware(sc, 0);
3867 		if (err) {
3868 			device_printf(sc->dev,
3869 				      "Unable to re-load f/w\n");
3870 		}
3871 		if (running) {
3872 			if (!err)
3873 				err = mxge_open(sc);
3874 			/* release all TX locks */
3875 			for (s = 0; s < num_tx_slices; s++) {
3876 				ss = &sc->ss[s];
3877 #ifdef IFNET_BUF_RING
3878 				mxge_start_locked(ss);
3879 #endif
3880 				mtx_unlock(&ss->tx.mtx);
3881 			}
3882 		}
3883 		sc->watchdog_resets++;
3884 	} else {
3885 		device_printf(sc->dev,
3886 			      "NIC did not reboot, not resetting\n");
3887 		err = 0;
3888 	}
3889 	if (err) {
3890 		device_printf(sc->dev, "watchdog reset failed\n");
3891 	} else {
3892 		if (sc->dying == 2)
3893 			sc->dying = 0;
3894 		callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
3895 	}
3896 }
3897 
3898 static void
3899 mxge_watchdog_task(void *arg, int pending)
3900 {
3901 	mxge_softc_t *sc = arg;
3902 
3903 	mtx_lock(&sc->driver_mtx);
3904 	mxge_watchdog_reset(sc);
3905 	mtx_unlock(&sc->driver_mtx);
3906 }
3907 
3908 static void
3909 mxge_warn_stuck(mxge_softc_t *sc, mxge_tx_ring_t *tx, int slice)
3910 {
3911 	tx = &sc->ss[slice].tx;
3912 	device_printf(sc->dev, "slice %d struck? ring state:\n", slice);
3913 	device_printf(sc->dev,
3914 		      "tx.req=%d tx.done=%d, tx.queue_active=%d\n",
3915 		      tx->req, tx->done, tx->queue_active);
3916 	device_printf(sc->dev, "tx.activate=%d tx.deactivate=%d\n",
3917 			      tx->activate, tx->deactivate);
3918 	device_printf(sc->dev, "pkt_done=%d fw=%d\n",
3919 		      tx->pkt_done,
3920 		      be32toh(sc->ss->fw_stats->send_done_count));
3921 }
3922 
3923 static int
3924 mxge_watchdog(mxge_softc_t *sc)
3925 {
3926 	mxge_tx_ring_t *tx;
3927 	uint32_t rx_pause = be32toh(sc->ss->fw_stats->dropped_pause);
3928 	int i, err = 0;
3929 
3930 	/* see if we have outstanding transmits, which
3931 	   have been pending for more than mxge_ticks */
3932 	for (i = 0;
3933 #ifdef IFNET_BUF_RING
3934 	     (i < sc->num_slices) && (err == 0);
3935 #else
3936 	     (i < 1) && (err == 0);
3937 #endif
3938 	     i++) {
3939 		tx = &sc->ss[i].tx;
3940 		if (tx->req != tx->done &&
3941 		    tx->watchdog_req != tx->watchdog_done &&
3942 		    tx->done == tx->watchdog_done) {
3943 			/* check for pause blocking before resetting */
3944 			if (tx->watchdog_rx_pause == rx_pause) {
3945 				mxge_warn_stuck(sc, tx, i);
3946 				taskqueue_enqueue(sc->tq, &sc->watchdog_task);
3947 				return (ENXIO);
3948 			}
3949 			else
3950 				device_printf(sc->dev, "Flow control blocking "
3951 					      "xmits, check link partner\n");
3952 		}
3953 
3954 		tx->watchdog_req = tx->req;
3955 		tx->watchdog_done = tx->done;
3956 		tx->watchdog_rx_pause = rx_pause;
3957 	}
3958 
3959 	if (sc->need_media_probe)
3960 		mxge_media_probe(sc);
3961 	return (err);
3962 }
3963 
3964 static uint64_t
3965 mxge_get_counter(struct ifnet *ifp, ift_counter cnt)
3966 {
3967 	struct mxge_softc *sc;
3968 	uint64_t rv;
3969 
3970 	sc = if_getsoftc(ifp);
3971 	rv = 0;
3972 
3973 	switch (cnt) {
3974 	case IFCOUNTER_IPACKETS:
3975 		for (int s = 0; s < sc->num_slices; s++)
3976 			rv += sc->ss[s].ipackets;
3977 		return (rv);
3978 	case IFCOUNTER_OPACKETS:
3979 		for (int s = 0; s < sc->num_slices; s++)
3980 			rv += sc->ss[s].opackets;
3981 		return (rv);
3982 	case IFCOUNTER_OERRORS:
3983 		for (int s = 0; s < sc->num_slices; s++)
3984 			rv += sc->ss[s].oerrors;
3985 		return (rv);
3986 #ifdef IFNET_BUF_RING
3987 	case IFCOUNTER_OBYTES:
3988 		for (int s = 0; s < sc->num_slices; s++)
3989 			rv += sc->ss[s].obytes;
3990 		return (rv);
3991 	case IFCOUNTER_OMCASTS:
3992 		for (int s = 0; s < sc->num_slices; s++)
3993 			rv += sc->ss[s].omcasts;
3994 		return (rv);
3995 	case IFCOUNTER_OQDROPS:
3996 		for (int s = 0; s < sc->num_slices; s++)
3997 			rv += sc->ss[s].tx.br->br_drops;
3998 		return (rv);
3999 #endif
4000 	default:
4001 		return (if_get_counter_default(ifp, cnt));
4002 	}
4003 }
4004 
4005 static void
4006 mxge_tick(void *arg)
4007 {
4008 	mxge_softc_t *sc = arg;
4009 	u_long pkts = 0;
4010 	int err = 0;
4011 	int running, ticks;
4012 	uint16_t cmd;
4013 
4014 	ticks = mxge_ticks;
4015 	running = sc->ifp->if_drv_flags & IFF_DRV_RUNNING;
4016 	if (running) {
4017 		if (!sc->watchdog_countdown) {
4018 			err = mxge_watchdog(sc);
4019 			sc->watchdog_countdown = 4;
4020 		}
4021 		sc->watchdog_countdown--;
4022 	}
4023 	if (pkts == 0) {
4024 		/* ensure NIC did not suffer h/w fault while idle */
4025 		cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
4026 		if ((cmd & PCIM_CMD_BUSMASTEREN) == 0) {
4027 			sc->dying = 2;
4028 			taskqueue_enqueue(sc->tq, &sc->watchdog_task);
4029 			err = ENXIO;
4030 		}
4031 		/* look less often if NIC is idle */
4032 		ticks *= 4;
4033 	}
4034 
4035 	if (err == 0)
4036 		callout_reset(&sc->co_hdl, ticks, mxge_tick, sc);
4037 
4038 }
4039 
4040 static int
4041 mxge_media_change(struct ifnet *ifp)
4042 {
4043 	return EINVAL;
4044 }
4045 
4046 static int
4047 mxge_change_mtu(mxge_softc_t *sc, int mtu)
4048 {
4049 	struct ifnet *ifp = sc->ifp;
4050 	int real_mtu, old_mtu;
4051 	int err = 0;
4052 
4053 	real_mtu = mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
4054 	if ((real_mtu > sc->max_mtu) || real_mtu < 60)
4055 		return EINVAL;
4056 	mtx_lock(&sc->driver_mtx);
4057 	old_mtu = ifp->if_mtu;
4058 	ifp->if_mtu = mtu;
4059 	if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
4060 		mxge_close(sc, 0);
4061 		err = mxge_open(sc);
4062 		if (err != 0) {
4063 			ifp->if_mtu = old_mtu;
4064 			mxge_close(sc, 0);
4065 			(void) mxge_open(sc);
4066 		}
4067 	}
4068 	mtx_unlock(&sc->driver_mtx);
4069 	return err;
4070 }
4071 
4072 static void
4073 mxge_media_status(struct ifnet *ifp, struct ifmediareq *ifmr)
4074 {
4075 	mxge_softc_t *sc = ifp->if_softc;
4076 
4077 	if (sc == NULL)
4078 		return;
4079 	ifmr->ifm_status = IFM_AVALID;
4080 	ifmr->ifm_active = IFM_ETHER | IFM_FDX;
4081 	ifmr->ifm_status |= sc->link_state ? IFM_ACTIVE : 0;
4082 	ifmr->ifm_active |= sc->current_media;
4083 }
4084 
4085 static int
4086 mxge_fetch_i2c(mxge_softc_t *sc, struct ifi2creq *i2c)
4087 {
4088 	mxge_cmd_t cmd;
4089 	uint32_t i2c_args;
4090 	int i, ms, err;
4091 
4092 	if (i2c->dev_addr != 0xA0 &&
4093 	    i2c->dev_addr != 0xA2)
4094 		return (EINVAL);
4095 	if (i2c->len > sizeof(i2c->data))
4096 		return (EINVAL);
4097 
4098 	for (i = 0; i < i2c->len; i++) {
4099 		i2c_args = i2c->dev_addr << 0x8;
4100 		i2c_args |= i2c->offset + i;
4101 		cmd.data0 = 0;	 /* just fetch 1 byte, not all 256 */
4102 		cmd.data1 = i2c_args;
4103 		err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_READ, &cmd);
4104 
4105 		if (err != MXGEFW_CMD_OK)
4106 			return (EIO);
4107 		/* now we wait for the data to be cached */
4108 		cmd.data0 = i2c_args & 0xff;
4109 		err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
4110 		for (ms = 0; (err == EBUSY) && (ms < 50); ms++) {
4111 			cmd.data0 = i2c_args & 0xff;
4112 			err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
4113 			if (err == EBUSY)
4114 				DELAY(1000);
4115 		}
4116 		if (err != MXGEFW_CMD_OK)
4117 			return (EIO);
4118 		i2c->data[i] = cmd.data0;
4119 	}
4120 	return (0);
4121 }
4122 
4123 static int
4124 mxge_ioctl(struct ifnet *ifp, u_long command, caddr_t data)
4125 {
4126 	mxge_softc_t *sc = ifp->if_softc;
4127 	struct ifreq *ifr = (struct ifreq *)data;
4128 	struct ifi2creq i2c;
4129 	int err, mask;
4130 
4131 	err = 0;
4132 	switch (command) {
4133 	case SIOCSIFMTU:
4134 		err = mxge_change_mtu(sc, ifr->ifr_mtu);
4135 		break;
4136 
4137 	case SIOCSIFFLAGS:
4138 		mtx_lock(&sc->driver_mtx);
4139 		if (sc->dying) {
4140 			mtx_unlock(&sc->driver_mtx);
4141 			return EINVAL;
4142 		}
4143 		if (ifp->if_flags & IFF_UP) {
4144 			if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) {
4145 				err = mxge_open(sc);
4146 			} else {
4147 				/* take care of promis can allmulti
4148 				   flag chages */
4149 				mxge_change_promisc(sc,
4150 						    ifp->if_flags & IFF_PROMISC);
4151 				mxge_set_multicast_list(sc);
4152 			}
4153 		} else {
4154 			if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
4155 				mxge_close(sc, 0);
4156 			}
4157 		}
4158 		mtx_unlock(&sc->driver_mtx);
4159 		break;
4160 
4161 	case SIOCADDMULTI:
4162 	case SIOCDELMULTI:
4163 		mtx_lock(&sc->driver_mtx);
4164 		if (sc->dying) {
4165 			mtx_unlock(&sc->driver_mtx);
4166 			return (EINVAL);
4167 		}
4168 		mxge_set_multicast_list(sc);
4169 		mtx_unlock(&sc->driver_mtx);
4170 		break;
4171 
4172 	case SIOCSIFCAP:
4173 		mtx_lock(&sc->driver_mtx);
4174 		mask = ifr->ifr_reqcap ^ ifp->if_capenable;
4175 		if (mask & IFCAP_TXCSUM) {
4176 			if (IFCAP_TXCSUM & ifp->if_capenable) {
4177 				mask &= ~IFCAP_TSO4;
4178 				ifp->if_capenable &= ~(IFCAP_TXCSUM|IFCAP_TSO4);
4179 				ifp->if_hwassist &= ~(CSUM_TCP | CSUM_UDP);
4180 			} else {
4181 				ifp->if_capenable |= IFCAP_TXCSUM;
4182 				ifp->if_hwassist |= (CSUM_TCP | CSUM_UDP);
4183 			}
4184 		}
4185 		if (mask & IFCAP_RXCSUM) {
4186 			if (IFCAP_RXCSUM & ifp->if_capenable) {
4187 				ifp->if_capenable &= ~IFCAP_RXCSUM;
4188 			} else {
4189 				ifp->if_capenable |= IFCAP_RXCSUM;
4190 			}
4191 		}
4192 		if (mask & IFCAP_TSO4) {
4193 			if (IFCAP_TSO4 & ifp->if_capenable) {
4194 				ifp->if_capenable &= ~IFCAP_TSO4;
4195 			} else if (IFCAP_TXCSUM & ifp->if_capenable) {
4196 				ifp->if_capenable |= IFCAP_TSO4;
4197 				ifp->if_hwassist |= CSUM_TSO;
4198 			} else {
4199 				printf("mxge requires tx checksum offload"
4200 				       " be enabled to use TSO\n");
4201 				err = EINVAL;
4202 			}
4203 		}
4204 #if IFCAP_TSO6
4205 		if (mask & IFCAP_TXCSUM_IPV6) {
4206 			if (IFCAP_TXCSUM_IPV6 & ifp->if_capenable) {
4207 				mask &= ~IFCAP_TSO6;
4208 				ifp->if_capenable &= ~(IFCAP_TXCSUM_IPV6
4209 						       | IFCAP_TSO6);
4210 				ifp->if_hwassist &= ~(CSUM_TCP_IPV6
4211 						      | CSUM_UDP);
4212 			} else {
4213 				ifp->if_capenable |= IFCAP_TXCSUM_IPV6;
4214 				ifp->if_hwassist |= (CSUM_TCP_IPV6
4215 						     | CSUM_UDP_IPV6);
4216 			}
4217 		}
4218 		if (mask & IFCAP_RXCSUM_IPV6) {
4219 			if (IFCAP_RXCSUM_IPV6 & ifp->if_capenable) {
4220 				ifp->if_capenable &= ~IFCAP_RXCSUM_IPV6;
4221 			} else {
4222 				ifp->if_capenable |= IFCAP_RXCSUM_IPV6;
4223 			}
4224 		}
4225 		if (mask & IFCAP_TSO6) {
4226 			if (IFCAP_TSO6 & ifp->if_capenable) {
4227 				ifp->if_capenable &= ~IFCAP_TSO6;
4228 			} else if (IFCAP_TXCSUM_IPV6 & ifp->if_capenable) {
4229 				ifp->if_capenable |= IFCAP_TSO6;
4230 				ifp->if_hwassist |= CSUM_TSO;
4231 			} else {
4232 				printf("mxge requires tx checksum offload"
4233 				       " be enabled to use TSO\n");
4234 				err = EINVAL;
4235 			}
4236 		}
4237 #endif /*IFCAP_TSO6 */
4238 
4239 		if (mask & IFCAP_LRO)
4240 			ifp->if_capenable ^= IFCAP_LRO;
4241 		if (mask & IFCAP_VLAN_HWTAGGING)
4242 			ifp->if_capenable ^= IFCAP_VLAN_HWTAGGING;
4243 		if (mask & IFCAP_VLAN_HWTSO)
4244 			ifp->if_capenable ^= IFCAP_VLAN_HWTSO;
4245 
4246 		if (!(ifp->if_capabilities & IFCAP_VLAN_HWTSO) ||
4247 		    !(ifp->if_capenable & IFCAP_VLAN_HWTAGGING))
4248 			ifp->if_capenable &= ~IFCAP_VLAN_HWTSO;
4249 
4250 		mtx_unlock(&sc->driver_mtx);
4251 		VLAN_CAPABILITIES(ifp);
4252 
4253 		break;
4254 
4255 	case SIOCGIFMEDIA:
4256 		mtx_lock(&sc->driver_mtx);
4257 		if (sc->dying) {
4258 			mtx_unlock(&sc->driver_mtx);
4259 			return (EINVAL);
4260 		}
4261 		mxge_media_probe(sc);
4262 		mtx_unlock(&sc->driver_mtx);
4263 		err = ifmedia_ioctl(ifp, (struct ifreq *)data,
4264 				    &sc->media, command);
4265 		break;
4266 
4267 	case SIOCGI2C:
4268 		if (sc->connector != MXGE_XFP &&
4269 		    sc->connector != MXGE_SFP) {
4270 			err = ENXIO;
4271 			break;
4272 		}
4273 		err = copyin(ifr_data_get_ptr(ifr), &i2c, sizeof(i2c));
4274 		if (err != 0)
4275 			break;
4276 		mtx_lock(&sc->driver_mtx);
4277 		if (sc->dying) {
4278 			mtx_unlock(&sc->driver_mtx);
4279 			return (EINVAL);
4280 		}
4281 		err = mxge_fetch_i2c(sc, &i2c);
4282 		mtx_unlock(&sc->driver_mtx);
4283 		if (err == 0)
4284 			err = copyout(&i2c, ifr_data_get_ptr(ifr),
4285 			    sizeof(i2c));
4286 		break;
4287 	default:
4288 		err = ether_ioctl(ifp, command, data);
4289 		break;
4290 	}
4291 	return err;
4292 }
4293 
4294 static void
4295 mxge_fetch_tunables(mxge_softc_t *sc)
4296 {
4297 
4298 	TUNABLE_INT_FETCH("hw.mxge.max_slices", &mxge_max_slices);
4299 	TUNABLE_INT_FETCH("hw.mxge.flow_control_enabled",
4300 			  &mxge_flow_control);
4301 	TUNABLE_INT_FETCH("hw.mxge.intr_coal_delay",
4302 			  &mxge_intr_coal_delay);
4303 	TUNABLE_INT_FETCH("hw.mxge.nvidia_ecrc_enable",
4304 			  &mxge_nvidia_ecrc_enable);
4305 	TUNABLE_INT_FETCH("hw.mxge.force_firmware",
4306 			  &mxge_force_firmware);
4307 	TUNABLE_INT_FETCH("hw.mxge.deassert_wait",
4308 			  &mxge_deassert_wait);
4309 	TUNABLE_INT_FETCH("hw.mxge.verbose",
4310 			  &mxge_verbose);
4311 	TUNABLE_INT_FETCH("hw.mxge.ticks", &mxge_ticks);
4312 	TUNABLE_INT_FETCH("hw.mxge.always_promisc", &mxge_always_promisc);
4313 	TUNABLE_INT_FETCH("hw.mxge.rss_hash_type", &mxge_rss_hash_type);
4314 	TUNABLE_INT_FETCH("hw.mxge.rss_hashtype", &mxge_rss_hash_type);
4315 	TUNABLE_INT_FETCH("hw.mxge.initial_mtu", &mxge_initial_mtu);
4316 	TUNABLE_INT_FETCH("hw.mxge.throttle", &mxge_throttle);
4317 
4318 	if (bootverbose)
4319 		mxge_verbose = 1;
4320 	if (mxge_intr_coal_delay < 0 || mxge_intr_coal_delay > 10*1000)
4321 		mxge_intr_coal_delay = 30;
4322 	if (mxge_ticks == 0)
4323 		mxge_ticks = hz / 2;
4324 	sc->pause = mxge_flow_control;
4325 	if (mxge_rss_hash_type < MXGEFW_RSS_HASH_TYPE_IPV4
4326 	    || mxge_rss_hash_type > MXGEFW_RSS_HASH_TYPE_MAX) {
4327 		mxge_rss_hash_type = MXGEFW_RSS_HASH_TYPE_SRC_DST_PORT;
4328 	}
4329 	if (mxge_initial_mtu > ETHERMTU_JUMBO ||
4330 	    mxge_initial_mtu < ETHER_MIN_LEN)
4331 		mxge_initial_mtu = ETHERMTU_JUMBO;
4332 
4333 	if (mxge_throttle && mxge_throttle > MXGE_MAX_THROTTLE)
4334 		mxge_throttle = MXGE_MAX_THROTTLE;
4335 	if (mxge_throttle && mxge_throttle < MXGE_MIN_THROTTLE)
4336 		mxge_throttle = MXGE_MIN_THROTTLE;
4337 	sc->throttle = mxge_throttle;
4338 }
4339 
4340 static void
4341 mxge_free_slices(mxge_softc_t *sc)
4342 {
4343 	struct mxge_slice_state *ss;
4344 	int i;
4345 
4346 	if (sc->ss == NULL)
4347 		return;
4348 
4349 	for (i = 0; i < sc->num_slices; i++) {
4350 		ss = &sc->ss[i];
4351 		if (ss->fw_stats != NULL) {
4352 			mxge_dma_free(&ss->fw_stats_dma);
4353 			ss->fw_stats = NULL;
4354 #ifdef IFNET_BUF_RING
4355 			if (ss->tx.br != NULL) {
4356 				drbr_free(ss->tx.br, M_DEVBUF);
4357 				ss->tx.br = NULL;
4358 			}
4359 #endif
4360 			mtx_destroy(&ss->tx.mtx);
4361 		}
4362 		if (ss->rx_done.entry != NULL) {
4363 			mxge_dma_free(&ss->rx_done.dma);
4364 			ss->rx_done.entry = NULL;
4365 		}
4366 	}
4367 	free(sc->ss, M_DEVBUF);
4368 	sc->ss = NULL;
4369 }
4370 
4371 static int
4372 mxge_alloc_slices(mxge_softc_t *sc)
4373 {
4374 	mxge_cmd_t cmd;
4375 	struct mxge_slice_state *ss;
4376 	size_t bytes;
4377 	int err, i, max_intr_slots;
4378 
4379 	err = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
4380 	if (err != 0) {
4381 		device_printf(sc->dev, "Cannot determine rx ring size\n");
4382 		return err;
4383 	}
4384 	sc->rx_ring_size = cmd.data0;
4385 	max_intr_slots = 2 * (sc->rx_ring_size / sizeof (mcp_dma_addr_t));
4386 
4387 	bytes = sizeof (*sc->ss) * sc->num_slices;
4388 	sc->ss = malloc(bytes, M_DEVBUF, M_NOWAIT | M_ZERO);
4389 	if (sc->ss == NULL)
4390 		return (ENOMEM);
4391 	for (i = 0; i < sc->num_slices; i++) {
4392 		ss = &sc->ss[i];
4393 
4394 		ss->sc = sc;
4395 
4396 		/* allocate per-slice rx interrupt queues */
4397 
4398 		bytes = max_intr_slots * sizeof (*ss->rx_done.entry);
4399 		err = mxge_dma_alloc(sc, &ss->rx_done.dma, bytes, 4096);
4400 		if (err != 0)
4401 			goto abort;
4402 		ss->rx_done.entry = ss->rx_done.dma.addr;
4403 		bzero(ss->rx_done.entry, bytes);
4404 
4405 		/*
4406 		 * allocate the per-slice firmware stats; stats
4407 		 * (including tx) are used used only on the first
4408 		 * slice for now
4409 		 */
4410 #ifndef IFNET_BUF_RING
4411 		if (i > 0)
4412 			continue;
4413 #endif
4414 
4415 		bytes = sizeof (*ss->fw_stats);
4416 		err = mxge_dma_alloc(sc, &ss->fw_stats_dma,
4417 				     sizeof (*ss->fw_stats), 64);
4418 		if (err != 0)
4419 			goto abort;
4420 		ss->fw_stats = (mcp_irq_data_t *)ss->fw_stats_dma.addr;
4421 		snprintf(ss->tx.mtx_name, sizeof(ss->tx.mtx_name),
4422 			 "%s:tx(%d)", device_get_nameunit(sc->dev), i);
4423 		mtx_init(&ss->tx.mtx, ss->tx.mtx_name, NULL, MTX_DEF);
4424 #ifdef IFNET_BUF_RING
4425 		ss->tx.br = buf_ring_alloc(2048, M_DEVBUF, M_WAITOK,
4426 					   &ss->tx.mtx);
4427 #endif
4428 	}
4429 
4430 	return (0);
4431 
4432 abort:
4433 	mxge_free_slices(sc);
4434 	return (ENOMEM);
4435 }
4436 
4437 static void
4438 mxge_slice_probe(mxge_softc_t *sc)
4439 {
4440 	mxge_cmd_t cmd;
4441 	char *old_fw;
4442 	int msix_cnt, status, max_intr_slots;
4443 
4444 	sc->num_slices = 1;
4445 	/*
4446 	 *  don't enable multiple slices if they are not enabled,
4447 	 *  or if this is not an SMP system
4448 	 */
4449 
4450 	if (mxge_max_slices == 0 || mxge_max_slices == 1 || mp_ncpus < 2)
4451 		return;
4452 
4453 	/* see how many MSI-X interrupts are available */
4454 	msix_cnt = pci_msix_count(sc->dev);
4455 	if (msix_cnt < 2)
4456 		return;
4457 
4458 	/* now load the slice aware firmware see what it supports */
4459 	old_fw = sc->fw_name;
4460 	if (old_fw == mxge_fw_aligned)
4461 		sc->fw_name = mxge_fw_rss_aligned;
4462 	else
4463 		sc->fw_name = mxge_fw_rss_unaligned;
4464 	status = mxge_load_firmware(sc, 0);
4465 	if (status != 0) {
4466 		device_printf(sc->dev, "Falling back to a single slice\n");
4467 		return;
4468 	}
4469 
4470 	/* try to send a reset command to the card to see if it
4471 	   is alive */
4472 	memset(&cmd, 0, sizeof (cmd));
4473 	status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
4474 	if (status != 0) {
4475 		device_printf(sc->dev, "failed reset\n");
4476 		goto abort_with_fw;
4477 	}
4478 
4479 	/* get rx ring size */
4480 	status = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
4481 	if (status != 0) {
4482 		device_printf(sc->dev, "Cannot determine rx ring size\n");
4483 		goto abort_with_fw;
4484 	}
4485 	max_intr_slots = 2 * (cmd.data0 / sizeof (mcp_dma_addr_t));
4486 
4487 	/* tell it the size of the interrupt queues */
4488 	cmd.data0 = max_intr_slots * sizeof (struct mcp_slot);
4489 	status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
4490 	if (status != 0) {
4491 		device_printf(sc->dev, "failed MXGEFW_CMD_SET_INTRQ_SIZE\n");
4492 		goto abort_with_fw;
4493 	}
4494 
4495 	/* ask the maximum number of slices it supports */
4496 	status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES, &cmd);
4497 	if (status != 0) {
4498 		device_printf(sc->dev,
4499 			      "failed MXGEFW_CMD_GET_MAX_RSS_QUEUES\n");
4500 		goto abort_with_fw;
4501 	}
4502 	sc->num_slices = cmd.data0;
4503 	if (sc->num_slices > msix_cnt)
4504 		sc->num_slices = msix_cnt;
4505 
4506 	if (mxge_max_slices == -1) {
4507 		/* cap to number of CPUs in system */
4508 		if (sc->num_slices > mp_ncpus)
4509 			sc->num_slices = mp_ncpus;
4510 	} else {
4511 		if (sc->num_slices > mxge_max_slices)
4512 			sc->num_slices = mxge_max_slices;
4513 	}
4514 	/* make sure it is a power of two */
4515 	while (sc->num_slices & (sc->num_slices - 1))
4516 		sc->num_slices--;
4517 
4518 	if (mxge_verbose)
4519 		device_printf(sc->dev, "using %d slices\n",
4520 			      sc->num_slices);
4521 
4522 	return;
4523 
4524 abort_with_fw:
4525 	sc->fw_name = old_fw;
4526 	(void) mxge_load_firmware(sc, 0);
4527 }
4528 
4529 static int
4530 mxge_add_msix_irqs(mxge_softc_t *sc)
4531 {
4532 	size_t bytes;
4533 	int count, err, i, rid;
4534 
4535 	rid = PCIR_BAR(2);
4536 	sc->msix_table_res = bus_alloc_resource_any(sc->dev, SYS_RES_MEMORY,
4537 						    &rid, RF_ACTIVE);
4538 
4539 	if (sc->msix_table_res == NULL) {
4540 		device_printf(sc->dev, "couldn't alloc MSIX table res\n");
4541 		return ENXIO;
4542 	}
4543 
4544 	count = sc->num_slices;
4545 	err = pci_alloc_msix(sc->dev, &count);
4546 	if (err != 0) {
4547 		device_printf(sc->dev, "pci_alloc_msix: failed, wanted %d"
4548 			      "err = %d \n", sc->num_slices, err);
4549 		goto abort_with_msix_table;
4550 	}
4551 	if (count < sc->num_slices) {
4552 		device_printf(sc->dev, "pci_alloc_msix: need %d, got %d\n",
4553 			      count, sc->num_slices);
4554 		device_printf(sc->dev,
4555 			      "Try setting hw.mxge.max_slices to %d\n",
4556 			      count);
4557 		err = ENOSPC;
4558 		goto abort_with_msix;
4559 	}
4560 	bytes = sizeof (*sc->msix_irq_res) * sc->num_slices;
4561 	sc->msix_irq_res = malloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO);
4562 	if (sc->msix_irq_res == NULL) {
4563 		err = ENOMEM;
4564 		goto abort_with_msix;
4565 	}
4566 
4567 	for (i = 0; i < sc->num_slices; i++) {
4568 		rid = i + 1;
4569 		sc->msix_irq_res[i] = bus_alloc_resource_any(sc->dev,
4570 							  SYS_RES_IRQ,
4571 							  &rid, RF_ACTIVE);
4572 		if (sc->msix_irq_res[i] == NULL) {
4573 			device_printf(sc->dev, "couldn't allocate IRQ res"
4574 				      " for message %d\n", i);
4575 			err = ENXIO;
4576 			goto abort_with_res;
4577 		}
4578 	}
4579 
4580 	bytes = sizeof (*sc->msix_ih) * sc->num_slices;
4581 	sc->msix_ih =  malloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO);
4582 
4583 	for (i = 0; i < sc->num_slices; i++) {
4584 		err = bus_setup_intr(sc->dev, sc->msix_irq_res[i],
4585 				     INTR_TYPE_NET | INTR_MPSAFE,
4586 #if __FreeBSD_version > 700030
4587 				     NULL,
4588 #endif
4589 				     mxge_intr, &sc->ss[i], &sc->msix_ih[i]);
4590 		if (err != 0) {
4591 			device_printf(sc->dev, "couldn't setup intr for "
4592 				      "message %d\n", i);
4593 			goto abort_with_intr;
4594 		}
4595 		bus_describe_intr(sc->dev, sc->msix_irq_res[i],
4596 				  sc->msix_ih[i], "s%d", i);
4597 	}
4598 
4599 	if (mxge_verbose) {
4600 		device_printf(sc->dev, "using %d msix IRQs:",
4601 			      sc->num_slices);
4602 		for (i = 0; i < sc->num_slices; i++)
4603 			printf(" %jd", rman_get_start(sc->msix_irq_res[i]));
4604 		printf("\n");
4605 	}
4606 	return (0);
4607 
4608 abort_with_intr:
4609 	for (i = 0; i < sc->num_slices; i++) {
4610 		if (sc->msix_ih[i] != NULL) {
4611 			bus_teardown_intr(sc->dev, sc->msix_irq_res[i],
4612 					  sc->msix_ih[i]);
4613 			sc->msix_ih[i] = NULL;
4614 		}
4615 	}
4616 	free(sc->msix_ih, M_DEVBUF);
4617 
4618 abort_with_res:
4619 	for (i = 0; i < sc->num_slices; i++) {
4620 		rid = i + 1;
4621 		if (sc->msix_irq_res[i] != NULL)
4622 			bus_release_resource(sc->dev, SYS_RES_IRQ, rid,
4623 					     sc->msix_irq_res[i]);
4624 		sc->msix_irq_res[i] = NULL;
4625 	}
4626 	free(sc->msix_irq_res, M_DEVBUF);
4627 
4628 abort_with_msix:
4629 	pci_release_msi(sc->dev);
4630 
4631 abort_with_msix_table:
4632 	bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2),
4633 			     sc->msix_table_res);
4634 
4635 	return err;
4636 }
4637 
4638 static int
4639 mxge_add_single_irq(mxge_softc_t *sc)
4640 {
4641 	int count, err, rid;
4642 
4643 	count = pci_msi_count(sc->dev);
4644 	if (count == 1 && pci_alloc_msi(sc->dev, &count) == 0) {
4645 		rid = 1;
4646 	} else {
4647 		rid = 0;
4648 		sc->legacy_irq = 1;
4649 	}
4650 	sc->irq_res = bus_alloc_resource_any(sc->dev, SYS_RES_IRQ, &rid,
4651 					     RF_SHAREABLE | RF_ACTIVE);
4652 	if (sc->irq_res == NULL) {
4653 		device_printf(sc->dev, "could not alloc interrupt\n");
4654 		return ENXIO;
4655 	}
4656 	if (mxge_verbose)
4657 		device_printf(sc->dev, "using %s irq %jd\n",
4658 			      sc->legacy_irq ? "INTx" : "MSI",
4659 			      rman_get_start(sc->irq_res));
4660 	err = bus_setup_intr(sc->dev, sc->irq_res,
4661 			     INTR_TYPE_NET | INTR_MPSAFE,
4662 #if __FreeBSD_version > 700030
4663 			     NULL,
4664 #endif
4665 			     mxge_intr, &sc->ss[0], &sc->ih);
4666 	if (err != 0) {
4667 		bus_release_resource(sc->dev, SYS_RES_IRQ,
4668 				     sc->legacy_irq ? 0 : 1, sc->irq_res);
4669 		if (!sc->legacy_irq)
4670 			pci_release_msi(sc->dev);
4671 	}
4672 	return err;
4673 }
4674 
4675 static void
4676 mxge_rem_msix_irqs(mxge_softc_t *sc)
4677 {
4678 	int i, rid;
4679 
4680 	for (i = 0; i < sc->num_slices; i++) {
4681 		if (sc->msix_ih[i] != NULL) {
4682 			bus_teardown_intr(sc->dev, sc->msix_irq_res[i],
4683 					  sc->msix_ih[i]);
4684 			sc->msix_ih[i] = NULL;
4685 		}
4686 	}
4687 	free(sc->msix_ih, M_DEVBUF);
4688 
4689 	for (i = 0; i < sc->num_slices; i++) {
4690 		rid = i + 1;
4691 		if (sc->msix_irq_res[i] != NULL)
4692 			bus_release_resource(sc->dev, SYS_RES_IRQ, rid,
4693 					     sc->msix_irq_res[i]);
4694 		sc->msix_irq_res[i] = NULL;
4695 	}
4696 	free(sc->msix_irq_res, M_DEVBUF);
4697 
4698 	bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2),
4699 			     sc->msix_table_res);
4700 
4701 	pci_release_msi(sc->dev);
4702 	return;
4703 }
4704 
4705 static void
4706 mxge_rem_single_irq(mxge_softc_t *sc)
4707 {
4708 	bus_teardown_intr(sc->dev, sc->irq_res, sc->ih);
4709 	bus_release_resource(sc->dev, SYS_RES_IRQ,
4710 			     sc->legacy_irq ? 0 : 1, sc->irq_res);
4711 	if (!sc->legacy_irq)
4712 		pci_release_msi(sc->dev);
4713 }
4714 
4715 static void
4716 mxge_rem_irq(mxge_softc_t *sc)
4717 {
4718 	if (sc->num_slices > 1)
4719 		mxge_rem_msix_irqs(sc);
4720 	else
4721 		mxge_rem_single_irq(sc);
4722 }
4723 
4724 static int
4725 mxge_add_irq(mxge_softc_t *sc)
4726 {
4727 	int err;
4728 
4729 	if (sc->num_slices > 1)
4730 		err = mxge_add_msix_irqs(sc);
4731 	else
4732 		err = mxge_add_single_irq(sc);
4733 
4734 	if (0 && err == 0 && sc->num_slices > 1) {
4735 		mxge_rem_msix_irqs(sc);
4736 		err = mxge_add_msix_irqs(sc);
4737 	}
4738 	return err;
4739 }
4740 
4741 static int
4742 mxge_attach(device_t dev)
4743 {
4744 	mxge_cmd_t cmd;
4745 	mxge_softc_t *sc = device_get_softc(dev);
4746 	struct ifnet *ifp;
4747 	int err, rid;
4748 
4749 	sc->dev = dev;
4750 	mxge_fetch_tunables(sc);
4751 
4752 	TASK_INIT(&sc->watchdog_task, 1, mxge_watchdog_task, sc);
4753 	sc->tq = taskqueue_create("mxge_taskq", M_WAITOK,
4754 				  taskqueue_thread_enqueue, &sc->tq);
4755 	if (sc->tq == NULL) {
4756 		err = ENOMEM;
4757 		goto abort_with_nothing;
4758 	}
4759 
4760 	err = bus_dma_tag_create(bus_get_dma_tag(dev),	/* parent */
4761 				 1,			/* alignment */
4762 				 0,			/* boundary */
4763 				 BUS_SPACE_MAXADDR,	/* low */
4764 				 BUS_SPACE_MAXADDR,	/* high */
4765 				 NULL, NULL,		/* filter */
4766 				 65536 + 256,		/* maxsize */
4767 				 MXGE_MAX_SEND_DESC, 	/* num segs */
4768 				 65536,			/* maxsegsize */
4769 				 0,			/* flags */
4770 				 NULL, NULL,		/* lock */
4771 				 &sc->parent_dmat);	/* tag */
4772 
4773 	if (err != 0) {
4774 		device_printf(sc->dev, "Err %d allocating parent dmat\n",
4775 			      err);
4776 		goto abort_with_tq;
4777 	}
4778 
4779 	ifp = sc->ifp = if_alloc(IFT_ETHER);
4780 	if (ifp == NULL) {
4781 		device_printf(dev, "can not if_alloc()\n");
4782 		err = ENOSPC;
4783 		goto abort_with_parent_dmat;
4784 	}
4785 	if_initname(ifp, device_get_name(dev), device_get_unit(dev));
4786 
4787 	snprintf(sc->cmd_mtx_name, sizeof(sc->cmd_mtx_name), "%s:cmd",
4788 		 device_get_nameunit(dev));
4789 	mtx_init(&sc->cmd_mtx, sc->cmd_mtx_name, NULL, MTX_DEF);
4790 	snprintf(sc->driver_mtx_name, sizeof(sc->driver_mtx_name),
4791 		 "%s:drv", device_get_nameunit(dev));
4792 	mtx_init(&sc->driver_mtx, sc->driver_mtx_name,
4793 		 MTX_NETWORK_LOCK, MTX_DEF);
4794 
4795 	callout_init_mtx(&sc->co_hdl, &sc->driver_mtx, 0);
4796 
4797 	mxge_setup_cfg_space(sc);
4798 
4799 	/* Map the board into the kernel */
4800 	rid = PCIR_BARS;
4801 	sc->mem_res = bus_alloc_resource_any(dev, SYS_RES_MEMORY, &rid,
4802 					     RF_ACTIVE);
4803 	if (sc->mem_res == NULL) {
4804 		device_printf(dev, "could not map memory\n");
4805 		err = ENXIO;
4806 		goto abort_with_lock;
4807 	}
4808 	sc->sram = rman_get_virtual(sc->mem_res);
4809 	sc->sram_size = 2*1024*1024 - (2*(48*1024)+(32*1024)) - 0x100;
4810 	if (sc->sram_size > rman_get_size(sc->mem_res)) {
4811 		device_printf(dev, "impossible memory region size %jd\n",
4812 			      rman_get_size(sc->mem_res));
4813 		err = ENXIO;
4814 		goto abort_with_mem_res;
4815 	}
4816 
4817 	/* make NULL terminated copy of the EEPROM strings section of
4818 	   lanai SRAM */
4819 	bzero(sc->eeprom_strings, MXGE_EEPROM_STRINGS_SIZE);
4820 	bus_space_read_region_1(rman_get_bustag(sc->mem_res),
4821 				rman_get_bushandle(sc->mem_res),
4822 				sc->sram_size - MXGE_EEPROM_STRINGS_SIZE,
4823 				sc->eeprom_strings,
4824 				MXGE_EEPROM_STRINGS_SIZE - 2);
4825 	err = mxge_parse_strings(sc);
4826 	if (err != 0)
4827 		goto abort_with_mem_res;
4828 
4829 	/* Enable write combining for efficient use of PCIe bus */
4830 	mxge_enable_wc(sc);
4831 
4832 	/* Allocate the out of band dma memory */
4833 	err = mxge_dma_alloc(sc, &sc->cmd_dma,
4834 			     sizeof (mxge_cmd_t), 64);
4835 	if (err != 0)
4836 		goto abort_with_mem_res;
4837 	sc->cmd = (mcp_cmd_response_t *) sc->cmd_dma.addr;
4838 	err = mxge_dma_alloc(sc, &sc->zeropad_dma, 64, 64);
4839 	if (err != 0)
4840 		goto abort_with_cmd_dma;
4841 
4842 	err = mxge_dma_alloc(sc, &sc->dmabench_dma, 4096, 4096);
4843 	if (err != 0)
4844 		goto abort_with_zeropad_dma;
4845 
4846 	/* select & load the firmware */
4847 	err = mxge_select_firmware(sc);
4848 	if (err != 0)
4849 		goto abort_with_dmabench;
4850 	sc->intr_coal_delay = mxge_intr_coal_delay;
4851 
4852 	mxge_slice_probe(sc);
4853 	err = mxge_alloc_slices(sc);
4854 	if (err != 0)
4855 		goto abort_with_dmabench;
4856 
4857 	err = mxge_reset(sc, 0);
4858 	if (err != 0)
4859 		goto abort_with_slices;
4860 
4861 	err = mxge_alloc_rings(sc);
4862 	if (err != 0) {
4863 		device_printf(sc->dev, "failed to allocate rings\n");
4864 		goto abort_with_slices;
4865 	}
4866 
4867 	err = mxge_add_irq(sc);
4868 	if (err != 0) {
4869 		device_printf(sc->dev, "failed to add irq\n");
4870 		goto abort_with_rings;
4871 	}
4872 
4873 	ifp->if_baudrate = IF_Gbps(10);
4874 	ifp->if_capabilities = IFCAP_RXCSUM | IFCAP_TXCSUM | IFCAP_TSO4 |
4875 		IFCAP_VLAN_MTU | IFCAP_LINKSTATE | IFCAP_TXCSUM_IPV6 |
4876 		IFCAP_RXCSUM_IPV6;
4877 #if defined(INET) || defined(INET6)
4878 	ifp->if_capabilities |= IFCAP_LRO;
4879 #endif
4880 
4881 #ifdef MXGE_NEW_VLAN_API
4882 	ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_HWCSUM;
4883 
4884 	/* Only FW 1.4.32 and newer can do TSO over vlans */
4885 	if (sc->fw_ver_major == 1 && sc->fw_ver_minor == 4 &&
4886 	    sc->fw_ver_tiny >= 32)
4887 		ifp->if_capabilities |= IFCAP_VLAN_HWTSO;
4888 #endif
4889 	sc->max_mtu = mxge_max_mtu(sc);
4890 	if (sc->max_mtu >= 9000)
4891 		ifp->if_capabilities |= IFCAP_JUMBO_MTU;
4892 	else
4893 		device_printf(dev, "MTU limited to %d.  Install "
4894 			      "latest firmware for 9000 byte jumbo support\n",
4895 			      sc->max_mtu - ETHER_HDR_LEN);
4896 	ifp->if_hwassist = CSUM_TCP | CSUM_UDP | CSUM_TSO;
4897 	ifp->if_hwassist |= CSUM_TCP_IPV6 | CSUM_UDP_IPV6;
4898 	/* check to see if f/w supports TSO for IPv6 */
4899 	if (!mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_TSO6_HDR_SIZE, &cmd)) {
4900 		if (CSUM_TCP_IPV6)
4901 			ifp->if_capabilities |= IFCAP_TSO6;
4902 		sc->max_tso6_hlen = min(cmd.data0,
4903 					sizeof (sc->ss[0].scratch));
4904 	}
4905 	ifp->if_capenable = ifp->if_capabilities;
4906 	if (sc->lro_cnt == 0)
4907 		ifp->if_capenable &= ~IFCAP_LRO;
4908 	ifp->if_init = mxge_init;
4909 	ifp->if_softc = sc;
4910 	ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
4911 	ifp->if_ioctl = mxge_ioctl;
4912 	ifp->if_start = mxge_start;
4913 	ifp->if_get_counter = mxge_get_counter;
4914 	ifp->if_hw_tsomax = IP_MAXPACKET - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN);
4915 	ifp->if_hw_tsomaxsegcount = sc->ss[0].tx.max_desc;
4916 	ifp->if_hw_tsomaxsegsize = IP_MAXPACKET;
4917 	/* Initialise the ifmedia structure */
4918 	ifmedia_init(&sc->media, 0, mxge_media_change,
4919 		     mxge_media_status);
4920 	mxge_media_init(sc);
4921 	mxge_media_probe(sc);
4922 	sc->dying = 0;
4923 	ether_ifattach(ifp, sc->mac_addr);
4924 	/* ether_ifattach sets mtu to ETHERMTU */
4925 	if (mxge_initial_mtu != ETHERMTU)
4926 		mxge_change_mtu(sc, mxge_initial_mtu);
4927 
4928 	mxge_add_sysctls(sc);
4929 #ifdef IFNET_BUF_RING
4930 	ifp->if_transmit = mxge_transmit;
4931 	ifp->if_qflush = mxge_qflush;
4932 #endif
4933 	taskqueue_start_threads(&sc->tq, 1, PI_NET, "%s taskq",
4934 				device_get_nameunit(sc->dev));
4935 	callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
4936 	return 0;
4937 
4938 abort_with_rings:
4939 	mxge_free_rings(sc);
4940 abort_with_slices:
4941 	mxge_free_slices(sc);
4942 abort_with_dmabench:
4943 	mxge_dma_free(&sc->dmabench_dma);
4944 abort_with_zeropad_dma:
4945 	mxge_dma_free(&sc->zeropad_dma);
4946 abort_with_cmd_dma:
4947 	mxge_dma_free(&sc->cmd_dma);
4948 abort_with_mem_res:
4949 	bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
4950 abort_with_lock:
4951 	pci_disable_busmaster(dev);
4952 	mtx_destroy(&sc->cmd_mtx);
4953 	mtx_destroy(&sc->driver_mtx);
4954 	if_free(ifp);
4955 abort_with_parent_dmat:
4956 	bus_dma_tag_destroy(sc->parent_dmat);
4957 abort_with_tq:
4958 	if (sc->tq != NULL) {
4959 		taskqueue_drain(sc->tq, &sc->watchdog_task);
4960 		taskqueue_free(sc->tq);
4961 		sc->tq = NULL;
4962 	}
4963 abort_with_nothing:
4964 	return err;
4965 }
4966 
4967 static int
4968 mxge_detach(device_t dev)
4969 {
4970 	mxge_softc_t *sc = device_get_softc(dev);
4971 
4972 	if (mxge_vlans_active(sc)) {
4973 		device_printf(sc->dev,
4974 			      "Detach vlans before removing module\n");
4975 		return EBUSY;
4976 	}
4977 	mtx_lock(&sc->driver_mtx);
4978 	sc->dying = 1;
4979 	if (sc->ifp->if_drv_flags & IFF_DRV_RUNNING)
4980 		mxge_close(sc, 0);
4981 	mtx_unlock(&sc->driver_mtx);
4982 	ether_ifdetach(sc->ifp);
4983 	if (sc->tq != NULL) {
4984 		taskqueue_drain(sc->tq, &sc->watchdog_task);
4985 		taskqueue_free(sc->tq);
4986 		sc->tq = NULL;
4987 	}
4988 	callout_drain(&sc->co_hdl);
4989 	ifmedia_removeall(&sc->media);
4990 	mxge_dummy_rdma(sc, 0);
4991 	mxge_rem_sysctls(sc);
4992 	mxge_rem_irq(sc);
4993 	mxge_free_rings(sc);
4994 	mxge_free_slices(sc);
4995 	mxge_dma_free(&sc->dmabench_dma);
4996 	mxge_dma_free(&sc->zeropad_dma);
4997 	mxge_dma_free(&sc->cmd_dma);
4998 	bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
4999 	pci_disable_busmaster(dev);
5000 	mtx_destroy(&sc->cmd_mtx);
5001 	mtx_destroy(&sc->driver_mtx);
5002 	if_free(sc->ifp);
5003 	bus_dma_tag_destroy(sc->parent_dmat);
5004 	return 0;
5005 }
5006 
5007 static int
5008 mxge_shutdown(device_t dev)
5009 {
5010 	return 0;
5011 }
5012 
5013 /*
5014   This file uses Myri10GE driver indentation.
5015 
5016   Local Variables:
5017   c-file-style:"linux"
5018   tab-width:8
5019   End:
5020 */
5021