xref: /freebsd/sys/dev/mxge/if_mxge.c (revision 84dfba8d183d31e3412639ecb4b8ad4433cf7e80)
1 /******************************************************************************
2 
3 Copyright (c) 2006-2013, Myricom Inc.
4 All rights reserved.
5 
6 Redistribution and use in source and binary forms, with or without
7 modification, are permitted provided that the following conditions are met:
8 
9  1. Redistributions of source code must retain the above copyright notice,
10     this list of conditions and the following disclaimer.
11 
12  2. Neither the name of the Myricom Inc, nor the names of its
13     contributors may be used to endorse or promote products derived from
14     this software without specific prior written permission.
15 
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 POSSIBILITY OF SUCH DAMAGE.
27 
28 ***************************************************************************/
29 
30 #include <sys/cdefs.h>
31 __FBSDID("$FreeBSD$");
32 
33 #include <sys/param.h>
34 #include <sys/systm.h>
35 #include <sys/linker.h>
36 #include <sys/firmware.h>
37 #include <sys/endian.h>
38 #include <sys/sockio.h>
39 #include <sys/mbuf.h>
40 #include <sys/malloc.h>
41 #include <sys/kdb.h>
42 #include <sys/kernel.h>
43 #include <sys/lock.h>
44 #include <sys/module.h>
45 #include <sys/socket.h>
46 #include <sys/sysctl.h>
47 #include <sys/sx.h>
48 #include <sys/taskqueue.h>
49 
50 #include <net/if.h>
51 #include <net/if_var.h>
52 #include <net/if_arp.h>
53 #include <net/ethernet.h>
54 #include <net/if_dl.h>
55 #include <net/if_media.h>
56 
57 #include <net/bpf.h>
58 
59 #include <net/if_types.h>
60 #include <net/if_vlan_var.h>
61 #include <net/zlib.h>
62 
63 #include <netinet/in_systm.h>
64 #include <netinet/in.h>
65 #include <netinet/ip.h>
66 #include <netinet/ip6.h>
67 #include <netinet/tcp.h>
68 #include <netinet/tcp_lro.h>
69 #include <netinet6/ip6_var.h>
70 
71 #include <machine/bus.h>
72 #include <machine/in_cksum.h>
73 #include <machine/resource.h>
74 #include <sys/bus.h>
75 #include <sys/rman.h>
76 #include <sys/smp.h>
77 
78 #include <dev/pci/pcireg.h>
79 #include <dev/pci/pcivar.h>
80 #include <dev/pci/pci_private.h> /* XXX for pci_cfg_restore */
81 
82 #include <vm/vm.h>		/* for pmap_mapdev() */
83 #include <vm/pmap.h>
84 
85 #if defined(__i386) || defined(__amd64)
86 #include <machine/specialreg.h>
87 #endif
88 
89 #include <dev/mxge/mxge_mcp.h>
90 #include <dev/mxge/mcp_gen_header.h>
91 /*#define MXGE_FAKE_IFP*/
92 #include <dev/mxge/if_mxge_var.h>
93 #ifdef IFNET_BUF_RING
94 #include <sys/buf_ring.h>
95 #endif
96 
97 #include "opt_inet.h"
98 #include "opt_inet6.h"
99 
100 /* tunable params */
101 static int mxge_nvidia_ecrc_enable = 1;
102 static int mxge_force_firmware = 0;
103 static int mxge_intr_coal_delay = 30;
104 static int mxge_deassert_wait = 1;
105 static int mxge_flow_control = 1;
106 static int mxge_verbose = 0;
107 static int mxge_ticks;
108 static int mxge_max_slices = 1;
109 static int mxge_rss_hash_type = MXGEFW_RSS_HASH_TYPE_SRC_DST_PORT;
110 static int mxge_always_promisc = 0;
111 static int mxge_initial_mtu = ETHERMTU_JUMBO;
112 static int mxge_throttle = 0;
113 static char *mxge_fw_unaligned = "mxge_ethp_z8e";
114 static char *mxge_fw_aligned = "mxge_eth_z8e";
115 static char *mxge_fw_rss_aligned = "mxge_rss_eth_z8e";
116 static char *mxge_fw_rss_unaligned = "mxge_rss_ethp_z8e";
117 
118 static int mxge_probe(device_t dev);
119 static int mxge_attach(device_t dev);
120 static int mxge_detach(device_t dev);
121 static int mxge_shutdown(device_t dev);
122 static void mxge_intr(void *arg);
123 
124 static device_method_t mxge_methods[] =
125 {
126   /* Device interface */
127   DEVMETHOD(device_probe, mxge_probe),
128   DEVMETHOD(device_attach, mxge_attach),
129   DEVMETHOD(device_detach, mxge_detach),
130   DEVMETHOD(device_shutdown, mxge_shutdown),
131 
132   DEVMETHOD_END
133 };
134 
135 static driver_t mxge_driver =
136 {
137   "mxge",
138   mxge_methods,
139   sizeof(mxge_softc_t),
140 };
141 
142 static devclass_t mxge_devclass;
143 
144 /* Declare ourselves to be a child of the PCI bus.*/
145 DRIVER_MODULE(mxge, pci, mxge_driver, mxge_devclass, 0, 0);
146 MODULE_DEPEND(mxge, firmware, 1, 1, 1);
147 MODULE_DEPEND(mxge, zlib, 1, 1, 1);
148 
149 static int mxge_load_firmware(mxge_softc_t *sc, int adopt);
150 static int mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data);
151 static int mxge_close(mxge_softc_t *sc, int down);
152 static int mxge_open(mxge_softc_t *sc);
153 static void mxge_tick(void *arg);
154 
155 static int
156 mxge_probe(device_t dev)
157 {
158 	int rev;
159 
160 
161 	if ((pci_get_vendor(dev) == MXGE_PCI_VENDOR_MYRICOM) &&
162 	    ((pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E) ||
163 	     (pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E_9))) {
164 		rev = pci_get_revid(dev);
165 		switch (rev) {
166 		case MXGE_PCI_REV_Z8E:
167 			device_set_desc(dev, "Myri10G-PCIE-8A");
168 			break;
169 		case MXGE_PCI_REV_Z8ES:
170 			device_set_desc(dev, "Myri10G-PCIE-8B");
171 			break;
172 		default:
173 			device_set_desc(dev, "Myri10G-PCIE-8??");
174 			device_printf(dev, "Unrecognized rev %d NIC\n",
175 				      rev);
176 			break;
177 		}
178 		return 0;
179 	}
180 	return ENXIO;
181 }
182 
183 static void
184 mxge_enable_wc(mxge_softc_t *sc)
185 {
186 #if defined(__i386) || defined(__amd64)
187 	vm_offset_t len;
188 	int err;
189 
190 	sc->wc = 1;
191 	len = rman_get_size(sc->mem_res);
192 	err = pmap_change_attr((vm_offset_t) sc->sram,
193 			       len, PAT_WRITE_COMBINING);
194 	if (err != 0) {
195 		device_printf(sc->dev, "pmap_change_attr failed, %d\n",
196 			      err);
197 		sc->wc = 0;
198 	}
199 #endif
200 }
201 
202 
203 /* callback to get our DMA address */
204 static void
205 mxge_dmamap_callback(void *arg, bus_dma_segment_t *segs, int nsegs,
206 			 int error)
207 {
208 	if (error == 0) {
209 		*(bus_addr_t *) arg = segs->ds_addr;
210 	}
211 }
212 
213 static int
214 mxge_dma_alloc(mxge_softc_t *sc, mxge_dma_t *dma, size_t bytes,
215 		   bus_size_t alignment)
216 {
217 	int err;
218 	device_t dev = sc->dev;
219 	bus_size_t boundary, maxsegsize;
220 
221 	if (bytes > 4096 && alignment == 4096) {
222 		boundary = 0;
223 		maxsegsize = bytes;
224 	} else {
225 		boundary = 4096;
226 		maxsegsize = 4096;
227 	}
228 
229 	/* allocate DMAable memory tags */
230 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
231 				 alignment,		/* alignment */
232 				 boundary,		/* boundary */
233 				 BUS_SPACE_MAXADDR,	/* low */
234 				 BUS_SPACE_MAXADDR,	/* high */
235 				 NULL, NULL,		/* filter */
236 				 bytes,			/* maxsize */
237 				 1,			/* num segs */
238 				 maxsegsize,		/* maxsegsize */
239 				 BUS_DMA_COHERENT,	/* flags */
240 				 NULL, NULL,		/* lock */
241 				 &dma->dmat);		/* tag */
242 	if (err != 0) {
243 		device_printf(dev, "couldn't alloc tag (err = %d)\n", err);
244 		return err;
245 	}
246 
247 	/* allocate DMAable memory & map */
248 	err = bus_dmamem_alloc(dma->dmat, &dma->addr,
249 			       (BUS_DMA_WAITOK | BUS_DMA_COHERENT
250 				| BUS_DMA_ZERO),  &dma->map);
251 	if (err != 0) {
252 		device_printf(dev, "couldn't alloc mem (err = %d)\n", err);
253 		goto abort_with_dmat;
254 	}
255 
256 	/* load the memory */
257 	err = bus_dmamap_load(dma->dmat, dma->map, dma->addr, bytes,
258 			      mxge_dmamap_callback,
259 			      (void *)&dma->bus_addr, 0);
260 	if (err != 0) {
261 		device_printf(dev, "couldn't load map (err = %d)\n", err);
262 		goto abort_with_mem;
263 	}
264 	return 0;
265 
266 abort_with_mem:
267 	bus_dmamem_free(dma->dmat, dma->addr, dma->map);
268 abort_with_dmat:
269 	(void)bus_dma_tag_destroy(dma->dmat);
270 	return err;
271 }
272 
273 
274 static void
275 mxge_dma_free(mxge_dma_t *dma)
276 {
277 	bus_dmamap_unload(dma->dmat, dma->map);
278 	bus_dmamem_free(dma->dmat, dma->addr, dma->map);
279 	(void)bus_dma_tag_destroy(dma->dmat);
280 }
281 
282 /*
283  * The eeprom strings on the lanaiX have the format
284  * SN=x\0
285  * MAC=x:x:x:x:x:x\0
286  * PC=text\0
287  */
288 
289 static int
290 mxge_parse_strings(mxge_softc_t *sc)
291 {
292 	char *ptr;
293 	int i, found_mac, found_sn2;
294 	char *endptr;
295 
296 	ptr = sc->eeprom_strings;
297 	found_mac = 0;
298 	found_sn2 = 0;
299 	while (*ptr != '\0') {
300 		if (strncmp(ptr, "MAC=", 4) == 0) {
301 			ptr += 4;
302 			for (i = 0;;) {
303 				sc->mac_addr[i] = strtoul(ptr, &endptr, 16);
304 				if (endptr - ptr != 2)
305 					goto abort;
306 				ptr = endptr;
307 				if (++i == 6)
308 					break;
309 				if (*ptr++ != ':')
310 					goto abort;
311 			}
312 			found_mac = 1;
313 		} else if (strncmp(ptr, "PC=", 3) == 0) {
314 			ptr += 3;
315 			strlcpy(sc->product_code_string, ptr,
316 			    sizeof(sc->product_code_string));
317 		} else if (!found_sn2 && (strncmp(ptr, "SN=", 3) == 0)) {
318 			ptr += 3;
319 			strlcpy(sc->serial_number_string, ptr,
320 			    sizeof(sc->serial_number_string));
321 		} else if (strncmp(ptr, "SN2=", 4) == 0) {
322 			/* SN2 takes precedence over SN */
323 			ptr += 4;
324 			found_sn2 = 1;
325 			strlcpy(sc->serial_number_string, ptr,
326 			    sizeof(sc->serial_number_string));
327 		}
328 		while (*ptr++ != '\0') {}
329 	}
330 
331 	if (found_mac)
332 		return 0;
333 
334  abort:
335 	device_printf(sc->dev, "failed to parse eeprom_strings\n");
336 
337 	return ENXIO;
338 }
339 
340 #if defined __i386 || defined i386 || defined __i386__ || defined __x86_64__
341 static void
342 mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
343 {
344 	uint32_t val;
345 	unsigned long base, off;
346 	char *va, *cfgptr;
347 	device_t pdev, mcp55;
348 	uint16_t vendor_id, device_id, word;
349 	uintptr_t bus, slot, func, ivend, idev;
350 	uint32_t *ptr32;
351 
352 
353 	if (!mxge_nvidia_ecrc_enable)
354 		return;
355 
356 	pdev = device_get_parent(device_get_parent(sc->dev));
357 	if (pdev == NULL) {
358 		device_printf(sc->dev, "could not find parent?\n");
359 		return;
360 	}
361 	vendor_id = pci_read_config(pdev, PCIR_VENDOR, 2);
362 	device_id = pci_read_config(pdev, PCIR_DEVICE, 2);
363 
364 	if (vendor_id != 0x10de)
365 		return;
366 
367 	base = 0;
368 
369 	if (device_id == 0x005d) {
370 		/* ck804, base address is magic */
371 		base = 0xe0000000UL;
372 	} else if (device_id >= 0x0374 && device_id <= 0x378) {
373 		/* mcp55, base address stored in chipset */
374 		mcp55 = pci_find_bsf(0, 0, 0);
375 		if (mcp55 &&
376 		    0x10de == pci_read_config(mcp55, PCIR_VENDOR, 2) &&
377 		    0x0369 == pci_read_config(mcp55, PCIR_DEVICE, 2)) {
378 			word = pci_read_config(mcp55, 0x90, 2);
379 			base = ((unsigned long)word & 0x7ffeU) << 25;
380 		}
381 	}
382 	if (!base)
383 		return;
384 
385 	/* XXXX
386 	   Test below is commented because it is believed that doing
387 	   config read/write beyond 0xff will access the config space
388 	   for the next larger function.  Uncomment this and remove
389 	   the hacky pmap_mapdev() way of accessing config space when
390 	   FreeBSD grows support for extended pcie config space access
391 	*/
392 #if 0
393 	/* See if we can, by some miracle, access the extended
394 	   config space */
395 	val = pci_read_config(pdev, 0x178, 4);
396 	if (val != 0xffffffff) {
397 		val |= 0x40;
398 		pci_write_config(pdev, 0x178, val, 4);
399 		return;
400 	}
401 #endif
402 	/* Rather than using normal pci config space writes, we must
403 	 * map the Nvidia config space ourselves.  This is because on
404 	 * opteron/nvidia class machine the 0xe000000 mapping is
405 	 * handled by the nvidia chipset, that means the internal PCI
406 	 * device (the on-chip northbridge), or the amd-8131 bridge
407 	 * and things behind them are not visible by this method.
408 	 */
409 
410 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
411 		      PCI_IVAR_BUS, &bus);
412 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
413 		      PCI_IVAR_SLOT, &slot);
414 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
415 		      PCI_IVAR_FUNCTION, &func);
416 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
417 		      PCI_IVAR_VENDOR, &ivend);
418 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
419 		      PCI_IVAR_DEVICE, &idev);
420 
421 	off =  base
422 		+ 0x00100000UL * (unsigned long)bus
423 		+ 0x00001000UL * (unsigned long)(func
424 						 + 8 * slot);
425 
426 	/* map it into the kernel */
427 	va = pmap_mapdev(trunc_page((vm_paddr_t)off), PAGE_SIZE);
428 
429 
430 	if (va == NULL) {
431 		device_printf(sc->dev, "pmap_kenter_temporary didn't\n");
432 		return;
433 	}
434 	/* get a pointer to the config space mapped into the kernel */
435 	cfgptr = va + (off & PAGE_MASK);
436 
437 	/* make sure that we can really access it */
438 	vendor_id = *(uint16_t *)(cfgptr + PCIR_VENDOR);
439 	device_id = *(uint16_t *)(cfgptr + PCIR_DEVICE);
440 	if (! (vendor_id == ivend && device_id == idev)) {
441 		device_printf(sc->dev, "mapping failed: 0x%x:0x%x\n",
442 			      vendor_id, device_id);
443 		pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
444 		return;
445 	}
446 
447 	ptr32 = (uint32_t*)(cfgptr + 0x178);
448 	val = *ptr32;
449 
450 	if (val == 0xffffffff) {
451 		device_printf(sc->dev, "extended mapping failed\n");
452 		pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
453 		return;
454 	}
455 	*ptr32 = val | 0x40;
456 	pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
457 	if (mxge_verbose)
458 		device_printf(sc->dev,
459 			      "Enabled ECRC on upstream Nvidia bridge "
460 			      "at %d:%d:%d\n",
461 			      (int)bus, (int)slot, (int)func);
462 	return;
463 }
464 #else
465 static void
466 mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
467 {
468 	device_printf(sc->dev,
469 		      "Nforce 4 chipset on non-x86/amd64!?!?!\n");
470 	return;
471 }
472 #endif
473 
474 
475 static int
476 mxge_dma_test(mxge_softc_t *sc, int test_type)
477 {
478 	mxge_cmd_t cmd;
479 	bus_addr_t dmatest_bus = sc->dmabench_dma.bus_addr;
480 	int status;
481 	uint32_t len;
482 	char *test = " ";
483 
484 
485 	/* Run a small DMA test.
486 	 * The magic multipliers to the length tell the firmware
487 	 * to do DMA read, write, or read+write tests.  The
488 	 * results are returned in cmd.data0.  The upper 16
489 	 * bits of the return is the number of transfers completed.
490 	 * The lower 16 bits is the time in 0.5us ticks that the
491 	 * transfers took to complete.
492 	 */
493 
494 	len = sc->tx_boundary;
495 
496 	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
497 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
498 	cmd.data2 = len * 0x10000;
499 	status = mxge_send_cmd(sc, test_type, &cmd);
500 	if (status != 0) {
501 		test = "read";
502 		goto abort;
503 	}
504 	sc->read_dma = ((cmd.data0>>16) * len * 2) /
505 		(cmd.data0 & 0xffff);
506 	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
507 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
508 	cmd.data2 = len * 0x1;
509 	status = mxge_send_cmd(sc, test_type, &cmd);
510 	if (status != 0) {
511 		test = "write";
512 		goto abort;
513 	}
514 	sc->write_dma = ((cmd.data0>>16) * len * 2) /
515 		(cmd.data0 & 0xffff);
516 
517 	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
518 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
519 	cmd.data2 = len * 0x10001;
520 	status = mxge_send_cmd(sc, test_type, &cmd);
521 	if (status != 0) {
522 		test = "read/write";
523 		goto abort;
524 	}
525 	sc->read_write_dma = ((cmd.data0>>16) * len * 2 * 2) /
526 		(cmd.data0 & 0xffff);
527 
528 abort:
529 	if (status != 0 && test_type != MXGEFW_CMD_UNALIGNED_TEST)
530 		device_printf(sc->dev, "DMA %s benchmark failed: %d\n",
531 			      test, status);
532 
533 	return status;
534 }
535 
536 /*
537  * The Lanai Z8E PCI-E interface achieves higher Read-DMA throughput
538  * when the PCI-E Completion packets are aligned on an 8-byte
539  * boundary.  Some PCI-E chip sets always align Completion packets; on
540  * the ones that do not, the alignment can be enforced by enabling
541  * ECRC generation (if supported).
542  *
543  * When PCI-E Completion packets are not aligned, it is actually more
544  * efficient to limit Read-DMA transactions to 2KB, rather than 4KB.
545  *
546  * If the driver can neither enable ECRC nor verify that it has
547  * already been enabled, then it must use a firmware image which works
548  * around unaligned completion packets (ethp_z8e.dat), and it should
549  * also ensure that it never gives the device a Read-DMA which is
550  * larger than 2KB by setting the tx_boundary to 2KB.  If ECRC is
551  * enabled, then the driver should use the aligned (eth_z8e.dat)
552  * firmware image, and set tx_boundary to 4KB.
553  */
554 
555 static int
556 mxge_firmware_probe(mxge_softc_t *sc)
557 {
558 	device_t dev = sc->dev;
559 	int reg, status;
560 	uint16_t pectl;
561 
562 	sc->tx_boundary = 4096;
563 	/*
564 	 * Verify the max read request size was set to 4KB
565 	 * before trying the test with 4KB.
566 	 */
567 	if (pci_find_cap(dev, PCIY_EXPRESS, &reg) == 0) {
568 		pectl = pci_read_config(dev, reg + 0x8, 2);
569 		if ((pectl & (5 << 12)) != (5 << 12)) {
570 			device_printf(dev, "Max Read Req. size != 4k (0x%x\n",
571 				      pectl);
572 			sc->tx_boundary = 2048;
573 		}
574 	}
575 
576 	/*
577 	 * load the optimized firmware (which assumes aligned PCIe
578 	 * completions) in order to see if it works on this host.
579 	 */
580 	sc->fw_name = mxge_fw_aligned;
581 	status = mxge_load_firmware(sc, 1);
582 	if (status != 0) {
583 		return status;
584 	}
585 
586 	/*
587 	 * Enable ECRC if possible
588 	 */
589 	mxge_enable_nvidia_ecrc(sc);
590 
591 	/*
592 	 * Run a DMA test which watches for unaligned completions and
593 	 * aborts on the first one seen.  Not required on Z8ES or newer.
594 	 */
595 	if (pci_get_revid(sc->dev) >= MXGE_PCI_REV_Z8ES)
596 		return 0;
597 	status = mxge_dma_test(sc, MXGEFW_CMD_UNALIGNED_TEST);
598 	if (status == 0)
599 		return 0; /* keep the aligned firmware */
600 
601 	if (status != E2BIG)
602 		device_printf(dev, "DMA test failed: %d\n", status);
603 	if (status == ENOSYS)
604 		device_printf(dev, "Falling back to ethp! "
605 			      "Please install up to date fw\n");
606 	return status;
607 }
608 
609 static int
610 mxge_select_firmware(mxge_softc_t *sc)
611 {
612 	int aligned = 0;
613 	int force_firmware = mxge_force_firmware;
614 
615 	if (sc->throttle)
616 		force_firmware = sc->throttle;
617 
618 	if (force_firmware != 0) {
619 		if (force_firmware == 1)
620 			aligned = 1;
621 		else
622 			aligned = 0;
623 		if (mxge_verbose)
624 			device_printf(sc->dev,
625 				      "Assuming %s completions (forced)\n",
626 				      aligned ? "aligned" : "unaligned");
627 		goto abort;
628 	}
629 
630 	/* if the PCIe link width is 4 or less, we can use the aligned
631 	   firmware and skip any checks */
632 	if (sc->link_width != 0 && sc->link_width <= 4) {
633 		device_printf(sc->dev,
634 			      "PCIe x%d Link, expect reduced performance\n",
635 			      sc->link_width);
636 		aligned = 1;
637 		goto abort;
638 	}
639 
640 	if (0 == mxge_firmware_probe(sc))
641 		return 0;
642 
643 abort:
644 	if (aligned) {
645 		sc->fw_name = mxge_fw_aligned;
646 		sc->tx_boundary = 4096;
647 	} else {
648 		sc->fw_name = mxge_fw_unaligned;
649 		sc->tx_boundary = 2048;
650 	}
651 	return (mxge_load_firmware(sc, 0));
652 }
653 
654 static int
655 mxge_validate_firmware(mxge_softc_t *sc, const mcp_gen_header_t *hdr)
656 {
657 
658 
659 	if (be32toh(hdr->mcp_type) != MCP_TYPE_ETH) {
660 		device_printf(sc->dev, "Bad firmware type: 0x%x\n",
661 			      be32toh(hdr->mcp_type));
662 		return EIO;
663 	}
664 
665 	/* save firmware version for sysctl */
666 	strlcpy(sc->fw_version, hdr->version, sizeof(sc->fw_version));
667 	if (mxge_verbose)
668 		device_printf(sc->dev, "firmware id: %s\n", hdr->version);
669 
670 	sscanf(sc->fw_version, "%d.%d.%d", &sc->fw_ver_major,
671 	       &sc->fw_ver_minor, &sc->fw_ver_tiny);
672 
673 	if (!(sc->fw_ver_major == MXGEFW_VERSION_MAJOR
674 	      && sc->fw_ver_minor == MXGEFW_VERSION_MINOR)) {
675 		device_printf(sc->dev, "Found firmware version %s\n",
676 			      sc->fw_version);
677 		device_printf(sc->dev, "Driver needs %d.%d\n",
678 			      MXGEFW_VERSION_MAJOR, MXGEFW_VERSION_MINOR);
679 		return EINVAL;
680 	}
681 	return 0;
682 
683 }
684 
685 static void *
686 z_alloc(void *nil, u_int items, u_int size)
687 {
688         void *ptr;
689 
690         ptr = malloc(items * size, M_TEMP, M_NOWAIT);
691         return ptr;
692 }
693 
694 static void
695 z_free(void *nil, void *ptr)
696 {
697         free(ptr, M_TEMP);
698 }
699 
700 
701 static int
702 mxge_load_firmware_helper(mxge_softc_t *sc, uint32_t *limit)
703 {
704 	z_stream zs;
705 	char *inflate_buffer;
706 	const struct firmware *fw;
707 	const mcp_gen_header_t *hdr;
708 	unsigned hdr_offset;
709 	int status;
710 	unsigned int i;
711 	char dummy;
712 	size_t fw_len;
713 
714 	fw = firmware_get(sc->fw_name);
715 	if (fw == NULL) {
716 		device_printf(sc->dev, "Could not find firmware image %s\n",
717 			      sc->fw_name);
718 		return ENOENT;
719 	}
720 
721 
722 
723 	/* setup zlib and decompress f/w */
724 	bzero(&zs, sizeof (zs));
725 	zs.zalloc = z_alloc;
726 	zs.zfree = z_free;
727 	status = inflateInit(&zs);
728 	if (status != Z_OK) {
729 		status = EIO;
730 		goto abort_with_fw;
731 	}
732 
733 	/* the uncompressed size is stored as the firmware version,
734 	   which would otherwise go unused */
735 	fw_len = (size_t) fw->version;
736 	inflate_buffer = malloc(fw_len, M_TEMP, M_NOWAIT);
737 	if (inflate_buffer == NULL)
738 		goto abort_with_zs;
739 	zs.avail_in = fw->datasize;
740 	zs.next_in = __DECONST(char *, fw->data);
741 	zs.avail_out = fw_len;
742 	zs.next_out = inflate_buffer;
743 	status = inflate(&zs, Z_FINISH);
744 	if (status != Z_STREAM_END) {
745 		device_printf(sc->dev, "zlib %d\n", status);
746 		status = EIO;
747 		goto abort_with_buffer;
748 	}
749 
750 	/* check id */
751 	hdr_offset = htobe32(*(const uint32_t *)
752 			     (inflate_buffer + MCP_HEADER_PTR_OFFSET));
753 	if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > fw_len) {
754 		device_printf(sc->dev, "Bad firmware file");
755 		status = EIO;
756 		goto abort_with_buffer;
757 	}
758 	hdr = (const void*)(inflate_buffer + hdr_offset);
759 
760 	status = mxge_validate_firmware(sc, hdr);
761 	if (status != 0)
762 		goto abort_with_buffer;
763 
764 	/* Copy the inflated firmware to NIC SRAM. */
765 	for (i = 0; i < fw_len; i += 256) {
766 		mxge_pio_copy(sc->sram + MXGE_FW_OFFSET + i,
767 			      inflate_buffer + i,
768 			      min(256U, (unsigned)(fw_len - i)));
769 		wmb();
770 		dummy = *sc->sram;
771 		wmb();
772 	}
773 
774 	*limit = fw_len;
775 	status = 0;
776 abort_with_buffer:
777 	free(inflate_buffer, M_TEMP);
778 abort_with_zs:
779 	inflateEnd(&zs);
780 abort_with_fw:
781 	firmware_put(fw, FIRMWARE_UNLOAD);
782 	return status;
783 }
784 
785 /*
786  * Enable or disable periodic RDMAs from the host to make certain
787  * chipsets resend dropped PCIe messages
788  */
789 
790 static void
791 mxge_dummy_rdma(mxge_softc_t *sc, int enable)
792 {
793 	char buf_bytes[72];
794 	volatile uint32_t *confirm;
795 	volatile char *submit;
796 	uint32_t *buf, dma_low, dma_high;
797 	int i;
798 
799 	buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
800 
801 	/* clear confirmation addr */
802 	confirm = (volatile uint32_t *)sc->cmd;
803 	*confirm = 0;
804 	wmb();
805 
806 	/* send an rdma command to the PCIe engine, and wait for the
807 	   response in the confirmation address.  The firmware should
808 	   write a -1 there to indicate it is alive and well
809 	*/
810 
811 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
812 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
813 	buf[0] = htobe32(dma_high);		/* confirm addr MSW */
814 	buf[1] = htobe32(dma_low);		/* confirm addr LSW */
815 	buf[2] = htobe32(0xffffffff);		/* confirm data */
816 	dma_low = MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr);
817 	dma_high = MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr);
818 	buf[3] = htobe32(dma_high); 		/* dummy addr MSW */
819 	buf[4] = htobe32(dma_low); 		/* dummy addr LSW */
820 	buf[5] = htobe32(enable);			/* enable? */
821 
822 
823 	submit = (volatile char *)(sc->sram + MXGEFW_BOOT_DUMMY_RDMA);
824 
825 	mxge_pio_copy(submit, buf, 64);
826 	wmb();
827 	DELAY(1000);
828 	wmb();
829 	i = 0;
830 	while (*confirm != 0xffffffff && i < 20) {
831 		DELAY(1000);
832 		i++;
833 	}
834 	if (*confirm != 0xffffffff) {
835 		device_printf(sc->dev, "dummy rdma %s failed (%p = 0x%x)",
836 			      (enable ? "enable" : "disable"), confirm,
837 			      *confirm);
838 	}
839 	return;
840 }
841 
842 static int
843 mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data)
844 {
845 	mcp_cmd_t *buf;
846 	char buf_bytes[sizeof(*buf) + 8];
847 	volatile mcp_cmd_response_t *response = sc->cmd;
848 	volatile char *cmd_addr = sc->sram + MXGEFW_ETH_CMD;
849 	uint32_t dma_low, dma_high;
850 	int err, sleep_total = 0;
851 
852 	/* ensure buf is aligned to 8 bytes */
853 	buf = (mcp_cmd_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
854 
855 	buf->data0 = htobe32(data->data0);
856 	buf->data1 = htobe32(data->data1);
857 	buf->data2 = htobe32(data->data2);
858 	buf->cmd = htobe32(cmd);
859 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
860 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
861 
862 	buf->response_addr.low = htobe32(dma_low);
863 	buf->response_addr.high = htobe32(dma_high);
864 	mtx_lock(&sc->cmd_mtx);
865 	response->result = 0xffffffff;
866 	wmb();
867 	mxge_pio_copy((volatile void *)cmd_addr, buf, sizeof (*buf));
868 
869 	/* wait up to 20ms */
870 	err = EAGAIN;
871 	for (sleep_total = 0; sleep_total <  20; sleep_total++) {
872 		bus_dmamap_sync(sc->cmd_dma.dmat,
873 				sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
874 		wmb();
875 		switch (be32toh(response->result)) {
876 		case 0:
877 			data->data0 = be32toh(response->data);
878 			err = 0;
879 			break;
880 		case 0xffffffff:
881 			DELAY(1000);
882 			break;
883 		case MXGEFW_CMD_UNKNOWN:
884 			err = ENOSYS;
885 			break;
886 		case MXGEFW_CMD_ERROR_UNALIGNED:
887 			err = E2BIG;
888 			break;
889 		case MXGEFW_CMD_ERROR_BUSY:
890 			err = EBUSY;
891 			break;
892 		case MXGEFW_CMD_ERROR_I2C_ABSENT:
893 			err = ENXIO;
894 			break;
895 		default:
896 			device_printf(sc->dev,
897 				      "mxge: command %d "
898 				      "failed, result = %d\n",
899 				      cmd, be32toh(response->result));
900 			err = ENXIO;
901 			break;
902 		}
903 		if (err != EAGAIN)
904 			break;
905 	}
906 	if (err == EAGAIN)
907 		device_printf(sc->dev, "mxge: command %d timed out"
908 			      "result = %d\n",
909 			      cmd, be32toh(response->result));
910 	mtx_unlock(&sc->cmd_mtx);
911 	return err;
912 }
913 
914 static int
915 mxge_adopt_running_firmware(mxge_softc_t *sc)
916 {
917 	struct mcp_gen_header *hdr;
918 	const size_t bytes = sizeof (struct mcp_gen_header);
919 	size_t hdr_offset;
920 	int status;
921 
922 	/* find running firmware header */
923 	hdr_offset = htobe32(*(volatile uint32_t *)
924 			     (sc->sram + MCP_HEADER_PTR_OFFSET));
925 
926 	if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > sc->sram_size) {
927 		device_printf(sc->dev,
928 			      "Running firmware has bad header offset (%d)\n",
929 			      (int)hdr_offset);
930 		return EIO;
931 	}
932 
933 	/* copy header of running firmware from SRAM to host memory to
934 	 * validate firmware */
935 	hdr = malloc(bytes, M_DEVBUF, M_NOWAIT);
936 	if (hdr == NULL) {
937 		device_printf(sc->dev, "could not malloc firmware hdr\n");
938 		return ENOMEM;
939 	}
940 	bus_space_read_region_1(rman_get_bustag(sc->mem_res),
941 				rman_get_bushandle(sc->mem_res),
942 				hdr_offset, (char *)hdr, bytes);
943 	status = mxge_validate_firmware(sc, hdr);
944 	free(hdr, M_DEVBUF);
945 
946 	/*
947 	 * check to see if adopted firmware has bug where adopting
948 	 * it will cause broadcasts to be filtered unless the NIC
949 	 * is kept in ALLMULTI mode
950 	 */
951 	if (sc->fw_ver_major == 1 && sc->fw_ver_minor == 4 &&
952 	    sc->fw_ver_tiny >= 4 && sc->fw_ver_tiny <= 11) {
953 		sc->adopted_rx_filter_bug = 1;
954 		device_printf(sc->dev, "Adopting fw %d.%d.%d: "
955 			      "working around rx filter bug\n",
956 			      sc->fw_ver_major, sc->fw_ver_minor,
957 			      sc->fw_ver_tiny);
958 	}
959 
960 	return status;
961 }
962 
963 
964 static int
965 mxge_load_firmware(mxge_softc_t *sc, int adopt)
966 {
967 	volatile uint32_t *confirm;
968 	volatile char *submit;
969 	char buf_bytes[72];
970 	uint32_t *buf, size, dma_low, dma_high;
971 	int status, i;
972 
973 	buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
974 
975 	size = sc->sram_size;
976 	status = mxge_load_firmware_helper(sc, &size);
977 	if (status) {
978 		if (!adopt)
979 			return status;
980 		/* Try to use the currently running firmware, if
981 		   it is new enough */
982 		status = mxge_adopt_running_firmware(sc);
983 		if (status) {
984 			device_printf(sc->dev,
985 				      "failed to adopt running firmware\n");
986 			return status;
987 		}
988 		device_printf(sc->dev,
989 			      "Successfully adopted running firmware\n");
990 		if (sc->tx_boundary == 4096) {
991 			device_printf(sc->dev,
992 				"Using firmware currently running on NIC"
993 				 ".  For optimal\n");
994 			device_printf(sc->dev,
995 				 "performance consider loading optimized "
996 				 "firmware\n");
997 		}
998 		sc->fw_name = mxge_fw_unaligned;
999 		sc->tx_boundary = 2048;
1000 		return 0;
1001 	}
1002 	/* clear confirmation addr */
1003 	confirm = (volatile uint32_t *)sc->cmd;
1004 	*confirm = 0;
1005 	wmb();
1006 	/* send a reload command to the bootstrap MCP, and wait for the
1007 	   response in the confirmation address.  The firmware should
1008 	   write a -1 there to indicate it is alive and well
1009 	*/
1010 
1011 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
1012 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
1013 
1014 	buf[0] = htobe32(dma_high);	/* confirm addr MSW */
1015 	buf[1] = htobe32(dma_low);	/* confirm addr LSW */
1016 	buf[2] = htobe32(0xffffffff);	/* confirm data */
1017 
1018 	/* FIX: All newest firmware should un-protect the bottom of
1019 	   the sram before handoff. However, the very first interfaces
1020 	   do not. Therefore the handoff copy must skip the first 8 bytes
1021 	*/
1022 					/* where the code starts*/
1023 	buf[3] = htobe32(MXGE_FW_OFFSET + 8);
1024 	buf[4] = htobe32(size - 8); 	/* length of code */
1025 	buf[5] = htobe32(8);		/* where to copy to */
1026 	buf[6] = htobe32(0);		/* where to jump to */
1027 
1028 	submit = (volatile char *)(sc->sram + MXGEFW_BOOT_HANDOFF);
1029 	mxge_pio_copy(submit, buf, 64);
1030 	wmb();
1031 	DELAY(1000);
1032 	wmb();
1033 	i = 0;
1034 	while (*confirm != 0xffffffff && i < 20) {
1035 		DELAY(1000*10);
1036 		i++;
1037 		bus_dmamap_sync(sc->cmd_dma.dmat,
1038 				sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
1039 	}
1040 	if (*confirm != 0xffffffff) {
1041 		device_printf(sc->dev,"handoff failed (%p = 0x%x)",
1042 			confirm, *confirm);
1043 
1044 		return ENXIO;
1045 	}
1046 	return 0;
1047 }
1048 
1049 static int
1050 mxge_update_mac_address(mxge_softc_t *sc)
1051 {
1052 	mxge_cmd_t cmd;
1053 	uint8_t *addr = sc->mac_addr;
1054 	int status;
1055 
1056 
1057 	cmd.data0 = ((addr[0] << 24) | (addr[1] << 16)
1058 		     | (addr[2] << 8) | addr[3]);
1059 
1060 	cmd.data1 = ((addr[4] << 8) | (addr[5]));
1061 
1062 	status = mxge_send_cmd(sc, MXGEFW_SET_MAC_ADDRESS, &cmd);
1063 	return status;
1064 }
1065 
1066 static int
1067 mxge_change_pause(mxge_softc_t *sc, int pause)
1068 {
1069 	mxge_cmd_t cmd;
1070 	int status;
1071 
1072 	if (pause)
1073 		status = mxge_send_cmd(sc, MXGEFW_ENABLE_FLOW_CONTROL,
1074 				       &cmd);
1075 	else
1076 		status = mxge_send_cmd(sc, MXGEFW_DISABLE_FLOW_CONTROL,
1077 				       &cmd);
1078 
1079 	if (status) {
1080 		device_printf(sc->dev, "Failed to set flow control mode\n");
1081 		return ENXIO;
1082 	}
1083 	sc->pause = pause;
1084 	return 0;
1085 }
1086 
1087 static void
1088 mxge_change_promisc(mxge_softc_t *sc, int promisc)
1089 {
1090 	mxge_cmd_t cmd;
1091 	int status;
1092 
1093 	if (mxge_always_promisc)
1094 		promisc = 1;
1095 
1096 	if (promisc)
1097 		status = mxge_send_cmd(sc, MXGEFW_ENABLE_PROMISC,
1098 				       &cmd);
1099 	else
1100 		status = mxge_send_cmd(sc, MXGEFW_DISABLE_PROMISC,
1101 				       &cmd);
1102 
1103 	if (status) {
1104 		device_printf(sc->dev, "Failed to set promisc mode\n");
1105 	}
1106 }
1107 
1108 static void
1109 mxge_set_multicast_list(mxge_softc_t *sc)
1110 {
1111 	mxge_cmd_t cmd;
1112 	struct ifmultiaddr *ifma;
1113 	struct ifnet *ifp = sc->ifp;
1114 	int err;
1115 
1116 	/* This firmware is known to not support multicast */
1117 	if (!sc->fw_multicast_support)
1118 		return;
1119 
1120 	/* Disable multicast filtering while we play with the lists*/
1121 	err = mxge_send_cmd(sc, MXGEFW_ENABLE_ALLMULTI, &cmd);
1122 	if (err != 0) {
1123 		device_printf(sc->dev, "Failed MXGEFW_ENABLE_ALLMULTI,"
1124 		       " error status: %d\n", err);
1125 		return;
1126 	}
1127 
1128 	if (sc->adopted_rx_filter_bug)
1129 		return;
1130 
1131 	if (ifp->if_flags & IFF_ALLMULTI)
1132 		/* request to disable multicast filtering, so quit here */
1133 		return;
1134 
1135 	/* Flush all the filters */
1136 
1137 	err = mxge_send_cmd(sc, MXGEFW_LEAVE_ALL_MULTICAST_GROUPS, &cmd);
1138 	if (err != 0) {
1139 		device_printf(sc->dev,
1140 			      "Failed MXGEFW_LEAVE_ALL_MULTICAST_GROUPS"
1141 			      ", error status: %d\n", err);
1142 		return;
1143 	}
1144 
1145 	/* Walk the multicast list, and add each address */
1146 
1147 	if_maddr_rlock(ifp);
1148 	TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
1149 		if (ifma->ifma_addr->sa_family != AF_LINK)
1150 			continue;
1151 		bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr),
1152 		      &cmd.data0, 4);
1153 		bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr) + 4,
1154 		      &cmd.data1, 2);
1155 		cmd.data0 = htonl(cmd.data0);
1156 		cmd.data1 = htonl(cmd.data1);
1157 		err = mxge_send_cmd(sc, MXGEFW_JOIN_MULTICAST_GROUP, &cmd);
1158 		if (err != 0) {
1159 			device_printf(sc->dev, "Failed "
1160 			       "MXGEFW_JOIN_MULTICAST_GROUP, error status:"
1161 			       "%d\t", err);
1162 			/* abort, leaving multicast filtering off */
1163 			if_maddr_runlock(ifp);
1164 			return;
1165 		}
1166 	}
1167 	if_maddr_runlock(ifp);
1168 	/* Enable multicast filtering */
1169 	err = mxge_send_cmd(sc, MXGEFW_DISABLE_ALLMULTI, &cmd);
1170 	if (err != 0) {
1171 		device_printf(sc->dev, "Failed MXGEFW_DISABLE_ALLMULTI"
1172 		       ", error status: %d\n", err);
1173 	}
1174 }
1175 
1176 static int
1177 mxge_max_mtu(mxge_softc_t *sc)
1178 {
1179 	mxge_cmd_t cmd;
1180 	int status;
1181 
1182 	if (MJUMPAGESIZE - MXGEFW_PAD >  MXGEFW_MAX_MTU)
1183 		return  MXGEFW_MAX_MTU - MXGEFW_PAD;
1184 
1185 	/* try to set nbufs to see if it we can
1186 	   use virtually contiguous jumbos */
1187 	cmd.data0 = 0;
1188 	status = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
1189 			       &cmd);
1190 	if (status == 0)
1191 		return  MXGEFW_MAX_MTU - MXGEFW_PAD;
1192 
1193 	/* otherwise, we're limited to MJUMPAGESIZE */
1194 	return MJUMPAGESIZE - MXGEFW_PAD;
1195 }
1196 
1197 static int
1198 mxge_reset(mxge_softc_t *sc, int interrupts_setup)
1199 {
1200 	struct mxge_slice_state *ss;
1201 	mxge_rx_done_t *rx_done;
1202 	volatile uint32_t *irq_claim;
1203 	mxge_cmd_t cmd;
1204 	int slice, status;
1205 
1206 	/* try to send a reset command to the card to see if it
1207 	   is alive */
1208 	memset(&cmd, 0, sizeof (cmd));
1209 	status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
1210 	if (status != 0) {
1211 		device_printf(sc->dev, "failed reset\n");
1212 		return ENXIO;
1213 	}
1214 
1215 	mxge_dummy_rdma(sc, 1);
1216 
1217 
1218 	/* set the intrq size */
1219 	cmd.data0 = sc->rx_ring_size;
1220 	status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
1221 
1222 	/*
1223 	 * Even though we already know how many slices are supported
1224 	 * via mxge_slice_probe(), MXGEFW_CMD_GET_MAX_RSS_QUEUES
1225 	 * has magic side effects, and must be called after a reset.
1226 	 * It must be called prior to calling any RSS related cmds,
1227 	 * including assigning an interrupt queue for anything but
1228 	 * slice 0.  It must also be called *after*
1229 	 * MXGEFW_CMD_SET_INTRQ_SIZE, since the intrq size is used by
1230 	 * the firmware to compute offsets.
1231 	 */
1232 
1233 	if (sc->num_slices > 1) {
1234 		/* ask the maximum number of slices it supports */
1235 		status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES,
1236 					   &cmd);
1237 		if (status != 0) {
1238 			device_printf(sc->dev,
1239 				      "failed to get number of slices\n");
1240 			return status;
1241 		}
1242 		/*
1243 		 * MXGEFW_CMD_ENABLE_RSS_QUEUES must be called prior
1244 		 * to setting up the interrupt queue DMA
1245 		 */
1246 		cmd.data0 = sc->num_slices;
1247 		cmd.data1 = MXGEFW_SLICE_INTR_MODE_ONE_PER_SLICE;
1248 #ifdef IFNET_BUF_RING
1249 		cmd.data1 |= MXGEFW_SLICE_ENABLE_MULTIPLE_TX_QUEUES;
1250 #endif
1251 		status = mxge_send_cmd(sc, MXGEFW_CMD_ENABLE_RSS_QUEUES,
1252 					   &cmd);
1253 		if (status != 0) {
1254 			device_printf(sc->dev,
1255 				      "failed to set number of slices\n");
1256 			return status;
1257 		}
1258 	}
1259 
1260 
1261 	if (interrupts_setup) {
1262 		/* Now exchange information about interrupts  */
1263 		for (slice = 0; slice < sc->num_slices; slice++) {
1264 			rx_done = &sc->ss[slice].rx_done;
1265 			memset(rx_done->entry, 0, sc->rx_ring_size);
1266 			cmd.data0 = MXGE_LOWPART_TO_U32(rx_done->dma.bus_addr);
1267 			cmd.data1 = MXGE_HIGHPART_TO_U32(rx_done->dma.bus_addr);
1268 			cmd.data2 = slice;
1269 			status |= mxge_send_cmd(sc,
1270 						MXGEFW_CMD_SET_INTRQ_DMA,
1271 						&cmd);
1272 		}
1273 	}
1274 
1275 	status |= mxge_send_cmd(sc,
1276 				MXGEFW_CMD_GET_INTR_COAL_DELAY_OFFSET, &cmd);
1277 
1278 
1279 	sc->intr_coal_delay_ptr = (volatile uint32_t *)(sc->sram + cmd.data0);
1280 
1281 	status |= mxge_send_cmd(sc, MXGEFW_CMD_GET_IRQ_ACK_OFFSET, &cmd);
1282 	irq_claim = (volatile uint32_t *)(sc->sram + cmd.data0);
1283 
1284 
1285 	status |= mxge_send_cmd(sc,  MXGEFW_CMD_GET_IRQ_DEASSERT_OFFSET,
1286 				&cmd);
1287 	sc->irq_deassert = (volatile uint32_t *)(sc->sram + cmd.data0);
1288 	if (status != 0) {
1289 		device_printf(sc->dev, "failed set interrupt parameters\n");
1290 		return status;
1291 	}
1292 
1293 
1294 	*sc->intr_coal_delay_ptr = htobe32(sc->intr_coal_delay);
1295 
1296 
1297 	/* run a DMA benchmark */
1298 	(void) mxge_dma_test(sc, MXGEFW_DMA_TEST);
1299 
1300 	for (slice = 0; slice < sc->num_slices; slice++) {
1301 		ss = &sc->ss[slice];
1302 
1303 		ss->irq_claim = irq_claim + (2 * slice);
1304 		/* reset mcp/driver shared state back to 0 */
1305 		ss->rx_done.idx = 0;
1306 		ss->rx_done.cnt = 0;
1307 		ss->tx.req = 0;
1308 		ss->tx.done = 0;
1309 		ss->tx.pkt_done = 0;
1310 		ss->tx.queue_active = 0;
1311 		ss->tx.activate = 0;
1312 		ss->tx.deactivate = 0;
1313 		ss->tx.wake = 0;
1314 		ss->tx.defrag = 0;
1315 		ss->tx.stall = 0;
1316 		ss->rx_big.cnt = 0;
1317 		ss->rx_small.cnt = 0;
1318 		ss->lc.lro_bad_csum = 0;
1319 		ss->lc.lro_queued = 0;
1320 		ss->lc.lro_flushed = 0;
1321 		if (ss->fw_stats != NULL) {
1322 			bzero(ss->fw_stats, sizeof *ss->fw_stats);
1323 		}
1324 	}
1325 	sc->rdma_tags_available = 15;
1326 	status = mxge_update_mac_address(sc);
1327 	mxge_change_promisc(sc, sc->ifp->if_flags & IFF_PROMISC);
1328 	mxge_change_pause(sc, sc->pause);
1329 	mxge_set_multicast_list(sc);
1330 	if (sc->throttle) {
1331 		cmd.data0 = sc->throttle;
1332 		if (mxge_send_cmd(sc, MXGEFW_CMD_SET_THROTTLE_FACTOR,
1333 				  &cmd)) {
1334 			device_printf(sc->dev,
1335 				      "can't enable throttle\n");
1336 		}
1337 	}
1338 	return status;
1339 }
1340 
1341 static int
1342 mxge_change_throttle(SYSCTL_HANDLER_ARGS)
1343 {
1344 	mxge_cmd_t cmd;
1345 	mxge_softc_t *sc;
1346 	int err;
1347 	unsigned int throttle;
1348 
1349 	sc = arg1;
1350 	throttle = sc->throttle;
1351 	err = sysctl_handle_int(oidp, &throttle, arg2, req);
1352         if (err != 0) {
1353                 return err;
1354         }
1355 
1356 	if (throttle == sc->throttle)
1357 		return 0;
1358 
1359         if (throttle < MXGE_MIN_THROTTLE || throttle > MXGE_MAX_THROTTLE)
1360                 return EINVAL;
1361 
1362 	mtx_lock(&sc->driver_mtx);
1363 	cmd.data0 = throttle;
1364 	err = mxge_send_cmd(sc, MXGEFW_CMD_SET_THROTTLE_FACTOR, &cmd);
1365 	if (err == 0)
1366 		sc->throttle = throttle;
1367 	mtx_unlock(&sc->driver_mtx);
1368 	return err;
1369 }
1370 
1371 static int
1372 mxge_change_intr_coal(SYSCTL_HANDLER_ARGS)
1373 {
1374         mxge_softc_t *sc;
1375         unsigned int intr_coal_delay;
1376         int err;
1377 
1378         sc = arg1;
1379         intr_coal_delay = sc->intr_coal_delay;
1380         err = sysctl_handle_int(oidp, &intr_coal_delay, arg2, req);
1381         if (err != 0) {
1382                 return err;
1383         }
1384         if (intr_coal_delay == sc->intr_coal_delay)
1385                 return 0;
1386 
1387         if (intr_coal_delay == 0 || intr_coal_delay > 1000*1000)
1388                 return EINVAL;
1389 
1390 	mtx_lock(&sc->driver_mtx);
1391 	*sc->intr_coal_delay_ptr = htobe32(intr_coal_delay);
1392 	sc->intr_coal_delay = intr_coal_delay;
1393 
1394 	mtx_unlock(&sc->driver_mtx);
1395         return err;
1396 }
1397 
1398 static int
1399 mxge_change_flow_control(SYSCTL_HANDLER_ARGS)
1400 {
1401         mxge_softc_t *sc;
1402         unsigned int enabled;
1403         int err;
1404 
1405         sc = arg1;
1406         enabled = sc->pause;
1407         err = sysctl_handle_int(oidp, &enabled, arg2, req);
1408         if (err != 0) {
1409                 return err;
1410         }
1411         if (enabled == sc->pause)
1412                 return 0;
1413 
1414 	mtx_lock(&sc->driver_mtx);
1415 	err = mxge_change_pause(sc, enabled);
1416 	mtx_unlock(&sc->driver_mtx);
1417         return err;
1418 }
1419 
1420 static int
1421 mxge_handle_be32(SYSCTL_HANDLER_ARGS)
1422 {
1423         int err;
1424 
1425         if (arg1 == NULL)
1426                 return EFAULT;
1427         arg2 = be32toh(*(int *)arg1);
1428         arg1 = NULL;
1429         err = sysctl_handle_int(oidp, arg1, arg2, req);
1430 
1431         return err;
1432 }
1433 
1434 static void
1435 mxge_rem_sysctls(mxge_softc_t *sc)
1436 {
1437 	struct mxge_slice_state *ss;
1438 	int slice;
1439 
1440 	if (sc->slice_sysctl_tree == NULL)
1441 		return;
1442 
1443 	for (slice = 0; slice < sc->num_slices; slice++) {
1444 		ss = &sc->ss[slice];
1445 		if (ss == NULL || ss->sysctl_tree == NULL)
1446 			continue;
1447 		sysctl_ctx_free(&ss->sysctl_ctx);
1448 		ss->sysctl_tree = NULL;
1449 	}
1450 	sysctl_ctx_free(&sc->slice_sysctl_ctx);
1451 	sc->slice_sysctl_tree = NULL;
1452 }
1453 
1454 static void
1455 mxge_add_sysctls(mxge_softc_t *sc)
1456 {
1457 	struct sysctl_ctx_list *ctx;
1458 	struct sysctl_oid_list *children;
1459 	mcp_irq_data_t *fw;
1460 	struct mxge_slice_state *ss;
1461 	int slice;
1462 	char slice_num[8];
1463 
1464 	ctx = device_get_sysctl_ctx(sc->dev);
1465 	children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev));
1466 	fw = sc->ss[0].fw_stats;
1467 
1468 	/* random information */
1469 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1470 		       "firmware_version",
1471 		       CTLFLAG_RD, &sc->fw_version,
1472 		       0, "firmware version");
1473 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1474 		       "serial_number",
1475 		       CTLFLAG_RD, &sc->serial_number_string,
1476 		       0, "serial number");
1477 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1478 		       "product_code",
1479 		       CTLFLAG_RD, &sc->product_code_string,
1480 		       0, "product_code");
1481 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1482 		       "pcie_link_width",
1483 		       CTLFLAG_RD, &sc->link_width,
1484 		       0, "tx_boundary");
1485 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1486 		       "tx_boundary",
1487 		       CTLFLAG_RD, &sc->tx_boundary,
1488 		       0, "tx_boundary");
1489 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1490 		       "write_combine",
1491 		       CTLFLAG_RD, &sc->wc,
1492 		       0, "write combining PIO?");
1493 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1494 		       "read_dma_MBs",
1495 		       CTLFLAG_RD, &sc->read_dma,
1496 		       0, "DMA Read speed in MB/s");
1497 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1498 		       "write_dma_MBs",
1499 		       CTLFLAG_RD, &sc->write_dma,
1500 		       0, "DMA Write speed in MB/s");
1501 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1502 		       "read_write_dma_MBs",
1503 		       CTLFLAG_RD, &sc->read_write_dma,
1504 		       0, "DMA concurrent Read/Write speed in MB/s");
1505 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1506 		       "watchdog_resets",
1507 		       CTLFLAG_RD, &sc->watchdog_resets,
1508 		       0, "Number of times NIC was reset");
1509 
1510 
1511 	/* performance related tunables */
1512 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1513 			"intr_coal_delay",
1514 			CTLTYPE_INT|CTLFLAG_RW, sc,
1515 			0, mxge_change_intr_coal,
1516 			"I", "interrupt coalescing delay in usecs");
1517 
1518 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1519 			"throttle",
1520 			CTLTYPE_INT|CTLFLAG_RW, sc,
1521 			0, mxge_change_throttle,
1522 			"I", "transmit throttling");
1523 
1524 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1525 			"flow_control_enabled",
1526 			CTLTYPE_INT|CTLFLAG_RW, sc,
1527 			0, mxge_change_flow_control,
1528 			"I", "interrupt coalescing delay in usecs");
1529 
1530 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1531 		       "deassert_wait",
1532 		       CTLFLAG_RW, &mxge_deassert_wait,
1533 		       0, "Wait for IRQ line to go low in ihandler");
1534 
1535 	/* stats block from firmware is in network byte order.
1536 	   Need to swap it */
1537 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1538 			"link_up",
1539 			CTLTYPE_INT|CTLFLAG_RD, &fw->link_up,
1540 			0, mxge_handle_be32,
1541 			"I", "link up");
1542 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1543 			"rdma_tags_available",
1544 			CTLTYPE_INT|CTLFLAG_RD, &fw->rdma_tags_available,
1545 			0, mxge_handle_be32,
1546 			"I", "rdma_tags_available");
1547 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1548 			"dropped_bad_crc32",
1549 			CTLTYPE_INT|CTLFLAG_RD,
1550 			&fw->dropped_bad_crc32,
1551 			0, mxge_handle_be32,
1552 			"I", "dropped_bad_crc32");
1553 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1554 			"dropped_bad_phy",
1555 			CTLTYPE_INT|CTLFLAG_RD,
1556 			&fw->dropped_bad_phy,
1557 			0, mxge_handle_be32,
1558 			"I", "dropped_bad_phy");
1559 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1560 			"dropped_link_error_or_filtered",
1561 			CTLTYPE_INT|CTLFLAG_RD,
1562 			&fw->dropped_link_error_or_filtered,
1563 			0, mxge_handle_be32,
1564 			"I", "dropped_link_error_or_filtered");
1565 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1566 			"dropped_link_overflow",
1567 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_link_overflow,
1568 			0, mxge_handle_be32,
1569 			"I", "dropped_link_overflow");
1570 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1571 			"dropped_multicast_filtered",
1572 			CTLTYPE_INT|CTLFLAG_RD,
1573 			&fw->dropped_multicast_filtered,
1574 			0, mxge_handle_be32,
1575 			"I", "dropped_multicast_filtered");
1576 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1577 			"dropped_no_big_buffer",
1578 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_no_big_buffer,
1579 			0, mxge_handle_be32,
1580 			"I", "dropped_no_big_buffer");
1581 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1582 			"dropped_no_small_buffer",
1583 			CTLTYPE_INT|CTLFLAG_RD,
1584 			&fw->dropped_no_small_buffer,
1585 			0, mxge_handle_be32,
1586 			"I", "dropped_no_small_buffer");
1587 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1588 			"dropped_overrun",
1589 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_overrun,
1590 			0, mxge_handle_be32,
1591 			"I", "dropped_overrun");
1592 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1593 			"dropped_pause",
1594 			CTLTYPE_INT|CTLFLAG_RD,
1595 			&fw->dropped_pause,
1596 			0, mxge_handle_be32,
1597 			"I", "dropped_pause");
1598 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1599 			"dropped_runt",
1600 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_runt,
1601 			0, mxge_handle_be32,
1602 			"I", "dropped_runt");
1603 
1604 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1605 			"dropped_unicast_filtered",
1606 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_unicast_filtered,
1607 			0, mxge_handle_be32,
1608 			"I", "dropped_unicast_filtered");
1609 
1610 	/* verbose printing? */
1611 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1612 		       "verbose",
1613 		       CTLFLAG_RW, &mxge_verbose,
1614 		       0, "verbose printing");
1615 
1616 	/* add counters exported for debugging from all slices */
1617 	sysctl_ctx_init(&sc->slice_sysctl_ctx);
1618 	sc->slice_sysctl_tree =
1619 		SYSCTL_ADD_NODE(&sc->slice_sysctl_ctx, children, OID_AUTO,
1620 				"slice", CTLFLAG_RD, 0, "");
1621 
1622 	for (slice = 0; slice < sc->num_slices; slice++) {
1623 		ss = &sc->ss[slice];
1624 		sysctl_ctx_init(&ss->sysctl_ctx);
1625 		ctx = &ss->sysctl_ctx;
1626 		children = SYSCTL_CHILDREN(sc->slice_sysctl_tree);
1627 		sprintf(slice_num, "%d", slice);
1628 		ss->sysctl_tree =
1629 			SYSCTL_ADD_NODE(ctx, children, OID_AUTO, slice_num,
1630 					CTLFLAG_RD, 0, "");
1631 		children = SYSCTL_CHILDREN(ss->sysctl_tree);
1632 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1633 			       "rx_small_cnt",
1634 			       CTLFLAG_RD, &ss->rx_small.cnt,
1635 			       0, "rx_small_cnt");
1636 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1637 			       "rx_big_cnt",
1638 			       CTLFLAG_RD, &ss->rx_big.cnt,
1639 			       0, "rx_small_cnt");
1640 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1641 			       "lro_flushed", CTLFLAG_RD, &ss->lc.lro_flushed,
1642 			       0, "number of lro merge queues flushed");
1643 
1644 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1645 			       "lro_bad_csum", CTLFLAG_RD, &ss->lc.lro_bad_csum,
1646 			       0, "number of bad csums preventing LRO");
1647 
1648 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1649 			       "lro_queued", CTLFLAG_RD, &ss->lc.lro_queued,
1650 			       0, "number of frames appended to lro merge"
1651 			       "queues");
1652 
1653 #ifndef IFNET_BUF_RING
1654 		/* only transmit from slice 0 for now */
1655 		if (slice > 0)
1656 			continue;
1657 #endif
1658 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1659 			       "tx_req",
1660 			       CTLFLAG_RD, &ss->tx.req,
1661 			       0, "tx_req");
1662 
1663 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1664 			       "tx_done",
1665 			       CTLFLAG_RD, &ss->tx.done,
1666 			       0, "tx_done");
1667 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1668 			       "tx_pkt_done",
1669 			       CTLFLAG_RD, &ss->tx.pkt_done,
1670 			       0, "tx_done");
1671 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1672 			       "tx_stall",
1673 			       CTLFLAG_RD, &ss->tx.stall,
1674 			       0, "tx_stall");
1675 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1676 			       "tx_wake",
1677 			       CTLFLAG_RD, &ss->tx.wake,
1678 			       0, "tx_wake");
1679 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1680 			       "tx_defrag",
1681 			       CTLFLAG_RD, &ss->tx.defrag,
1682 			       0, "tx_defrag");
1683 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1684 			       "tx_queue_active",
1685 			       CTLFLAG_RD, &ss->tx.queue_active,
1686 			       0, "tx_queue_active");
1687 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1688 			       "tx_activate",
1689 			       CTLFLAG_RD, &ss->tx.activate,
1690 			       0, "tx_activate");
1691 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1692 			       "tx_deactivate",
1693 			       CTLFLAG_RD, &ss->tx.deactivate,
1694 			       0, "tx_deactivate");
1695 	}
1696 }
1697 
1698 /* copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
1699    backwards one at a time and handle ring wraps */
1700 
1701 static inline void
1702 mxge_submit_req_backwards(mxge_tx_ring_t *tx,
1703 			    mcp_kreq_ether_send_t *src, int cnt)
1704 {
1705         int idx, starting_slot;
1706         starting_slot = tx->req;
1707         while (cnt > 1) {
1708                 cnt--;
1709                 idx = (starting_slot + cnt) & tx->mask;
1710                 mxge_pio_copy(&tx->lanai[idx],
1711 			      &src[cnt], sizeof(*src));
1712                 wmb();
1713         }
1714 }
1715 
1716 /*
1717  * copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
1718  * at most 32 bytes at a time, so as to avoid involving the software
1719  * pio handler in the nic.   We re-write the first segment's flags
1720  * to mark them valid only after writing the entire chain
1721  */
1722 
1723 static inline void
1724 mxge_submit_req(mxge_tx_ring_t *tx, mcp_kreq_ether_send_t *src,
1725                   int cnt)
1726 {
1727         int idx, i;
1728         uint32_t *src_ints;
1729 	volatile uint32_t *dst_ints;
1730         mcp_kreq_ether_send_t *srcp;
1731 	volatile mcp_kreq_ether_send_t *dstp, *dst;
1732 	uint8_t last_flags;
1733 
1734         idx = tx->req & tx->mask;
1735 
1736 	last_flags = src->flags;
1737 	src->flags = 0;
1738         wmb();
1739         dst = dstp = &tx->lanai[idx];
1740         srcp = src;
1741 
1742         if ((idx + cnt) < tx->mask) {
1743                 for (i = 0; i < (cnt - 1); i += 2) {
1744                         mxge_pio_copy(dstp, srcp, 2 * sizeof(*src));
1745                         wmb(); /* force write every 32 bytes */
1746                         srcp += 2;
1747                         dstp += 2;
1748                 }
1749         } else {
1750                 /* submit all but the first request, and ensure
1751                    that it is submitted below */
1752                 mxge_submit_req_backwards(tx, src, cnt);
1753                 i = 0;
1754         }
1755         if (i < cnt) {
1756                 /* submit the first request */
1757                 mxge_pio_copy(dstp, srcp, sizeof(*src));
1758                 wmb(); /* barrier before setting valid flag */
1759         }
1760 
1761         /* re-write the last 32-bits with the valid flags */
1762         src->flags = last_flags;
1763         src_ints = (uint32_t *)src;
1764         src_ints+=3;
1765         dst_ints = (volatile uint32_t *)dst;
1766         dst_ints+=3;
1767         *dst_ints =  *src_ints;
1768         tx->req += cnt;
1769         wmb();
1770 }
1771 
1772 static int
1773 mxge_parse_tx(struct mxge_slice_state *ss, struct mbuf *m,
1774     struct mxge_pkt_info *pi)
1775 {
1776 	struct ether_vlan_header *eh;
1777 	uint16_t etype;
1778 	int tso = m->m_pkthdr.csum_flags & (CSUM_TSO);
1779 #if IFCAP_TSO6 && defined(INET6)
1780 	int nxt;
1781 #endif
1782 
1783 	eh = mtod(m, struct ether_vlan_header *);
1784 	if (eh->evl_encap_proto == htons(ETHERTYPE_VLAN)) {
1785 		etype = ntohs(eh->evl_proto);
1786 		pi->ip_off = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
1787 	} else {
1788 		etype = ntohs(eh->evl_encap_proto);
1789 		pi->ip_off = ETHER_HDR_LEN;
1790 	}
1791 
1792 	switch (etype) {
1793 	case ETHERTYPE_IP:
1794 		/*
1795 		 * ensure ip header is in first mbuf, copy it to a
1796 		 * scratch buffer if not
1797 		 */
1798 		pi->ip = (struct ip *)(m->m_data + pi->ip_off);
1799 		pi->ip6 = NULL;
1800 		if (__predict_false(m->m_len < pi->ip_off + sizeof(*pi->ip))) {
1801 			m_copydata(m, 0, pi->ip_off + sizeof(*pi->ip),
1802 			    ss->scratch);
1803 			pi->ip = (struct ip *)(ss->scratch + pi->ip_off);
1804 		}
1805 		pi->ip_hlen = pi->ip->ip_hl << 2;
1806 		if (!tso)
1807 			return 0;
1808 
1809 		if (__predict_false(m->m_len < pi->ip_off + pi->ip_hlen +
1810 		    sizeof(struct tcphdr))) {
1811 			m_copydata(m, 0, pi->ip_off + pi->ip_hlen +
1812 			    sizeof(struct tcphdr), ss->scratch);
1813 			pi->ip = (struct ip *)(ss->scratch + pi->ip_off);
1814 		}
1815 		pi->tcp = (struct tcphdr *)((char *)pi->ip + pi->ip_hlen);
1816 		break;
1817 #if IFCAP_TSO6 && defined(INET6)
1818 	case ETHERTYPE_IPV6:
1819 		pi->ip6 = (struct ip6_hdr *)(m->m_data + pi->ip_off);
1820 		if (__predict_false(m->m_len < pi->ip_off + sizeof(*pi->ip6))) {
1821 			m_copydata(m, 0, pi->ip_off + sizeof(*pi->ip6),
1822 			    ss->scratch);
1823 			pi->ip6 = (struct ip6_hdr *)(ss->scratch + pi->ip_off);
1824 		}
1825 		nxt = 0;
1826 		pi->ip_hlen = ip6_lasthdr(m, pi->ip_off, IPPROTO_IPV6, &nxt);
1827 		pi->ip_hlen -= pi->ip_off;
1828 		if (nxt != IPPROTO_TCP && nxt != IPPROTO_UDP)
1829 			return EINVAL;
1830 
1831 		if (!tso)
1832 			return 0;
1833 
1834 		if (pi->ip_off + pi->ip_hlen > ss->sc->max_tso6_hlen)
1835 			return EINVAL;
1836 
1837 		if (__predict_false(m->m_len < pi->ip_off + pi->ip_hlen +
1838 		    sizeof(struct tcphdr))) {
1839 			m_copydata(m, 0, pi->ip_off + pi->ip_hlen +
1840 			    sizeof(struct tcphdr), ss->scratch);
1841 			pi->ip6 = (struct ip6_hdr *)(ss->scratch + pi->ip_off);
1842 		}
1843 		pi->tcp = (struct tcphdr *)((char *)pi->ip6 + pi->ip_hlen);
1844 		break;
1845 #endif
1846 	default:
1847 		return EINVAL;
1848 	}
1849 	return 0;
1850 }
1851 
1852 #if IFCAP_TSO4
1853 
1854 static void
1855 mxge_encap_tso(struct mxge_slice_state *ss, struct mbuf *m,
1856 	       int busdma_seg_cnt, struct mxge_pkt_info *pi)
1857 {
1858 	mxge_tx_ring_t *tx;
1859 	mcp_kreq_ether_send_t *req;
1860 	bus_dma_segment_t *seg;
1861 	uint32_t low, high_swapped;
1862 	int len, seglen, cum_len, cum_len_next;
1863 	int next_is_first, chop, cnt, rdma_count, small;
1864 	uint16_t pseudo_hdr_offset, cksum_offset, mss, sum;
1865 	uint8_t flags, flags_next;
1866 	static int once;
1867 
1868 	mss = m->m_pkthdr.tso_segsz;
1869 
1870 	/* negative cum_len signifies to the
1871 	 * send loop that we are still in the
1872 	 * header portion of the TSO packet.
1873 	 */
1874 
1875 	cksum_offset = pi->ip_off + pi->ip_hlen;
1876 	cum_len = -(cksum_offset + (pi->tcp->th_off << 2));
1877 
1878 	/* TSO implies checksum offload on this hardware */
1879 	if (__predict_false((m->m_pkthdr.csum_flags & (CSUM_TCP|CSUM_TCP_IPV6)) == 0)) {
1880 		/*
1881 		 * If packet has full TCP csum, replace it with pseudo hdr
1882 		 * sum that the NIC expects, otherwise the NIC will emit
1883 		 * packets with bad TCP checksums.
1884 		 */
1885 		m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
1886 		if (pi->ip6) {
1887 #if (CSUM_TCP_IPV6 != 0) && defined(INET6)
1888 			m->m_pkthdr.csum_flags |= CSUM_TCP_IPV6;
1889 			sum = in6_cksum_pseudo(pi->ip6,
1890 			    m->m_pkthdr.len - cksum_offset,
1891 			    IPPROTO_TCP, 0);
1892 #endif
1893 		} else {
1894 #ifdef INET
1895 			m->m_pkthdr.csum_flags |= CSUM_TCP;
1896 			sum = in_pseudo(pi->ip->ip_src.s_addr,
1897 			    pi->ip->ip_dst.s_addr,
1898 			    htons(IPPROTO_TCP + (m->m_pkthdr.len -
1899 				    cksum_offset)));
1900 #endif
1901 		}
1902 		m_copyback(m, offsetof(struct tcphdr, th_sum) +
1903 		    cksum_offset, sizeof(sum), (caddr_t)&sum);
1904 	}
1905 	flags = MXGEFW_FLAGS_TSO_HDR | MXGEFW_FLAGS_FIRST;
1906 
1907 
1908 	/* for TSO, pseudo_hdr_offset holds mss.
1909 	 * The firmware figures out where to put
1910 	 * the checksum by parsing the header. */
1911 	pseudo_hdr_offset = htobe16(mss);
1912 
1913 	if (pi->ip6) {
1914 		/*
1915 		 * for IPv6 TSO, the "checksum offset" is re-purposed
1916 		 * to store the TCP header len
1917 		 */
1918 		cksum_offset = (pi->tcp->th_off << 2);
1919 	}
1920 
1921 	tx = &ss->tx;
1922 	req = tx->req_list;
1923 	seg = tx->seg_list;
1924 	cnt = 0;
1925 	rdma_count = 0;
1926 	/* "rdma_count" is the number of RDMAs belonging to the
1927 	 * current packet BEFORE the current send request. For
1928 	 * non-TSO packets, this is equal to "count".
1929 	 * For TSO packets, rdma_count needs to be reset
1930 	 * to 0 after a segment cut.
1931 	 *
1932 	 * The rdma_count field of the send request is
1933 	 * the number of RDMAs of the packet starting at
1934 	 * that request. For TSO send requests with one ore more cuts
1935 	 * in the middle, this is the number of RDMAs starting
1936 	 * after the last cut in the request. All previous
1937 	 * segments before the last cut implicitly have 1 RDMA.
1938 	 *
1939 	 * Since the number of RDMAs is not known beforehand,
1940 	 * it must be filled-in retroactively - after each
1941 	 * segmentation cut or at the end of the entire packet.
1942 	 */
1943 
1944 	while (busdma_seg_cnt) {
1945 		/* Break the busdma segment up into pieces*/
1946 		low = MXGE_LOWPART_TO_U32(seg->ds_addr);
1947 		high_swapped = 	htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
1948 		len = seg->ds_len;
1949 
1950 		while (len) {
1951 			flags_next = flags & ~MXGEFW_FLAGS_FIRST;
1952 			seglen = len;
1953 			cum_len_next = cum_len + seglen;
1954 			(req-rdma_count)->rdma_count = rdma_count + 1;
1955 			if (__predict_true(cum_len >= 0)) {
1956 				/* payload */
1957 				chop = (cum_len_next > mss);
1958 				cum_len_next = cum_len_next % mss;
1959 				next_is_first = (cum_len_next == 0);
1960 				flags |= chop * MXGEFW_FLAGS_TSO_CHOP;
1961 				flags_next |= next_is_first *
1962 					MXGEFW_FLAGS_FIRST;
1963 				rdma_count |= -(chop | next_is_first);
1964 				rdma_count += chop & !next_is_first;
1965 			} else if (cum_len_next >= 0) {
1966 				/* header ends */
1967 				rdma_count = -1;
1968 				cum_len_next = 0;
1969 				seglen = -cum_len;
1970 				small = (mss <= MXGEFW_SEND_SMALL_SIZE);
1971 				flags_next = MXGEFW_FLAGS_TSO_PLD |
1972 					MXGEFW_FLAGS_FIRST |
1973 					(small * MXGEFW_FLAGS_SMALL);
1974 			    }
1975 
1976 			req->addr_high = high_swapped;
1977 			req->addr_low = htobe32(low);
1978 			req->pseudo_hdr_offset = pseudo_hdr_offset;
1979 			req->pad = 0;
1980 			req->rdma_count = 1;
1981 			req->length = htobe16(seglen);
1982 			req->cksum_offset = cksum_offset;
1983 			req->flags = flags | ((cum_len & 1) *
1984 					      MXGEFW_FLAGS_ALIGN_ODD);
1985 			low += seglen;
1986 			len -= seglen;
1987 			cum_len = cum_len_next;
1988 			flags = flags_next;
1989 			req++;
1990 			cnt++;
1991 			rdma_count++;
1992 			if (cksum_offset != 0 && !pi->ip6) {
1993 				if (__predict_false(cksum_offset > seglen))
1994 					cksum_offset -= seglen;
1995 				else
1996 					cksum_offset = 0;
1997 			}
1998 			if (__predict_false(cnt > tx->max_desc))
1999 				goto drop;
2000 		}
2001 		busdma_seg_cnt--;
2002 		seg++;
2003 	}
2004 	(req-rdma_count)->rdma_count = rdma_count;
2005 
2006 	do {
2007 		req--;
2008 		req->flags |= MXGEFW_FLAGS_TSO_LAST;
2009 	} while (!(req->flags & (MXGEFW_FLAGS_TSO_CHOP | MXGEFW_FLAGS_FIRST)));
2010 
2011 	tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
2012 	mxge_submit_req(tx, tx->req_list, cnt);
2013 #ifdef IFNET_BUF_RING
2014 	if ((ss->sc->num_slices > 1) && tx->queue_active == 0) {
2015 		/* tell the NIC to start polling this slice */
2016 		*tx->send_go = 1;
2017 		tx->queue_active = 1;
2018 		tx->activate++;
2019 		wmb();
2020 	}
2021 #endif
2022 	return;
2023 
2024 drop:
2025 	bus_dmamap_unload(tx->dmat, tx->info[tx->req & tx->mask].map);
2026 	m_freem(m);
2027 	ss->oerrors++;
2028 	if (!once) {
2029 		printf("tx->max_desc exceeded via TSO!\n");
2030 		printf("mss = %d, %ld, %d!\n", mss,
2031 		       (long)seg - (long)tx->seg_list, tx->max_desc);
2032 		once = 1;
2033 	}
2034 	return;
2035 
2036 }
2037 
2038 #endif /* IFCAP_TSO4 */
2039 
2040 #ifdef MXGE_NEW_VLAN_API
2041 /*
2042  * We reproduce the software vlan tag insertion from
2043  * net/if_vlan.c:vlan_start() here so that we can advertise "hardware"
2044  * vlan tag insertion. We need to advertise this in order to have the
2045  * vlan interface respect our csum offload flags.
2046  */
2047 static struct mbuf *
2048 mxge_vlan_tag_insert(struct mbuf *m)
2049 {
2050 	struct ether_vlan_header *evl;
2051 
2052 	M_PREPEND(m, ETHER_VLAN_ENCAP_LEN, M_NOWAIT);
2053 	if (__predict_false(m == NULL))
2054 		return NULL;
2055 	if (m->m_len < sizeof(*evl)) {
2056 		m = m_pullup(m, sizeof(*evl));
2057 		if (__predict_false(m == NULL))
2058 			return NULL;
2059 	}
2060 	/*
2061 	 * Transform the Ethernet header into an Ethernet header
2062 	 * with 802.1Q encapsulation.
2063 	 */
2064 	evl = mtod(m, struct ether_vlan_header *);
2065 	bcopy((char *)evl + ETHER_VLAN_ENCAP_LEN,
2066 	      (char *)evl, ETHER_HDR_LEN - ETHER_TYPE_LEN);
2067 	evl->evl_encap_proto = htons(ETHERTYPE_VLAN);
2068 	evl->evl_tag = htons(m->m_pkthdr.ether_vtag);
2069 	m->m_flags &= ~M_VLANTAG;
2070 	return m;
2071 }
2072 #endif /* MXGE_NEW_VLAN_API */
2073 
2074 static void
2075 mxge_encap(struct mxge_slice_state *ss, struct mbuf *m)
2076 {
2077 	struct mxge_pkt_info pi = {0,0,0,0};
2078 	mxge_softc_t *sc;
2079 	mcp_kreq_ether_send_t *req;
2080 	bus_dma_segment_t *seg;
2081 	struct mbuf *m_tmp;
2082 	struct ifnet *ifp;
2083 	mxge_tx_ring_t *tx;
2084 	int cnt, cum_len, err, i, idx, odd_flag;
2085 	uint16_t pseudo_hdr_offset;
2086         uint8_t flags, cksum_offset;
2087 
2088 
2089 	sc = ss->sc;
2090 	ifp = sc->ifp;
2091 	tx = &ss->tx;
2092 
2093 #ifdef MXGE_NEW_VLAN_API
2094 	if (m->m_flags & M_VLANTAG) {
2095 		m = mxge_vlan_tag_insert(m);
2096 		if (__predict_false(m == NULL))
2097 			goto drop_without_m;
2098 	}
2099 #endif
2100 	if (m->m_pkthdr.csum_flags &
2101 	    (CSUM_TSO | CSUM_DELAY_DATA | CSUM_DELAY_DATA_IPV6)) {
2102 		if (mxge_parse_tx(ss, m, &pi))
2103 			goto drop;
2104 	}
2105 
2106 	/* (try to) map the frame for DMA */
2107 	idx = tx->req & tx->mask;
2108 	err = bus_dmamap_load_mbuf_sg(tx->dmat, tx->info[idx].map,
2109 				      m, tx->seg_list, &cnt,
2110 				      BUS_DMA_NOWAIT);
2111 	if (__predict_false(err == EFBIG)) {
2112 		/* Too many segments in the chain.  Try
2113 		   to defrag */
2114 		m_tmp = m_defrag(m, M_NOWAIT);
2115 		if (m_tmp == NULL) {
2116 			goto drop;
2117 		}
2118 		ss->tx.defrag++;
2119 		m = m_tmp;
2120 		err = bus_dmamap_load_mbuf_sg(tx->dmat,
2121 					      tx->info[idx].map,
2122 					      m, tx->seg_list, &cnt,
2123 					      BUS_DMA_NOWAIT);
2124 	}
2125 	if (__predict_false(err != 0)) {
2126 		device_printf(sc->dev, "bus_dmamap_load_mbuf_sg returned %d"
2127 			      " packet len = %d\n", err, m->m_pkthdr.len);
2128 		goto drop;
2129 	}
2130 	bus_dmamap_sync(tx->dmat, tx->info[idx].map,
2131 			BUS_DMASYNC_PREWRITE);
2132 	tx->info[idx].m = m;
2133 
2134 #if IFCAP_TSO4
2135 	/* TSO is different enough, we handle it in another routine */
2136 	if (m->m_pkthdr.csum_flags & (CSUM_TSO)) {
2137 		mxge_encap_tso(ss, m, cnt, &pi);
2138 		return;
2139 	}
2140 #endif
2141 
2142 	req = tx->req_list;
2143 	cksum_offset = 0;
2144 	pseudo_hdr_offset = 0;
2145 	flags = MXGEFW_FLAGS_NO_TSO;
2146 
2147 	/* checksum offloading? */
2148 	if (m->m_pkthdr.csum_flags &
2149 	    (CSUM_DELAY_DATA | CSUM_DELAY_DATA_IPV6)) {
2150 		/* ensure ip header is in first mbuf, copy
2151 		   it to a scratch buffer if not */
2152 		cksum_offset = pi.ip_off + pi.ip_hlen;
2153 		pseudo_hdr_offset = cksum_offset +  m->m_pkthdr.csum_data;
2154 		pseudo_hdr_offset = htobe16(pseudo_hdr_offset);
2155 		req->cksum_offset = cksum_offset;
2156 		flags |= MXGEFW_FLAGS_CKSUM;
2157 		odd_flag = MXGEFW_FLAGS_ALIGN_ODD;
2158 	} else {
2159 		odd_flag = 0;
2160 	}
2161 	if (m->m_pkthdr.len < MXGEFW_SEND_SMALL_SIZE)
2162 		flags |= MXGEFW_FLAGS_SMALL;
2163 
2164 	/* convert segments into a request list */
2165 	cum_len = 0;
2166 	seg = tx->seg_list;
2167 	req->flags = MXGEFW_FLAGS_FIRST;
2168 	for (i = 0; i < cnt; i++) {
2169 		req->addr_low =
2170 			htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
2171 		req->addr_high =
2172 			htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
2173 		req->length = htobe16(seg->ds_len);
2174 		req->cksum_offset = cksum_offset;
2175 		if (cksum_offset > seg->ds_len)
2176 			cksum_offset -= seg->ds_len;
2177 		else
2178 			cksum_offset = 0;
2179 		req->pseudo_hdr_offset = pseudo_hdr_offset;
2180 		req->pad = 0; /* complete solid 16-byte block */
2181 		req->rdma_count = 1;
2182 		req->flags |= flags | ((cum_len & 1) * odd_flag);
2183 		cum_len += seg->ds_len;
2184 		seg++;
2185 		req++;
2186 		req->flags = 0;
2187 	}
2188 	req--;
2189 	/* pad runts to 60 bytes */
2190 	if (cum_len < 60) {
2191 		req++;
2192 		req->addr_low =
2193 			htobe32(MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr));
2194 		req->addr_high =
2195 			htobe32(MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr));
2196 		req->length = htobe16(60 - cum_len);
2197 		req->cksum_offset = 0;
2198 		req->pseudo_hdr_offset = pseudo_hdr_offset;
2199 		req->pad = 0; /* complete solid 16-byte block */
2200 		req->rdma_count = 1;
2201 		req->flags |= flags | ((cum_len & 1) * odd_flag);
2202 		cnt++;
2203 	}
2204 
2205 	tx->req_list[0].rdma_count = cnt;
2206 #if 0
2207 	/* print what the firmware will see */
2208 	for (i = 0; i < cnt; i++) {
2209 		printf("%d: addr: 0x%x 0x%x len:%d pso%d,"
2210 		    "cso:%d, flags:0x%x, rdma:%d\n",
2211 		    i, (int)ntohl(tx->req_list[i].addr_high),
2212 		    (int)ntohl(tx->req_list[i].addr_low),
2213 		    (int)ntohs(tx->req_list[i].length),
2214 		    (int)ntohs(tx->req_list[i].pseudo_hdr_offset),
2215 		    tx->req_list[i].cksum_offset, tx->req_list[i].flags,
2216 		    tx->req_list[i].rdma_count);
2217 	}
2218 	printf("--------------\n");
2219 #endif
2220 	tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
2221 	mxge_submit_req(tx, tx->req_list, cnt);
2222 #ifdef IFNET_BUF_RING
2223 	if ((ss->sc->num_slices > 1) && tx->queue_active == 0) {
2224 		/* tell the NIC to start polling this slice */
2225 		*tx->send_go = 1;
2226 		tx->queue_active = 1;
2227 		tx->activate++;
2228 		wmb();
2229 	}
2230 #endif
2231 	return;
2232 
2233 drop:
2234 	m_freem(m);
2235 drop_without_m:
2236 	ss->oerrors++;
2237 	return;
2238 }
2239 
2240 #ifdef IFNET_BUF_RING
2241 static void
2242 mxge_qflush(struct ifnet *ifp)
2243 {
2244 	mxge_softc_t *sc = ifp->if_softc;
2245 	mxge_tx_ring_t *tx;
2246 	struct mbuf *m;
2247 	int slice;
2248 
2249 	for (slice = 0; slice < sc->num_slices; slice++) {
2250 		tx = &sc->ss[slice].tx;
2251 		mtx_lock(&tx->mtx);
2252 		while ((m = buf_ring_dequeue_sc(tx->br)) != NULL)
2253 			m_freem(m);
2254 		mtx_unlock(&tx->mtx);
2255 	}
2256 	if_qflush(ifp);
2257 }
2258 
2259 static inline void
2260 mxge_start_locked(struct mxge_slice_state *ss)
2261 {
2262 	mxge_softc_t *sc;
2263 	struct mbuf *m;
2264 	struct ifnet *ifp;
2265 	mxge_tx_ring_t *tx;
2266 
2267 	sc = ss->sc;
2268 	ifp = sc->ifp;
2269 	tx = &ss->tx;
2270 
2271 	while ((tx->mask - (tx->req - tx->done)) > tx->max_desc) {
2272 		m = drbr_dequeue(ifp, tx->br);
2273 		if (m == NULL) {
2274 			return;
2275 		}
2276 		/* let BPF see it */
2277 		BPF_MTAP(ifp, m);
2278 
2279 		/* give it to the nic */
2280 		mxge_encap(ss, m);
2281 	}
2282 	/* ran out of transmit slots */
2283 	if (((ss->if_drv_flags & IFF_DRV_OACTIVE) == 0)
2284 	    && (!drbr_empty(ifp, tx->br))) {
2285 		ss->if_drv_flags |= IFF_DRV_OACTIVE;
2286 		tx->stall++;
2287 	}
2288 }
2289 
2290 static int
2291 mxge_transmit_locked(struct mxge_slice_state *ss, struct mbuf *m)
2292 {
2293 	mxge_softc_t *sc;
2294 	struct ifnet *ifp;
2295 	mxge_tx_ring_t *tx;
2296 	int err;
2297 
2298 	sc = ss->sc;
2299 	ifp = sc->ifp;
2300 	tx = &ss->tx;
2301 
2302 	if ((ss->if_drv_flags & (IFF_DRV_RUNNING|IFF_DRV_OACTIVE)) !=
2303 	    IFF_DRV_RUNNING) {
2304 		err = drbr_enqueue(ifp, tx->br, m);
2305 		return (err);
2306 	}
2307 
2308 	if (!drbr_needs_enqueue(ifp, tx->br) &&
2309 	    ((tx->mask - (tx->req - tx->done)) > tx->max_desc)) {
2310 		/* let BPF see it */
2311 		BPF_MTAP(ifp, m);
2312 		/* give it to the nic */
2313 		mxge_encap(ss, m);
2314 	} else if ((err = drbr_enqueue(ifp, tx->br, m)) != 0) {
2315 		return (err);
2316 	}
2317 	if (!drbr_empty(ifp, tx->br))
2318 		mxge_start_locked(ss);
2319 	return (0);
2320 }
2321 
2322 static int
2323 mxge_transmit(struct ifnet *ifp, struct mbuf *m)
2324 {
2325 	mxge_softc_t *sc = ifp->if_softc;
2326 	struct mxge_slice_state *ss;
2327 	mxge_tx_ring_t *tx;
2328 	int err = 0;
2329 	int slice;
2330 
2331 	slice = m->m_pkthdr.flowid;
2332 	slice &= (sc->num_slices - 1);  /* num_slices always power of 2 */
2333 
2334 	ss = &sc->ss[slice];
2335 	tx = &ss->tx;
2336 
2337 	if (mtx_trylock(&tx->mtx)) {
2338 		err = mxge_transmit_locked(ss, m);
2339 		mtx_unlock(&tx->mtx);
2340 	} else {
2341 		err = drbr_enqueue(ifp, tx->br, m);
2342 	}
2343 
2344 	return (err);
2345 }
2346 
2347 #else
2348 
2349 static inline void
2350 mxge_start_locked(struct mxge_slice_state *ss)
2351 {
2352 	mxge_softc_t *sc;
2353 	struct mbuf *m;
2354 	struct ifnet *ifp;
2355 	mxge_tx_ring_t *tx;
2356 
2357 	sc = ss->sc;
2358 	ifp = sc->ifp;
2359 	tx = &ss->tx;
2360 	while ((tx->mask - (tx->req - tx->done)) > tx->max_desc) {
2361 		IFQ_DRV_DEQUEUE(&ifp->if_snd, m);
2362 		if (m == NULL) {
2363 			return;
2364 		}
2365 		/* let BPF see it */
2366 		BPF_MTAP(ifp, m);
2367 
2368 		/* give it to the nic */
2369 		mxge_encap(ss, m);
2370 	}
2371 	/* ran out of transmit slots */
2372 	if ((sc->ifp->if_drv_flags & IFF_DRV_OACTIVE) == 0) {
2373 		sc->ifp->if_drv_flags |= IFF_DRV_OACTIVE;
2374 		tx->stall++;
2375 	}
2376 }
2377 #endif
2378 static void
2379 mxge_start(struct ifnet *ifp)
2380 {
2381 	mxge_softc_t *sc = ifp->if_softc;
2382 	struct mxge_slice_state *ss;
2383 
2384 	/* only use the first slice for now */
2385 	ss = &sc->ss[0];
2386 	mtx_lock(&ss->tx.mtx);
2387 	mxge_start_locked(ss);
2388 	mtx_unlock(&ss->tx.mtx);
2389 }
2390 
2391 /*
2392  * copy an array of mcp_kreq_ether_recv_t's to the mcp.  Copy
2393  * at most 32 bytes at a time, so as to avoid involving the software
2394  * pio handler in the nic.   We re-write the first segment's low
2395  * DMA address to mark it valid only after we write the entire chunk
2396  * in a burst
2397  */
2398 static inline void
2399 mxge_submit_8rx(volatile mcp_kreq_ether_recv_t *dst,
2400 		mcp_kreq_ether_recv_t *src)
2401 {
2402 	uint32_t low;
2403 
2404 	low = src->addr_low;
2405 	src->addr_low = 0xffffffff;
2406 	mxge_pio_copy(dst, src, 4 * sizeof (*src));
2407 	wmb();
2408 	mxge_pio_copy(dst + 4, src + 4, 4 * sizeof (*src));
2409 	wmb();
2410 	src->addr_low = low;
2411 	dst->addr_low = low;
2412 	wmb();
2413 }
2414 
2415 static int
2416 mxge_get_buf_small(struct mxge_slice_state *ss, bus_dmamap_t map, int idx)
2417 {
2418 	bus_dma_segment_t seg;
2419 	struct mbuf *m;
2420 	mxge_rx_ring_t *rx = &ss->rx_small;
2421 	int cnt, err;
2422 
2423 	m = m_gethdr(M_NOWAIT, MT_DATA);
2424 	if (m == NULL) {
2425 		rx->alloc_fail++;
2426 		err = ENOBUFS;
2427 		goto done;
2428 	}
2429 	m->m_len = MHLEN;
2430 	err = bus_dmamap_load_mbuf_sg(rx->dmat, map, m,
2431 				      &seg, &cnt, BUS_DMA_NOWAIT);
2432 	if (err != 0) {
2433 		m_free(m);
2434 		goto done;
2435 	}
2436 	rx->info[idx].m = m;
2437 	rx->shadow[idx].addr_low =
2438 		htobe32(MXGE_LOWPART_TO_U32(seg.ds_addr));
2439 	rx->shadow[idx].addr_high =
2440 		htobe32(MXGE_HIGHPART_TO_U32(seg.ds_addr));
2441 
2442 done:
2443 	if ((idx & 7) == 7)
2444 		mxge_submit_8rx(&rx->lanai[idx - 7], &rx->shadow[idx - 7]);
2445 	return err;
2446 }
2447 
2448 static int
2449 mxge_get_buf_big(struct mxge_slice_state *ss, bus_dmamap_t map, int idx)
2450 {
2451 	bus_dma_segment_t seg[3];
2452 	struct mbuf *m;
2453 	mxge_rx_ring_t *rx = &ss->rx_big;
2454 	int cnt, err, i;
2455 
2456 	m = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, rx->cl_size);
2457 	if (m == NULL) {
2458 		rx->alloc_fail++;
2459 		err = ENOBUFS;
2460 		goto done;
2461 	}
2462 	m->m_len = rx->mlen;
2463 	err = bus_dmamap_load_mbuf_sg(rx->dmat, map, m,
2464 				      seg, &cnt, BUS_DMA_NOWAIT);
2465 	if (err != 0) {
2466 		m_free(m);
2467 		goto done;
2468 	}
2469 	rx->info[idx].m = m;
2470 	rx->shadow[idx].addr_low =
2471 		htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
2472 	rx->shadow[idx].addr_high =
2473 		htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
2474 
2475 #if MXGE_VIRT_JUMBOS
2476 	for (i = 1; i < cnt; i++) {
2477 		rx->shadow[idx + i].addr_low =
2478 			htobe32(MXGE_LOWPART_TO_U32(seg[i].ds_addr));
2479 		rx->shadow[idx + i].addr_high =
2480 			htobe32(MXGE_HIGHPART_TO_U32(seg[i].ds_addr));
2481        }
2482 #endif
2483 
2484 done:
2485        for (i = 0; i < rx->nbufs; i++) {
2486 		if ((idx & 7) == 7) {
2487 			mxge_submit_8rx(&rx->lanai[idx - 7],
2488 					&rx->shadow[idx - 7]);
2489 		}
2490 		idx++;
2491 	}
2492 	return err;
2493 }
2494 
2495 #ifdef INET6
2496 
2497 static uint16_t
2498 mxge_csum_generic(uint16_t *raw, int len)
2499 {
2500 	uint32_t csum;
2501 
2502 
2503 	csum = 0;
2504 	while (len > 0) {
2505 		csum += *raw;
2506 		raw++;
2507 		len -= 2;
2508 	}
2509 	csum = (csum >> 16) + (csum & 0xffff);
2510 	csum = (csum >> 16) + (csum & 0xffff);
2511 	return (uint16_t)csum;
2512 }
2513 
2514 static inline uint16_t
2515 mxge_rx_csum6(void *p, struct mbuf *m, uint32_t csum)
2516 {
2517 	uint32_t partial;
2518 	int nxt, cksum_offset;
2519 	struct ip6_hdr *ip6 = p;
2520 	uint16_t c;
2521 
2522 	nxt = ip6->ip6_nxt;
2523 	cksum_offset = sizeof (*ip6) + ETHER_HDR_LEN;
2524 	if (nxt != IPPROTO_TCP && nxt != IPPROTO_UDP) {
2525 		cksum_offset = ip6_lasthdr(m, ETHER_HDR_LEN,
2526 					   IPPROTO_IPV6, &nxt);
2527 		if (nxt != IPPROTO_TCP && nxt != IPPROTO_UDP)
2528 			return (1);
2529 	}
2530 
2531 	/*
2532 	 * IPv6 headers do not contain a checksum, and hence
2533 	 * do not checksum to zero, so they don't "fall out"
2534 	 * of the partial checksum calculation like IPv4
2535 	 * headers do.  We need to fix the partial checksum by
2536 	 * subtracting the checksum of the IPv6 header.
2537 	 */
2538 
2539 	partial = mxge_csum_generic((uint16_t *)ip6, cksum_offset -
2540 				    ETHER_HDR_LEN);
2541 	csum += ~partial;
2542 	csum +=	 (csum < ~partial);
2543 	csum = (csum >> 16) + (csum & 0xFFFF);
2544 	csum = (csum >> 16) + (csum & 0xFFFF);
2545 	c = in6_cksum_pseudo(ip6, m->m_pkthdr.len - cksum_offset, nxt,
2546 			     csum);
2547 	c ^= 0xffff;
2548 	return (c);
2549 }
2550 #endif /* INET6 */
2551 /*
2552  *  Myri10GE hardware checksums are not valid if the sender
2553  *  padded the frame with non-zero padding.  This is because
2554  *  the firmware just does a simple 16-bit 1s complement
2555  *  checksum across the entire frame, excluding the first 14
2556  *  bytes.  It is best to simply to check the checksum and
2557  *  tell the stack about it only if the checksum is good
2558  */
2559 
2560 static inline uint16_t
2561 mxge_rx_csum(struct mbuf *m, int csum)
2562 {
2563 	struct ether_header *eh;
2564 #ifdef INET
2565 	struct ip *ip;
2566 #endif
2567 #if defined(INET) || defined(INET6)
2568 	int cap = m->m_pkthdr.rcvif->if_capenable;
2569 #endif
2570 	uint16_t c, etype;
2571 
2572 
2573 	eh = mtod(m, struct ether_header *);
2574 	etype = ntohs(eh->ether_type);
2575 	switch (etype) {
2576 #ifdef INET
2577 	case ETHERTYPE_IP:
2578 		if ((cap & IFCAP_RXCSUM) == 0)
2579 			return (1);
2580 		ip = (struct ip *)(eh + 1);
2581 		if (ip->ip_p != IPPROTO_TCP && ip->ip_p != IPPROTO_UDP)
2582 			return (1);
2583 		c = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
2584 			      htonl(ntohs(csum) + ntohs(ip->ip_len) -
2585 				    (ip->ip_hl << 2) + ip->ip_p));
2586 		c ^= 0xffff;
2587 		break;
2588 #endif
2589 #ifdef INET6
2590 	case ETHERTYPE_IPV6:
2591 		if ((cap & IFCAP_RXCSUM_IPV6) == 0)
2592 			return (1);
2593 		c = mxge_rx_csum6((eh + 1), m, csum);
2594 		break;
2595 #endif
2596 	default:
2597 		c = 1;
2598 	}
2599 	return (c);
2600 }
2601 
2602 static void
2603 mxge_vlan_tag_remove(struct mbuf *m, uint32_t *csum)
2604 {
2605 	struct ether_vlan_header *evl;
2606 	struct ether_header *eh;
2607 	uint32_t partial;
2608 
2609 	evl = mtod(m, struct ether_vlan_header *);
2610 	eh = mtod(m, struct ether_header *);
2611 
2612 	/*
2613 	 * fix checksum by subtracting ETHER_VLAN_ENCAP_LEN bytes
2614 	 * after what the firmware thought was the end of the ethernet
2615 	 * header.
2616 	 */
2617 
2618 	/* put checksum into host byte order */
2619 	*csum = ntohs(*csum);
2620 	partial = ntohl(*(uint32_t *)(mtod(m, char *) + ETHER_HDR_LEN));
2621 	(*csum) += ~partial;
2622 	(*csum) +=  ((*csum) < ~partial);
2623 	(*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2624 	(*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2625 
2626 	/* restore checksum to network byte order;
2627 	   later consumers expect this */
2628 	*csum = htons(*csum);
2629 
2630 	/* save the tag */
2631 #ifdef MXGE_NEW_VLAN_API
2632 	m->m_pkthdr.ether_vtag = ntohs(evl->evl_tag);
2633 #else
2634 	{
2635 		struct m_tag *mtag;
2636 		mtag = m_tag_alloc(MTAG_VLAN, MTAG_VLAN_TAG, sizeof(u_int),
2637 				   M_NOWAIT);
2638 		if (mtag == NULL)
2639 			return;
2640 		VLAN_TAG_VALUE(mtag) = ntohs(evl->evl_tag);
2641 		m_tag_prepend(m, mtag);
2642 	}
2643 
2644 #endif
2645 	m->m_flags |= M_VLANTAG;
2646 
2647 	/*
2648 	 * Remove the 802.1q header by copying the Ethernet
2649 	 * addresses over it and adjusting the beginning of
2650 	 * the data in the mbuf.  The encapsulated Ethernet
2651 	 * type field is already in place.
2652 	 */
2653 	bcopy((char *)evl, (char *)evl + ETHER_VLAN_ENCAP_LEN,
2654 	      ETHER_HDR_LEN - ETHER_TYPE_LEN);
2655 	m_adj(m, ETHER_VLAN_ENCAP_LEN);
2656 }
2657 
2658 
2659 static inline void
2660 mxge_rx_done_big(struct mxge_slice_state *ss, uint32_t len,
2661 		 uint32_t csum, int lro)
2662 {
2663 	mxge_softc_t *sc;
2664 	struct ifnet *ifp;
2665 	struct mbuf *m;
2666 	struct ether_header *eh;
2667 	mxge_rx_ring_t *rx;
2668 	bus_dmamap_t old_map;
2669 	int idx;
2670 
2671 	sc = ss->sc;
2672 	ifp = sc->ifp;
2673 	rx = &ss->rx_big;
2674 	idx = rx->cnt & rx->mask;
2675 	rx->cnt += rx->nbufs;
2676 	/* save a pointer to the received mbuf */
2677 	m = rx->info[idx].m;
2678 	/* try to replace the received mbuf */
2679 	if (mxge_get_buf_big(ss, rx->extra_map, idx)) {
2680 		/* drop the frame -- the old mbuf is re-cycled */
2681 		ifp->if_ierrors++;
2682 		return;
2683 	}
2684 
2685 	/* unmap the received buffer */
2686 	old_map = rx->info[idx].map;
2687 	bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2688 	bus_dmamap_unload(rx->dmat, old_map);
2689 
2690 	/* swap the bus_dmamap_t's */
2691 	rx->info[idx].map = rx->extra_map;
2692 	rx->extra_map = old_map;
2693 
2694 	/* mcp implicitly skips 1st 2 bytes so that packet is properly
2695 	 * aligned */
2696 	m->m_data += MXGEFW_PAD;
2697 
2698 	m->m_pkthdr.rcvif = ifp;
2699 	m->m_len = m->m_pkthdr.len = len;
2700 	ss->ipackets++;
2701 	eh = mtod(m, struct ether_header *);
2702 	if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2703 		mxge_vlan_tag_remove(m, &csum);
2704 	}
2705 	/* if the checksum is valid, mark it in the mbuf header */
2706 
2707 	if ((ifp->if_capenable & (IFCAP_RXCSUM_IPV6 | IFCAP_RXCSUM)) &&
2708 	    (0 == mxge_rx_csum(m, csum))) {
2709 		/* Tell the stack that the  checksum is good */
2710 		m->m_pkthdr.csum_data = 0xffff;
2711 		m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR |
2712 			CSUM_DATA_VALID;
2713 
2714 #if defined(INET) || defined (INET6)
2715 		if (lro && (0 == tcp_lro_rx(&ss->lc, m, 0)))
2716 			return;
2717 #endif
2718 	}
2719 	/* flowid only valid if RSS hashing is enabled */
2720 	if (sc->num_slices > 1) {
2721 		m->m_pkthdr.flowid = (ss - sc->ss);
2722 		m->m_flags |= M_FLOWID;
2723 	}
2724 	/* pass the frame up the stack */
2725 	(*ifp->if_input)(ifp, m);
2726 }
2727 
2728 static inline void
2729 mxge_rx_done_small(struct mxge_slice_state *ss, uint32_t len,
2730 		   uint32_t csum, int lro)
2731 {
2732 	mxge_softc_t *sc;
2733 	struct ifnet *ifp;
2734 	struct ether_header *eh;
2735 	struct mbuf *m;
2736 	mxge_rx_ring_t *rx;
2737 	bus_dmamap_t old_map;
2738 	int idx;
2739 
2740 	sc = ss->sc;
2741 	ifp = sc->ifp;
2742 	rx = &ss->rx_small;
2743 	idx = rx->cnt & rx->mask;
2744 	rx->cnt++;
2745 	/* save a pointer to the received mbuf */
2746 	m = rx->info[idx].m;
2747 	/* try to replace the received mbuf */
2748 	if (mxge_get_buf_small(ss, rx->extra_map, idx)) {
2749 		/* drop the frame -- the old mbuf is re-cycled */
2750 		ifp->if_ierrors++;
2751 		return;
2752 	}
2753 
2754 	/* unmap the received buffer */
2755 	old_map = rx->info[idx].map;
2756 	bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2757 	bus_dmamap_unload(rx->dmat, old_map);
2758 
2759 	/* swap the bus_dmamap_t's */
2760 	rx->info[idx].map = rx->extra_map;
2761 	rx->extra_map = old_map;
2762 
2763 	/* mcp implicitly skips 1st 2 bytes so that packet is properly
2764 	 * aligned */
2765 	m->m_data += MXGEFW_PAD;
2766 
2767 	m->m_pkthdr.rcvif = ifp;
2768 	m->m_len = m->m_pkthdr.len = len;
2769 	ss->ipackets++;
2770 	eh = mtod(m, struct ether_header *);
2771 	if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2772 		mxge_vlan_tag_remove(m, &csum);
2773 	}
2774 	/* if the checksum is valid, mark it in the mbuf header */
2775 	if ((ifp->if_capenable & (IFCAP_RXCSUM_IPV6 | IFCAP_RXCSUM)) &&
2776 	    (0 == mxge_rx_csum(m, csum))) {
2777 		/* Tell the stack that the  checksum is good */
2778 		m->m_pkthdr.csum_data = 0xffff;
2779 		m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR |
2780 			CSUM_DATA_VALID;
2781 
2782 #if defined(INET) || defined (INET6)
2783 		if (lro && (0 == tcp_lro_rx(&ss->lc, m, csum)))
2784 			return;
2785 #endif
2786 	}
2787 	/* flowid only valid if RSS hashing is enabled */
2788 	if (sc->num_slices > 1) {
2789 		m->m_pkthdr.flowid = (ss - sc->ss);
2790 		m->m_flags |= M_FLOWID;
2791 	}
2792 	/* pass the frame up the stack */
2793 	(*ifp->if_input)(ifp, m);
2794 }
2795 
2796 static inline void
2797 mxge_clean_rx_done(struct mxge_slice_state *ss)
2798 {
2799 	mxge_rx_done_t *rx_done = &ss->rx_done;
2800 	int limit = 0;
2801 	uint16_t length;
2802 	uint16_t checksum;
2803 	int lro;
2804 
2805 	lro = ss->sc->ifp->if_capenable & IFCAP_LRO;
2806 	while (rx_done->entry[rx_done->idx].length != 0) {
2807 		length = ntohs(rx_done->entry[rx_done->idx].length);
2808 		rx_done->entry[rx_done->idx].length = 0;
2809 		checksum = rx_done->entry[rx_done->idx].checksum;
2810 		if (length <= (MHLEN - MXGEFW_PAD))
2811 			mxge_rx_done_small(ss, length, checksum, lro);
2812 		else
2813 			mxge_rx_done_big(ss, length, checksum, lro);
2814 		rx_done->cnt++;
2815 		rx_done->idx = rx_done->cnt & rx_done->mask;
2816 
2817 		/* limit potential for livelock */
2818 		if (__predict_false(++limit > rx_done->mask / 2))
2819 			break;
2820 	}
2821 #if defined(INET)  || defined (INET6)
2822 	while (!SLIST_EMPTY(&ss->lc.lro_active)) {
2823 		struct lro_entry *lro = SLIST_FIRST(&ss->lc.lro_active);
2824 		SLIST_REMOVE_HEAD(&ss->lc.lro_active, next);
2825 		tcp_lro_flush(&ss->lc, lro);
2826 	}
2827 #endif
2828 }
2829 
2830 
2831 static inline void
2832 mxge_tx_done(struct mxge_slice_state *ss, uint32_t mcp_idx)
2833 {
2834 	struct ifnet *ifp;
2835 	mxge_tx_ring_t *tx;
2836 	struct mbuf *m;
2837 	bus_dmamap_t map;
2838 	int idx;
2839 	int *flags;
2840 
2841 	tx = &ss->tx;
2842 	ifp = ss->sc->ifp;
2843 	while (tx->pkt_done != mcp_idx) {
2844 		idx = tx->done & tx->mask;
2845 		tx->done++;
2846 		m = tx->info[idx].m;
2847 		/* mbuf and DMA map only attached to the first
2848 		   segment per-mbuf */
2849 		if (m != NULL) {
2850 			ss->obytes += m->m_pkthdr.len;
2851 			if (m->m_flags & M_MCAST)
2852 				ss->omcasts++;
2853 			ss->opackets++;
2854 			tx->info[idx].m = NULL;
2855 			map = tx->info[idx].map;
2856 			bus_dmamap_unload(tx->dmat, map);
2857 			m_freem(m);
2858 		}
2859 		if (tx->info[idx].flag) {
2860 			tx->info[idx].flag = 0;
2861 			tx->pkt_done++;
2862 		}
2863 	}
2864 
2865 	/* If we have space, clear IFF_OACTIVE to tell the stack that
2866            its OK to send packets */
2867 #ifdef IFNET_BUF_RING
2868 	flags = &ss->if_drv_flags;
2869 #else
2870 	flags = &ifp->if_drv_flags;
2871 #endif
2872 	mtx_lock(&ss->tx.mtx);
2873 	if ((*flags) & IFF_DRV_OACTIVE &&
2874 	    tx->req - tx->done < (tx->mask + 1)/4) {
2875 		*(flags) &= ~IFF_DRV_OACTIVE;
2876 		ss->tx.wake++;
2877 		mxge_start_locked(ss);
2878 	}
2879 #ifdef IFNET_BUF_RING
2880 	if ((ss->sc->num_slices > 1) && (tx->req == tx->done)) {
2881 		/* let the NIC stop polling this queue, since there
2882 		 * are no more transmits pending */
2883 		if (tx->req == tx->done) {
2884 			*tx->send_stop = 1;
2885 			tx->queue_active = 0;
2886 			tx->deactivate++;
2887 			wmb();
2888 		}
2889 	}
2890 #endif
2891 	mtx_unlock(&ss->tx.mtx);
2892 
2893 }
2894 
2895 static struct mxge_media_type mxge_xfp_media_types[] =
2896 {
2897 	{IFM_10G_CX4,	0x7f, 		"10GBASE-CX4 (module)"},
2898 	{IFM_10G_SR, 	(1 << 7),	"10GBASE-SR"},
2899 	{IFM_10G_LR, 	(1 << 6),	"10GBASE-LR"},
2900 	{0,		(1 << 5),	"10GBASE-ER"},
2901 	{IFM_10G_LRM,	(1 << 4),	"10GBASE-LRM"},
2902 	{0,		(1 << 3),	"10GBASE-SW"},
2903 	{0,		(1 << 2),	"10GBASE-LW"},
2904 	{0,		(1 << 1),	"10GBASE-EW"},
2905 	{0,		(1 << 0),	"Reserved"}
2906 };
2907 static struct mxge_media_type mxge_sfp_media_types[] =
2908 {
2909 	{IFM_10G_TWINAX,      0,	"10GBASE-Twinax"},
2910 	{0,		(1 << 7),	"Reserved"},
2911 	{IFM_10G_LRM,	(1 << 6),	"10GBASE-LRM"},
2912 	{IFM_10G_LR, 	(1 << 5),	"10GBASE-LR"},
2913 	{IFM_10G_SR,	(1 << 4),	"10GBASE-SR"},
2914 	{IFM_10G_TWINAX,(1 << 0),	"10GBASE-Twinax"}
2915 };
2916 
2917 static void
2918 mxge_media_set(mxge_softc_t *sc, int media_type)
2919 {
2920 
2921 
2922 	ifmedia_add(&sc->media, IFM_ETHER | IFM_FDX | media_type,
2923 		    0, NULL);
2924 	ifmedia_set(&sc->media, IFM_ETHER | IFM_FDX | media_type);
2925 	sc->current_media = media_type;
2926 	sc->media.ifm_media = sc->media.ifm_cur->ifm_media;
2927 }
2928 
2929 static void
2930 mxge_media_init(mxge_softc_t *sc)
2931 {
2932 	char *ptr;
2933 	int i;
2934 
2935 	ifmedia_removeall(&sc->media);
2936 	mxge_media_set(sc, IFM_AUTO);
2937 
2938 	/*
2939 	 * parse the product code to deterimine the interface type
2940 	 * (CX4, XFP, Quad Ribbon Fiber) by looking at the character
2941 	 * after the 3rd dash in the driver's cached copy of the
2942 	 * EEPROM's product code string.
2943 	 */
2944 	ptr = sc->product_code_string;
2945 	if (ptr == NULL) {
2946 		device_printf(sc->dev, "Missing product code\n");
2947 		return;
2948 	}
2949 
2950 	for (i = 0; i < 3; i++, ptr++) {
2951 		ptr = strchr(ptr, '-');
2952 		if (ptr == NULL) {
2953 			device_printf(sc->dev,
2954 				      "only %d dashes in PC?!?\n", i);
2955 			return;
2956 		}
2957 	}
2958 	if (*ptr == 'C' || *(ptr +1) == 'C') {
2959 		/* -C is CX4 */
2960 		sc->connector = MXGE_CX4;
2961 		mxge_media_set(sc, IFM_10G_CX4);
2962 	} else if (*ptr == 'Q') {
2963 		/* -Q is Quad Ribbon Fiber */
2964 		sc->connector = MXGE_QRF;
2965 		device_printf(sc->dev, "Quad Ribbon Fiber Media\n");
2966 		/* FreeBSD has no media type for Quad ribbon fiber */
2967 	} else if (*ptr == 'R') {
2968 		/* -R is XFP */
2969 		sc->connector = MXGE_XFP;
2970 	} else if (*ptr == 'S' || *(ptr +1) == 'S') {
2971 		/* -S or -2S is SFP+ */
2972 		sc->connector = MXGE_SFP;
2973 	} else {
2974 		device_printf(sc->dev, "Unknown media type: %c\n", *ptr);
2975 	}
2976 }
2977 
2978 /*
2979  * Determine the media type for a NIC.  Some XFPs will identify
2980  * themselves only when their link is up, so this is initiated via a
2981  * link up interrupt.  However, this can potentially take up to
2982  * several milliseconds, so it is run via the watchdog routine, rather
2983  * than in the interrupt handler itself.
2984  */
2985 static void
2986 mxge_media_probe(mxge_softc_t *sc)
2987 {
2988 	mxge_cmd_t cmd;
2989 	char *cage_type;
2990 
2991 	struct mxge_media_type *mxge_media_types = NULL;
2992 	int i, err, ms, mxge_media_type_entries;
2993 	uint32_t byte;
2994 
2995 	sc->need_media_probe = 0;
2996 
2997 	if (sc->connector == MXGE_XFP) {
2998 		/* -R is XFP */
2999 		mxge_media_types = mxge_xfp_media_types;
3000 		mxge_media_type_entries =
3001 			sizeof (mxge_xfp_media_types) /
3002 			sizeof (mxge_xfp_media_types[0]);
3003 		byte = MXGE_XFP_COMPLIANCE_BYTE;
3004 		cage_type = "XFP";
3005 	} else 	if (sc->connector == MXGE_SFP) {
3006 		/* -S or -2S is SFP+ */
3007 		mxge_media_types = mxge_sfp_media_types;
3008 		mxge_media_type_entries =
3009 			sizeof (mxge_sfp_media_types) /
3010 			sizeof (mxge_sfp_media_types[0]);
3011 		cage_type = "SFP+";
3012 		byte = 3;
3013 	} else {
3014 		/* nothing to do; media type cannot change */
3015 		return;
3016 	}
3017 
3018 	/*
3019 	 * At this point we know the NIC has an XFP cage, so now we
3020 	 * try to determine what is in the cage by using the
3021 	 * firmware's XFP I2C commands to read the XFP 10GbE compilance
3022 	 * register.  We read just one byte, which may take over
3023 	 * a millisecond
3024 	 */
3025 
3026 	cmd.data0 = 0;	 /* just fetch 1 byte, not all 256 */
3027 	cmd.data1 = byte;
3028 	err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_READ, &cmd);
3029 	if (err == MXGEFW_CMD_ERROR_I2C_FAILURE) {
3030 		device_printf(sc->dev, "failed to read XFP\n");
3031 	}
3032 	if (err == MXGEFW_CMD_ERROR_I2C_ABSENT) {
3033 		device_printf(sc->dev, "Type R/S with no XFP!?!?\n");
3034 	}
3035 	if (err != MXGEFW_CMD_OK) {
3036 		return;
3037 	}
3038 
3039 	/* now we wait for the data to be cached */
3040 	cmd.data0 = byte;
3041 	err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
3042 	for (ms = 0; (err == EBUSY) && (ms < 50); ms++) {
3043 		DELAY(1000);
3044 		cmd.data0 = byte;
3045 		err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
3046 	}
3047 	if (err != MXGEFW_CMD_OK) {
3048 		device_printf(sc->dev, "failed to read %s (%d, %dms)\n",
3049 			      cage_type, err, ms);
3050 		return;
3051 	}
3052 
3053 	if (cmd.data0 == mxge_media_types[0].bitmask) {
3054 		if (mxge_verbose)
3055 			device_printf(sc->dev, "%s:%s\n", cage_type,
3056 				      mxge_media_types[0].name);
3057 		if (sc->current_media != mxge_media_types[0].flag) {
3058 			mxge_media_init(sc);
3059 			mxge_media_set(sc, mxge_media_types[0].flag);
3060 		}
3061 		return;
3062 	}
3063 	for (i = 1; i < mxge_media_type_entries; i++) {
3064 		if (cmd.data0 & mxge_media_types[i].bitmask) {
3065 			if (mxge_verbose)
3066 				device_printf(sc->dev, "%s:%s\n",
3067 					      cage_type,
3068 					      mxge_media_types[i].name);
3069 
3070 			if (sc->current_media != mxge_media_types[i].flag) {
3071 				mxge_media_init(sc);
3072 				mxge_media_set(sc, mxge_media_types[i].flag);
3073 			}
3074 			return;
3075 		}
3076 	}
3077 	if (mxge_verbose)
3078 		device_printf(sc->dev, "%s media 0x%x unknown\n",
3079 			      cage_type, cmd.data0);
3080 
3081 	return;
3082 }
3083 
3084 static void
3085 mxge_intr(void *arg)
3086 {
3087 	struct mxge_slice_state *ss = arg;
3088 	mxge_softc_t *sc = ss->sc;
3089 	mcp_irq_data_t *stats = ss->fw_stats;
3090 	mxge_tx_ring_t *tx = &ss->tx;
3091 	mxge_rx_done_t *rx_done = &ss->rx_done;
3092 	uint32_t send_done_count;
3093 	uint8_t valid;
3094 
3095 
3096 #ifndef IFNET_BUF_RING
3097 	/* an interrupt on a non-zero slice is implicitly valid
3098 	   since MSI-X irqs are not shared */
3099 	if (ss != sc->ss) {
3100 		mxge_clean_rx_done(ss);
3101 		*ss->irq_claim = be32toh(3);
3102 		return;
3103 	}
3104 #endif
3105 
3106 	/* make sure the DMA has finished */
3107 	if (!stats->valid) {
3108 		return;
3109 	}
3110 	valid = stats->valid;
3111 
3112 	if (sc->legacy_irq) {
3113 		/* lower legacy IRQ  */
3114 		*sc->irq_deassert = 0;
3115 		if (!mxge_deassert_wait)
3116 			/* don't wait for conf. that irq is low */
3117 			stats->valid = 0;
3118 	} else {
3119 		stats->valid = 0;
3120 	}
3121 
3122 	/* loop while waiting for legacy irq deassertion */
3123 	do {
3124 		/* check for transmit completes and receives */
3125 		send_done_count = be32toh(stats->send_done_count);
3126 		while ((send_done_count != tx->pkt_done) ||
3127 		       (rx_done->entry[rx_done->idx].length != 0)) {
3128 			if (send_done_count != tx->pkt_done)
3129 				mxge_tx_done(ss, (int)send_done_count);
3130 			mxge_clean_rx_done(ss);
3131 			send_done_count = be32toh(stats->send_done_count);
3132 		}
3133 		if (sc->legacy_irq && mxge_deassert_wait)
3134 			wmb();
3135 	} while (*((volatile uint8_t *) &stats->valid));
3136 
3137 	/* fw link & error stats meaningful only on the first slice */
3138 	if (__predict_false((ss == sc->ss) && stats->stats_updated)) {
3139 		if (sc->link_state != stats->link_up) {
3140 			sc->link_state = stats->link_up;
3141 			if (sc->link_state) {
3142 				if_link_state_change(sc->ifp, LINK_STATE_UP);
3143 				if_initbaudrate(sc->ifp, IF_Gbps(10));
3144 				if (mxge_verbose)
3145 					device_printf(sc->dev, "link up\n");
3146 			} else {
3147 				if_link_state_change(sc->ifp, LINK_STATE_DOWN);
3148 				sc->ifp->if_baudrate = 0;
3149 				if (mxge_verbose)
3150 					device_printf(sc->dev, "link down\n");
3151 			}
3152 			sc->need_media_probe = 1;
3153 		}
3154 		if (sc->rdma_tags_available !=
3155 		    be32toh(stats->rdma_tags_available)) {
3156 			sc->rdma_tags_available =
3157 				be32toh(stats->rdma_tags_available);
3158 			device_printf(sc->dev, "RDMA timed out! %d tags "
3159 				      "left\n", sc->rdma_tags_available);
3160 		}
3161 
3162 		if (stats->link_down) {
3163 			sc->down_cnt += stats->link_down;
3164 			sc->link_state = 0;
3165 			if_link_state_change(sc->ifp, LINK_STATE_DOWN);
3166 		}
3167 	}
3168 
3169 	/* check to see if we have rx token to pass back */
3170 	if (valid & 0x1)
3171 	    *ss->irq_claim = be32toh(3);
3172 	*(ss->irq_claim + 1) = be32toh(3);
3173 }
3174 
3175 static void
3176 mxge_init(void *arg)
3177 {
3178 	mxge_softc_t *sc = arg;
3179 	struct ifnet *ifp = sc->ifp;
3180 
3181 
3182 	mtx_lock(&sc->driver_mtx);
3183 	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0)
3184 		(void) mxge_open(sc);
3185 	mtx_unlock(&sc->driver_mtx);
3186 }
3187 
3188 
3189 
3190 static void
3191 mxge_free_slice_mbufs(struct mxge_slice_state *ss)
3192 {
3193 	int i;
3194 
3195 #if defined(INET) || defined(INET6)
3196 	tcp_lro_free(&ss->lc);
3197 #endif
3198 	for (i = 0; i <= ss->rx_big.mask; i++) {
3199 		if (ss->rx_big.info[i].m == NULL)
3200 			continue;
3201 		bus_dmamap_unload(ss->rx_big.dmat,
3202 				  ss->rx_big.info[i].map);
3203 		m_freem(ss->rx_big.info[i].m);
3204 		ss->rx_big.info[i].m = NULL;
3205 	}
3206 
3207 	for (i = 0; i <= ss->rx_small.mask; i++) {
3208 		if (ss->rx_small.info[i].m == NULL)
3209 			continue;
3210 		bus_dmamap_unload(ss->rx_small.dmat,
3211 				  ss->rx_small.info[i].map);
3212 		m_freem(ss->rx_small.info[i].m);
3213 		ss->rx_small.info[i].m = NULL;
3214 	}
3215 
3216 	/* transmit ring used only on the first slice */
3217 	if (ss->tx.info == NULL)
3218 		return;
3219 
3220 	for (i = 0; i <= ss->tx.mask; i++) {
3221 		ss->tx.info[i].flag = 0;
3222 		if (ss->tx.info[i].m == NULL)
3223 			continue;
3224 		bus_dmamap_unload(ss->tx.dmat,
3225 				  ss->tx.info[i].map);
3226 		m_freem(ss->tx.info[i].m);
3227 		ss->tx.info[i].m = NULL;
3228 	}
3229 }
3230 
3231 static void
3232 mxge_free_mbufs(mxge_softc_t *sc)
3233 {
3234 	int slice;
3235 
3236 	for (slice = 0; slice < sc->num_slices; slice++)
3237 		mxge_free_slice_mbufs(&sc->ss[slice]);
3238 }
3239 
3240 static void
3241 mxge_free_slice_rings(struct mxge_slice_state *ss)
3242 {
3243 	int i;
3244 
3245 
3246 	if (ss->rx_done.entry != NULL)
3247 		mxge_dma_free(&ss->rx_done.dma);
3248 	ss->rx_done.entry = NULL;
3249 
3250 	if (ss->tx.req_bytes != NULL)
3251 		free(ss->tx.req_bytes, M_DEVBUF);
3252 	ss->tx.req_bytes = NULL;
3253 
3254 	if (ss->tx.seg_list != NULL)
3255 		free(ss->tx.seg_list, M_DEVBUF);
3256 	ss->tx.seg_list = NULL;
3257 
3258 	if (ss->rx_small.shadow != NULL)
3259 		free(ss->rx_small.shadow, M_DEVBUF);
3260 	ss->rx_small.shadow = NULL;
3261 
3262 	if (ss->rx_big.shadow != NULL)
3263 		free(ss->rx_big.shadow, M_DEVBUF);
3264 	ss->rx_big.shadow = NULL;
3265 
3266 	if (ss->tx.info != NULL) {
3267 		if (ss->tx.dmat != NULL) {
3268 			for (i = 0; i <= ss->tx.mask; i++) {
3269 				bus_dmamap_destroy(ss->tx.dmat,
3270 						   ss->tx.info[i].map);
3271 			}
3272 			bus_dma_tag_destroy(ss->tx.dmat);
3273 		}
3274 		free(ss->tx.info, M_DEVBUF);
3275 	}
3276 	ss->tx.info = NULL;
3277 
3278 	if (ss->rx_small.info != NULL) {
3279 		if (ss->rx_small.dmat != NULL) {
3280 			for (i = 0; i <= ss->rx_small.mask; i++) {
3281 				bus_dmamap_destroy(ss->rx_small.dmat,
3282 						   ss->rx_small.info[i].map);
3283 			}
3284 			bus_dmamap_destroy(ss->rx_small.dmat,
3285 					   ss->rx_small.extra_map);
3286 			bus_dma_tag_destroy(ss->rx_small.dmat);
3287 		}
3288 		free(ss->rx_small.info, M_DEVBUF);
3289 	}
3290 	ss->rx_small.info = NULL;
3291 
3292 	if (ss->rx_big.info != NULL) {
3293 		if (ss->rx_big.dmat != NULL) {
3294 			for (i = 0; i <= ss->rx_big.mask; i++) {
3295 				bus_dmamap_destroy(ss->rx_big.dmat,
3296 						   ss->rx_big.info[i].map);
3297 			}
3298 			bus_dmamap_destroy(ss->rx_big.dmat,
3299 					   ss->rx_big.extra_map);
3300 			bus_dma_tag_destroy(ss->rx_big.dmat);
3301 		}
3302 		free(ss->rx_big.info, M_DEVBUF);
3303 	}
3304 	ss->rx_big.info = NULL;
3305 }
3306 
3307 static void
3308 mxge_free_rings(mxge_softc_t *sc)
3309 {
3310 	int slice;
3311 
3312 	for (slice = 0; slice < sc->num_slices; slice++)
3313 		mxge_free_slice_rings(&sc->ss[slice]);
3314 }
3315 
3316 static int
3317 mxge_alloc_slice_rings(struct mxge_slice_state *ss, int rx_ring_entries,
3318 		       int tx_ring_entries)
3319 {
3320 	mxge_softc_t *sc = ss->sc;
3321 	size_t bytes;
3322 	int err, i;
3323 
3324 	/* allocate per-slice receive resources */
3325 
3326 	ss->rx_small.mask = ss->rx_big.mask = rx_ring_entries - 1;
3327 	ss->rx_done.mask = (2 * rx_ring_entries) - 1;
3328 
3329 	/* allocate the rx shadow rings */
3330 	bytes = rx_ring_entries * sizeof (*ss->rx_small.shadow);
3331 	ss->rx_small.shadow = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3332 
3333 	bytes = rx_ring_entries * sizeof (*ss->rx_big.shadow);
3334 	ss->rx_big.shadow = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3335 
3336 	/* allocate the rx host info rings */
3337 	bytes = rx_ring_entries * sizeof (*ss->rx_small.info);
3338 	ss->rx_small.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3339 
3340 	bytes = rx_ring_entries * sizeof (*ss->rx_big.info);
3341 	ss->rx_big.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3342 
3343 	/* allocate the rx busdma resources */
3344 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
3345 				 1,			/* alignment */
3346 				 4096,			/* boundary */
3347 				 BUS_SPACE_MAXADDR,	/* low */
3348 				 BUS_SPACE_MAXADDR,	/* high */
3349 				 NULL, NULL,		/* filter */
3350 				 MHLEN,			/* maxsize */
3351 				 1,			/* num segs */
3352 				 MHLEN,			/* maxsegsize */
3353 				 BUS_DMA_ALLOCNOW,	/* flags */
3354 				 NULL, NULL,		/* lock */
3355 				 &ss->rx_small.dmat);	/* tag */
3356 	if (err != 0) {
3357 		device_printf(sc->dev, "Err %d allocating rx_small dmat\n",
3358 			      err);
3359 		return err;
3360 	}
3361 
3362 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
3363 				 1,			/* alignment */
3364 #if MXGE_VIRT_JUMBOS
3365 				 4096,			/* boundary */
3366 #else
3367 				 0,			/* boundary */
3368 #endif
3369 				 BUS_SPACE_MAXADDR,	/* low */
3370 				 BUS_SPACE_MAXADDR,	/* high */
3371 				 NULL, NULL,		/* filter */
3372 				 3*4096,		/* maxsize */
3373 #if MXGE_VIRT_JUMBOS
3374 				 3,			/* num segs */
3375 				 4096,			/* maxsegsize*/
3376 #else
3377 				 1,			/* num segs */
3378 				 MJUM9BYTES,		/* maxsegsize*/
3379 #endif
3380 				 BUS_DMA_ALLOCNOW,	/* flags */
3381 				 NULL, NULL,		/* lock */
3382 				 &ss->rx_big.dmat);	/* tag */
3383 	if (err != 0) {
3384 		device_printf(sc->dev, "Err %d allocating rx_big dmat\n",
3385 			      err);
3386 		return err;
3387 	}
3388 	for (i = 0; i <= ss->rx_small.mask; i++) {
3389 		err = bus_dmamap_create(ss->rx_small.dmat, 0,
3390 					&ss->rx_small.info[i].map);
3391 		if (err != 0) {
3392 			device_printf(sc->dev, "Err %d  rx_small dmamap\n",
3393 				      err);
3394 			return err;
3395 		}
3396 	}
3397 	err = bus_dmamap_create(ss->rx_small.dmat, 0,
3398 				&ss->rx_small.extra_map);
3399 	if (err != 0) {
3400 		device_printf(sc->dev, "Err %d extra rx_small dmamap\n",
3401 			      err);
3402 		return err;
3403 	}
3404 
3405 	for (i = 0; i <= ss->rx_big.mask; i++) {
3406 		err = bus_dmamap_create(ss->rx_big.dmat, 0,
3407 					&ss->rx_big.info[i].map);
3408 		if (err != 0) {
3409 			device_printf(sc->dev, "Err %d  rx_big dmamap\n",
3410 				      err);
3411 			return err;
3412 		}
3413 	}
3414 	err = bus_dmamap_create(ss->rx_big.dmat, 0,
3415 				&ss->rx_big.extra_map);
3416 	if (err != 0) {
3417 		device_printf(sc->dev, "Err %d extra rx_big dmamap\n",
3418 			      err);
3419 		return err;
3420 	}
3421 
3422 	/* now allocate TX resources */
3423 
3424 #ifndef IFNET_BUF_RING
3425 	/* only use a single TX ring for now */
3426 	if (ss != ss->sc->ss)
3427 		return 0;
3428 #endif
3429 
3430 	ss->tx.mask = tx_ring_entries - 1;
3431 	ss->tx.max_desc = MIN(MXGE_MAX_SEND_DESC, tx_ring_entries / 4);
3432 
3433 
3434 	/* allocate the tx request copy block */
3435 	bytes = 8 +
3436 		sizeof (*ss->tx.req_list) * (ss->tx.max_desc + 4);
3437 	ss->tx.req_bytes = malloc(bytes, M_DEVBUF, M_WAITOK);
3438 	/* ensure req_list entries are aligned to 8 bytes */
3439 	ss->tx.req_list = (mcp_kreq_ether_send_t *)
3440 		((unsigned long)(ss->tx.req_bytes + 7) & ~7UL);
3441 
3442 	/* allocate the tx busdma segment list */
3443 	bytes = sizeof (*ss->tx.seg_list) * ss->tx.max_desc;
3444 	ss->tx.seg_list = (bus_dma_segment_t *)
3445 		malloc(bytes, M_DEVBUF, M_WAITOK);
3446 
3447 	/* allocate the tx host info ring */
3448 	bytes = tx_ring_entries * sizeof (*ss->tx.info);
3449 	ss->tx.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3450 
3451 	/* allocate the tx busdma resources */
3452 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
3453 				 1,			/* alignment */
3454 				 sc->tx_boundary,	/* boundary */
3455 				 BUS_SPACE_MAXADDR,	/* low */
3456 				 BUS_SPACE_MAXADDR,	/* high */
3457 				 NULL, NULL,		/* filter */
3458 				 65536 + 256,		/* maxsize */
3459 				 ss->tx.max_desc - 2,	/* num segs */
3460 				 sc->tx_boundary,	/* maxsegsz */
3461 				 BUS_DMA_ALLOCNOW,	/* flags */
3462 				 NULL, NULL,		/* lock */
3463 				 &ss->tx.dmat);		/* tag */
3464 
3465 	if (err != 0) {
3466 		device_printf(sc->dev, "Err %d allocating tx dmat\n",
3467 			      err);
3468 		return err;
3469 	}
3470 
3471 	/* now use these tags to setup dmamaps for each slot
3472 	   in the ring */
3473 	for (i = 0; i <= ss->tx.mask; i++) {
3474 		err = bus_dmamap_create(ss->tx.dmat, 0,
3475 					&ss->tx.info[i].map);
3476 		if (err != 0) {
3477 			device_printf(sc->dev, "Err %d  tx dmamap\n",
3478 				      err);
3479 			return err;
3480 		}
3481 	}
3482 	return 0;
3483 
3484 }
3485 
3486 static int
3487 mxge_alloc_rings(mxge_softc_t *sc)
3488 {
3489 	mxge_cmd_t cmd;
3490 	int tx_ring_size;
3491 	int tx_ring_entries, rx_ring_entries;
3492 	int err, slice;
3493 
3494 	/* get ring sizes */
3495 	err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_RING_SIZE, &cmd);
3496 	tx_ring_size = cmd.data0;
3497 	if (err != 0) {
3498 		device_printf(sc->dev, "Cannot determine tx ring sizes\n");
3499 		goto abort;
3500 	}
3501 
3502 	tx_ring_entries = tx_ring_size / sizeof (mcp_kreq_ether_send_t);
3503 	rx_ring_entries = sc->rx_ring_size / sizeof (mcp_dma_addr_t);
3504 	IFQ_SET_MAXLEN(&sc->ifp->if_snd, tx_ring_entries - 1);
3505 	sc->ifp->if_snd.ifq_drv_maxlen = sc->ifp->if_snd.ifq_maxlen;
3506 	IFQ_SET_READY(&sc->ifp->if_snd);
3507 
3508 	for (slice = 0; slice < sc->num_slices; slice++) {
3509 		err = mxge_alloc_slice_rings(&sc->ss[slice],
3510 					     rx_ring_entries,
3511 					     tx_ring_entries);
3512 		if (err != 0)
3513 			goto abort;
3514 	}
3515 	return 0;
3516 
3517 abort:
3518 	mxge_free_rings(sc);
3519 	return err;
3520 
3521 }
3522 
3523 
3524 static void
3525 mxge_choose_params(int mtu, int *big_buf_size, int *cl_size, int *nbufs)
3526 {
3527 	int bufsize = mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN + MXGEFW_PAD;
3528 
3529 	if (bufsize < MCLBYTES) {
3530 		/* easy, everything fits in a single buffer */
3531 		*big_buf_size = MCLBYTES;
3532 		*cl_size = MCLBYTES;
3533 		*nbufs = 1;
3534 		return;
3535 	}
3536 
3537 	if (bufsize < MJUMPAGESIZE) {
3538 		/* still easy, everything still fits in a single buffer */
3539 		*big_buf_size = MJUMPAGESIZE;
3540 		*cl_size = MJUMPAGESIZE;
3541 		*nbufs = 1;
3542 		return;
3543 	}
3544 #if MXGE_VIRT_JUMBOS
3545 	/* now we need to use virtually contiguous buffers */
3546 	*cl_size = MJUM9BYTES;
3547 	*big_buf_size = 4096;
3548 	*nbufs = mtu / 4096 + 1;
3549 	/* needs to be a power of two, so round up */
3550 	if (*nbufs == 3)
3551 		*nbufs = 4;
3552 #else
3553 	*cl_size = MJUM9BYTES;
3554 	*big_buf_size = MJUM9BYTES;
3555 	*nbufs = 1;
3556 #endif
3557 }
3558 
3559 static int
3560 mxge_slice_open(struct mxge_slice_state *ss, int nbufs, int cl_size)
3561 {
3562 	mxge_softc_t *sc;
3563 	mxge_cmd_t cmd;
3564 	bus_dmamap_t map;
3565 	int err, i, slice;
3566 
3567 
3568 	sc = ss->sc;
3569 	slice = ss - sc->ss;
3570 
3571 #if defined(INET) || defined(INET6)
3572 	(void)tcp_lro_init(&ss->lc);
3573 #endif
3574 	ss->lc.ifp = sc->ifp;
3575 
3576 	/* get the lanai pointers to the send and receive rings */
3577 
3578 	err = 0;
3579 #ifndef IFNET_BUF_RING
3580 	/* We currently only send from the first slice */
3581 	if (slice == 0) {
3582 #endif
3583 		cmd.data0 = slice;
3584 		err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_OFFSET, &cmd);
3585 		ss->tx.lanai =
3586 			(volatile mcp_kreq_ether_send_t *)(sc->sram + cmd.data0);
3587 		ss->tx.send_go = (volatile uint32_t *)
3588 			(sc->sram + MXGEFW_ETH_SEND_GO + 64 * slice);
3589 		ss->tx.send_stop = (volatile uint32_t *)
3590 		(sc->sram + MXGEFW_ETH_SEND_STOP + 64 * slice);
3591 #ifndef IFNET_BUF_RING
3592 	}
3593 #endif
3594 	cmd.data0 = slice;
3595 	err |= mxge_send_cmd(sc,
3596 			     MXGEFW_CMD_GET_SMALL_RX_OFFSET, &cmd);
3597 	ss->rx_small.lanai =
3598 		(volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
3599 	cmd.data0 = slice;
3600 	err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_BIG_RX_OFFSET, &cmd);
3601 	ss->rx_big.lanai =
3602 		(volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
3603 
3604 	if (err != 0) {
3605 		device_printf(sc->dev,
3606 			      "failed to get ring sizes or locations\n");
3607 		return EIO;
3608 	}
3609 
3610 	/* stock receive rings */
3611 	for (i = 0; i <= ss->rx_small.mask; i++) {
3612 		map = ss->rx_small.info[i].map;
3613 		err = mxge_get_buf_small(ss, map, i);
3614 		if (err) {
3615 			device_printf(sc->dev, "alloced %d/%d smalls\n",
3616 				      i, ss->rx_small.mask + 1);
3617 			return ENOMEM;
3618 		}
3619 	}
3620 	for (i = 0; i <= ss->rx_big.mask; i++) {
3621 		ss->rx_big.shadow[i].addr_low = 0xffffffff;
3622 		ss->rx_big.shadow[i].addr_high = 0xffffffff;
3623 	}
3624 	ss->rx_big.nbufs = nbufs;
3625 	ss->rx_big.cl_size = cl_size;
3626 	ss->rx_big.mlen = ss->sc->ifp->if_mtu + ETHER_HDR_LEN +
3627 		ETHER_VLAN_ENCAP_LEN + MXGEFW_PAD;
3628 	for (i = 0; i <= ss->rx_big.mask; i += ss->rx_big.nbufs) {
3629 		map = ss->rx_big.info[i].map;
3630 		err = mxge_get_buf_big(ss, map, i);
3631 		if (err) {
3632 			device_printf(sc->dev, "alloced %d/%d bigs\n",
3633 				      i, ss->rx_big.mask + 1);
3634 			return ENOMEM;
3635 		}
3636 	}
3637 	return 0;
3638 }
3639 
3640 static int
3641 mxge_open(mxge_softc_t *sc)
3642 {
3643 	mxge_cmd_t cmd;
3644 	int err, big_bytes, nbufs, slice, cl_size, i;
3645 	bus_addr_t bus;
3646 	volatile uint8_t *itable;
3647 	struct mxge_slice_state *ss;
3648 
3649 	/* Copy the MAC address in case it was overridden */
3650 	bcopy(IF_LLADDR(sc->ifp), sc->mac_addr, ETHER_ADDR_LEN);
3651 
3652 	err = mxge_reset(sc, 1);
3653 	if (err != 0) {
3654 		device_printf(sc->dev, "failed to reset\n");
3655 		return EIO;
3656 	}
3657 
3658 	if (sc->num_slices > 1) {
3659 		/* setup the indirection table */
3660 		cmd.data0 = sc->num_slices;
3661 		err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_TABLE_SIZE,
3662 				    &cmd);
3663 
3664 		err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_RSS_TABLE_OFFSET,
3665 				     &cmd);
3666 		if (err != 0) {
3667 			device_printf(sc->dev,
3668 				      "failed to setup rss tables\n");
3669 			return err;
3670 		}
3671 
3672 		/* just enable an identity mapping */
3673 		itable = sc->sram + cmd.data0;
3674 		for (i = 0; i < sc->num_slices; i++)
3675 			itable[i] = (uint8_t)i;
3676 
3677 		cmd.data0 = 1;
3678 		cmd.data1 = mxge_rss_hash_type;
3679 		err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_ENABLE, &cmd);
3680 		if (err != 0) {
3681 			device_printf(sc->dev, "failed to enable slices\n");
3682 			return err;
3683 		}
3684 	}
3685 
3686 
3687 	mxge_choose_params(sc->ifp->if_mtu, &big_bytes, &cl_size, &nbufs);
3688 
3689 	cmd.data0 = nbufs;
3690 	err = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
3691 			    &cmd);
3692 	/* error is only meaningful if we're trying to set
3693 	   MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS > 1 */
3694 	if (err && nbufs > 1) {
3695 		device_printf(sc->dev,
3696 			      "Failed to set alway-use-n to %d\n",
3697 			      nbufs);
3698 		return EIO;
3699 	}
3700 	/* Give the firmware the mtu and the big and small buffer
3701 	   sizes.  The firmware wants the big buf size to be a power
3702 	   of two. Luckily, FreeBSD's clusters are powers of two */
3703 	cmd.data0 = sc->ifp->if_mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
3704 	err = mxge_send_cmd(sc, MXGEFW_CMD_SET_MTU, &cmd);
3705 	cmd.data0 = MHLEN - MXGEFW_PAD;
3706 	err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_SMALL_BUFFER_SIZE,
3707 			     &cmd);
3708 	cmd.data0 = big_bytes;
3709 	err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_BIG_BUFFER_SIZE, &cmd);
3710 
3711 	if (err != 0) {
3712 		device_printf(sc->dev, "failed to setup params\n");
3713 		goto abort;
3714 	}
3715 
3716 	/* Now give him the pointer to the stats block */
3717 	for (slice = 0;
3718 #ifdef IFNET_BUF_RING
3719 	     slice < sc->num_slices;
3720 #else
3721 	     slice < 1;
3722 #endif
3723 	     slice++) {
3724 		ss = &sc->ss[slice];
3725 		cmd.data0 =
3726 			MXGE_LOWPART_TO_U32(ss->fw_stats_dma.bus_addr);
3727 		cmd.data1 =
3728 			MXGE_HIGHPART_TO_U32(ss->fw_stats_dma.bus_addr);
3729 		cmd.data2 = sizeof(struct mcp_irq_data);
3730 		cmd.data2 |= (slice << 16);
3731 		err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_STATS_DMA_V2, &cmd);
3732 	}
3733 
3734 	if (err != 0) {
3735 		bus = sc->ss->fw_stats_dma.bus_addr;
3736 		bus += offsetof(struct mcp_irq_data, send_done_count);
3737 		cmd.data0 = MXGE_LOWPART_TO_U32(bus);
3738 		cmd.data1 = MXGE_HIGHPART_TO_U32(bus);
3739 		err = mxge_send_cmd(sc,
3740 				    MXGEFW_CMD_SET_STATS_DMA_OBSOLETE,
3741 				    &cmd);
3742 		/* Firmware cannot support multicast without STATS_DMA_V2 */
3743 		sc->fw_multicast_support = 0;
3744 	} else {
3745 		sc->fw_multicast_support = 1;
3746 	}
3747 
3748 	if (err != 0) {
3749 		device_printf(sc->dev, "failed to setup params\n");
3750 		goto abort;
3751 	}
3752 
3753 	for (slice = 0; slice < sc->num_slices; slice++) {
3754 		err = mxge_slice_open(&sc->ss[slice], nbufs, cl_size);
3755 		if (err != 0) {
3756 			device_printf(sc->dev, "couldn't open slice %d\n",
3757 				      slice);
3758 			goto abort;
3759 		}
3760 	}
3761 
3762 	/* Finally, start the firmware running */
3763 	err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_UP, &cmd);
3764 	if (err) {
3765 		device_printf(sc->dev, "Couldn't bring up link\n");
3766 		goto abort;
3767 	}
3768 #ifdef IFNET_BUF_RING
3769 	for (slice = 0; slice < sc->num_slices; slice++) {
3770 		ss = &sc->ss[slice];
3771 		ss->if_drv_flags |= IFF_DRV_RUNNING;
3772 		ss->if_drv_flags &= ~IFF_DRV_OACTIVE;
3773 	}
3774 #endif
3775 	sc->ifp->if_drv_flags |= IFF_DRV_RUNNING;
3776 	sc->ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
3777 
3778 	return 0;
3779 
3780 
3781 abort:
3782 	mxge_free_mbufs(sc);
3783 
3784 	return err;
3785 }
3786 
3787 static int
3788 mxge_close(mxge_softc_t *sc, int down)
3789 {
3790 	mxge_cmd_t cmd;
3791 	int err, old_down_cnt;
3792 #ifdef IFNET_BUF_RING
3793 	struct mxge_slice_state *ss;
3794 	int slice;
3795 #endif
3796 
3797 #ifdef IFNET_BUF_RING
3798 	for (slice = 0; slice < sc->num_slices; slice++) {
3799 		ss = &sc->ss[slice];
3800 		ss->if_drv_flags &= ~IFF_DRV_RUNNING;
3801 	}
3802 #endif
3803 	sc->ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
3804 	if (!down) {
3805 		old_down_cnt = sc->down_cnt;
3806 		wmb();
3807 		err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_DOWN, &cmd);
3808 		if (err) {
3809 			device_printf(sc->dev,
3810 				      "Couldn't bring down link\n");
3811 		}
3812 		if (old_down_cnt == sc->down_cnt) {
3813 			/* wait for down irq */
3814 			DELAY(10 * sc->intr_coal_delay);
3815 		}
3816 		wmb();
3817 		if (old_down_cnt == sc->down_cnt) {
3818 			device_printf(sc->dev, "never got down irq\n");
3819 		}
3820 	}
3821 	mxge_free_mbufs(sc);
3822 
3823 	return 0;
3824 }
3825 
3826 static void
3827 mxge_setup_cfg_space(mxge_softc_t *sc)
3828 {
3829 	device_t dev = sc->dev;
3830 	int reg;
3831 	uint16_t lnk, pectl;
3832 
3833 	/* find the PCIe link width and set max read request to 4KB*/
3834 	if (pci_find_cap(dev, PCIY_EXPRESS, &reg) == 0) {
3835 		lnk = pci_read_config(dev, reg + 0x12, 2);
3836 		sc->link_width = (lnk >> 4) & 0x3f;
3837 
3838 		if (sc->pectl == 0) {
3839 			pectl = pci_read_config(dev, reg + 0x8, 2);
3840 			pectl = (pectl & ~0x7000) | (5 << 12);
3841 			pci_write_config(dev, reg + 0x8, pectl, 2);
3842 			sc->pectl = pectl;
3843 		} else {
3844 			/* restore saved pectl after watchdog reset */
3845 			pci_write_config(dev, reg + 0x8, sc->pectl, 2);
3846 		}
3847 	}
3848 
3849 	/* Enable DMA and Memory space access */
3850 	pci_enable_busmaster(dev);
3851 }
3852 
3853 static uint32_t
3854 mxge_read_reboot(mxge_softc_t *sc)
3855 {
3856 	device_t dev = sc->dev;
3857 	uint32_t vs;
3858 
3859 	/* find the vendor specific offset */
3860 	if (pci_find_cap(dev, PCIY_VENDOR, &vs) != 0) {
3861 		device_printf(sc->dev,
3862 			      "could not find vendor specific offset\n");
3863 		return (uint32_t)-1;
3864 	}
3865 	/* enable read32 mode */
3866 	pci_write_config(dev, vs + 0x10, 0x3, 1);
3867 	/* tell NIC which register to read */
3868 	pci_write_config(dev, vs + 0x18, 0xfffffff0, 4);
3869 	return (pci_read_config(dev, vs + 0x14, 4));
3870 }
3871 
3872 static void
3873 mxge_watchdog_reset(mxge_softc_t *sc)
3874 {
3875 	struct pci_devinfo *dinfo;
3876 	struct mxge_slice_state *ss;
3877 	int err, running, s, num_tx_slices = 1;
3878 	uint32_t reboot;
3879 	uint16_t cmd;
3880 
3881 	err = ENXIO;
3882 
3883 	device_printf(sc->dev, "Watchdog reset!\n");
3884 
3885 	/*
3886 	 * check to see if the NIC rebooted.  If it did, then all of
3887 	 * PCI config space has been reset, and things like the
3888 	 * busmaster bit will be zero.  If this is the case, then we
3889 	 * must restore PCI config space before the NIC can be used
3890 	 * again
3891 	 */
3892 	cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3893 	if (cmd == 0xffff) {
3894 		/*
3895 		 * maybe the watchdog caught the NIC rebooting; wait
3896 		 * up to 100ms for it to finish.  If it does not come
3897 		 * back, then give up
3898 		 */
3899 		DELAY(1000*100);
3900 		cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3901 		if (cmd == 0xffff) {
3902 			device_printf(sc->dev, "NIC disappeared!\n");
3903 		}
3904 	}
3905 	if ((cmd & PCIM_CMD_BUSMASTEREN) == 0) {
3906 		/* print the reboot status */
3907 		reboot = mxge_read_reboot(sc);
3908 		device_printf(sc->dev, "NIC rebooted, status = 0x%x\n",
3909 			      reboot);
3910 		running = sc->ifp->if_drv_flags & IFF_DRV_RUNNING;
3911 		if (running) {
3912 
3913 			/*
3914 			 * quiesce NIC so that TX routines will not try to
3915 			 * xmit after restoration of BAR
3916 			 */
3917 
3918 			/* Mark the link as down */
3919 			if (sc->link_state) {
3920 				sc->link_state = 0;
3921 				if_link_state_change(sc->ifp,
3922 						     LINK_STATE_DOWN);
3923 			}
3924 #ifdef IFNET_BUF_RING
3925 			num_tx_slices = sc->num_slices;
3926 #endif
3927 			/* grab all TX locks to ensure no tx  */
3928 			for (s = 0; s < num_tx_slices; s++) {
3929 				ss = &sc->ss[s];
3930 				mtx_lock(&ss->tx.mtx);
3931 			}
3932 			mxge_close(sc, 1);
3933 		}
3934 		/* restore PCI configuration space */
3935 		dinfo = device_get_ivars(sc->dev);
3936 		pci_cfg_restore(sc->dev, dinfo);
3937 
3938 		/* and redo any changes we made to our config space */
3939 		mxge_setup_cfg_space(sc);
3940 
3941 		/* reload f/w */
3942 		err = mxge_load_firmware(sc, 0);
3943 		if (err) {
3944 			device_printf(sc->dev,
3945 				      "Unable to re-load f/w\n");
3946 		}
3947 		if (running) {
3948 			if (!err)
3949 				err = mxge_open(sc);
3950 			/* release all TX locks */
3951 			for (s = 0; s < num_tx_slices; s++) {
3952 				ss = &sc->ss[s];
3953 #ifdef IFNET_BUF_RING
3954 				mxge_start_locked(ss);
3955 #endif
3956 				mtx_unlock(&ss->tx.mtx);
3957 			}
3958 		}
3959 		sc->watchdog_resets++;
3960 	} else {
3961 		device_printf(sc->dev,
3962 			      "NIC did not reboot, not resetting\n");
3963 		err = 0;
3964 	}
3965 	if (err) {
3966 		device_printf(sc->dev, "watchdog reset failed\n");
3967 	} else {
3968 		if (sc->dying == 2)
3969 			sc->dying = 0;
3970 		callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
3971 	}
3972 }
3973 
3974 static void
3975 mxge_watchdog_task(void *arg, int pending)
3976 {
3977 	mxge_softc_t *sc = arg;
3978 
3979 
3980 	mtx_lock(&sc->driver_mtx);
3981 	mxge_watchdog_reset(sc);
3982 	mtx_unlock(&sc->driver_mtx);
3983 }
3984 
3985 static void
3986 mxge_warn_stuck(mxge_softc_t *sc, mxge_tx_ring_t *tx, int slice)
3987 {
3988 	tx = &sc->ss[slice].tx;
3989 	device_printf(sc->dev, "slice %d struck? ring state:\n", slice);
3990 	device_printf(sc->dev,
3991 		      "tx.req=%d tx.done=%d, tx.queue_active=%d\n",
3992 		      tx->req, tx->done, tx->queue_active);
3993 	device_printf(sc->dev, "tx.activate=%d tx.deactivate=%d\n",
3994 			      tx->activate, tx->deactivate);
3995 	device_printf(sc->dev, "pkt_done=%d fw=%d\n",
3996 		      tx->pkt_done,
3997 		      be32toh(sc->ss->fw_stats->send_done_count));
3998 }
3999 
4000 static int
4001 mxge_watchdog(mxge_softc_t *sc)
4002 {
4003 	mxge_tx_ring_t *tx;
4004 	uint32_t rx_pause = be32toh(sc->ss->fw_stats->dropped_pause);
4005 	int i, err = 0;
4006 
4007 	/* see if we have outstanding transmits, which
4008 	   have been pending for more than mxge_ticks */
4009 	for (i = 0;
4010 #ifdef IFNET_BUF_RING
4011 	     (i < sc->num_slices) && (err == 0);
4012 #else
4013 	     (i < 1) && (err == 0);
4014 #endif
4015 	     i++) {
4016 		tx = &sc->ss[i].tx;
4017 		if (tx->req != tx->done &&
4018 		    tx->watchdog_req != tx->watchdog_done &&
4019 		    tx->done == tx->watchdog_done) {
4020 			/* check for pause blocking before resetting */
4021 			if (tx->watchdog_rx_pause == rx_pause) {
4022 				mxge_warn_stuck(sc, tx, i);
4023 				taskqueue_enqueue(sc->tq, &sc->watchdog_task);
4024 				return (ENXIO);
4025 			}
4026 			else
4027 				device_printf(sc->dev, "Flow control blocking "
4028 					      "xmits, check link partner\n");
4029 		}
4030 
4031 		tx->watchdog_req = tx->req;
4032 		tx->watchdog_done = tx->done;
4033 		tx->watchdog_rx_pause = rx_pause;
4034 	}
4035 
4036 	if (sc->need_media_probe)
4037 		mxge_media_probe(sc);
4038 	return (err);
4039 }
4040 
4041 static u_long
4042 mxge_update_stats(mxge_softc_t *sc)
4043 {
4044 	struct mxge_slice_state *ss;
4045 	u_long pkts = 0;
4046 	u_long ipackets = 0;
4047 	u_long opackets = 0;
4048 #ifdef IFNET_BUF_RING
4049 	u_long obytes = 0;
4050 	u_long omcasts = 0;
4051 	u_long odrops = 0;
4052 #endif
4053 	u_long oerrors = 0;
4054 	int slice;
4055 
4056 	for (slice = 0; slice < sc->num_slices; slice++) {
4057 		ss = &sc->ss[slice];
4058 		ipackets += ss->ipackets;
4059 		opackets += ss->opackets;
4060 #ifdef IFNET_BUF_RING
4061 		obytes += ss->obytes;
4062 		omcasts += ss->omcasts;
4063 		odrops += ss->tx.br->br_drops;
4064 #endif
4065 		oerrors += ss->oerrors;
4066 	}
4067 	pkts = (ipackets - sc->ifp->if_ipackets);
4068 	pkts += (opackets - sc->ifp->if_opackets);
4069 	sc->ifp->if_ipackets = ipackets;
4070 	sc->ifp->if_opackets = opackets;
4071 #ifdef IFNET_BUF_RING
4072 	sc->ifp->if_obytes = obytes;
4073 	sc->ifp->if_omcasts = omcasts;
4074 	sc->ifp->if_snd.ifq_drops = odrops;
4075 #endif
4076 	sc->ifp->if_oerrors = oerrors;
4077 	return pkts;
4078 }
4079 
4080 static void
4081 mxge_tick(void *arg)
4082 {
4083 	mxge_softc_t *sc = arg;
4084 	u_long pkts = 0;
4085 	int err = 0;
4086 	int running, ticks;
4087 	uint16_t cmd;
4088 
4089 	ticks = mxge_ticks;
4090 	running = sc->ifp->if_drv_flags & IFF_DRV_RUNNING;
4091 	if (running) {
4092 		/* aggregate stats from different slices */
4093 		pkts = mxge_update_stats(sc);
4094 		if (!sc->watchdog_countdown) {
4095 			err = mxge_watchdog(sc);
4096 			sc->watchdog_countdown = 4;
4097 		}
4098 		sc->watchdog_countdown--;
4099 	}
4100 	if (pkts == 0) {
4101 		/* ensure NIC did not suffer h/w fault while idle */
4102 		cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
4103 		if ((cmd & PCIM_CMD_BUSMASTEREN) == 0) {
4104 			sc->dying = 2;
4105 			taskqueue_enqueue(sc->tq, &sc->watchdog_task);
4106 			err = ENXIO;
4107 		}
4108 		/* look less often if NIC is idle */
4109 		ticks *= 4;
4110 	}
4111 
4112 	if (err == 0)
4113 		callout_reset(&sc->co_hdl, ticks, mxge_tick, sc);
4114 
4115 }
4116 
4117 static int
4118 mxge_media_change(struct ifnet *ifp)
4119 {
4120 	return EINVAL;
4121 }
4122 
4123 static int
4124 mxge_change_mtu(mxge_softc_t *sc, int mtu)
4125 {
4126 	struct ifnet *ifp = sc->ifp;
4127 	int real_mtu, old_mtu;
4128 	int err = 0;
4129 
4130 
4131 	real_mtu = mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
4132 	if ((real_mtu > sc->max_mtu) || real_mtu < 60)
4133 		return EINVAL;
4134 	mtx_lock(&sc->driver_mtx);
4135 	old_mtu = ifp->if_mtu;
4136 	ifp->if_mtu = mtu;
4137 	if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
4138 		mxge_close(sc, 0);
4139 		err = mxge_open(sc);
4140 		if (err != 0) {
4141 			ifp->if_mtu = old_mtu;
4142 			mxge_close(sc, 0);
4143 			(void) mxge_open(sc);
4144 		}
4145 	}
4146 	mtx_unlock(&sc->driver_mtx);
4147 	return err;
4148 }
4149 
4150 static void
4151 mxge_media_status(struct ifnet *ifp, struct ifmediareq *ifmr)
4152 {
4153 	mxge_softc_t *sc = ifp->if_softc;
4154 
4155 
4156 	if (sc == NULL)
4157 		return;
4158 	ifmr->ifm_status = IFM_AVALID;
4159 	ifmr->ifm_active = IFM_ETHER | IFM_FDX;
4160 	ifmr->ifm_status |= sc->link_state ? IFM_ACTIVE : 0;
4161 	ifmr->ifm_active |= sc->current_media;
4162 }
4163 
4164 static int
4165 mxge_ioctl(struct ifnet *ifp, u_long command, caddr_t data)
4166 {
4167 	mxge_softc_t *sc = ifp->if_softc;
4168 	struct ifreq *ifr = (struct ifreq *)data;
4169 	int err, mask;
4170 
4171 	err = 0;
4172 	switch (command) {
4173 	case SIOCSIFADDR:
4174 	case SIOCGIFADDR:
4175 		err = ether_ioctl(ifp, command, data);
4176 		break;
4177 
4178 	case SIOCSIFMTU:
4179 		err = mxge_change_mtu(sc, ifr->ifr_mtu);
4180 		break;
4181 
4182 	case SIOCSIFFLAGS:
4183 		mtx_lock(&sc->driver_mtx);
4184 		if (sc->dying) {
4185 			mtx_unlock(&sc->driver_mtx);
4186 			return EINVAL;
4187 		}
4188 		if (ifp->if_flags & IFF_UP) {
4189 			if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) {
4190 				err = mxge_open(sc);
4191 			} else {
4192 				/* take care of promis can allmulti
4193 				   flag chages */
4194 				mxge_change_promisc(sc,
4195 						    ifp->if_flags & IFF_PROMISC);
4196 				mxge_set_multicast_list(sc);
4197 			}
4198 		} else {
4199 			if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
4200 				mxge_close(sc, 0);
4201 			}
4202 		}
4203 		mtx_unlock(&sc->driver_mtx);
4204 		break;
4205 
4206 	case SIOCADDMULTI:
4207 	case SIOCDELMULTI:
4208 		mtx_lock(&sc->driver_mtx);
4209 		mxge_set_multicast_list(sc);
4210 		mtx_unlock(&sc->driver_mtx);
4211 		break;
4212 
4213 	case SIOCSIFCAP:
4214 		mtx_lock(&sc->driver_mtx);
4215 		mask = ifr->ifr_reqcap ^ ifp->if_capenable;
4216 		if (mask & IFCAP_TXCSUM) {
4217 			if (IFCAP_TXCSUM & ifp->if_capenable) {
4218 				ifp->if_capenable &= ~(IFCAP_TXCSUM|IFCAP_TSO4);
4219 				ifp->if_hwassist &= ~(CSUM_TCP | CSUM_UDP);
4220 			} else {
4221 				ifp->if_capenable |= IFCAP_TXCSUM;
4222 				ifp->if_hwassist |= (CSUM_TCP | CSUM_UDP);
4223 			}
4224 		} else if (mask & IFCAP_RXCSUM) {
4225 			if (IFCAP_RXCSUM & ifp->if_capenable) {
4226 				ifp->if_capenable &= ~IFCAP_RXCSUM;
4227 			} else {
4228 				ifp->if_capenable |= IFCAP_RXCSUM;
4229 			}
4230 		}
4231 		if (mask & IFCAP_TSO4) {
4232 			if (IFCAP_TSO4 & ifp->if_capenable) {
4233 				ifp->if_capenable &= ~IFCAP_TSO4;
4234 			} else if (IFCAP_TXCSUM & ifp->if_capenable) {
4235 				ifp->if_capenable |= IFCAP_TSO4;
4236 				ifp->if_hwassist |= CSUM_TSO;
4237 			} else {
4238 				printf("mxge requires tx checksum offload"
4239 				       " be enabled to use TSO\n");
4240 				err = EINVAL;
4241 			}
4242 		}
4243 #if IFCAP_TSO6
4244 		if (mask & IFCAP_TXCSUM_IPV6) {
4245 			if (IFCAP_TXCSUM_IPV6 & ifp->if_capenable) {
4246 				ifp->if_capenable &= ~(IFCAP_TXCSUM_IPV6
4247 						       | IFCAP_TSO6);
4248 				ifp->if_hwassist &= ~(CSUM_TCP_IPV6
4249 						      | CSUM_UDP);
4250 			} else {
4251 				ifp->if_capenable |= IFCAP_TXCSUM_IPV6;
4252 				ifp->if_hwassist |= (CSUM_TCP_IPV6
4253 						     | CSUM_UDP_IPV6);
4254 			}
4255 		} else if (mask & IFCAP_RXCSUM_IPV6) {
4256 			if (IFCAP_RXCSUM_IPV6 & ifp->if_capenable) {
4257 				ifp->if_capenable &= ~IFCAP_RXCSUM_IPV6;
4258 			} else {
4259 				ifp->if_capenable |= IFCAP_RXCSUM_IPV6;
4260 			}
4261 		}
4262 		if (mask & IFCAP_TSO6) {
4263 			if (IFCAP_TSO6 & ifp->if_capenable) {
4264 				ifp->if_capenable &= ~IFCAP_TSO6;
4265 			} else if (IFCAP_TXCSUM_IPV6 & ifp->if_capenable) {
4266 				ifp->if_capenable |= IFCAP_TSO6;
4267 				ifp->if_hwassist |= CSUM_TSO;
4268 			} else {
4269 				printf("mxge requires tx checksum offload"
4270 				       " be enabled to use TSO\n");
4271 				err = EINVAL;
4272 			}
4273 		}
4274 #endif /*IFCAP_TSO6 */
4275 
4276 		if (mask & IFCAP_LRO)
4277 			ifp->if_capenable ^= IFCAP_LRO;
4278 		if (mask & IFCAP_VLAN_HWTAGGING)
4279 			ifp->if_capenable ^= IFCAP_VLAN_HWTAGGING;
4280 		if (mask & IFCAP_VLAN_HWTSO)
4281 			ifp->if_capenable ^= IFCAP_VLAN_HWTSO;
4282 
4283 		if (!(ifp->if_capabilities & IFCAP_VLAN_HWTSO) ||
4284 		    !(ifp->if_capenable & IFCAP_VLAN_HWTAGGING))
4285 			ifp->if_capenable &= ~IFCAP_VLAN_HWTSO;
4286 
4287 		mtx_unlock(&sc->driver_mtx);
4288 		VLAN_CAPABILITIES(ifp);
4289 
4290 		break;
4291 
4292 	case SIOCGIFMEDIA:
4293 		mtx_lock(&sc->driver_mtx);
4294 		mxge_media_probe(sc);
4295 		mtx_unlock(&sc->driver_mtx);
4296 		err = ifmedia_ioctl(ifp, (struct ifreq *)data,
4297 				    &sc->media, command);
4298                 break;
4299 
4300 	default:
4301 		err = ENOTTY;
4302         }
4303 	return err;
4304 }
4305 
4306 static void
4307 mxge_fetch_tunables(mxge_softc_t *sc)
4308 {
4309 
4310 	TUNABLE_INT_FETCH("hw.mxge.max_slices", &mxge_max_slices);
4311 	TUNABLE_INT_FETCH("hw.mxge.flow_control_enabled",
4312 			  &mxge_flow_control);
4313 	TUNABLE_INT_FETCH("hw.mxge.intr_coal_delay",
4314 			  &mxge_intr_coal_delay);
4315 	TUNABLE_INT_FETCH("hw.mxge.nvidia_ecrc_enable",
4316 			  &mxge_nvidia_ecrc_enable);
4317 	TUNABLE_INT_FETCH("hw.mxge.force_firmware",
4318 			  &mxge_force_firmware);
4319 	TUNABLE_INT_FETCH("hw.mxge.deassert_wait",
4320 			  &mxge_deassert_wait);
4321 	TUNABLE_INT_FETCH("hw.mxge.verbose",
4322 			  &mxge_verbose);
4323 	TUNABLE_INT_FETCH("hw.mxge.ticks", &mxge_ticks);
4324 	TUNABLE_INT_FETCH("hw.mxge.always_promisc", &mxge_always_promisc);
4325 	TUNABLE_INT_FETCH("hw.mxge.rss_hash_type", &mxge_rss_hash_type);
4326 	TUNABLE_INT_FETCH("hw.mxge.rss_hashtype", &mxge_rss_hash_type);
4327 	TUNABLE_INT_FETCH("hw.mxge.initial_mtu", &mxge_initial_mtu);
4328 	TUNABLE_INT_FETCH("hw.mxge.throttle", &mxge_throttle);
4329 
4330 	if (bootverbose)
4331 		mxge_verbose = 1;
4332 	if (mxge_intr_coal_delay < 0 || mxge_intr_coal_delay > 10*1000)
4333 		mxge_intr_coal_delay = 30;
4334 	if (mxge_ticks == 0)
4335 		mxge_ticks = hz / 2;
4336 	sc->pause = mxge_flow_control;
4337 	if (mxge_rss_hash_type < MXGEFW_RSS_HASH_TYPE_IPV4
4338 	    || mxge_rss_hash_type > MXGEFW_RSS_HASH_TYPE_MAX) {
4339 		mxge_rss_hash_type = MXGEFW_RSS_HASH_TYPE_SRC_DST_PORT;
4340 	}
4341 	if (mxge_initial_mtu > ETHERMTU_JUMBO ||
4342 	    mxge_initial_mtu < ETHER_MIN_LEN)
4343 		mxge_initial_mtu = ETHERMTU_JUMBO;
4344 
4345 	if (mxge_throttle && mxge_throttle > MXGE_MAX_THROTTLE)
4346 		mxge_throttle = MXGE_MAX_THROTTLE;
4347 	if (mxge_throttle && mxge_throttle < MXGE_MIN_THROTTLE)
4348 		mxge_throttle = MXGE_MIN_THROTTLE;
4349 	sc->throttle = mxge_throttle;
4350 }
4351 
4352 
4353 static void
4354 mxge_free_slices(mxge_softc_t *sc)
4355 {
4356 	struct mxge_slice_state *ss;
4357 	int i;
4358 
4359 
4360 	if (sc->ss == NULL)
4361 		return;
4362 
4363 	for (i = 0; i < sc->num_slices; i++) {
4364 		ss = &sc->ss[i];
4365 		if (ss->fw_stats != NULL) {
4366 			mxge_dma_free(&ss->fw_stats_dma);
4367 			ss->fw_stats = NULL;
4368 #ifdef IFNET_BUF_RING
4369 			if (ss->tx.br != NULL) {
4370 				drbr_free(ss->tx.br, M_DEVBUF);
4371 				ss->tx.br = NULL;
4372 			}
4373 #endif
4374 			mtx_destroy(&ss->tx.mtx);
4375 		}
4376 		if (ss->rx_done.entry != NULL) {
4377 			mxge_dma_free(&ss->rx_done.dma);
4378 			ss->rx_done.entry = NULL;
4379 		}
4380 	}
4381 	free(sc->ss, M_DEVBUF);
4382 	sc->ss = NULL;
4383 }
4384 
4385 static int
4386 mxge_alloc_slices(mxge_softc_t *sc)
4387 {
4388 	mxge_cmd_t cmd;
4389 	struct mxge_slice_state *ss;
4390 	size_t bytes;
4391 	int err, i, max_intr_slots;
4392 
4393 	err = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
4394 	if (err != 0) {
4395 		device_printf(sc->dev, "Cannot determine rx ring size\n");
4396 		return err;
4397 	}
4398 	sc->rx_ring_size = cmd.data0;
4399 	max_intr_slots = 2 * (sc->rx_ring_size / sizeof (mcp_dma_addr_t));
4400 
4401 	bytes = sizeof (*sc->ss) * sc->num_slices;
4402 	sc->ss = malloc(bytes, M_DEVBUF, M_NOWAIT | M_ZERO);
4403 	if (sc->ss == NULL)
4404 		return (ENOMEM);
4405 	for (i = 0; i < sc->num_slices; i++) {
4406 		ss = &sc->ss[i];
4407 
4408 		ss->sc = sc;
4409 
4410 		/* allocate per-slice rx interrupt queues */
4411 
4412 		bytes = max_intr_slots * sizeof (*ss->rx_done.entry);
4413 		err = mxge_dma_alloc(sc, &ss->rx_done.dma, bytes, 4096);
4414 		if (err != 0)
4415 			goto abort;
4416 		ss->rx_done.entry = ss->rx_done.dma.addr;
4417 		bzero(ss->rx_done.entry, bytes);
4418 
4419 		/*
4420 		 * allocate the per-slice firmware stats; stats
4421 		 * (including tx) are used used only on the first
4422 		 * slice for now
4423 		 */
4424 #ifndef IFNET_BUF_RING
4425 		if (i > 0)
4426 			continue;
4427 #endif
4428 
4429 		bytes = sizeof (*ss->fw_stats);
4430 		err = mxge_dma_alloc(sc, &ss->fw_stats_dma,
4431 				     sizeof (*ss->fw_stats), 64);
4432 		if (err != 0)
4433 			goto abort;
4434 		ss->fw_stats = (mcp_irq_data_t *)ss->fw_stats_dma.addr;
4435 		snprintf(ss->tx.mtx_name, sizeof(ss->tx.mtx_name),
4436 			 "%s:tx(%d)", device_get_nameunit(sc->dev), i);
4437 		mtx_init(&ss->tx.mtx, ss->tx.mtx_name, NULL, MTX_DEF);
4438 #ifdef IFNET_BUF_RING
4439 		ss->tx.br = buf_ring_alloc(2048, M_DEVBUF, M_WAITOK,
4440 					   &ss->tx.mtx);
4441 #endif
4442 	}
4443 
4444 	return (0);
4445 
4446 abort:
4447 	mxge_free_slices(sc);
4448 	return (ENOMEM);
4449 }
4450 
4451 static void
4452 mxge_slice_probe(mxge_softc_t *sc)
4453 {
4454 	mxge_cmd_t cmd;
4455 	char *old_fw;
4456 	int msix_cnt, status, max_intr_slots;
4457 
4458 	sc->num_slices = 1;
4459 	/*
4460 	 *  don't enable multiple slices if they are not enabled,
4461 	 *  or if this is not an SMP system
4462 	 */
4463 
4464 	if (mxge_max_slices == 0 || mxge_max_slices == 1 || mp_ncpus < 2)
4465 		return;
4466 
4467 	/* see how many MSI-X interrupts are available */
4468 	msix_cnt = pci_msix_count(sc->dev);
4469 	if (msix_cnt < 2)
4470 		return;
4471 
4472 	/* now load the slice aware firmware see what it supports */
4473 	old_fw = sc->fw_name;
4474 	if (old_fw == mxge_fw_aligned)
4475 		sc->fw_name = mxge_fw_rss_aligned;
4476 	else
4477 		sc->fw_name = mxge_fw_rss_unaligned;
4478 	status = mxge_load_firmware(sc, 0);
4479 	if (status != 0) {
4480 		device_printf(sc->dev, "Falling back to a single slice\n");
4481 		return;
4482 	}
4483 
4484 	/* try to send a reset command to the card to see if it
4485 	   is alive */
4486 	memset(&cmd, 0, sizeof (cmd));
4487 	status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
4488 	if (status != 0) {
4489 		device_printf(sc->dev, "failed reset\n");
4490 		goto abort_with_fw;
4491 	}
4492 
4493 	/* get rx ring size */
4494 	status = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
4495 	if (status != 0) {
4496 		device_printf(sc->dev, "Cannot determine rx ring size\n");
4497 		goto abort_with_fw;
4498 	}
4499 	max_intr_slots = 2 * (cmd.data0 / sizeof (mcp_dma_addr_t));
4500 
4501 	/* tell it the size of the interrupt queues */
4502 	cmd.data0 = max_intr_slots * sizeof (struct mcp_slot);
4503 	status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
4504 	if (status != 0) {
4505 		device_printf(sc->dev, "failed MXGEFW_CMD_SET_INTRQ_SIZE\n");
4506 		goto abort_with_fw;
4507 	}
4508 
4509 	/* ask the maximum number of slices it supports */
4510 	status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES, &cmd);
4511 	if (status != 0) {
4512 		device_printf(sc->dev,
4513 			      "failed MXGEFW_CMD_GET_MAX_RSS_QUEUES\n");
4514 		goto abort_with_fw;
4515 	}
4516 	sc->num_slices = cmd.data0;
4517 	if (sc->num_slices > msix_cnt)
4518 		sc->num_slices = msix_cnt;
4519 
4520 	if (mxge_max_slices == -1) {
4521 		/* cap to number of CPUs in system */
4522 		if (sc->num_slices > mp_ncpus)
4523 			sc->num_slices = mp_ncpus;
4524 	} else {
4525 		if (sc->num_slices > mxge_max_slices)
4526 			sc->num_slices = mxge_max_slices;
4527 	}
4528 	/* make sure it is a power of two */
4529 	while (sc->num_slices & (sc->num_slices - 1))
4530 		sc->num_slices--;
4531 
4532 	if (mxge_verbose)
4533 		device_printf(sc->dev, "using %d slices\n",
4534 			      sc->num_slices);
4535 
4536 	return;
4537 
4538 abort_with_fw:
4539 	sc->fw_name = old_fw;
4540 	(void) mxge_load_firmware(sc, 0);
4541 }
4542 
4543 static int
4544 mxge_add_msix_irqs(mxge_softc_t *sc)
4545 {
4546 	size_t bytes;
4547 	int count, err, i, rid;
4548 
4549 	rid = PCIR_BAR(2);
4550 	sc->msix_table_res = bus_alloc_resource_any(sc->dev, SYS_RES_MEMORY,
4551 						    &rid, RF_ACTIVE);
4552 
4553 	if (sc->msix_table_res == NULL) {
4554 		device_printf(sc->dev, "couldn't alloc MSIX table res\n");
4555 		return ENXIO;
4556 	}
4557 
4558 	count = sc->num_slices;
4559 	err = pci_alloc_msix(sc->dev, &count);
4560 	if (err != 0) {
4561 		device_printf(sc->dev, "pci_alloc_msix: failed, wanted %d"
4562 			      "err = %d \n", sc->num_slices, err);
4563 		goto abort_with_msix_table;
4564 	}
4565 	if (count < sc->num_slices) {
4566 		device_printf(sc->dev, "pci_alloc_msix: need %d, got %d\n",
4567 			      count, sc->num_slices);
4568 		device_printf(sc->dev,
4569 			      "Try setting hw.mxge.max_slices to %d\n",
4570 			      count);
4571 		err = ENOSPC;
4572 		goto abort_with_msix;
4573 	}
4574 	bytes = sizeof (*sc->msix_irq_res) * sc->num_slices;
4575 	sc->msix_irq_res = malloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO);
4576 	if (sc->msix_irq_res == NULL) {
4577 		err = ENOMEM;
4578 		goto abort_with_msix;
4579 	}
4580 
4581 	for (i = 0; i < sc->num_slices; i++) {
4582 		rid = i + 1;
4583 		sc->msix_irq_res[i] = bus_alloc_resource_any(sc->dev,
4584 							  SYS_RES_IRQ,
4585 							  &rid, RF_ACTIVE);
4586 		if (sc->msix_irq_res[i] == NULL) {
4587 			device_printf(sc->dev, "couldn't allocate IRQ res"
4588 				      " for message %d\n", i);
4589 			err = ENXIO;
4590 			goto abort_with_res;
4591 		}
4592 	}
4593 
4594 	bytes = sizeof (*sc->msix_ih) * sc->num_slices;
4595 	sc->msix_ih =  malloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO);
4596 
4597 	for (i = 0; i < sc->num_slices; i++) {
4598 		err = bus_setup_intr(sc->dev, sc->msix_irq_res[i],
4599 				     INTR_TYPE_NET | INTR_MPSAFE,
4600 #if __FreeBSD_version > 700030
4601 				     NULL,
4602 #endif
4603 				     mxge_intr, &sc->ss[i], &sc->msix_ih[i]);
4604 		if (err != 0) {
4605 			device_printf(sc->dev, "couldn't setup intr for "
4606 				      "message %d\n", i);
4607 			goto abort_with_intr;
4608 		}
4609 		bus_describe_intr(sc->dev, sc->msix_irq_res[i],
4610 				  sc->msix_ih[i], "s%d", i);
4611 	}
4612 
4613 	if (mxge_verbose) {
4614 		device_printf(sc->dev, "using %d msix IRQs:",
4615 			      sc->num_slices);
4616 		for (i = 0; i < sc->num_slices; i++)
4617 			printf(" %ld",  rman_get_start(sc->msix_irq_res[i]));
4618 		printf("\n");
4619 	}
4620 	return (0);
4621 
4622 abort_with_intr:
4623 	for (i = 0; i < sc->num_slices; i++) {
4624 		if (sc->msix_ih[i] != NULL) {
4625 			bus_teardown_intr(sc->dev, sc->msix_irq_res[i],
4626 					  sc->msix_ih[i]);
4627 			sc->msix_ih[i] = NULL;
4628 		}
4629 	}
4630 	free(sc->msix_ih, M_DEVBUF);
4631 
4632 
4633 abort_with_res:
4634 	for (i = 0; i < sc->num_slices; i++) {
4635 		rid = i + 1;
4636 		if (sc->msix_irq_res[i] != NULL)
4637 			bus_release_resource(sc->dev, SYS_RES_IRQ, rid,
4638 					     sc->msix_irq_res[i]);
4639 		sc->msix_irq_res[i] = NULL;
4640 	}
4641 	free(sc->msix_irq_res, M_DEVBUF);
4642 
4643 
4644 abort_with_msix:
4645 	pci_release_msi(sc->dev);
4646 
4647 abort_with_msix_table:
4648 	bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2),
4649 			     sc->msix_table_res);
4650 
4651 	return err;
4652 }
4653 
4654 static int
4655 mxge_add_single_irq(mxge_softc_t *sc)
4656 {
4657 	int count, err, rid;
4658 
4659 	count = pci_msi_count(sc->dev);
4660 	if (count == 1 && pci_alloc_msi(sc->dev, &count) == 0) {
4661 		rid = 1;
4662 	} else {
4663 		rid = 0;
4664 		sc->legacy_irq = 1;
4665 	}
4666 	sc->irq_res = bus_alloc_resource(sc->dev, SYS_RES_IRQ, &rid, 0, ~0,
4667 					 1, RF_SHAREABLE | RF_ACTIVE);
4668 	if (sc->irq_res == NULL) {
4669 		device_printf(sc->dev, "could not alloc interrupt\n");
4670 		return ENXIO;
4671 	}
4672 	if (mxge_verbose)
4673 		device_printf(sc->dev, "using %s irq %ld\n",
4674 			      sc->legacy_irq ? "INTx" : "MSI",
4675 			      rman_get_start(sc->irq_res));
4676 	err = bus_setup_intr(sc->dev, sc->irq_res,
4677 			     INTR_TYPE_NET | INTR_MPSAFE,
4678 #if __FreeBSD_version > 700030
4679 			     NULL,
4680 #endif
4681 			     mxge_intr, &sc->ss[0], &sc->ih);
4682 	if (err != 0) {
4683 		bus_release_resource(sc->dev, SYS_RES_IRQ,
4684 				     sc->legacy_irq ? 0 : 1, sc->irq_res);
4685 		if (!sc->legacy_irq)
4686 			pci_release_msi(sc->dev);
4687 	}
4688 	return err;
4689 }
4690 
4691 static void
4692 mxge_rem_msix_irqs(mxge_softc_t *sc)
4693 {
4694 	int i, rid;
4695 
4696 	for (i = 0; i < sc->num_slices; i++) {
4697 		if (sc->msix_ih[i] != NULL) {
4698 			bus_teardown_intr(sc->dev, sc->msix_irq_res[i],
4699 					  sc->msix_ih[i]);
4700 			sc->msix_ih[i] = NULL;
4701 		}
4702 	}
4703 	free(sc->msix_ih, M_DEVBUF);
4704 
4705 	for (i = 0; i < sc->num_slices; i++) {
4706 		rid = i + 1;
4707 		if (sc->msix_irq_res[i] != NULL)
4708 			bus_release_resource(sc->dev, SYS_RES_IRQ, rid,
4709 					     sc->msix_irq_res[i]);
4710 		sc->msix_irq_res[i] = NULL;
4711 	}
4712 	free(sc->msix_irq_res, M_DEVBUF);
4713 
4714 	bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2),
4715 			     sc->msix_table_res);
4716 
4717 	pci_release_msi(sc->dev);
4718 	return;
4719 }
4720 
4721 static void
4722 mxge_rem_single_irq(mxge_softc_t *sc)
4723 {
4724 	bus_teardown_intr(sc->dev, sc->irq_res, sc->ih);
4725 	bus_release_resource(sc->dev, SYS_RES_IRQ,
4726 			     sc->legacy_irq ? 0 : 1, sc->irq_res);
4727 	if (!sc->legacy_irq)
4728 		pci_release_msi(sc->dev);
4729 }
4730 
4731 static void
4732 mxge_rem_irq(mxge_softc_t *sc)
4733 {
4734 	if (sc->num_slices > 1)
4735 		mxge_rem_msix_irqs(sc);
4736 	else
4737 		mxge_rem_single_irq(sc);
4738 }
4739 
4740 static int
4741 mxge_add_irq(mxge_softc_t *sc)
4742 {
4743 	int err;
4744 
4745 	if (sc->num_slices > 1)
4746 		err = mxge_add_msix_irqs(sc);
4747 	else
4748 		err = mxge_add_single_irq(sc);
4749 
4750 	if (0 && err == 0 && sc->num_slices > 1) {
4751 		mxge_rem_msix_irqs(sc);
4752 		err = mxge_add_msix_irqs(sc);
4753 	}
4754 	return err;
4755 }
4756 
4757 
4758 static int
4759 mxge_attach(device_t dev)
4760 {
4761 	mxge_cmd_t cmd;
4762 	mxge_softc_t *sc = device_get_softc(dev);
4763 	struct ifnet *ifp;
4764 	int err, rid;
4765 
4766 	sc->dev = dev;
4767 	mxge_fetch_tunables(sc);
4768 
4769 	TASK_INIT(&sc->watchdog_task, 1, mxge_watchdog_task, sc);
4770 	sc->tq = taskqueue_create("mxge_taskq", M_WAITOK,
4771 				  taskqueue_thread_enqueue, &sc->tq);
4772 	if (sc->tq == NULL) {
4773 		err = ENOMEM;
4774 		goto abort_with_nothing;
4775 	}
4776 
4777 	err = bus_dma_tag_create(bus_get_dma_tag(dev),	/* parent */
4778 				 1,			/* alignment */
4779 				 0,			/* boundary */
4780 				 BUS_SPACE_MAXADDR,	/* low */
4781 				 BUS_SPACE_MAXADDR,	/* high */
4782 				 NULL, NULL,		/* filter */
4783 				 65536 + 256,		/* maxsize */
4784 				 MXGE_MAX_SEND_DESC, 	/* num segs */
4785 				 65536,			/* maxsegsize */
4786 				 0,			/* flags */
4787 				 NULL, NULL,		/* lock */
4788 				 &sc->parent_dmat);	/* tag */
4789 
4790 	if (err != 0) {
4791 		device_printf(sc->dev, "Err %d allocating parent dmat\n",
4792 			      err);
4793 		goto abort_with_tq;
4794 	}
4795 
4796 	ifp = sc->ifp = if_alloc(IFT_ETHER);
4797 	if (ifp == NULL) {
4798 		device_printf(dev, "can not if_alloc()\n");
4799 		err = ENOSPC;
4800 		goto abort_with_parent_dmat;
4801 	}
4802 	if_initname(ifp, device_get_name(dev), device_get_unit(dev));
4803 
4804 	snprintf(sc->cmd_mtx_name, sizeof(sc->cmd_mtx_name), "%s:cmd",
4805 		 device_get_nameunit(dev));
4806 	mtx_init(&sc->cmd_mtx, sc->cmd_mtx_name, NULL, MTX_DEF);
4807 	snprintf(sc->driver_mtx_name, sizeof(sc->driver_mtx_name),
4808 		 "%s:drv", device_get_nameunit(dev));
4809 	mtx_init(&sc->driver_mtx, sc->driver_mtx_name,
4810 		 MTX_NETWORK_LOCK, MTX_DEF);
4811 
4812 	callout_init_mtx(&sc->co_hdl, &sc->driver_mtx, 0);
4813 
4814 	mxge_setup_cfg_space(sc);
4815 
4816 	/* Map the board into the kernel */
4817 	rid = PCIR_BARS;
4818 	sc->mem_res = bus_alloc_resource(dev, SYS_RES_MEMORY, &rid, 0,
4819 					 ~0, 1, RF_ACTIVE);
4820 	if (sc->mem_res == NULL) {
4821 		device_printf(dev, "could not map memory\n");
4822 		err = ENXIO;
4823 		goto abort_with_lock;
4824 	}
4825 	sc->sram = rman_get_virtual(sc->mem_res);
4826 	sc->sram_size = 2*1024*1024 - (2*(48*1024)+(32*1024)) - 0x100;
4827 	if (sc->sram_size > rman_get_size(sc->mem_res)) {
4828 		device_printf(dev, "impossible memory region size %ld\n",
4829 			      rman_get_size(sc->mem_res));
4830 		err = ENXIO;
4831 		goto abort_with_mem_res;
4832 	}
4833 
4834 	/* make NULL terminated copy of the EEPROM strings section of
4835 	   lanai SRAM */
4836 	bzero(sc->eeprom_strings, MXGE_EEPROM_STRINGS_SIZE);
4837 	bus_space_read_region_1(rman_get_bustag(sc->mem_res),
4838 				rman_get_bushandle(sc->mem_res),
4839 				sc->sram_size - MXGE_EEPROM_STRINGS_SIZE,
4840 				sc->eeprom_strings,
4841 				MXGE_EEPROM_STRINGS_SIZE - 2);
4842 	err = mxge_parse_strings(sc);
4843 	if (err != 0)
4844 		goto abort_with_mem_res;
4845 
4846 	/* Enable write combining for efficient use of PCIe bus */
4847 	mxge_enable_wc(sc);
4848 
4849 	/* Allocate the out of band dma memory */
4850 	err = mxge_dma_alloc(sc, &sc->cmd_dma,
4851 			     sizeof (mxge_cmd_t), 64);
4852 	if (err != 0)
4853 		goto abort_with_mem_res;
4854 	sc->cmd = (mcp_cmd_response_t *) sc->cmd_dma.addr;
4855 	err = mxge_dma_alloc(sc, &sc->zeropad_dma, 64, 64);
4856 	if (err != 0)
4857 		goto abort_with_cmd_dma;
4858 
4859 	err = mxge_dma_alloc(sc, &sc->dmabench_dma, 4096, 4096);
4860 	if (err != 0)
4861 		goto abort_with_zeropad_dma;
4862 
4863 	/* select & load the firmware */
4864 	err = mxge_select_firmware(sc);
4865 	if (err != 0)
4866 		goto abort_with_dmabench;
4867 	sc->intr_coal_delay = mxge_intr_coal_delay;
4868 
4869 	mxge_slice_probe(sc);
4870 	err = mxge_alloc_slices(sc);
4871 	if (err != 0)
4872 		goto abort_with_dmabench;
4873 
4874 	err = mxge_reset(sc, 0);
4875 	if (err != 0)
4876 		goto abort_with_slices;
4877 
4878 	err = mxge_alloc_rings(sc);
4879 	if (err != 0) {
4880 		device_printf(sc->dev, "failed to allocate rings\n");
4881 		goto abort_with_slices;
4882 	}
4883 
4884 	err = mxge_add_irq(sc);
4885 	if (err != 0) {
4886 		device_printf(sc->dev, "failed to add irq\n");
4887 		goto abort_with_rings;
4888 	}
4889 
4890 	if_initbaudrate(ifp, IF_Gbps(10));
4891 	ifp->if_capabilities = IFCAP_RXCSUM | IFCAP_TXCSUM | IFCAP_TSO4 |
4892 		IFCAP_VLAN_MTU | IFCAP_LINKSTATE | IFCAP_TXCSUM_IPV6 |
4893 		IFCAP_RXCSUM_IPV6;
4894 #if defined(INET) || defined(INET6)
4895 	ifp->if_capabilities |= IFCAP_LRO;
4896 #endif
4897 
4898 #ifdef MXGE_NEW_VLAN_API
4899 	ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_HWCSUM;
4900 
4901 	/* Only FW 1.4.32 and newer can do TSO over vlans */
4902 	if (sc->fw_ver_major == 1 && sc->fw_ver_minor == 4 &&
4903 	    sc->fw_ver_tiny >= 32)
4904 		ifp->if_capabilities |= IFCAP_VLAN_HWTSO;
4905 #endif
4906 	sc->max_mtu = mxge_max_mtu(sc);
4907 	if (sc->max_mtu >= 9000)
4908 		ifp->if_capabilities |= IFCAP_JUMBO_MTU;
4909 	else
4910 		device_printf(dev, "MTU limited to %d.  Install "
4911 			      "latest firmware for 9000 byte jumbo support\n",
4912 			      sc->max_mtu - ETHER_HDR_LEN);
4913 	ifp->if_hwassist = CSUM_TCP | CSUM_UDP | CSUM_TSO;
4914 	ifp->if_hwassist |= CSUM_TCP_IPV6 | CSUM_UDP_IPV6;
4915 	/* check to see if f/w supports TSO for IPv6 */
4916 	if (!mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_TSO6_HDR_SIZE, &cmd)) {
4917 		if (CSUM_TCP_IPV6)
4918 			ifp->if_capabilities |= IFCAP_TSO6;
4919 		sc->max_tso6_hlen = min(cmd.data0,
4920 					sizeof (sc->ss[0].scratch));
4921 	}
4922 	ifp->if_capenable = ifp->if_capabilities;
4923 	if (sc->lro_cnt == 0)
4924 		ifp->if_capenable &= ~IFCAP_LRO;
4925         ifp->if_init = mxge_init;
4926         ifp->if_softc = sc;
4927         ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
4928         ifp->if_ioctl = mxge_ioctl;
4929         ifp->if_start = mxge_start;
4930 	/* Initialise the ifmedia structure */
4931 	ifmedia_init(&sc->media, 0, mxge_media_change,
4932 		     mxge_media_status);
4933 	mxge_media_init(sc);
4934 	mxge_media_probe(sc);
4935 	sc->dying = 0;
4936 	ether_ifattach(ifp, sc->mac_addr);
4937 	/* ether_ifattach sets mtu to ETHERMTU */
4938 	if (mxge_initial_mtu != ETHERMTU)
4939 		mxge_change_mtu(sc, mxge_initial_mtu);
4940 
4941 	mxge_add_sysctls(sc);
4942 #ifdef IFNET_BUF_RING
4943 	ifp->if_transmit = mxge_transmit;
4944 	ifp->if_qflush = mxge_qflush;
4945 #endif
4946 	taskqueue_start_threads(&sc->tq, 1, PI_NET, "%s taskq",
4947 				device_get_nameunit(sc->dev));
4948 	callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
4949 	return 0;
4950 
4951 abort_with_rings:
4952 	mxge_free_rings(sc);
4953 abort_with_slices:
4954 	mxge_free_slices(sc);
4955 abort_with_dmabench:
4956 	mxge_dma_free(&sc->dmabench_dma);
4957 abort_with_zeropad_dma:
4958 	mxge_dma_free(&sc->zeropad_dma);
4959 abort_with_cmd_dma:
4960 	mxge_dma_free(&sc->cmd_dma);
4961 abort_with_mem_res:
4962 	bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
4963 abort_with_lock:
4964 	pci_disable_busmaster(dev);
4965 	mtx_destroy(&sc->cmd_mtx);
4966 	mtx_destroy(&sc->driver_mtx);
4967 	if_free(ifp);
4968 abort_with_parent_dmat:
4969 	bus_dma_tag_destroy(sc->parent_dmat);
4970 abort_with_tq:
4971 	if (sc->tq != NULL) {
4972 		taskqueue_drain(sc->tq, &sc->watchdog_task);
4973 		taskqueue_free(sc->tq);
4974 		sc->tq = NULL;
4975 	}
4976 abort_with_nothing:
4977 	return err;
4978 }
4979 
4980 static int
4981 mxge_detach(device_t dev)
4982 {
4983 	mxge_softc_t *sc = device_get_softc(dev);
4984 
4985 	if (mxge_vlans_active(sc)) {
4986 		device_printf(sc->dev,
4987 			      "Detach vlans before removing module\n");
4988 		return EBUSY;
4989 	}
4990 	mtx_lock(&sc->driver_mtx);
4991 	sc->dying = 1;
4992 	if (sc->ifp->if_drv_flags & IFF_DRV_RUNNING)
4993 		mxge_close(sc, 0);
4994 	mtx_unlock(&sc->driver_mtx);
4995 	ether_ifdetach(sc->ifp);
4996 	if (sc->tq != NULL) {
4997 		taskqueue_drain(sc->tq, &sc->watchdog_task);
4998 		taskqueue_free(sc->tq);
4999 		sc->tq = NULL;
5000 	}
5001 	callout_drain(&sc->co_hdl);
5002 	ifmedia_removeall(&sc->media);
5003 	mxge_dummy_rdma(sc, 0);
5004 	mxge_rem_sysctls(sc);
5005 	mxge_rem_irq(sc);
5006 	mxge_free_rings(sc);
5007 	mxge_free_slices(sc);
5008 	mxge_dma_free(&sc->dmabench_dma);
5009 	mxge_dma_free(&sc->zeropad_dma);
5010 	mxge_dma_free(&sc->cmd_dma);
5011 	bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
5012 	pci_disable_busmaster(dev);
5013 	mtx_destroy(&sc->cmd_mtx);
5014 	mtx_destroy(&sc->driver_mtx);
5015 	if_free(sc->ifp);
5016 	bus_dma_tag_destroy(sc->parent_dmat);
5017 	return 0;
5018 }
5019 
5020 static int
5021 mxge_shutdown(device_t dev)
5022 {
5023 	return 0;
5024 }
5025 
5026 /*
5027   This file uses Myri10GE driver indentation.
5028 
5029   Local Variables:
5030   c-file-style:"linux"
5031   tab-width:8
5032   End:
5033 */
5034