xref: /freebsd/sys/dev/mxge/if_mxge.c (revision 59c3cb81c1769fdb6c840c971df129b52f4a848d)
1 /******************************************************************************
2 
3 Copyright (c) 2006-2013, Myricom Inc.
4 All rights reserved.
5 
6 Redistribution and use in source and binary forms, with or without
7 modification, are permitted provided that the following conditions are met:
8 
9  1. Redistributions of source code must retain the above copyright notice,
10     this list of conditions and the following disclaimer.
11 
12  2. Neither the name of the Myricom Inc, nor the names of its
13     contributors may be used to endorse or promote products derived from
14     this software without specific prior written permission.
15 
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 POSSIBILITY OF SUCH DAMAGE.
27 
28 ***************************************************************************/
29 
30 #include <sys/cdefs.h>
31 __FBSDID("$FreeBSD$");
32 
33 #include <sys/param.h>
34 #include <sys/systm.h>
35 #include <sys/linker.h>
36 #include <sys/firmware.h>
37 #include <sys/endian.h>
38 #include <sys/sockio.h>
39 #include <sys/mbuf.h>
40 #include <sys/malloc.h>
41 #include <sys/kdb.h>
42 #include <sys/kernel.h>
43 #include <sys/lock.h>
44 #include <sys/module.h>
45 #include <sys/socket.h>
46 #include <sys/sysctl.h>
47 #include <sys/sx.h>
48 #include <sys/taskqueue.h>
49 #include <sys/zlib.h>
50 
51 #include <net/if.h>
52 #include <net/if_var.h>
53 #include <net/if_arp.h>
54 #include <net/ethernet.h>
55 #include <net/if_dl.h>
56 #include <net/if_media.h>
57 
58 #include <net/bpf.h>
59 
60 #include <net/if_types.h>
61 #include <net/if_vlan_var.h>
62 
63 #include <netinet/in_systm.h>
64 #include <netinet/in.h>
65 #include <netinet/ip.h>
66 #include <netinet/ip6.h>
67 #include <netinet/tcp.h>
68 #include <netinet/tcp_lro.h>
69 #include <netinet6/ip6_var.h>
70 
71 #include <machine/bus.h>
72 #include <machine/in_cksum.h>
73 #include <machine/resource.h>
74 #include <sys/bus.h>
75 #include <sys/rman.h>
76 #include <sys/smp.h>
77 
78 #include <dev/pci/pcireg.h>
79 #include <dev/pci/pcivar.h>
80 #include <dev/pci/pci_private.h> /* XXX for pci_cfg_restore */
81 
82 #include <vm/vm.h>		/* for pmap_mapdev() */
83 #include <vm/pmap.h>
84 
85 #if defined(__i386) || defined(__amd64)
86 #include <machine/specialreg.h>
87 #endif
88 
89 #include <dev/mxge/mxge_mcp.h>
90 #include <dev/mxge/mcp_gen_header.h>
91 /*#define MXGE_FAKE_IFP*/
92 #include <dev/mxge/if_mxge_var.h>
93 #ifdef IFNET_BUF_RING
94 #include <sys/buf_ring.h>
95 #endif
96 
97 #include "opt_inet.h"
98 #include "opt_inet6.h"
99 
100 /* tunable params */
101 static int mxge_nvidia_ecrc_enable = 1;
102 static int mxge_force_firmware = 0;
103 static int mxge_intr_coal_delay = 30;
104 static int mxge_deassert_wait = 1;
105 static int mxge_flow_control = 1;
106 static int mxge_verbose = 0;
107 static int mxge_ticks;
108 static int mxge_max_slices = 1;
109 static int mxge_rss_hash_type = MXGEFW_RSS_HASH_TYPE_SRC_DST_PORT;
110 static int mxge_always_promisc = 0;
111 static int mxge_initial_mtu = ETHERMTU_JUMBO;
112 static int mxge_throttle = 0;
113 static char *mxge_fw_unaligned = "mxge_ethp_z8e";
114 static char *mxge_fw_aligned = "mxge_eth_z8e";
115 static char *mxge_fw_rss_aligned = "mxge_rss_eth_z8e";
116 static char *mxge_fw_rss_unaligned = "mxge_rss_ethp_z8e";
117 
118 static int mxge_probe(device_t dev);
119 static int mxge_attach(device_t dev);
120 static int mxge_detach(device_t dev);
121 static int mxge_shutdown(device_t dev);
122 static void mxge_intr(void *arg);
123 
124 static device_method_t mxge_methods[] =
125 {
126   /* Device interface */
127   DEVMETHOD(device_probe, mxge_probe),
128   DEVMETHOD(device_attach, mxge_attach),
129   DEVMETHOD(device_detach, mxge_detach),
130   DEVMETHOD(device_shutdown, mxge_shutdown),
131 
132   DEVMETHOD_END
133 };
134 
135 static driver_t mxge_driver =
136 {
137   "mxge",
138   mxge_methods,
139   sizeof(mxge_softc_t),
140 };
141 
142 static devclass_t mxge_devclass;
143 
144 /* Declare ourselves to be a child of the PCI bus.*/
145 DRIVER_MODULE(mxge, pci, mxge_driver, mxge_devclass, 0, 0);
146 MODULE_DEPEND(mxge, firmware, 1, 1, 1);
147 MODULE_DEPEND(mxge, zlib, 1, 1, 1);
148 
149 static int mxge_load_firmware(mxge_softc_t *sc, int adopt);
150 static int mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data);
151 static int mxge_close(mxge_softc_t *sc, int down);
152 static int mxge_open(mxge_softc_t *sc);
153 static void mxge_tick(void *arg);
154 
155 static int
156 mxge_probe(device_t dev)
157 {
158 	int rev;
159 
160 
161 	if ((pci_get_vendor(dev) == MXGE_PCI_VENDOR_MYRICOM) &&
162 	    ((pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E) ||
163 	     (pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E_9))) {
164 		rev = pci_get_revid(dev);
165 		switch (rev) {
166 		case MXGE_PCI_REV_Z8E:
167 			device_set_desc(dev, "Myri10G-PCIE-8A");
168 			break;
169 		case MXGE_PCI_REV_Z8ES:
170 			device_set_desc(dev, "Myri10G-PCIE-8B");
171 			break;
172 		default:
173 			device_set_desc(dev, "Myri10G-PCIE-8??");
174 			device_printf(dev, "Unrecognized rev %d NIC\n",
175 				      rev);
176 			break;
177 		}
178 		return 0;
179 	}
180 	return ENXIO;
181 }
182 
183 static void
184 mxge_enable_wc(mxge_softc_t *sc)
185 {
186 #if defined(__i386) || defined(__amd64)
187 	vm_offset_t len;
188 	int err;
189 
190 	sc->wc = 1;
191 	len = rman_get_size(sc->mem_res);
192 	err = pmap_change_attr((vm_offset_t) sc->sram,
193 			       len, PAT_WRITE_COMBINING);
194 	if (err != 0) {
195 		device_printf(sc->dev, "pmap_change_attr failed, %d\n",
196 			      err);
197 		sc->wc = 0;
198 	}
199 #endif
200 }
201 
202 
203 /* callback to get our DMA address */
204 static void
205 mxge_dmamap_callback(void *arg, bus_dma_segment_t *segs, int nsegs,
206 			 int error)
207 {
208 	if (error == 0) {
209 		*(bus_addr_t *) arg = segs->ds_addr;
210 	}
211 }
212 
213 static int
214 mxge_dma_alloc(mxge_softc_t *sc, mxge_dma_t *dma, size_t bytes,
215 		   bus_size_t alignment)
216 {
217 	int err;
218 	device_t dev = sc->dev;
219 	bus_size_t boundary, maxsegsize;
220 
221 	if (bytes > 4096 && alignment == 4096) {
222 		boundary = 0;
223 		maxsegsize = bytes;
224 	} else {
225 		boundary = 4096;
226 		maxsegsize = 4096;
227 	}
228 
229 	/* allocate DMAable memory tags */
230 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
231 				 alignment,		/* alignment */
232 				 boundary,		/* boundary */
233 				 BUS_SPACE_MAXADDR,	/* low */
234 				 BUS_SPACE_MAXADDR,	/* high */
235 				 NULL, NULL,		/* filter */
236 				 bytes,			/* maxsize */
237 				 1,			/* num segs */
238 				 maxsegsize,		/* maxsegsize */
239 				 BUS_DMA_COHERENT,	/* flags */
240 				 NULL, NULL,		/* lock */
241 				 &dma->dmat);		/* tag */
242 	if (err != 0) {
243 		device_printf(dev, "couldn't alloc tag (err = %d)\n", err);
244 		return err;
245 	}
246 
247 	/* allocate DMAable memory & map */
248 	err = bus_dmamem_alloc(dma->dmat, &dma->addr,
249 			       (BUS_DMA_WAITOK | BUS_DMA_COHERENT
250 				| BUS_DMA_ZERO),  &dma->map);
251 	if (err != 0) {
252 		device_printf(dev, "couldn't alloc mem (err = %d)\n", err);
253 		goto abort_with_dmat;
254 	}
255 
256 	/* load the memory */
257 	err = bus_dmamap_load(dma->dmat, dma->map, dma->addr, bytes,
258 			      mxge_dmamap_callback,
259 			      (void *)&dma->bus_addr, 0);
260 	if (err != 0) {
261 		device_printf(dev, "couldn't load map (err = %d)\n", err);
262 		goto abort_with_mem;
263 	}
264 	return 0;
265 
266 abort_with_mem:
267 	bus_dmamem_free(dma->dmat, dma->addr, dma->map);
268 abort_with_dmat:
269 	(void)bus_dma_tag_destroy(dma->dmat);
270 	return err;
271 }
272 
273 
274 static void
275 mxge_dma_free(mxge_dma_t *dma)
276 {
277 	bus_dmamap_unload(dma->dmat, dma->map);
278 	bus_dmamem_free(dma->dmat, dma->addr, dma->map);
279 	(void)bus_dma_tag_destroy(dma->dmat);
280 }
281 
282 /*
283  * The eeprom strings on the lanaiX have the format
284  * SN=x\0
285  * MAC=x:x:x:x:x:x\0
286  * PC=text\0
287  */
288 
289 static int
290 mxge_parse_strings(mxge_softc_t *sc)
291 {
292 	char *ptr;
293 	int i, found_mac, found_sn2;
294 	char *endptr;
295 
296 	ptr = sc->eeprom_strings;
297 	found_mac = 0;
298 	found_sn2 = 0;
299 	while (*ptr != '\0') {
300 		if (strncmp(ptr, "MAC=", 4) == 0) {
301 			ptr += 4;
302 			for (i = 0;;) {
303 				sc->mac_addr[i] = strtoul(ptr, &endptr, 16);
304 				if (endptr - ptr != 2)
305 					goto abort;
306 				ptr = endptr;
307 				if (++i == 6)
308 					break;
309 				if (*ptr++ != ':')
310 					goto abort;
311 			}
312 			found_mac = 1;
313 		} else if (strncmp(ptr, "PC=", 3) == 0) {
314 			ptr += 3;
315 			strlcpy(sc->product_code_string, ptr,
316 			    sizeof(sc->product_code_string));
317 		} else if (!found_sn2 && (strncmp(ptr, "SN=", 3) == 0)) {
318 			ptr += 3;
319 			strlcpy(sc->serial_number_string, ptr,
320 			    sizeof(sc->serial_number_string));
321 		} else if (strncmp(ptr, "SN2=", 4) == 0) {
322 			/* SN2 takes precedence over SN */
323 			ptr += 4;
324 			found_sn2 = 1;
325 			strlcpy(sc->serial_number_string, ptr,
326 			    sizeof(sc->serial_number_string));
327 		}
328 		while (*ptr++ != '\0') {}
329 	}
330 
331 	if (found_mac)
332 		return 0;
333 
334  abort:
335 	device_printf(sc->dev, "failed to parse eeprom_strings\n");
336 
337 	return ENXIO;
338 }
339 
340 #if defined __i386 || defined i386 || defined __i386__ || defined __x86_64__
341 static void
342 mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
343 {
344 	uint32_t val;
345 	unsigned long base, off;
346 	char *va, *cfgptr;
347 	device_t pdev, mcp55;
348 	uint16_t vendor_id, device_id, word;
349 	uintptr_t bus, slot, func, ivend, idev;
350 	uint32_t *ptr32;
351 
352 
353 	if (!mxge_nvidia_ecrc_enable)
354 		return;
355 
356 	pdev = device_get_parent(device_get_parent(sc->dev));
357 	if (pdev == NULL) {
358 		device_printf(sc->dev, "could not find parent?\n");
359 		return;
360 	}
361 	vendor_id = pci_read_config(pdev, PCIR_VENDOR, 2);
362 	device_id = pci_read_config(pdev, PCIR_DEVICE, 2);
363 
364 	if (vendor_id != 0x10de)
365 		return;
366 
367 	base = 0;
368 
369 	if (device_id == 0x005d) {
370 		/* ck804, base address is magic */
371 		base = 0xe0000000UL;
372 	} else if (device_id >= 0x0374 && device_id <= 0x378) {
373 		/* mcp55, base address stored in chipset */
374 		mcp55 = pci_find_bsf(0, 0, 0);
375 		if (mcp55 &&
376 		    0x10de == pci_read_config(mcp55, PCIR_VENDOR, 2) &&
377 		    0x0369 == pci_read_config(mcp55, PCIR_DEVICE, 2)) {
378 			word = pci_read_config(mcp55, 0x90, 2);
379 			base = ((unsigned long)word & 0x7ffeU) << 25;
380 		}
381 	}
382 	if (!base)
383 		return;
384 
385 	/* XXXX
386 	   Test below is commented because it is believed that doing
387 	   config read/write beyond 0xff will access the config space
388 	   for the next larger function.  Uncomment this and remove
389 	   the hacky pmap_mapdev() way of accessing config space when
390 	   FreeBSD grows support for extended pcie config space access
391 	*/
392 #if 0
393 	/* See if we can, by some miracle, access the extended
394 	   config space */
395 	val = pci_read_config(pdev, 0x178, 4);
396 	if (val != 0xffffffff) {
397 		val |= 0x40;
398 		pci_write_config(pdev, 0x178, val, 4);
399 		return;
400 	}
401 #endif
402 	/* Rather than using normal pci config space writes, we must
403 	 * map the Nvidia config space ourselves.  This is because on
404 	 * opteron/nvidia class machine the 0xe000000 mapping is
405 	 * handled by the nvidia chipset, that means the internal PCI
406 	 * device (the on-chip northbridge), or the amd-8131 bridge
407 	 * and things behind them are not visible by this method.
408 	 */
409 
410 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
411 		      PCI_IVAR_BUS, &bus);
412 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
413 		      PCI_IVAR_SLOT, &slot);
414 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
415 		      PCI_IVAR_FUNCTION, &func);
416 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
417 		      PCI_IVAR_VENDOR, &ivend);
418 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
419 		      PCI_IVAR_DEVICE, &idev);
420 
421 	off =  base
422 		+ 0x00100000UL * (unsigned long)bus
423 		+ 0x00001000UL * (unsigned long)(func
424 						 + 8 * slot);
425 
426 	/* map it into the kernel */
427 	va = pmap_mapdev(trunc_page((vm_paddr_t)off), PAGE_SIZE);
428 
429 
430 	if (va == NULL) {
431 		device_printf(sc->dev, "pmap_kenter_temporary didn't\n");
432 		return;
433 	}
434 	/* get a pointer to the config space mapped into the kernel */
435 	cfgptr = va + (off & PAGE_MASK);
436 
437 	/* make sure that we can really access it */
438 	vendor_id = *(uint16_t *)(cfgptr + PCIR_VENDOR);
439 	device_id = *(uint16_t *)(cfgptr + PCIR_DEVICE);
440 	if (! (vendor_id == ivend && device_id == idev)) {
441 		device_printf(sc->dev, "mapping failed: 0x%x:0x%x\n",
442 			      vendor_id, device_id);
443 		pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
444 		return;
445 	}
446 
447 	ptr32 = (uint32_t*)(cfgptr + 0x178);
448 	val = *ptr32;
449 
450 	if (val == 0xffffffff) {
451 		device_printf(sc->dev, "extended mapping failed\n");
452 		pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
453 		return;
454 	}
455 	*ptr32 = val | 0x40;
456 	pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
457 	if (mxge_verbose)
458 		device_printf(sc->dev,
459 			      "Enabled ECRC on upstream Nvidia bridge "
460 			      "at %d:%d:%d\n",
461 			      (int)bus, (int)slot, (int)func);
462 	return;
463 }
464 #else
465 static void
466 mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
467 {
468 	device_printf(sc->dev,
469 		      "Nforce 4 chipset on non-x86/amd64!?!?!\n");
470 	return;
471 }
472 #endif
473 
474 
475 static int
476 mxge_dma_test(mxge_softc_t *sc, int test_type)
477 {
478 	mxge_cmd_t cmd;
479 	bus_addr_t dmatest_bus = sc->dmabench_dma.bus_addr;
480 	int status;
481 	uint32_t len;
482 	char *test = " ";
483 
484 
485 	/* Run a small DMA test.
486 	 * The magic multipliers to the length tell the firmware
487 	 * to do DMA read, write, or read+write tests.  The
488 	 * results are returned in cmd.data0.  The upper 16
489 	 * bits of the return is the number of transfers completed.
490 	 * The lower 16 bits is the time in 0.5us ticks that the
491 	 * transfers took to complete.
492 	 */
493 
494 	len = sc->tx_boundary;
495 
496 	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
497 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
498 	cmd.data2 = len * 0x10000;
499 	status = mxge_send_cmd(sc, test_type, &cmd);
500 	if (status != 0) {
501 		test = "read";
502 		goto abort;
503 	}
504 	sc->read_dma = ((cmd.data0>>16) * len * 2) /
505 		(cmd.data0 & 0xffff);
506 	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
507 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
508 	cmd.data2 = len * 0x1;
509 	status = mxge_send_cmd(sc, test_type, &cmd);
510 	if (status != 0) {
511 		test = "write";
512 		goto abort;
513 	}
514 	sc->write_dma = ((cmd.data0>>16) * len * 2) /
515 		(cmd.data0 & 0xffff);
516 
517 	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
518 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
519 	cmd.data2 = len * 0x10001;
520 	status = mxge_send_cmd(sc, test_type, &cmd);
521 	if (status != 0) {
522 		test = "read/write";
523 		goto abort;
524 	}
525 	sc->read_write_dma = ((cmd.data0>>16) * len * 2 * 2) /
526 		(cmd.data0 & 0xffff);
527 
528 abort:
529 	if (status != 0 && test_type != MXGEFW_CMD_UNALIGNED_TEST)
530 		device_printf(sc->dev, "DMA %s benchmark failed: %d\n",
531 			      test, status);
532 
533 	return status;
534 }
535 
536 /*
537  * The Lanai Z8E PCI-E interface achieves higher Read-DMA throughput
538  * when the PCI-E Completion packets are aligned on an 8-byte
539  * boundary.  Some PCI-E chip sets always align Completion packets; on
540  * the ones that do not, the alignment can be enforced by enabling
541  * ECRC generation (if supported).
542  *
543  * When PCI-E Completion packets are not aligned, it is actually more
544  * efficient to limit Read-DMA transactions to 2KB, rather than 4KB.
545  *
546  * If the driver can neither enable ECRC nor verify that it has
547  * already been enabled, then it must use a firmware image which works
548  * around unaligned completion packets (ethp_z8e.dat), and it should
549  * also ensure that it never gives the device a Read-DMA which is
550  * larger than 2KB by setting the tx_boundary to 2KB.  If ECRC is
551  * enabled, then the driver should use the aligned (eth_z8e.dat)
552  * firmware image, and set tx_boundary to 4KB.
553  */
554 
555 static int
556 mxge_firmware_probe(mxge_softc_t *sc)
557 {
558 	device_t dev = sc->dev;
559 	int reg, status;
560 	uint16_t pectl;
561 
562 	sc->tx_boundary = 4096;
563 	/*
564 	 * Verify the max read request size was set to 4KB
565 	 * before trying the test with 4KB.
566 	 */
567 	if (pci_find_cap(dev, PCIY_EXPRESS, &reg) == 0) {
568 		pectl = pci_read_config(dev, reg + 0x8, 2);
569 		if ((pectl & (5 << 12)) != (5 << 12)) {
570 			device_printf(dev, "Max Read Req. size != 4k (0x%x\n",
571 				      pectl);
572 			sc->tx_boundary = 2048;
573 		}
574 	}
575 
576 	/*
577 	 * load the optimized firmware (which assumes aligned PCIe
578 	 * completions) in order to see if it works on this host.
579 	 */
580 	sc->fw_name = mxge_fw_aligned;
581 	status = mxge_load_firmware(sc, 1);
582 	if (status != 0) {
583 		return status;
584 	}
585 
586 	/*
587 	 * Enable ECRC if possible
588 	 */
589 	mxge_enable_nvidia_ecrc(sc);
590 
591 	/*
592 	 * Run a DMA test which watches for unaligned completions and
593 	 * aborts on the first one seen.  Not required on Z8ES or newer.
594 	 */
595 	if (pci_get_revid(sc->dev) >= MXGE_PCI_REV_Z8ES)
596 		return 0;
597 	status = mxge_dma_test(sc, MXGEFW_CMD_UNALIGNED_TEST);
598 	if (status == 0)
599 		return 0; /* keep the aligned firmware */
600 
601 	if (status != E2BIG)
602 		device_printf(dev, "DMA test failed: %d\n", status);
603 	if (status == ENOSYS)
604 		device_printf(dev, "Falling back to ethp! "
605 			      "Please install up to date fw\n");
606 	return status;
607 }
608 
609 static int
610 mxge_select_firmware(mxge_softc_t *sc)
611 {
612 	int aligned = 0;
613 	int force_firmware = mxge_force_firmware;
614 
615 	if (sc->throttle)
616 		force_firmware = sc->throttle;
617 
618 	if (force_firmware != 0) {
619 		if (force_firmware == 1)
620 			aligned = 1;
621 		else
622 			aligned = 0;
623 		if (mxge_verbose)
624 			device_printf(sc->dev,
625 				      "Assuming %s completions (forced)\n",
626 				      aligned ? "aligned" : "unaligned");
627 		goto abort;
628 	}
629 
630 	/* if the PCIe link width is 4 or less, we can use the aligned
631 	   firmware and skip any checks */
632 	if (sc->link_width != 0 && sc->link_width <= 4) {
633 		device_printf(sc->dev,
634 			      "PCIe x%d Link, expect reduced performance\n",
635 			      sc->link_width);
636 		aligned = 1;
637 		goto abort;
638 	}
639 
640 	if (0 == mxge_firmware_probe(sc))
641 		return 0;
642 
643 abort:
644 	if (aligned) {
645 		sc->fw_name = mxge_fw_aligned;
646 		sc->tx_boundary = 4096;
647 	} else {
648 		sc->fw_name = mxge_fw_unaligned;
649 		sc->tx_boundary = 2048;
650 	}
651 	return (mxge_load_firmware(sc, 0));
652 }
653 
654 static int
655 mxge_validate_firmware(mxge_softc_t *sc, const mcp_gen_header_t *hdr)
656 {
657 
658 
659 	if (be32toh(hdr->mcp_type) != MCP_TYPE_ETH) {
660 		device_printf(sc->dev, "Bad firmware type: 0x%x\n",
661 			      be32toh(hdr->mcp_type));
662 		return EIO;
663 	}
664 
665 	/* save firmware version for sysctl */
666 	strlcpy(sc->fw_version, hdr->version, sizeof(sc->fw_version));
667 	if (mxge_verbose)
668 		device_printf(sc->dev, "firmware id: %s\n", hdr->version);
669 
670 	sscanf(sc->fw_version, "%d.%d.%d", &sc->fw_ver_major,
671 	       &sc->fw_ver_minor, &sc->fw_ver_tiny);
672 
673 	if (!(sc->fw_ver_major == MXGEFW_VERSION_MAJOR
674 	      && sc->fw_ver_minor == MXGEFW_VERSION_MINOR)) {
675 		device_printf(sc->dev, "Found firmware version %s\n",
676 			      sc->fw_version);
677 		device_printf(sc->dev, "Driver needs %d.%d\n",
678 			      MXGEFW_VERSION_MAJOR, MXGEFW_VERSION_MINOR);
679 		return EINVAL;
680 	}
681 	return 0;
682 
683 }
684 
685 static void *
686 z_alloc(void *nil, u_int items, u_int size)
687 {
688 	void *ptr;
689 
690 	ptr = malloc(items * size, M_TEMP, M_NOWAIT);
691 	return ptr;
692 }
693 
694 static void
695 z_free(void *nil, void *ptr)
696 {
697 	free(ptr, M_TEMP);
698 }
699 
700 
701 static int
702 mxge_load_firmware_helper(mxge_softc_t *sc, uint32_t *limit)
703 {
704 	z_stream zs;
705 	char *inflate_buffer;
706 	const struct firmware *fw;
707 	const mcp_gen_header_t *hdr;
708 	unsigned hdr_offset;
709 	int status;
710 	unsigned int i;
711 	char dummy;
712 	size_t fw_len;
713 
714 	fw = firmware_get(sc->fw_name);
715 	if (fw == NULL) {
716 		device_printf(sc->dev, "Could not find firmware image %s\n",
717 			      sc->fw_name);
718 		return ENOENT;
719 	}
720 
721 
722 
723 	/* setup zlib and decompress f/w */
724 	bzero(&zs, sizeof (zs));
725 	zs.zalloc = z_alloc;
726 	zs.zfree = z_free;
727 	status = inflateInit(&zs);
728 	if (status != Z_OK) {
729 		status = EIO;
730 		goto abort_with_fw;
731 	}
732 
733 	/* the uncompressed size is stored as the firmware version,
734 	   which would otherwise go unused */
735 	fw_len = (size_t) fw->version;
736 	inflate_buffer = malloc(fw_len, M_TEMP, M_NOWAIT);
737 	if (inflate_buffer == NULL)
738 		goto abort_with_zs;
739 	zs.avail_in = fw->datasize;
740 	zs.next_in = __DECONST(char *, fw->data);
741 	zs.avail_out = fw_len;
742 	zs.next_out = inflate_buffer;
743 	status = inflate(&zs, Z_FINISH);
744 	if (status != Z_STREAM_END) {
745 		device_printf(sc->dev, "zlib %d\n", status);
746 		status = EIO;
747 		goto abort_with_buffer;
748 	}
749 
750 	/* check id */
751 	hdr_offset = htobe32(*(const uint32_t *)
752 			     (inflate_buffer + MCP_HEADER_PTR_OFFSET));
753 	if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > fw_len) {
754 		device_printf(sc->dev, "Bad firmware file");
755 		status = EIO;
756 		goto abort_with_buffer;
757 	}
758 	hdr = (const void*)(inflate_buffer + hdr_offset);
759 
760 	status = mxge_validate_firmware(sc, hdr);
761 	if (status != 0)
762 		goto abort_with_buffer;
763 
764 	/* Copy the inflated firmware to NIC SRAM. */
765 	for (i = 0; i < fw_len; i += 256) {
766 		mxge_pio_copy(sc->sram + MXGE_FW_OFFSET + i,
767 			      inflate_buffer + i,
768 			      min(256U, (unsigned)(fw_len - i)));
769 		wmb();
770 		dummy = *sc->sram;
771 		wmb();
772 	}
773 
774 	*limit = fw_len;
775 	status = 0;
776 abort_with_buffer:
777 	free(inflate_buffer, M_TEMP);
778 abort_with_zs:
779 	inflateEnd(&zs);
780 abort_with_fw:
781 	firmware_put(fw, FIRMWARE_UNLOAD);
782 	return status;
783 }
784 
785 /*
786  * Enable or disable periodic RDMAs from the host to make certain
787  * chipsets resend dropped PCIe messages
788  */
789 
790 static void
791 mxge_dummy_rdma(mxge_softc_t *sc, int enable)
792 {
793 	char buf_bytes[72];
794 	volatile uint32_t *confirm;
795 	volatile char *submit;
796 	uint32_t *buf, dma_low, dma_high;
797 	int i;
798 
799 	buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
800 
801 	/* clear confirmation addr */
802 	confirm = (volatile uint32_t *)sc->cmd;
803 	*confirm = 0;
804 	wmb();
805 
806 	/* send an rdma command to the PCIe engine, and wait for the
807 	   response in the confirmation address.  The firmware should
808 	   write a -1 there to indicate it is alive and well
809 	*/
810 
811 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
812 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
813 	buf[0] = htobe32(dma_high);		/* confirm addr MSW */
814 	buf[1] = htobe32(dma_low);		/* confirm addr LSW */
815 	buf[2] = htobe32(0xffffffff);		/* confirm data */
816 	dma_low = MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr);
817 	dma_high = MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr);
818 	buf[3] = htobe32(dma_high); 		/* dummy addr MSW */
819 	buf[4] = htobe32(dma_low); 		/* dummy addr LSW */
820 	buf[5] = htobe32(enable);			/* enable? */
821 
822 
823 	submit = (volatile char *)(sc->sram + MXGEFW_BOOT_DUMMY_RDMA);
824 
825 	mxge_pio_copy(submit, buf, 64);
826 	wmb();
827 	DELAY(1000);
828 	wmb();
829 	i = 0;
830 	while (*confirm != 0xffffffff && i < 20) {
831 		DELAY(1000);
832 		i++;
833 	}
834 	if (*confirm != 0xffffffff) {
835 		device_printf(sc->dev, "dummy rdma %s failed (%p = 0x%x)",
836 			      (enable ? "enable" : "disable"), confirm,
837 			      *confirm);
838 	}
839 	return;
840 }
841 
842 static int
843 mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data)
844 {
845 	mcp_cmd_t *buf;
846 	char buf_bytes[sizeof(*buf) + 8];
847 	volatile mcp_cmd_response_t *response = sc->cmd;
848 	volatile char *cmd_addr = sc->sram + MXGEFW_ETH_CMD;
849 	uint32_t dma_low, dma_high;
850 	int err, sleep_total = 0;
851 
852 	/* ensure buf is aligned to 8 bytes */
853 	buf = (mcp_cmd_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
854 
855 	buf->data0 = htobe32(data->data0);
856 	buf->data1 = htobe32(data->data1);
857 	buf->data2 = htobe32(data->data2);
858 	buf->cmd = htobe32(cmd);
859 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
860 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
861 
862 	buf->response_addr.low = htobe32(dma_low);
863 	buf->response_addr.high = htobe32(dma_high);
864 	mtx_lock(&sc->cmd_mtx);
865 	response->result = 0xffffffff;
866 	wmb();
867 	mxge_pio_copy((volatile void *)cmd_addr, buf, sizeof (*buf));
868 
869 	/* wait up to 20ms */
870 	err = EAGAIN;
871 	for (sleep_total = 0; sleep_total <  20; sleep_total++) {
872 		bus_dmamap_sync(sc->cmd_dma.dmat,
873 				sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
874 		wmb();
875 		switch (be32toh(response->result)) {
876 		case 0:
877 			data->data0 = be32toh(response->data);
878 			err = 0;
879 			break;
880 		case 0xffffffff:
881 			DELAY(1000);
882 			break;
883 		case MXGEFW_CMD_UNKNOWN:
884 			err = ENOSYS;
885 			break;
886 		case MXGEFW_CMD_ERROR_UNALIGNED:
887 			err = E2BIG;
888 			break;
889 		case MXGEFW_CMD_ERROR_BUSY:
890 			err = EBUSY;
891 			break;
892 		case MXGEFW_CMD_ERROR_I2C_ABSENT:
893 			err = ENXIO;
894 			break;
895 		default:
896 			device_printf(sc->dev,
897 				      "mxge: command %d "
898 				      "failed, result = %d\n",
899 				      cmd, be32toh(response->result));
900 			err = ENXIO;
901 			break;
902 		}
903 		if (err != EAGAIN)
904 			break;
905 	}
906 	if (err == EAGAIN)
907 		device_printf(sc->dev, "mxge: command %d timed out"
908 			      "result = %d\n",
909 			      cmd, be32toh(response->result));
910 	mtx_unlock(&sc->cmd_mtx);
911 	return err;
912 }
913 
914 static int
915 mxge_adopt_running_firmware(mxge_softc_t *sc)
916 {
917 	struct mcp_gen_header *hdr;
918 	const size_t bytes = sizeof (struct mcp_gen_header);
919 	size_t hdr_offset;
920 	int status;
921 
922 	/* find running firmware header */
923 	hdr_offset = htobe32(*(volatile uint32_t *)
924 			     (sc->sram + MCP_HEADER_PTR_OFFSET));
925 
926 	if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > sc->sram_size) {
927 		device_printf(sc->dev,
928 			      "Running firmware has bad header offset (%d)\n",
929 			      (int)hdr_offset);
930 		return EIO;
931 	}
932 
933 	/* copy header of running firmware from SRAM to host memory to
934 	 * validate firmware */
935 	hdr = malloc(bytes, M_DEVBUF, M_NOWAIT);
936 	if (hdr == NULL) {
937 		device_printf(sc->dev, "could not malloc firmware hdr\n");
938 		return ENOMEM;
939 	}
940 	bus_space_read_region_1(rman_get_bustag(sc->mem_res),
941 				rman_get_bushandle(sc->mem_res),
942 				hdr_offset, (char *)hdr, bytes);
943 	status = mxge_validate_firmware(sc, hdr);
944 	free(hdr, M_DEVBUF);
945 
946 	/*
947 	 * check to see if adopted firmware has bug where adopting
948 	 * it will cause broadcasts to be filtered unless the NIC
949 	 * is kept in ALLMULTI mode
950 	 */
951 	if (sc->fw_ver_major == 1 && sc->fw_ver_minor == 4 &&
952 	    sc->fw_ver_tiny >= 4 && sc->fw_ver_tiny <= 11) {
953 		sc->adopted_rx_filter_bug = 1;
954 		device_printf(sc->dev, "Adopting fw %d.%d.%d: "
955 			      "working around rx filter bug\n",
956 			      sc->fw_ver_major, sc->fw_ver_minor,
957 			      sc->fw_ver_tiny);
958 	}
959 
960 	return status;
961 }
962 
963 
964 static int
965 mxge_load_firmware(mxge_softc_t *sc, int adopt)
966 {
967 	volatile uint32_t *confirm;
968 	volatile char *submit;
969 	char buf_bytes[72];
970 	uint32_t *buf, size, dma_low, dma_high;
971 	int status, i;
972 
973 	buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
974 
975 	size = sc->sram_size;
976 	status = mxge_load_firmware_helper(sc, &size);
977 	if (status) {
978 		if (!adopt)
979 			return status;
980 		/* Try to use the currently running firmware, if
981 		   it is new enough */
982 		status = mxge_adopt_running_firmware(sc);
983 		if (status) {
984 			device_printf(sc->dev,
985 				      "failed to adopt running firmware\n");
986 			return status;
987 		}
988 		device_printf(sc->dev,
989 			      "Successfully adopted running firmware\n");
990 		if (sc->tx_boundary == 4096) {
991 			device_printf(sc->dev,
992 				"Using firmware currently running on NIC"
993 				 ".  For optimal\n");
994 			device_printf(sc->dev,
995 				 "performance consider loading optimized "
996 				 "firmware\n");
997 		}
998 		sc->fw_name = mxge_fw_unaligned;
999 		sc->tx_boundary = 2048;
1000 		return 0;
1001 	}
1002 	/* clear confirmation addr */
1003 	confirm = (volatile uint32_t *)sc->cmd;
1004 	*confirm = 0;
1005 	wmb();
1006 	/* send a reload command to the bootstrap MCP, and wait for the
1007 	   response in the confirmation address.  The firmware should
1008 	   write a -1 there to indicate it is alive and well
1009 	*/
1010 
1011 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
1012 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
1013 
1014 	buf[0] = htobe32(dma_high);	/* confirm addr MSW */
1015 	buf[1] = htobe32(dma_low);	/* confirm addr LSW */
1016 	buf[2] = htobe32(0xffffffff);	/* confirm data */
1017 
1018 	/* FIX: All newest firmware should un-protect the bottom of
1019 	   the sram before handoff. However, the very first interfaces
1020 	   do not. Therefore the handoff copy must skip the first 8 bytes
1021 	*/
1022 					/* where the code starts*/
1023 	buf[3] = htobe32(MXGE_FW_OFFSET + 8);
1024 	buf[4] = htobe32(size - 8); 	/* length of code */
1025 	buf[5] = htobe32(8);		/* where to copy to */
1026 	buf[6] = htobe32(0);		/* where to jump to */
1027 
1028 	submit = (volatile char *)(sc->sram + MXGEFW_BOOT_HANDOFF);
1029 	mxge_pio_copy(submit, buf, 64);
1030 	wmb();
1031 	DELAY(1000);
1032 	wmb();
1033 	i = 0;
1034 	while (*confirm != 0xffffffff && i < 20) {
1035 		DELAY(1000*10);
1036 		i++;
1037 		bus_dmamap_sync(sc->cmd_dma.dmat,
1038 				sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
1039 	}
1040 	if (*confirm != 0xffffffff) {
1041 		device_printf(sc->dev,"handoff failed (%p = 0x%x)",
1042 			confirm, *confirm);
1043 
1044 		return ENXIO;
1045 	}
1046 	return 0;
1047 }
1048 
1049 static int
1050 mxge_update_mac_address(mxge_softc_t *sc)
1051 {
1052 	mxge_cmd_t cmd;
1053 	uint8_t *addr = sc->mac_addr;
1054 	int status;
1055 
1056 
1057 	cmd.data0 = ((addr[0] << 24) | (addr[1] << 16)
1058 		     | (addr[2] << 8) | addr[3]);
1059 
1060 	cmd.data1 = ((addr[4] << 8) | (addr[5]));
1061 
1062 	status = mxge_send_cmd(sc, MXGEFW_SET_MAC_ADDRESS, &cmd);
1063 	return status;
1064 }
1065 
1066 static int
1067 mxge_change_pause(mxge_softc_t *sc, int pause)
1068 {
1069 	mxge_cmd_t cmd;
1070 	int status;
1071 
1072 	if (pause)
1073 		status = mxge_send_cmd(sc, MXGEFW_ENABLE_FLOW_CONTROL,
1074 				       &cmd);
1075 	else
1076 		status = mxge_send_cmd(sc, MXGEFW_DISABLE_FLOW_CONTROL,
1077 				       &cmd);
1078 
1079 	if (status) {
1080 		device_printf(sc->dev, "Failed to set flow control mode\n");
1081 		return ENXIO;
1082 	}
1083 	sc->pause = pause;
1084 	return 0;
1085 }
1086 
1087 static void
1088 mxge_change_promisc(mxge_softc_t *sc, int promisc)
1089 {
1090 	mxge_cmd_t cmd;
1091 	int status;
1092 
1093 	if (mxge_always_promisc)
1094 		promisc = 1;
1095 
1096 	if (promisc)
1097 		status = mxge_send_cmd(sc, MXGEFW_ENABLE_PROMISC,
1098 				       &cmd);
1099 	else
1100 		status = mxge_send_cmd(sc, MXGEFW_DISABLE_PROMISC,
1101 				       &cmd);
1102 
1103 	if (status) {
1104 		device_printf(sc->dev, "Failed to set promisc mode\n");
1105 	}
1106 }
1107 
1108 static void
1109 mxge_set_multicast_list(mxge_softc_t *sc)
1110 {
1111 	mxge_cmd_t cmd;
1112 	struct ifmultiaddr *ifma;
1113 	struct ifnet *ifp = sc->ifp;
1114 	int err;
1115 
1116 	/* This firmware is known to not support multicast */
1117 	if (!sc->fw_multicast_support)
1118 		return;
1119 
1120 	/* Disable multicast filtering while we play with the lists*/
1121 	err = mxge_send_cmd(sc, MXGEFW_ENABLE_ALLMULTI, &cmd);
1122 	if (err != 0) {
1123 		device_printf(sc->dev, "Failed MXGEFW_ENABLE_ALLMULTI,"
1124 		       " error status: %d\n", err);
1125 		return;
1126 	}
1127 
1128 	if (sc->adopted_rx_filter_bug)
1129 		return;
1130 
1131 	if (ifp->if_flags & IFF_ALLMULTI)
1132 		/* request to disable multicast filtering, so quit here */
1133 		return;
1134 
1135 	/* Flush all the filters */
1136 
1137 	err = mxge_send_cmd(sc, MXGEFW_LEAVE_ALL_MULTICAST_GROUPS, &cmd);
1138 	if (err != 0) {
1139 		device_printf(sc->dev,
1140 			      "Failed MXGEFW_LEAVE_ALL_MULTICAST_GROUPS"
1141 			      ", error status: %d\n", err);
1142 		return;
1143 	}
1144 
1145 	/* Walk the multicast list, and add each address */
1146 
1147 	if_maddr_rlock(ifp);
1148 	TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
1149 		if (ifma->ifma_addr->sa_family != AF_LINK)
1150 			continue;
1151 		bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr),
1152 		      &cmd.data0, 4);
1153 		bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr) + 4,
1154 		      &cmd.data1, 2);
1155 		cmd.data0 = htonl(cmd.data0);
1156 		cmd.data1 = htonl(cmd.data1);
1157 		err = mxge_send_cmd(sc, MXGEFW_JOIN_MULTICAST_GROUP, &cmd);
1158 		if (err != 0) {
1159 			device_printf(sc->dev, "Failed "
1160 			       "MXGEFW_JOIN_MULTICAST_GROUP, error status:"
1161 			       "%d\t", err);
1162 			/* abort, leaving multicast filtering off */
1163 			if_maddr_runlock(ifp);
1164 			return;
1165 		}
1166 	}
1167 	if_maddr_runlock(ifp);
1168 	/* Enable multicast filtering */
1169 	err = mxge_send_cmd(sc, MXGEFW_DISABLE_ALLMULTI, &cmd);
1170 	if (err != 0) {
1171 		device_printf(sc->dev, "Failed MXGEFW_DISABLE_ALLMULTI"
1172 		       ", error status: %d\n", err);
1173 	}
1174 }
1175 
1176 static int
1177 mxge_max_mtu(mxge_softc_t *sc)
1178 {
1179 	mxge_cmd_t cmd;
1180 	int status;
1181 
1182 	if (MJUMPAGESIZE - MXGEFW_PAD >  MXGEFW_MAX_MTU)
1183 		return  MXGEFW_MAX_MTU - MXGEFW_PAD;
1184 
1185 	/* try to set nbufs to see if it we can
1186 	   use virtually contiguous jumbos */
1187 	cmd.data0 = 0;
1188 	status = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
1189 			       &cmd);
1190 	if (status == 0)
1191 		return  MXGEFW_MAX_MTU - MXGEFW_PAD;
1192 
1193 	/* otherwise, we're limited to MJUMPAGESIZE */
1194 	return MJUMPAGESIZE - MXGEFW_PAD;
1195 }
1196 
1197 static int
1198 mxge_reset(mxge_softc_t *sc, int interrupts_setup)
1199 {
1200 	struct mxge_slice_state *ss;
1201 	mxge_rx_done_t *rx_done;
1202 	volatile uint32_t *irq_claim;
1203 	mxge_cmd_t cmd;
1204 	int slice, status;
1205 
1206 	/* try to send a reset command to the card to see if it
1207 	   is alive */
1208 	memset(&cmd, 0, sizeof (cmd));
1209 	status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
1210 	if (status != 0) {
1211 		device_printf(sc->dev, "failed reset\n");
1212 		return ENXIO;
1213 	}
1214 
1215 	mxge_dummy_rdma(sc, 1);
1216 
1217 
1218 	/* set the intrq size */
1219 	cmd.data0 = sc->rx_ring_size;
1220 	status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
1221 
1222 	/*
1223 	 * Even though we already know how many slices are supported
1224 	 * via mxge_slice_probe(), MXGEFW_CMD_GET_MAX_RSS_QUEUES
1225 	 * has magic side effects, and must be called after a reset.
1226 	 * It must be called prior to calling any RSS related cmds,
1227 	 * including assigning an interrupt queue for anything but
1228 	 * slice 0.  It must also be called *after*
1229 	 * MXGEFW_CMD_SET_INTRQ_SIZE, since the intrq size is used by
1230 	 * the firmware to compute offsets.
1231 	 */
1232 
1233 	if (sc->num_slices > 1) {
1234 		/* ask the maximum number of slices it supports */
1235 		status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES,
1236 					   &cmd);
1237 		if (status != 0) {
1238 			device_printf(sc->dev,
1239 				      "failed to get number of slices\n");
1240 			return status;
1241 		}
1242 		/*
1243 		 * MXGEFW_CMD_ENABLE_RSS_QUEUES must be called prior
1244 		 * to setting up the interrupt queue DMA
1245 		 */
1246 		cmd.data0 = sc->num_slices;
1247 		cmd.data1 = MXGEFW_SLICE_INTR_MODE_ONE_PER_SLICE;
1248 #ifdef IFNET_BUF_RING
1249 		cmd.data1 |= MXGEFW_SLICE_ENABLE_MULTIPLE_TX_QUEUES;
1250 #endif
1251 		status = mxge_send_cmd(sc, MXGEFW_CMD_ENABLE_RSS_QUEUES,
1252 					   &cmd);
1253 		if (status != 0) {
1254 			device_printf(sc->dev,
1255 				      "failed to set number of slices\n");
1256 			return status;
1257 		}
1258 	}
1259 
1260 
1261 	if (interrupts_setup) {
1262 		/* Now exchange information about interrupts  */
1263 		for (slice = 0; slice < sc->num_slices; slice++) {
1264 			rx_done = &sc->ss[slice].rx_done;
1265 			memset(rx_done->entry, 0, sc->rx_ring_size);
1266 			cmd.data0 = MXGE_LOWPART_TO_U32(rx_done->dma.bus_addr);
1267 			cmd.data1 = MXGE_HIGHPART_TO_U32(rx_done->dma.bus_addr);
1268 			cmd.data2 = slice;
1269 			status |= mxge_send_cmd(sc,
1270 						MXGEFW_CMD_SET_INTRQ_DMA,
1271 						&cmd);
1272 		}
1273 	}
1274 
1275 	status |= mxge_send_cmd(sc,
1276 				MXGEFW_CMD_GET_INTR_COAL_DELAY_OFFSET, &cmd);
1277 
1278 
1279 	sc->intr_coal_delay_ptr = (volatile uint32_t *)(sc->sram + cmd.data0);
1280 
1281 	status |= mxge_send_cmd(sc, MXGEFW_CMD_GET_IRQ_ACK_OFFSET, &cmd);
1282 	irq_claim = (volatile uint32_t *)(sc->sram + cmd.data0);
1283 
1284 
1285 	status |= mxge_send_cmd(sc,  MXGEFW_CMD_GET_IRQ_DEASSERT_OFFSET,
1286 				&cmd);
1287 	sc->irq_deassert = (volatile uint32_t *)(sc->sram + cmd.data0);
1288 	if (status != 0) {
1289 		device_printf(sc->dev, "failed set interrupt parameters\n");
1290 		return status;
1291 	}
1292 
1293 
1294 	*sc->intr_coal_delay_ptr = htobe32(sc->intr_coal_delay);
1295 
1296 
1297 	/* run a DMA benchmark */
1298 	(void) mxge_dma_test(sc, MXGEFW_DMA_TEST);
1299 
1300 	for (slice = 0; slice < sc->num_slices; slice++) {
1301 		ss = &sc->ss[slice];
1302 
1303 		ss->irq_claim = irq_claim + (2 * slice);
1304 		/* reset mcp/driver shared state back to 0 */
1305 		ss->rx_done.idx = 0;
1306 		ss->rx_done.cnt = 0;
1307 		ss->tx.req = 0;
1308 		ss->tx.done = 0;
1309 		ss->tx.pkt_done = 0;
1310 		ss->tx.queue_active = 0;
1311 		ss->tx.activate = 0;
1312 		ss->tx.deactivate = 0;
1313 		ss->tx.wake = 0;
1314 		ss->tx.defrag = 0;
1315 		ss->tx.stall = 0;
1316 		ss->rx_big.cnt = 0;
1317 		ss->rx_small.cnt = 0;
1318 		ss->lc.lro_bad_csum = 0;
1319 		ss->lc.lro_queued = 0;
1320 		ss->lc.lro_flushed = 0;
1321 		if (ss->fw_stats != NULL) {
1322 			bzero(ss->fw_stats, sizeof *ss->fw_stats);
1323 		}
1324 	}
1325 	sc->rdma_tags_available = 15;
1326 	status = mxge_update_mac_address(sc);
1327 	mxge_change_promisc(sc, sc->ifp->if_flags & IFF_PROMISC);
1328 	mxge_change_pause(sc, sc->pause);
1329 	mxge_set_multicast_list(sc);
1330 	if (sc->throttle) {
1331 		cmd.data0 = sc->throttle;
1332 		if (mxge_send_cmd(sc, MXGEFW_CMD_SET_THROTTLE_FACTOR,
1333 				  &cmd)) {
1334 			device_printf(sc->dev,
1335 				      "can't enable throttle\n");
1336 		}
1337 	}
1338 	return status;
1339 }
1340 
1341 static int
1342 mxge_change_throttle(SYSCTL_HANDLER_ARGS)
1343 {
1344 	mxge_cmd_t cmd;
1345 	mxge_softc_t *sc;
1346 	int err;
1347 	unsigned int throttle;
1348 
1349 	sc = arg1;
1350 	throttle = sc->throttle;
1351 	err = sysctl_handle_int(oidp, &throttle, arg2, req);
1352 	if (err != 0) {
1353 		return err;
1354 	}
1355 
1356 	if (throttle == sc->throttle)
1357 		return 0;
1358 
1359 	if (throttle < MXGE_MIN_THROTTLE || throttle > MXGE_MAX_THROTTLE)
1360 		return EINVAL;
1361 
1362 	mtx_lock(&sc->driver_mtx);
1363 	cmd.data0 = throttle;
1364 	err = mxge_send_cmd(sc, MXGEFW_CMD_SET_THROTTLE_FACTOR, &cmd);
1365 	if (err == 0)
1366 		sc->throttle = throttle;
1367 	mtx_unlock(&sc->driver_mtx);
1368 	return err;
1369 }
1370 
1371 static int
1372 mxge_change_intr_coal(SYSCTL_HANDLER_ARGS)
1373 {
1374 	mxge_softc_t *sc;
1375 	unsigned int intr_coal_delay;
1376 	int err;
1377 
1378 	sc = arg1;
1379 	intr_coal_delay = sc->intr_coal_delay;
1380 	err = sysctl_handle_int(oidp, &intr_coal_delay, arg2, req);
1381 	if (err != 0) {
1382 		return err;
1383 	}
1384 	if (intr_coal_delay == sc->intr_coal_delay)
1385 		return 0;
1386 
1387 	if (intr_coal_delay == 0 || intr_coal_delay > 1000*1000)
1388 		return EINVAL;
1389 
1390 	mtx_lock(&sc->driver_mtx);
1391 	*sc->intr_coal_delay_ptr = htobe32(intr_coal_delay);
1392 	sc->intr_coal_delay = intr_coal_delay;
1393 
1394 	mtx_unlock(&sc->driver_mtx);
1395 	return err;
1396 }
1397 
1398 static int
1399 mxge_change_flow_control(SYSCTL_HANDLER_ARGS)
1400 {
1401 	mxge_softc_t *sc;
1402 	unsigned int enabled;
1403 	int err;
1404 
1405 	sc = arg1;
1406 	enabled = sc->pause;
1407 	err = sysctl_handle_int(oidp, &enabled, arg2, req);
1408 	if (err != 0) {
1409 		return err;
1410 	}
1411 	if (enabled == sc->pause)
1412 		return 0;
1413 
1414 	mtx_lock(&sc->driver_mtx);
1415 	err = mxge_change_pause(sc, enabled);
1416 	mtx_unlock(&sc->driver_mtx);
1417 	return err;
1418 }
1419 
1420 static int
1421 mxge_handle_be32(SYSCTL_HANDLER_ARGS)
1422 {
1423 	int err;
1424 
1425 	if (arg1 == NULL)
1426 		return EFAULT;
1427 	arg2 = be32toh(*(int *)arg1);
1428 	arg1 = NULL;
1429 	err = sysctl_handle_int(oidp, arg1, arg2, req);
1430 
1431 	return err;
1432 }
1433 
1434 static void
1435 mxge_rem_sysctls(mxge_softc_t *sc)
1436 {
1437 	struct mxge_slice_state *ss;
1438 	int slice;
1439 
1440 	if (sc->slice_sysctl_tree == NULL)
1441 		return;
1442 
1443 	for (slice = 0; slice < sc->num_slices; slice++) {
1444 		ss = &sc->ss[slice];
1445 		if (ss == NULL || ss->sysctl_tree == NULL)
1446 			continue;
1447 		sysctl_ctx_free(&ss->sysctl_ctx);
1448 		ss->sysctl_tree = NULL;
1449 	}
1450 	sysctl_ctx_free(&sc->slice_sysctl_ctx);
1451 	sc->slice_sysctl_tree = NULL;
1452 }
1453 
1454 static void
1455 mxge_add_sysctls(mxge_softc_t *sc)
1456 {
1457 	struct sysctl_ctx_list *ctx;
1458 	struct sysctl_oid_list *children;
1459 	mcp_irq_data_t *fw;
1460 	struct mxge_slice_state *ss;
1461 	int slice;
1462 	char slice_num[8];
1463 
1464 	ctx = device_get_sysctl_ctx(sc->dev);
1465 	children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev));
1466 	fw = sc->ss[0].fw_stats;
1467 
1468 	/* random information */
1469 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1470 		       "firmware_version",
1471 		       CTLFLAG_RD, sc->fw_version,
1472 		       0, "firmware version");
1473 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1474 		       "serial_number",
1475 		       CTLFLAG_RD, sc->serial_number_string,
1476 		       0, "serial number");
1477 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1478 		       "product_code",
1479 		       CTLFLAG_RD, sc->product_code_string,
1480 		       0, "product_code");
1481 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1482 		       "pcie_link_width",
1483 		       CTLFLAG_RD, &sc->link_width,
1484 		       0, "tx_boundary");
1485 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1486 		       "tx_boundary",
1487 		       CTLFLAG_RD, &sc->tx_boundary,
1488 		       0, "tx_boundary");
1489 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1490 		       "write_combine",
1491 		       CTLFLAG_RD, &sc->wc,
1492 		       0, "write combining PIO?");
1493 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1494 		       "read_dma_MBs",
1495 		       CTLFLAG_RD, &sc->read_dma,
1496 		       0, "DMA Read speed in MB/s");
1497 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1498 		       "write_dma_MBs",
1499 		       CTLFLAG_RD, &sc->write_dma,
1500 		       0, "DMA Write speed in MB/s");
1501 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1502 		       "read_write_dma_MBs",
1503 		       CTLFLAG_RD, &sc->read_write_dma,
1504 		       0, "DMA concurrent Read/Write speed in MB/s");
1505 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1506 		       "watchdog_resets",
1507 		       CTLFLAG_RD, &sc->watchdog_resets,
1508 		       0, "Number of times NIC was reset");
1509 
1510 
1511 	/* performance related tunables */
1512 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1513 			"intr_coal_delay",
1514 			CTLTYPE_INT|CTLFLAG_RW, sc,
1515 			0, mxge_change_intr_coal,
1516 			"I", "interrupt coalescing delay in usecs");
1517 
1518 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1519 			"throttle",
1520 			CTLTYPE_INT|CTLFLAG_RW, sc,
1521 			0, mxge_change_throttle,
1522 			"I", "transmit throttling");
1523 
1524 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1525 			"flow_control_enabled",
1526 			CTLTYPE_INT|CTLFLAG_RW, sc,
1527 			0, mxge_change_flow_control,
1528 			"I", "interrupt coalescing delay in usecs");
1529 
1530 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1531 		       "deassert_wait",
1532 		       CTLFLAG_RW, &mxge_deassert_wait,
1533 		       0, "Wait for IRQ line to go low in ihandler");
1534 
1535 	/* stats block from firmware is in network byte order.
1536 	   Need to swap it */
1537 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1538 			"link_up",
1539 			CTLTYPE_INT|CTLFLAG_RD, &fw->link_up,
1540 			0, mxge_handle_be32,
1541 			"I", "link up");
1542 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1543 			"rdma_tags_available",
1544 			CTLTYPE_INT|CTLFLAG_RD, &fw->rdma_tags_available,
1545 			0, mxge_handle_be32,
1546 			"I", "rdma_tags_available");
1547 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1548 			"dropped_bad_crc32",
1549 			CTLTYPE_INT|CTLFLAG_RD,
1550 			&fw->dropped_bad_crc32,
1551 			0, mxge_handle_be32,
1552 			"I", "dropped_bad_crc32");
1553 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1554 			"dropped_bad_phy",
1555 			CTLTYPE_INT|CTLFLAG_RD,
1556 			&fw->dropped_bad_phy,
1557 			0, mxge_handle_be32,
1558 			"I", "dropped_bad_phy");
1559 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1560 			"dropped_link_error_or_filtered",
1561 			CTLTYPE_INT|CTLFLAG_RD,
1562 			&fw->dropped_link_error_or_filtered,
1563 			0, mxge_handle_be32,
1564 			"I", "dropped_link_error_or_filtered");
1565 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1566 			"dropped_link_overflow",
1567 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_link_overflow,
1568 			0, mxge_handle_be32,
1569 			"I", "dropped_link_overflow");
1570 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1571 			"dropped_multicast_filtered",
1572 			CTLTYPE_INT|CTLFLAG_RD,
1573 			&fw->dropped_multicast_filtered,
1574 			0, mxge_handle_be32,
1575 			"I", "dropped_multicast_filtered");
1576 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1577 			"dropped_no_big_buffer",
1578 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_no_big_buffer,
1579 			0, mxge_handle_be32,
1580 			"I", "dropped_no_big_buffer");
1581 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1582 			"dropped_no_small_buffer",
1583 			CTLTYPE_INT|CTLFLAG_RD,
1584 			&fw->dropped_no_small_buffer,
1585 			0, mxge_handle_be32,
1586 			"I", "dropped_no_small_buffer");
1587 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1588 			"dropped_overrun",
1589 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_overrun,
1590 			0, mxge_handle_be32,
1591 			"I", "dropped_overrun");
1592 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1593 			"dropped_pause",
1594 			CTLTYPE_INT|CTLFLAG_RD,
1595 			&fw->dropped_pause,
1596 			0, mxge_handle_be32,
1597 			"I", "dropped_pause");
1598 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1599 			"dropped_runt",
1600 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_runt,
1601 			0, mxge_handle_be32,
1602 			"I", "dropped_runt");
1603 
1604 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1605 			"dropped_unicast_filtered",
1606 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_unicast_filtered,
1607 			0, mxge_handle_be32,
1608 			"I", "dropped_unicast_filtered");
1609 
1610 	/* verbose printing? */
1611 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1612 		       "verbose",
1613 		       CTLFLAG_RW, &mxge_verbose,
1614 		       0, "verbose printing");
1615 
1616 	/* add counters exported for debugging from all slices */
1617 	sysctl_ctx_init(&sc->slice_sysctl_ctx);
1618 	sc->slice_sysctl_tree =
1619 		SYSCTL_ADD_NODE(&sc->slice_sysctl_ctx, children, OID_AUTO,
1620 				"slice", CTLFLAG_RD, 0, "");
1621 
1622 	for (slice = 0; slice < sc->num_slices; slice++) {
1623 		ss = &sc->ss[slice];
1624 		sysctl_ctx_init(&ss->sysctl_ctx);
1625 		ctx = &ss->sysctl_ctx;
1626 		children = SYSCTL_CHILDREN(sc->slice_sysctl_tree);
1627 		sprintf(slice_num, "%d", slice);
1628 		ss->sysctl_tree =
1629 			SYSCTL_ADD_NODE(ctx, children, OID_AUTO, slice_num,
1630 					CTLFLAG_RD, 0, "");
1631 		children = SYSCTL_CHILDREN(ss->sysctl_tree);
1632 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1633 			       "rx_small_cnt",
1634 			       CTLFLAG_RD, &ss->rx_small.cnt,
1635 			       0, "rx_small_cnt");
1636 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1637 			       "rx_big_cnt",
1638 			       CTLFLAG_RD, &ss->rx_big.cnt,
1639 			       0, "rx_small_cnt");
1640 		SYSCTL_ADD_U64(ctx, children, OID_AUTO,
1641 			       "lro_flushed", CTLFLAG_RD, &ss->lc.lro_flushed,
1642 			       0, "number of lro merge queues flushed");
1643 
1644 		SYSCTL_ADD_U64(ctx, children, OID_AUTO,
1645 			       "lro_bad_csum", CTLFLAG_RD, &ss->lc.lro_bad_csum,
1646 			       0, "number of bad csums preventing LRO");
1647 
1648 		SYSCTL_ADD_U64(ctx, children, OID_AUTO,
1649 			       "lro_queued", CTLFLAG_RD, &ss->lc.lro_queued,
1650 			       0, "number of frames appended to lro merge"
1651 			       "queues");
1652 
1653 #ifndef IFNET_BUF_RING
1654 		/* only transmit from slice 0 for now */
1655 		if (slice > 0)
1656 			continue;
1657 #endif
1658 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1659 			       "tx_req",
1660 			       CTLFLAG_RD, &ss->tx.req,
1661 			       0, "tx_req");
1662 
1663 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1664 			       "tx_done",
1665 			       CTLFLAG_RD, &ss->tx.done,
1666 			       0, "tx_done");
1667 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1668 			       "tx_pkt_done",
1669 			       CTLFLAG_RD, &ss->tx.pkt_done,
1670 			       0, "tx_done");
1671 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1672 			       "tx_stall",
1673 			       CTLFLAG_RD, &ss->tx.stall,
1674 			       0, "tx_stall");
1675 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1676 			       "tx_wake",
1677 			       CTLFLAG_RD, &ss->tx.wake,
1678 			       0, "tx_wake");
1679 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1680 			       "tx_defrag",
1681 			       CTLFLAG_RD, &ss->tx.defrag,
1682 			       0, "tx_defrag");
1683 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1684 			       "tx_queue_active",
1685 			       CTLFLAG_RD, &ss->tx.queue_active,
1686 			       0, "tx_queue_active");
1687 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1688 			       "tx_activate",
1689 			       CTLFLAG_RD, &ss->tx.activate,
1690 			       0, "tx_activate");
1691 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1692 			       "tx_deactivate",
1693 			       CTLFLAG_RD, &ss->tx.deactivate,
1694 			       0, "tx_deactivate");
1695 	}
1696 }
1697 
1698 /* copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
1699    backwards one at a time and handle ring wraps */
1700 
1701 static inline void
1702 mxge_submit_req_backwards(mxge_tx_ring_t *tx,
1703 			    mcp_kreq_ether_send_t *src, int cnt)
1704 {
1705 	int idx, starting_slot;
1706 	starting_slot = tx->req;
1707 	while (cnt > 1) {
1708 		cnt--;
1709 		idx = (starting_slot + cnt) & tx->mask;
1710 		mxge_pio_copy(&tx->lanai[idx],
1711 			      &src[cnt], sizeof(*src));
1712 		wmb();
1713 	}
1714 }
1715 
1716 /*
1717  * copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
1718  * at most 32 bytes at a time, so as to avoid involving the software
1719  * pio handler in the nic.   We re-write the first segment's flags
1720  * to mark them valid only after writing the entire chain
1721  */
1722 
1723 static inline void
1724 mxge_submit_req(mxge_tx_ring_t *tx, mcp_kreq_ether_send_t *src,
1725 		  int cnt)
1726 {
1727 	int idx, i;
1728 	uint32_t *src_ints;
1729 	volatile uint32_t *dst_ints;
1730 	mcp_kreq_ether_send_t *srcp;
1731 	volatile mcp_kreq_ether_send_t *dstp, *dst;
1732 	uint8_t last_flags;
1733 
1734 	idx = tx->req & tx->mask;
1735 
1736 	last_flags = src->flags;
1737 	src->flags = 0;
1738 	wmb();
1739 	dst = dstp = &tx->lanai[idx];
1740 	srcp = src;
1741 
1742 	if ((idx + cnt) < tx->mask) {
1743 		for (i = 0; i < (cnt - 1); i += 2) {
1744 			mxge_pio_copy(dstp, srcp, 2 * sizeof(*src));
1745 			wmb(); /* force write every 32 bytes */
1746 			srcp += 2;
1747 			dstp += 2;
1748 		}
1749 	} else {
1750 		/* submit all but the first request, and ensure
1751 		   that it is submitted below */
1752 		mxge_submit_req_backwards(tx, src, cnt);
1753 		i = 0;
1754 	}
1755 	if (i < cnt) {
1756 		/* submit the first request */
1757 		mxge_pio_copy(dstp, srcp, sizeof(*src));
1758 		wmb(); /* barrier before setting valid flag */
1759 	}
1760 
1761 	/* re-write the last 32-bits with the valid flags */
1762 	src->flags = last_flags;
1763 	src_ints = (uint32_t *)src;
1764 	src_ints+=3;
1765 	dst_ints = (volatile uint32_t *)dst;
1766 	dst_ints+=3;
1767 	*dst_ints =  *src_ints;
1768 	tx->req += cnt;
1769 	wmb();
1770 }
1771 
1772 static int
1773 mxge_parse_tx(struct mxge_slice_state *ss, struct mbuf *m,
1774     struct mxge_pkt_info *pi)
1775 {
1776 	struct ether_vlan_header *eh;
1777 	uint16_t etype;
1778 	int tso = m->m_pkthdr.csum_flags & (CSUM_TSO);
1779 #if IFCAP_TSO6 && defined(INET6)
1780 	int nxt;
1781 #endif
1782 
1783 	eh = mtod(m, struct ether_vlan_header *);
1784 	if (eh->evl_encap_proto == htons(ETHERTYPE_VLAN)) {
1785 		etype = ntohs(eh->evl_proto);
1786 		pi->ip_off = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
1787 	} else {
1788 		etype = ntohs(eh->evl_encap_proto);
1789 		pi->ip_off = ETHER_HDR_LEN;
1790 	}
1791 
1792 	switch (etype) {
1793 	case ETHERTYPE_IP:
1794 		/*
1795 		 * ensure ip header is in first mbuf, copy it to a
1796 		 * scratch buffer if not
1797 		 */
1798 		pi->ip = (struct ip *)(m->m_data + pi->ip_off);
1799 		pi->ip6 = NULL;
1800 		if (__predict_false(m->m_len < pi->ip_off + sizeof(*pi->ip))) {
1801 			m_copydata(m, 0, pi->ip_off + sizeof(*pi->ip),
1802 			    ss->scratch);
1803 			pi->ip = (struct ip *)(ss->scratch + pi->ip_off);
1804 		}
1805 		pi->ip_hlen = pi->ip->ip_hl << 2;
1806 		if (!tso)
1807 			return 0;
1808 
1809 		if (__predict_false(m->m_len < pi->ip_off + pi->ip_hlen +
1810 		    sizeof(struct tcphdr))) {
1811 			m_copydata(m, 0, pi->ip_off + pi->ip_hlen +
1812 			    sizeof(struct tcphdr), ss->scratch);
1813 			pi->ip = (struct ip *)(ss->scratch + pi->ip_off);
1814 		}
1815 		pi->tcp = (struct tcphdr *)((char *)pi->ip + pi->ip_hlen);
1816 		break;
1817 #if IFCAP_TSO6 && defined(INET6)
1818 	case ETHERTYPE_IPV6:
1819 		pi->ip6 = (struct ip6_hdr *)(m->m_data + pi->ip_off);
1820 		if (__predict_false(m->m_len < pi->ip_off + sizeof(*pi->ip6))) {
1821 			m_copydata(m, 0, pi->ip_off + sizeof(*pi->ip6),
1822 			    ss->scratch);
1823 			pi->ip6 = (struct ip6_hdr *)(ss->scratch + pi->ip_off);
1824 		}
1825 		nxt = 0;
1826 		pi->ip_hlen = ip6_lasthdr(m, pi->ip_off, IPPROTO_IPV6, &nxt);
1827 		pi->ip_hlen -= pi->ip_off;
1828 		if (nxt != IPPROTO_TCP && nxt != IPPROTO_UDP)
1829 			return EINVAL;
1830 
1831 		if (!tso)
1832 			return 0;
1833 
1834 		if (pi->ip_off + pi->ip_hlen > ss->sc->max_tso6_hlen)
1835 			return EINVAL;
1836 
1837 		if (__predict_false(m->m_len < pi->ip_off + pi->ip_hlen +
1838 		    sizeof(struct tcphdr))) {
1839 			m_copydata(m, 0, pi->ip_off + pi->ip_hlen +
1840 			    sizeof(struct tcphdr), ss->scratch);
1841 			pi->ip6 = (struct ip6_hdr *)(ss->scratch + pi->ip_off);
1842 		}
1843 		pi->tcp = (struct tcphdr *)((char *)pi->ip6 + pi->ip_hlen);
1844 		break;
1845 #endif
1846 	default:
1847 		return EINVAL;
1848 	}
1849 	return 0;
1850 }
1851 
1852 #if IFCAP_TSO4
1853 
1854 static void
1855 mxge_encap_tso(struct mxge_slice_state *ss, struct mbuf *m,
1856 	       int busdma_seg_cnt, struct mxge_pkt_info *pi)
1857 {
1858 	mxge_tx_ring_t *tx;
1859 	mcp_kreq_ether_send_t *req;
1860 	bus_dma_segment_t *seg;
1861 	uint32_t low, high_swapped;
1862 	int len, seglen, cum_len, cum_len_next;
1863 	int next_is_first, chop, cnt, rdma_count, small;
1864 	uint16_t pseudo_hdr_offset, cksum_offset, mss, sum;
1865 	uint8_t flags, flags_next;
1866 	static int once;
1867 
1868 	mss = m->m_pkthdr.tso_segsz;
1869 
1870 	/* negative cum_len signifies to the
1871 	 * send loop that we are still in the
1872 	 * header portion of the TSO packet.
1873 	 */
1874 
1875 	cksum_offset = pi->ip_off + pi->ip_hlen;
1876 	cum_len = -(cksum_offset + (pi->tcp->th_off << 2));
1877 
1878 	/* TSO implies checksum offload on this hardware */
1879 	if (__predict_false((m->m_pkthdr.csum_flags & (CSUM_TCP|CSUM_TCP_IPV6)) == 0)) {
1880 		/*
1881 		 * If packet has full TCP csum, replace it with pseudo hdr
1882 		 * sum that the NIC expects, otherwise the NIC will emit
1883 		 * packets with bad TCP checksums.
1884 		 */
1885 		m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
1886 		if (pi->ip6) {
1887 #if (CSUM_TCP_IPV6 != 0) && defined(INET6)
1888 			m->m_pkthdr.csum_flags |= CSUM_TCP_IPV6;
1889 			sum = in6_cksum_pseudo(pi->ip6,
1890 			    m->m_pkthdr.len - cksum_offset,
1891 			    IPPROTO_TCP, 0);
1892 #endif
1893 		} else {
1894 #ifdef INET
1895 			m->m_pkthdr.csum_flags |= CSUM_TCP;
1896 			sum = in_pseudo(pi->ip->ip_src.s_addr,
1897 			    pi->ip->ip_dst.s_addr,
1898 			    htons(IPPROTO_TCP + (m->m_pkthdr.len -
1899 				    cksum_offset)));
1900 #endif
1901 		}
1902 		m_copyback(m, offsetof(struct tcphdr, th_sum) +
1903 		    cksum_offset, sizeof(sum), (caddr_t)&sum);
1904 	}
1905 	flags = MXGEFW_FLAGS_TSO_HDR | MXGEFW_FLAGS_FIRST;
1906 
1907 
1908 	/* for TSO, pseudo_hdr_offset holds mss.
1909 	 * The firmware figures out where to put
1910 	 * the checksum by parsing the header. */
1911 	pseudo_hdr_offset = htobe16(mss);
1912 
1913 	if (pi->ip6) {
1914 		/*
1915 		 * for IPv6 TSO, the "checksum offset" is re-purposed
1916 		 * to store the TCP header len
1917 		 */
1918 		cksum_offset = (pi->tcp->th_off << 2);
1919 	}
1920 
1921 	tx = &ss->tx;
1922 	req = tx->req_list;
1923 	seg = tx->seg_list;
1924 	cnt = 0;
1925 	rdma_count = 0;
1926 	/* "rdma_count" is the number of RDMAs belonging to the
1927 	 * current packet BEFORE the current send request. For
1928 	 * non-TSO packets, this is equal to "count".
1929 	 * For TSO packets, rdma_count needs to be reset
1930 	 * to 0 after a segment cut.
1931 	 *
1932 	 * The rdma_count field of the send request is
1933 	 * the number of RDMAs of the packet starting at
1934 	 * that request. For TSO send requests with one ore more cuts
1935 	 * in the middle, this is the number of RDMAs starting
1936 	 * after the last cut in the request. All previous
1937 	 * segments before the last cut implicitly have 1 RDMA.
1938 	 *
1939 	 * Since the number of RDMAs is not known beforehand,
1940 	 * it must be filled-in retroactively - after each
1941 	 * segmentation cut or at the end of the entire packet.
1942 	 */
1943 
1944 	while (busdma_seg_cnt) {
1945 		/* Break the busdma segment up into pieces*/
1946 		low = MXGE_LOWPART_TO_U32(seg->ds_addr);
1947 		high_swapped = 	htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
1948 		len = seg->ds_len;
1949 
1950 		while (len) {
1951 			flags_next = flags & ~MXGEFW_FLAGS_FIRST;
1952 			seglen = len;
1953 			cum_len_next = cum_len + seglen;
1954 			(req-rdma_count)->rdma_count = rdma_count + 1;
1955 			if (__predict_true(cum_len >= 0)) {
1956 				/* payload */
1957 				chop = (cum_len_next > mss);
1958 				cum_len_next = cum_len_next % mss;
1959 				next_is_first = (cum_len_next == 0);
1960 				flags |= chop * MXGEFW_FLAGS_TSO_CHOP;
1961 				flags_next |= next_is_first *
1962 					MXGEFW_FLAGS_FIRST;
1963 				rdma_count |= -(chop | next_is_first);
1964 				rdma_count += chop & !next_is_first;
1965 			} else if (cum_len_next >= 0) {
1966 				/* header ends */
1967 				rdma_count = -1;
1968 				cum_len_next = 0;
1969 				seglen = -cum_len;
1970 				small = (mss <= MXGEFW_SEND_SMALL_SIZE);
1971 				flags_next = MXGEFW_FLAGS_TSO_PLD |
1972 					MXGEFW_FLAGS_FIRST |
1973 					(small * MXGEFW_FLAGS_SMALL);
1974 			    }
1975 
1976 			req->addr_high = high_swapped;
1977 			req->addr_low = htobe32(low);
1978 			req->pseudo_hdr_offset = pseudo_hdr_offset;
1979 			req->pad = 0;
1980 			req->rdma_count = 1;
1981 			req->length = htobe16(seglen);
1982 			req->cksum_offset = cksum_offset;
1983 			req->flags = flags | ((cum_len & 1) *
1984 					      MXGEFW_FLAGS_ALIGN_ODD);
1985 			low += seglen;
1986 			len -= seglen;
1987 			cum_len = cum_len_next;
1988 			flags = flags_next;
1989 			req++;
1990 			cnt++;
1991 			rdma_count++;
1992 			if (cksum_offset != 0 && !pi->ip6) {
1993 				if (__predict_false(cksum_offset > seglen))
1994 					cksum_offset -= seglen;
1995 				else
1996 					cksum_offset = 0;
1997 			}
1998 			if (__predict_false(cnt > tx->max_desc))
1999 				goto drop;
2000 		}
2001 		busdma_seg_cnt--;
2002 		seg++;
2003 	}
2004 	(req-rdma_count)->rdma_count = rdma_count;
2005 
2006 	do {
2007 		req--;
2008 		req->flags |= MXGEFW_FLAGS_TSO_LAST;
2009 	} while (!(req->flags & (MXGEFW_FLAGS_TSO_CHOP | MXGEFW_FLAGS_FIRST)));
2010 
2011 	tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
2012 	mxge_submit_req(tx, tx->req_list, cnt);
2013 #ifdef IFNET_BUF_RING
2014 	if ((ss->sc->num_slices > 1) && tx->queue_active == 0) {
2015 		/* tell the NIC to start polling this slice */
2016 		*tx->send_go = 1;
2017 		tx->queue_active = 1;
2018 		tx->activate++;
2019 		wmb();
2020 	}
2021 #endif
2022 	return;
2023 
2024 drop:
2025 	bus_dmamap_unload(tx->dmat, tx->info[tx->req & tx->mask].map);
2026 	m_freem(m);
2027 	ss->oerrors++;
2028 	if (!once) {
2029 		printf("tx->max_desc exceeded via TSO!\n");
2030 		printf("mss = %d, %ld, %d!\n", mss,
2031 		       (long)seg - (long)tx->seg_list, tx->max_desc);
2032 		once = 1;
2033 	}
2034 	return;
2035 
2036 }
2037 
2038 #endif /* IFCAP_TSO4 */
2039 
2040 #ifdef MXGE_NEW_VLAN_API
2041 /*
2042  * We reproduce the software vlan tag insertion from
2043  * net/if_vlan.c:vlan_start() here so that we can advertise "hardware"
2044  * vlan tag insertion. We need to advertise this in order to have the
2045  * vlan interface respect our csum offload flags.
2046  */
2047 static struct mbuf *
2048 mxge_vlan_tag_insert(struct mbuf *m)
2049 {
2050 	struct ether_vlan_header *evl;
2051 
2052 	M_PREPEND(m, ETHER_VLAN_ENCAP_LEN, M_NOWAIT);
2053 	if (__predict_false(m == NULL))
2054 		return NULL;
2055 	if (m->m_len < sizeof(*evl)) {
2056 		m = m_pullup(m, sizeof(*evl));
2057 		if (__predict_false(m == NULL))
2058 			return NULL;
2059 	}
2060 	/*
2061 	 * Transform the Ethernet header into an Ethernet header
2062 	 * with 802.1Q encapsulation.
2063 	 */
2064 	evl = mtod(m, struct ether_vlan_header *);
2065 	bcopy((char *)evl + ETHER_VLAN_ENCAP_LEN,
2066 	      (char *)evl, ETHER_HDR_LEN - ETHER_TYPE_LEN);
2067 	evl->evl_encap_proto = htons(ETHERTYPE_VLAN);
2068 	evl->evl_tag = htons(m->m_pkthdr.ether_vtag);
2069 	m->m_flags &= ~M_VLANTAG;
2070 	return m;
2071 }
2072 #endif /* MXGE_NEW_VLAN_API */
2073 
2074 static void
2075 mxge_encap(struct mxge_slice_state *ss, struct mbuf *m)
2076 {
2077 	struct mxge_pkt_info pi = {0,0,0,0};
2078 	mxge_softc_t *sc;
2079 	mcp_kreq_ether_send_t *req;
2080 	bus_dma_segment_t *seg;
2081 	struct mbuf *m_tmp;
2082 	struct ifnet *ifp;
2083 	mxge_tx_ring_t *tx;
2084 	int cnt, cum_len, err, i, idx, odd_flag;
2085 	uint16_t pseudo_hdr_offset;
2086 	uint8_t flags, cksum_offset;
2087 
2088 
2089 	sc = ss->sc;
2090 	ifp = sc->ifp;
2091 	tx = &ss->tx;
2092 
2093 #ifdef MXGE_NEW_VLAN_API
2094 	if (m->m_flags & M_VLANTAG) {
2095 		m = mxge_vlan_tag_insert(m);
2096 		if (__predict_false(m == NULL))
2097 			goto drop_without_m;
2098 	}
2099 #endif
2100 	if (m->m_pkthdr.csum_flags &
2101 	    (CSUM_TSO | CSUM_DELAY_DATA | CSUM_DELAY_DATA_IPV6)) {
2102 		if (mxge_parse_tx(ss, m, &pi))
2103 			goto drop;
2104 	}
2105 
2106 	/* (try to) map the frame for DMA */
2107 	idx = tx->req & tx->mask;
2108 	err = bus_dmamap_load_mbuf_sg(tx->dmat, tx->info[idx].map,
2109 				      m, tx->seg_list, &cnt,
2110 				      BUS_DMA_NOWAIT);
2111 	if (__predict_false(err == EFBIG)) {
2112 		/* Too many segments in the chain.  Try
2113 		   to defrag */
2114 		m_tmp = m_defrag(m, M_NOWAIT);
2115 		if (m_tmp == NULL) {
2116 			goto drop;
2117 		}
2118 		ss->tx.defrag++;
2119 		m = m_tmp;
2120 		err = bus_dmamap_load_mbuf_sg(tx->dmat,
2121 					      tx->info[idx].map,
2122 					      m, tx->seg_list, &cnt,
2123 					      BUS_DMA_NOWAIT);
2124 	}
2125 	if (__predict_false(err != 0)) {
2126 		device_printf(sc->dev, "bus_dmamap_load_mbuf_sg returned %d"
2127 			      " packet len = %d\n", err, m->m_pkthdr.len);
2128 		goto drop;
2129 	}
2130 	bus_dmamap_sync(tx->dmat, tx->info[idx].map,
2131 			BUS_DMASYNC_PREWRITE);
2132 	tx->info[idx].m = m;
2133 
2134 #if IFCAP_TSO4
2135 	/* TSO is different enough, we handle it in another routine */
2136 	if (m->m_pkthdr.csum_flags & (CSUM_TSO)) {
2137 		mxge_encap_tso(ss, m, cnt, &pi);
2138 		return;
2139 	}
2140 #endif
2141 
2142 	req = tx->req_list;
2143 	cksum_offset = 0;
2144 	pseudo_hdr_offset = 0;
2145 	flags = MXGEFW_FLAGS_NO_TSO;
2146 
2147 	/* checksum offloading? */
2148 	if (m->m_pkthdr.csum_flags &
2149 	    (CSUM_DELAY_DATA | CSUM_DELAY_DATA_IPV6)) {
2150 		/* ensure ip header is in first mbuf, copy
2151 		   it to a scratch buffer if not */
2152 		cksum_offset = pi.ip_off + pi.ip_hlen;
2153 		pseudo_hdr_offset = cksum_offset +  m->m_pkthdr.csum_data;
2154 		pseudo_hdr_offset = htobe16(pseudo_hdr_offset);
2155 		req->cksum_offset = cksum_offset;
2156 		flags |= MXGEFW_FLAGS_CKSUM;
2157 		odd_flag = MXGEFW_FLAGS_ALIGN_ODD;
2158 	} else {
2159 		odd_flag = 0;
2160 	}
2161 	if (m->m_pkthdr.len < MXGEFW_SEND_SMALL_SIZE)
2162 		flags |= MXGEFW_FLAGS_SMALL;
2163 
2164 	/* convert segments into a request list */
2165 	cum_len = 0;
2166 	seg = tx->seg_list;
2167 	req->flags = MXGEFW_FLAGS_FIRST;
2168 	for (i = 0; i < cnt; i++) {
2169 		req->addr_low =
2170 			htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
2171 		req->addr_high =
2172 			htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
2173 		req->length = htobe16(seg->ds_len);
2174 		req->cksum_offset = cksum_offset;
2175 		if (cksum_offset > seg->ds_len)
2176 			cksum_offset -= seg->ds_len;
2177 		else
2178 			cksum_offset = 0;
2179 		req->pseudo_hdr_offset = pseudo_hdr_offset;
2180 		req->pad = 0; /* complete solid 16-byte block */
2181 		req->rdma_count = 1;
2182 		req->flags |= flags | ((cum_len & 1) * odd_flag);
2183 		cum_len += seg->ds_len;
2184 		seg++;
2185 		req++;
2186 		req->flags = 0;
2187 	}
2188 	req--;
2189 	/* pad runts to 60 bytes */
2190 	if (cum_len < 60) {
2191 		req++;
2192 		req->addr_low =
2193 			htobe32(MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr));
2194 		req->addr_high =
2195 			htobe32(MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr));
2196 		req->length = htobe16(60 - cum_len);
2197 		req->cksum_offset = 0;
2198 		req->pseudo_hdr_offset = pseudo_hdr_offset;
2199 		req->pad = 0; /* complete solid 16-byte block */
2200 		req->rdma_count = 1;
2201 		req->flags |= flags | ((cum_len & 1) * odd_flag);
2202 		cnt++;
2203 	}
2204 
2205 	tx->req_list[0].rdma_count = cnt;
2206 #if 0
2207 	/* print what the firmware will see */
2208 	for (i = 0; i < cnt; i++) {
2209 		printf("%d: addr: 0x%x 0x%x len:%d pso%d,"
2210 		    "cso:%d, flags:0x%x, rdma:%d\n",
2211 		    i, (int)ntohl(tx->req_list[i].addr_high),
2212 		    (int)ntohl(tx->req_list[i].addr_low),
2213 		    (int)ntohs(tx->req_list[i].length),
2214 		    (int)ntohs(tx->req_list[i].pseudo_hdr_offset),
2215 		    tx->req_list[i].cksum_offset, tx->req_list[i].flags,
2216 		    tx->req_list[i].rdma_count);
2217 	}
2218 	printf("--------------\n");
2219 #endif
2220 	tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
2221 	mxge_submit_req(tx, tx->req_list, cnt);
2222 #ifdef IFNET_BUF_RING
2223 	if ((ss->sc->num_slices > 1) && tx->queue_active == 0) {
2224 		/* tell the NIC to start polling this slice */
2225 		*tx->send_go = 1;
2226 		tx->queue_active = 1;
2227 		tx->activate++;
2228 		wmb();
2229 	}
2230 #endif
2231 	return;
2232 
2233 drop:
2234 	m_freem(m);
2235 drop_without_m:
2236 	ss->oerrors++;
2237 	return;
2238 }
2239 
2240 #ifdef IFNET_BUF_RING
2241 static void
2242 mxge_qflush(struct ifnet *ifp)
2243 {
2244 	mxge_softc_t *sc = ifp->if_softc;
2245 	mxge_tx_ring_t *tx;
2246 	struct mbuf *m;
2247 	int slice;
2248 
2249 	for (slice = 0; slice < sc->num_slices; slice++) {
2250 		tx = &sc->ss[slice].tx;
2251 		mtx_lock(&tx->mtx);
2252 		while ((m = buf_ring_dequeue_sc(tx->br)) != NULL)
2253 			m_freem(m);
2254 		mtx_unlock(&tx->mtx);
2255 	}
2256 	if_qflush(ifp);
2257 }
2258 
2259 static inline void
2260 mxge_start_locked(struct mxge_slice_state *ss)
2261 {
2262 	mxge_softc_t *sc;
2263 	struct mbuf *m;
2264 	struct ifnet *ifp;
2265 	mxge_tx_ring_t *tx;
2266 
2267 	sc = ss->sc;
2268 	ifp = sc->ifp;
2269 	tx = &ss->tx;
2270 
2271 	while ((tx->mask - (tx->req - tx->done)) > tx->max_desc) {
2272 		m = drbr_dequeue(ifp, tx->br);
2273 		if (m == NULL) {
2274 			return;
2275 		}
2276 		/* let BPF see it */
2277 		BPF_MTAP(ifp, m);
2278 
2279 		/* give it to the nic */
2280 		mxge_encap(ss, m);
2281 	}
2282 	/* ran out of transmit slots */
2283 	if (((ss->if_drv_flags & IFF_DRV_OACTIVE) == 0)
2284 	    && (!drbr_empty(ifp, tx->br))) {
2285 		ss->if_drv_flags |= IFF_DRV_OACTIVE;
2286 		tx->stall++;
2287 	}
2288 }
2289 
2290 static int
2291 mxge_transmit_locked(struct mxge_slice_state *ss, struct mbuf *m)
2292 {
2293 	mxge_softc_t *sc;
2294 	struct ifnet *ifp;
2295 	mxge_tx_ring_t *tx;
2296 	int err;
2297 
2298 	sc = ss->sc;
2299 	ifp = sc->ifp;
2300 	tx = &ss->tx;
2301 
2302 	if ((ss->if_drv_flags & (IFF_DRV_RUNNING|IFF_DRV_OACTIVE)) !=
2303 	    IFF_DRV_RUNNING) {
2304 		err = drbr_enqueue(ifp, tx->br, m);
2305 		return (err);
2306 	}
2307 
2308 	if (!drbr_needs_enqueue(ifp, tx->br) &&
2309 	    ((tx->mask - (tx->req - tx->done)) > tx->max_desc)) {
2310 		/* let BPF see it */
2311 		BPF_MTAP(ifp, m);
2312 		/* give it to the nic */
2313 		mxge_encap(ss, m);
2314 	} else if ((err = drbr_enqueue(ifp, tx->br, m)) != 0) {
2315 		return (err);
2316 	}
2317 	if (!drbr_empty(ifp, tx->br))
2318 		mxge_start_locked(ss);
2319 	return (0);
2320 }
2321 
2322 static int
2323 mxge_transmit(struct ifnet *ifp, struct mbuf *m)
2324 {
2325 	mxge_softc_t *sc = ifp->if_softc;
2326 	struct mxge_slice_state *ss;
2327 	mxge_tx_ring_t *tx;
2328 	int err = 0;
2329 	int slice;
2330 
2331 	slice = m->m_pkthdr.flowid;
2332 	slice &= (sc->num_slices - 1);  /* num_slices always power of 2 */
2333 
2334 	ss = &sc->ss[slice];
2335 	tx = &ss->tx;
2336 
2337 	if (mtx_trylock(&tx->mtx)) {
2338 		err = mxge_transmit_locked(ss, m);
2339 		mtx_unlock(&tx->mtx);
2340 	} else {
2341 		err = drbr_enqueue(ifp, tx->br, m);
2342 	}
2343 
2344 	return (err);
2345 }
2346 
2347 #else
2348 
2349 static inline void
2350 mxge_start_locked(struct mxge_slice_state *ss)
2351 {
2352 	mxge_softc_t *sc;
2353 	struct mbuf *m;
2354 	struct ifnet *ifp;
2355 	mxge_tx_ring_t *tx;
2356 
2357 	sc = ss->sc;
2358 	ifp = sc->ifp;
2359 	tx = &ss->tx;
2360 	while ((tx->mask - (tx->req - tx->done)) > tx->max_desc) {
2361 		IFQ_DRV_DEQUEUE(&ifp->if_snd, m);
2362 		if (m == NULL) {
2363 			return;
2364 		}
2365 		/* let BPF see it */
2366 		BPF_MTAP(ifp, m);
2367 
2368 		/* give it to the nic */
2369 		mxge_encap(ss, m);
2370 	}
2371 	/* ran out of transmit slots */
2372 	if ((sc->ifp->if_drv_flags & IFF_DRV_OACTIVE) == 0) {
2373 		sc->ifp->if_drv_flags |= IFF_DRV_OACTIVE;
2374 		tx->stall++;
2375 	}
2376 }
2377 #endif
2378 static void
2379 mxge_start(struct ifnet *ifp)
2380 {
2381 	mxge_softc_t *sc = ifp->if_softc;
2382 	struct mxge_slice_state *ss;
2383 
2384 	/* only use the first slice for now */
2385 	ss = &sc->ss[0];
2386 	mtx_lock(&ss->tx.mtx);
2387 	mxge_start_locked(ss);
2388 	mtx_unlock(&ss->tx.mtx);
2389 }
2390 
2391 /*
2392  * copy an array of mcp_kreq_ether_recv_t's to the mcp.  Copy
2393  * at most 32 bytes at a time, so as to avoid involving the software
2394  * pio handler in the nic.   We re-write the first segment's low
2395  * DMA address to mark it valid only after we write the entire chunk
2396  * in a burst
2397  */
2398 static inline void
2399 mxge_submit_8rx(volatile mcp_kreq_ether_recv_t *dst,
2400 		mcp_kreq_ether_recv_t *src)
2401 {
2402 	uint32_t low;
2403 
2404 	low = src->addr_low;
2405 	src->addr_low = 0xffffffff;
2406 	mxge_pio_copy(dst, src, 4 * sizeof (*src));
2407 	wmb();
2408 	mxge_pio_copy(dst + 4, src + 4, 4 * sizeof (*src));
2409 	wmb();
2410 	src->addr_low = low;
2411 	dst->addr_low = low;
2412 	wmb();
2413 }
2414 
2415 static int
2416 mxge_get_buf_small(struct mxge_slice_state *ss, bus_dmamap_t map, int idx)
2417 {
2418 	bus_dma_segment_t seg;
2419 	struct mbuf *m;
2420 	mxge_rx_ring_t *rx = &ss->rx_small;
2421 	int cnt, err;
2422 
2423 	m = m_gethdr(M_NOWAIT, MT_DATA);
2424 	if (m == NULL) {
2425 		rx->alloc_fail++;
2426 		err = ENOBUFS;
2427 		goto done;
2428 	}
2429 	m->m_len = MHLEN;
2430 	err = bus_dmamap_load_mbuf_sg(rx->dmat, map, m,
2431 				      &seg, &cnt, BUS_DMA_NOWAIT);
2432 	if (err != 0) {
2433 		m_free(m);
2434 		goto done;
2435 	}
2436 	rx->info[idx].m = m;
2437 	rx->shadow[idx].addr_low =
2438 		htobe32(MXGE_LOWPART_TO_U32(seg.ds_addr));
2439 	rx->shadow[idx].addr_high =
2440 		htobe32(MXGE_HIGHPART_TO_U32(seg.ds_addr));
2441 
2442 done:
2443 	if ((idx & 7) == 7)
2444 		mxge_submit_8rx(&rx->lanai[idx - 7], &rx->shadow[idx - 7]);
2445 	return err;
2446 }
2447 
2448 static int
2449 mxge_get_buf_big(struct mxge_slice_state *ss, bus_dmamap_t map, int idx)
2450 {
2451 	bus_dma_segment_t seg[3];
2452 	struct mbuf *m;
2453 	mxge_rx_ring_t *rx = &ss->rx_big;
2454 	int cnt, err, i;
2455 
2456 	m = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, rx->cl_size);
2457 	if (m == NULL) {
2458 		rx->alloc_fail++;
2459 		err = ENOBUFS;
2460 		goto done;
2461 	}
2462 	m->m_len = rx->mlen;
2463 	err = bus_dmamap_load_mbuf_sg(rx->dmat, map, m,
2464 				      seg, &cnt, BUS_DMA_NOWAIT);
2465 	if (err != 0) {
2466 		m_free(m);
2467 		goto done;
2468 	}
2469 	rx->info[idx].m = m;
2470 	rx->shadow[idx].addr_low =
2471 		htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
2472 	rx->shadow[idx].addr_high =
2473 		htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
2474 
2475 #if MXGE_VIRT_JUMBOS
2476 	for (i = 1; i < cnt; i++) {
2477 		rx->shadow[idx + i].addr_low =
2478 			htobe32(MXGE_LOWPART_TO_U32(seg[i].ds_addr));
2479 		rx->shadow[idx + i].addr_high =
2480 			htobe32(MXGE_HIGHPART_TO_U32(seg[i].ds_addr));
2481        }
2482 #endif
2483 
2484 done:
2485        for (i = 0; i < rx->nbufs; i++) {
2486 		if ((idx & 7) == 7) {
2487 			mxge_submit_8rx(&rx->lanai[idx - 7],
2488 					&rx->shadow[idx - 7]);
2489 		}
2490 		idx++;
2491 	}
2492 	return err;
2493 }
2494 
2495 #ifdef INET6
2496 
2497 static uint16_t
2498 mxge_csum_generic(uint16_t *raw, int len)
2499 {
2500 	uint32_t csum;
2501 
2502 
2503 	csum = 0;
2504 	while (len > 0) {
2505 		csum += *raw;
2506 		raw++;
2507 		len -= 2;
2508 	}
2509 	csum = (csum >> 16) + (csum & 0xffff);
2510 	csum = (csum >> 16) + (csum & 0xffff);
2511 	return (uint16_t)csum;
2512 }
2513 
2514 static inline uint16_t
2515 mxge_rx_csum6(void *p, struct mbuf *m, uint32_t csum)
2516 {
2517 	uint32_t partial;
2518 	int nxt, cksum_offset;
2519 	struct ip6_hdr *ip6 = p;
2520 	uint16_t c;
2521 
2522 	nxt = ip6->ip6_nxt;
2523 	cksum_offset = sizeof (*ip6) + ETHER_HDR_LEN;
2524 	if (nxt != IPPROTO_TCP && nxt != IPPROTO_UDP) {
2525 		cksum_offset = ip6_lasthdr(m, ETHER_HDR_LEN,
2526 					   IPPROTO_IPV6, &nxt);
2527 		if (nxt != IPPROTO_TCP && nxt != IPPROTO_UDP)
2528 			return (1);
2529 	}
2530 
2531 	/*
2532 	 * IPv6 headers do not contain a checksum, and hence
2533 	 * do not checksum to zero, so they don't "fall out"
2534 	 * of the partial checksum calculation like IPv4
2535 	 * headers do.  We need to fix the partial checksum by
2536 	 * subtracting the checksum of the IPv6 header.
2537 	 */
2538 
2539 	partial = mxge_csum_generic((uint16_t *)ip6, cksum_offset -
2540 				    ETHER_HDR_LEN);
2541 	csum += ~partial;
2542 	csum +=	 (csum < ~partial);
2543 	csum = (csum >> 16) + (csum & 0xFFFF);
2544 	csum = (csum >> 16) + (csum & 0xFFFF);
2545 	c = in6_cksum_pseudo(ip6, m->m_pkthdr.len - cksum_offset, nxt,
2546 			     csum);
2547 	c ^= 0xffff;
2548 	return (c);
2549 }
2550 #endif /* INET6 */
2551 /*
2552  *  Myri10GE hardware checksums are not valid if the sender
2553  *  padded the frame with non-zero padding.  This is because
2554  *  the firmware just does a simple 16-bit 1s complement
2555  *  checksum across the entire frame, excluding the first 14
2556  *  bytes.  It is best to simply to check the checksum and
2557  *  tell the stack about it only if the checksum is good
2558  */
2559 
2560 static inline uint16_t
2561 mxge_rx_csum(struct mbuf *m, int csum)
2562 {
2563 	struct ether_header *eh;
2564 #ifdef INET
2565 	struct ip *ip;
2566 #endif
2567 #if defined(INET) || defined(INET6)
2568 	int cap = m->m_pkthdr.rcvif->if_capenable;
2569 #endif
2570 	uint16_t c, etype;
2571 
2572 
2573 	eh = mtod(m, struct ether_header *);
2574 	etype = ntohs(eh->ether_type);
2575 	switch (etype) {
2576 #ifdef INET
2577 	case ETHERTYPE_IP:
2578 		if ((cap & IFCAP_RXCSUM) == 0)
2579 			return (1);
2580 		ip = (struct ip *)(eh + 1);
2581 		if (ip->ip_p != IPPROTO_TCP && ip->ip_p != IPPROTO_UDP)
2582 			return (1);
2583 		c = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
2584 			      htonl(ntohs(csum) + ntohs(ip->ip_len) -
2585 				    (ip->ip_hl << 2) + ip->ip_p));
2586 		c ^= 0xffff;
2587 		break;
2588 #endif
2589 #ifdef INET6
2590 	case ETHERTYPE_IPV6:
2591 		if ((cap & IFCAP_RXCSUM_IPV6) == 0)
2592 			return (1);
2593 		c = mxge_rx_csum6((eh + 1), m, csum);
2594 		break;
2595 #endif
2596 	default:
2597 		c = 1;
2598 	}
2599 	return (c);
2600 }
2601 
2602 static void
2603 mxge_vlan_tag_remove(struct mbuf *m, uint32_t *csum)
2604 {
2605 	struct ether_vlan_header *evl;
2606 	struct ether_header *eh;
2607 	uint32_t partial;
2608 
2609 	evl = mtod(m, struct ether_vlan_header *);
2610 	eh = mtod(m, struct ether_header *);
2611 
2612 	/*
2613 	 * fix checksum by subtracting ETHER_VLAN_ENCAP_LEN bytes
2614 	 * after what the firmware thought was the end of the ethernet
2615 	 * header.
2616 	 */
2617 
2618 	/* put checksum into host byte order */
2619 	*csum = ntohs(*csum);
2620 	partial = ntohl(*(uint32_t *)(mtod(m, char *) + ETHER_HDR_LEN));
2621 	(*csum) += ~partial;
2622 	(*csum) +=  ((*csum) < ~partial);
2623 	(*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2624 	(*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2625 
2626 	/* restore checksum to network byte order;
2627 	   later consumers expect this */
2628 	*csum = htons(*csum);
2629 
2630 	/* save the tag */
2631 #ifdef MXGE_NEW_VLAN_API
2632 	m->m_pkthdr.ether_vtag = ntohs(evl->evl_tag);
2633 #else
2634 	{
2635 		struct m_tag *mtag;
2636 		mtag = m_tag_alloc(MTAG_VLAN, MTAG_VLAN_TAG, sizeof(u_int),
2637 				   M_NOWAIT);
2638 		if (mtag == NULL)
2639 			return;
2640 		VLAN_TAG_VALUE(mtag) = ntohs(evl->evl_tag);
2641 		m_tag_prepend(m, mtag);
2642 	}
2643 
2644 #endif
2645 	m->m_flags |= M_VLANTAG;
2646 
2647 	/*
2648 	 * Remove the 802.1q header by copying the Ethernet
2649 	 * addresses over it and adjusting the beginning of
2650 	 * the data in the mbuf.  The encapsulated Ethernet
2651 	 * type field is already in place.
2652 	 */
2653 	bcopy((char *)evl, (char *)evl + ETHER_VLAN_ENCAP_LEN,
2654 	      ETHER_HDR_LEN - ETHER_TYPE_LEN);
2655 	m_adj(m, ETHER_VLAN_ENCAP_LEN);
2656 }
2657 
2658 
2659 static inline void
2660 mxge_rx_done_big(struct mxge_slice_state *ss, uint32_t len,
2661 		 uint32_t csum, int lro)
2662 {
2663 	mxge_softc_t *sc;
2664 	struct ifnet *ifp;
2665 	struct mbuf *m;
2666 	struct ether_header *eh;
2667 	mxge_rx_ring_t *rx;
2668 	bus_dmamap_t old_map;
2669 	int idx;
2670 
2671 	sc = ss->sc;
2672 	ifp = sc->ifp;
2673 	rx = &ss->rx_big;
2674 	idx = rx->cnt & rx->mask;
2675 	rx->cnt += rx->nbufs;
2676 	/* save a pointer to the received mbuf */
2677 	m = rx->info[idx].m;
2678 	/* try to replace the received mbuf */
2679 	if (mxge_get_buf_big(ss, rx->extra_map, idx)) {
2680 		/* drop the frame -- the old mbuf is re-cycled */
2681 		if_inc_counter(ifp, IFCOUNTER_IERRORS, 1);
2682 		return;
2683 	}
2684 
2685 	/* unmap the received buffer */
2686 	old_map = rx->info[idx].map;
2687 	bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2688 	bus_dmamap_unload(rx->dmat, old_map);
2689 
2690 	/* swap the bus_dmamap_t's */
2691 	rx->info[idx].map = rx->extra_map;
2692 	rx->extra_map = old_map;
2693 
2694 	/* mcp implicitly skips 1st 2 bytes so that packet is properly
2695 	 * aligned */
2696 	m->m_data += MXGEFW_PAD;
2697 
2698 	m->m_pkthdr.rcvif = ifp;
2699 	m->m_len = m->m_pkthdr.len = len;
2700 	ss->ipackets++;
2701 	eh = mtod(m, struct ether_header *);
2702 	if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2703 		mxge_vlan_tag_remove(m, &csum);
2704 	}
2705 	/* if the checksum is valid, mark it in the mbuf header */
2706 
2707 	if ((ifp->if_capenable & (IFCAP_RXCSUM_IPV6 | IFCAP_RXCSUM)) &&
2708 	    (0 == mxge_rx_csum(m, csum))) {
2709 		/* Tell the stack that the  checksum is good */
2710 		m->m_pkthdr.csum_data = 0xffff;
2711 		m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR |
2712 			CSUM_DATA_VALID;
2713 
2714 #if defined(INET) || defined (INET6)
2715 		if (lro && (0 == tcp_lro_rx(&ss->lc, m, 0)))
2716 			return;
2717 #endif
2718 	}
2719 	/* flowid only valid if RSS hashing is enabled */
2720 	if (sc->num_slices > 1) {
2721 		m->m_pkthdr.flowid = (ss - sc->ss);
2722 		M_HASHTYPE_SET(m, M_HASHTYPE_OPAQUE);
2723 	}
2724 	/* pass the frame up the stack */
2725 	(*ifp->if_input)(ifp, m);
2726 }
2727 
2728 static inline void
2729 mxge_rx_done_small(struct mxge_slice_state *ss, uint32_t len,
2730 		   uint32_t csum, int lro)
2731 {
2732 	mxge_softc_t *sc;
2733 	struct ifnet *ifp;
2734 	struct ether_header *eh;
2735 	struct mbuf *m;
2736 	mxge_rx_ring_t *rx;
2737 	bus_dmamap_t old_map;
2738 	int idx;
2739 
2740 	sc = ss->sc;
2741 	ifp = sc->ifp;
2742 	rx = &ss->rx_small;
2743 	idx = rx->cnt & rx->mask;
2744 	rx->cnt++;
2745 	/* save a pointer to the received mbuf */
2746 	m = rx->info[idx].m;
2747 	/* try to replace the received mbuf */
2748 	if (mxge_get_buf_small(ss, rx->extra_map, idx)) {
2749 		/* drop the frame -- the old mbuf is re-cycled */
2750 		if_inc_counter(ifp, IFCOUNTER_IERRORS, 1);
2751 		return;
2752 	}
2753 
2754 	/* unmap the received buffer */
2755 	old_map = rx->info[idx].map;
2756 	bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2757 	bus_dmamap_unload(rx->dmat, old_map);
2758 
2759 	/* swap the bus_dmamap_t's */
2760 	rx->info[idx].map = rx->extra_map;
2761 	rx->extra_map = old_map;
2762 
2763 	/* mcp implicitly skips 1st 2 bytes so that packet is properly
2764 	 * aligned */
2765 	m->m_data += MXGEFW_PAD;
2766 
2767 	m->m_pkthdr.rcvif = ifp;
2768 	m->m_len = m->m_pkthdr.len = len;
2769 	ss->ipackets++;
2770 	eh = mtod(m, struct ether_header *);
2771 	if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2772 		mxge_vlan_tag_remove(m, &csum);
2773 	}
2774 	/* if the checksum is valid, mark it in the mbuf header */
2775 	if ((ifp->if_capenable & (IFCAP_RXCSUM_IPV6 | IFCAP_RXCSUM)) &&
2776 	    (0 == mxge_rx_csum(m, csum))) {
2777 		/* Tell the stack that the  checksum is good */
2778 		m->m_pkthdr.csum_data = 0xffff;
2779 		m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR |
2780 			CSUM_DATA_VALID;
2781 
2782 #if defined(INET) || defined (INET6)
2783 		if (lro && (0 == tcp_lro_rx(&ss->lc, m, csum)))
2784 			return;
2785 #endif
2786 	}
2787 	/* flowid only valid if RSS hashing is enabled */
2788 	if (sc->num_slices > 1) {
2789 		m->m_pkthdr.flowid = (ss - sc->ss);
2790 		M_HASHTYPE_SET(m, M_HASHTYPE_OPAQUE);
2791 	}
2792 	/* pass the frame up the stack */
2793 	(*ifp->if_input)(ifp, m);
2794 }
2795 
2796 static inline void
2797 mxge_clean_rx_done(struct mxge_slice_state *ss)
2798 {
2799 	mxge_rx_done_t *rx_done = &ss->rx_done;
2800 	int limit = 0;
2801 	uint16_t length;
2802 	uint16_t checksum;
2803 	int lro;
2804 
2805 	lro = ss->sc->ifp->if_capenable & IFCAP_LRO;
2806 	while (rx_done->entry[rx_done->idx].length != 0) {
2807 		length = ntohs(rx_done->entry[rx_done->idx].length);
2808 		rx_done->entry[rx_done->idx].length = 0;
2809 		checksum = rx_done->entry[rx_done->idx].checksum;
2810 		if (length <= (MHLEN - MXGEFW_PAD))
2811 			mxge_rx_done_small(ss, length, checksum, lro);
2812 		else
2813 			mxge_rx_done_big(ss, length, checksum, lro);
2814 		rx_done->cnt++;
2815 		rx_done->idx = rx_done->cnt & rx_done->mask;
2816 
2817 		/* limit potential for livelock */
2818 		if (__predict_false(++limit > rx_done->mask / 2))
2819 			break;
2820 	}
2821 #if defined(INET)  || defined (INET6)
2822 	tcp_lro_flush_all(&ss->lc);
2823 #endif
2824 }
2825 
2826 
2827 static inline void
2828 mxge_tx_done(struct mxge_slice_state *ss, uint32_t mcp_idx)
2829 {
2830 	struct ifnet *ifp;
2831 	mxge_tx_ring_t *tx;
2832 	struct mbuf *m;
2833 	bus_dmamap_t map;
2834 	int idx;
2835 	int *flags;
2836 
2837 	tx = &ss->tx;
2838 	ifp = ss->sc->ifp;
2839 	while (tx->pkt_done != mcp_idx) {
2840 		idx = tx->done & tx->mask;
2841 		tx->done++;
2842 		m = tx->info[idx].m;
2843 		/* mbuf and DMA map only attached to the first
2844 		   segment per-mbuf */
2845 		if (m != NULL) {
2846 			ss->obytes += m->m_pkthdr.len;
2847 			if (m->m_flags & M_MCAST)
2848 				ss->omcasts++;
2849 			ss->opackets++;
2850 			tx->info[idx].m = NULL;
2851 			map = tx->info[idx].map;
2852 			bus_dmamap_unload(tx->dmat, map);
2853 			m_freem(m);
2854 		}
2855 		if (tx->info[idx].flag) {
2856 			tx->info[idx].flag = 0;
2857 			tx->pkt_done++;
2858 		}
2859 	}
2860 
2861 	/* If we have space, clear IFF_OACTIVE to tell the stack that
2862 	   its OK to send packets */
2863 #ifdef IFNET_BUF_RING
2864 	flags = &ss->if_drv_flags;
2865 #else
2866 	flags = &ifp->if_drv_flags;
2867 #endif
2868 	mtx_lock(&ss->tx.mtx);
2869 	if ((*flags) & IFF_DRV_OACTIVE &&
2870 	    tx->req - tx->done < (tx->mask + 1)/4) {
2871 		*(flags) &= ~IFF_DRV_OACTIVE;
2872 		ss->tx.wake++;
2873 		mxge_start_locked(ss);
2874 	}
2875 #ifdef IFNET_BUF_RING
2876 	if ((ss->sc->num_slices > 1) && (tx->req == tx->done)) {
2877 		/* let the NIC stop polling this queue, since there
2878 		 * are no more transmits pending */
2879 		if (tx->req == tx->done) {
2880 			*tx->send_stop = 1;
2881 			tx->queue_active = 0;
2882 			tx->deactivate++;
2883 			wmb();
2884 		}
2885 	}
2886 #endif
2887 	mtx_unlock(&ss->tx.mtx);
2888 
2889 }
2890 
2891 static struct mxge_media_type mxge_xfp_media_types[] =
2892 {
2893 	{IFM_10G_CX4,	0x7f, 		"10GBASE-CX4 (module)"},
2894 	{IFM_10G_SR, 	(1 << 7),	"10GBASE-SR"},
2895 	{IFM_10G_LR, 	(1 << 6),	"10GBASE-LR"},
2896 	{0,		(1 << 5),	"10GBASE-ER"},
2897 	{IFM_10G_LRM,	(1 << 4),	"10GBASE-LRM"},
2898 	{0,		(1 << 3),	"10GBASE-SW"},
2899 	{0,		(1 << 2),	"10GBASE-LW"},
2900 	{0,		(1 << 1),	"10GBASE-EW"},
2901 	{0,		(1 << 0),	"Reserved"}
2902 };
2903 static struct mxge_media_type mxge_sfp_media_types[] =
2904 {
2905 	{IFM_10G_TWINAX,      0,	"10GBASE-Twinax"},
2906 	{0,		(1 << 7),	"Reserved"},
2907 	{IFM_10G_LRM,	(1 << 6),	"10GBASE-LRM"},
2908 	{IFM_10G_LR, 	(1 << 5),	"10GBASE-LR"},
2909 	{IFM_10G_SR,	(1 << 4),	"10GBASE-SR"},
2910 	{IFM_10G_TWINAX,(1 << 0),	"10GBASE-Twinax"}
2911 };
2912 
2913 static void
2914 mxge_media_set(mxge_softc_t *sc, int media_type)
2915 {
2916 
2917 
2918 	ifmedia_add(&sc->media, IFM_ETHER | IFM_FDX | media_type,
2919 		    0, NULL);
2920 	ifmedia_set(&sc->media, IFM_ETHER | IFM_FDX | media_type);
2921 	sc->current_media = media_type;
2922 	sc->media.ifm_media = sc->media.ifm_cur->ifm_media;
2923 }
2924 
2925 static void
2926 mxge_media_init(mxge_softc_t *sc)
2927 {
2928 	char *ptr;
2929 	int i;
2930 
2931 	ifmedia_removeall(&sc->media);
2932 	mxge_media_set(sc, IFM_AUTO);
2933 
2934 	/*
2935 	 * parse the product code to deterimine the interface type
2936 	 * (CX4, XFP, Quad Ribbon Fiber) by looking at the character
2937 	 * after the 3rd dash in the driver's cached copy of the
2938 	 * EEPROM's product code string.
2939 	 */
2940 	ptr = sc->product_code_string;
2941 	if (ptr == NULL) {
2942 		device_printf(sc->dev, "Missing product code\n");
2943 		return;
2944 	}
2945 
2946 	for (i = 0; i < 3; i++, ptr++) {
2947 		ptr = strchr(ptr, '-');
2948 		if (ptr == NULL) {
2949 			device_printf(sc->dev,
2950 				      "only %d dashes in PC?!?\n", i);
2951 			return;
2952 		}
2953 	}
2954 	if (*ptr == 'C' || *(ptr +1) == 'C') {
2955 		/* -C is CX4 */
2956 		sc->connector = MXGE_CX4;
2957 		mxge_media_set(sc, IFM_10G_CX4);
2958 	} else if (*ptr == 'Q') {
2959 		/* -Q is Quad Ribbon Fiber */
2960 		sc->connector = MXGE_QRF;
2961 		device_printf(sc->dev, "Quad Ribbon Fiber Media\n");
2962 		/* FreeBSD has no media type for Quad ribbon fiber */
2963 	} else if (*ptr == 'R') {
2964 		/* -R is XFP */
2965 		sc->connector = MXGE_XFP;
2966 	} else if (*ptr == 'S' || *(ptr +1) == 'S') {
2967 		/* -S or -2S is SFP+ */
2968 		sc->connector = MXGE_SFP;
2969 	} else {
2970 		device_printf(sc->dev, "Unknown media type: %c\n", *ptr);
2971 	}
2972 }
2973 
2974 /*
2975  * Determine the media type for a NIC.  Some XFPs will identify
2976  * themselves only when their link is up, so this is initiated via a
2977  * link up interrupt.  However, this can potentially take up to
2978  * several milliseconds, so it is run via the watchdog routine, rather
2979  * than in the interrupt handler itself.
2980  */
2981 static void
2982 mxge_media_probe(mxge_softc_t *sc)
2983 {
2984 	mxge_cmd_t cmd;
2985 	char *cage_type;
2986 
2987 	struct mxge_media_type *mxge_media_types = NULL;
2988 	int i, err, ms, mxge_media_type_entries;
2989 	uint32_t byte;
2990 
2991 	sc->need_media_probe = 0;
2992 
2993 	if (sc->connector == MXGE_XFP) {
2994 		/* -R is XFP */
2995 		mxge_media_types = mxge_xfp_media_types;
2996 		mxge_media_type_entries =
2997 			sizeof (mxge_xfp_media_types) /
2998 			sizeof (mxge_xfp_media_types[0]);
2999 		byte = MXGE_XFP_COMPLIANCE_BYTE;
3000 		cage_type = "XFP";
3001 	} else 	if (sc->connector == MXGE_SFP) {
3002 		/* -S or -2S is SFP+ */
3003 		mxge_media_types = mxge_sfp_media_types;
3004 		mxge_media_type_entries =
3005 			sizeof (mxge_sfp_media_types) /
3006 			sizeof (mxge_sfp_media_types[0]);
3007 		cage_type = "SFP+";
3008 		byte = 3;
3009 	} else {
3010 		/* nothing to do; media type cannot change */
3011 		return;
3012 	}
3013 
3014 	/*
3015 	 * At this point we know the NIC has an XFP cage, so now we
3016 	 * try to determine what is in the cage by using the
3017 	 * firmware's XFP I2C commands to read the XFP 10GbE compilance
3018 	 * register.  We read just one byte, which may take over
3019 	 * a millisecond
3020 	 */
3021 
3022 	cmd.data0 = 0;	 /* just fetch 1 byte, not all 256 */
3023 	cmd.data1 = byte;
3024 	err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_READ, &cmd);
3025 	if (err == MXGEFW_CMD_ERROR_I2C_FAILURE) {
3026 		device_printf(sc->dev, "failed to read XFP\n");
3027 	}
3028 	if (err == MXGEFW_CMD_ERROR_I2C_ABSENT) {
3029 		device_printf(sc->dev, "Type R/S with no XFP!?!?\n");
3030 	}
3031 	if (err != MXGEFW_CMD_OK) {
3032 		return;
3033 	}
3034 
3035 	/* now we wait for the data to be cached */
3036 	cmd.data0 = byte;
3037 	err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
3038 	for (ms = 0; (err == EBUSY) && (ms < 50); ms++) {
3039 		DELAY(1000);
3040 		cmd.data0 = byte;
3041 		err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
3042 	}
3043 	if (err != MXGEFW_CMD_OK) {
3044 		device_printf(sc->dev, "failed to read %s (%d, %dms)\n",
3045 			      cage_type, err, ms);
3046 		return;
3047 	}
3048 
3049 	if (cmd.data0 == mxge_media_types[0].bitmask) {
3050 		if (mxge_verbose)
3051 			device_printf(sc->dev, "%s:%s\n", cage_type,
3052 				      mxge_media_types[0].name);
3053 		if (sc->current_media != mxge_media_types[0].flag) {
3054 			mxge_media_init(sc);
3055 			mxge_media_set(sc, mxge_media_types[0].flag);
3056 		}
3057 		return;
3058 	}
3059 	for (i = 1; i < mxge_media_type_entries; i++) {
3060 		if (cmd.data0 & mxge_media_types[i].bitmask) {
3061 			if (mxge_verbose)
3062 				device_printf(sc->dev, "%s:%s\n",
3063 					      cage_type,
3064 					      mxge_media_types[i].name);
3065 
3066 			if (sc->current_media != mxge_media_types[i].flag) {
3067 				mxge_media_init(sc);
3068 				mxge_media_set(sc, mxge_media_types[i].flag);
3069 			}
3070 			return;
3071 		}
3072 	}
3073 	if (mxge_verbose)
3074 		device_printf(sc->dev, "%s media 0x%x unknown\n",
3075 			      cage_type, cmd.data0);
3076 
3077 	return;
3078 }
3079 
3080 static void
3081 mxge_intr(void *arg)
3082 {
3083 	struct mxge_slice_state *ss = arg;
3084 	mxge_softc_t *sc = ss->sc;
3085 	mcp_irq_data_t *stats = ss->fw_stats;
3086 	mxge_tx_ring_t *tx = &ss->tx;
3087 	mxge_rx_done_t *rx_done = &ss->rx_done;
3088 	uint32_t send_done_count;
3089 	uint8_t valid;
3090 
3091 
3092 #ifndef IFNET_BUF_RING
3093 	/* an interrupt on a non-zero slice is implicitly valid
3094 	   since MSI-X irqs are not shared */
3095 	if (ss != sc->ss) {
3096 		mxge_clean_rx_done(ss);
3097 		*ss->irq_claim = be32toh(3);
3098 		return;
3099 	}
3100 #endif
3101 
3102 	/* make sure the DMA has finished */
3103 	if (!stats->valid) {
3104 		return;
3105 	}
3106 	valid = stats->valid;
3107 
3108 	if (sc->legacy_irq) {
3109 		/* lower legacy IRQ  */
3110 		*sc->irq_deassert = 0;
3111 		if (!mxge_deassert_wait)
3112 			/* don't wait for conf. that irq is low */
3113 			stats->valid = 0;
3114 	} else {
3115 		stats->valid = 0;
3116 	}
3117 
3118 	/* loop while waiting for legacy irq deassertion */
3119 	do {
3120 		/* check for transmit completes and receives */
3121 		send_done_count = be32toh(stats->send_done_count);
3122 		while ((send_done_count != tx->pkt_done) ||
3123 		       (rx_done->entry[rx_done->idx].length != 0)) {
3124 			if (send_done_count != tx->pkt_done)
3125 				mxge_tx_done(ss, (int)send_done_count);
3126 			mxge_clean_rx_done(ss);
3127 			send_done_count = be32toh(stats->send_done_count);
3128 		}
3129 		if (sc->legacy_irq && mxge_deassert_wait)
3130 			wmb();
3131 	} while (*((volatile uint8_t *) &stats->valid));
3132 
3133 	/* fw link & error stats meaningful only on the first slice */
3134 	if (__predict_false((ss == sc->ss) && stats->stats_updated)) {
3135 		if (sc->link_state != stats->link_up) {
3136 			sc->link_state = stats->link_up;
3137 			if (sc->link_state) {
3138 				if_link_state_change(sc->ifp, LINK_STATE_UP);
3139 				if (mxge_verbose)
3140 					device_printf(sc->dev, "link up\n");
3141 			} else {
3142 				if_link_state_change(sc->ifp, LINK_STATE_DOWN);
3143 				if (mxge_verbose)
3144 					device_printf(sc->dev, "link down\n");
3145 			}
3146 			sc->need_media_probe = 1;
3147 		}
3148 		if (sc->rdma_tags_available !=
3149 		    be32toh(stats->rdma_tags_available)) {
3150 			sc->rdma_tags_available =
3151 				be32toh(stats->rdma_tags_available);
3152 			device_printf(sc->dev, "RDMA timed out! %d tags "
3153 				      "left\n", sc->rdma_tags_available);
3154 		}
3155 
3156 		if (stats->link_down) {
3157 			sc->down_cnt += stats->link_down;
3158 			sc->link_state = 0;
3159 			if_link_state_change(sc->ifp, LINK_STATE_DOWN);
3160 		}
3161 	}
3162 
3163 	/* check to see if we have rx token to pass back */
3164 	if (valid & 0x1)
3165 	    *ss->irq_claim = be32toh(3);
3166 	*(ss->irq_claim + 1) = be32toh(3);
3167 }
3168 
3169 static void
3170 mxge_init(void *arg)
3171 {
3172 	mxge_softc_t *sc = arg;
3173 	struct ifnet *ifp = sc->ifp;
3174 
3175 
3176 	mtx_lock(&sc->driver_mtx);
3177 	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0)
3178 		(void) mxge_open(sc);
3179 	mtx_unlock(&sc->driver_mtx);
3180 }
3181 
3182 
3183 
3184 static void
3185 mxge_free_slice_mbufs(struct mxge_slice_state *ss)
3186 {
3187 	int i;
3188 
3189 #if defined(INET) || defined(INET6)
3190 	tcp_lro_free(&ss->lc);
3191 #endif
3192 	for (i = 0; i <= ss->rx_big.mask; i++) {
3193 		if (ss->rx_big.info[i].m == NULL)
3194 			continue;
3195 		bus_dmamap_unload(ss->rx_big.dmat,
3196 				  ss->rx_big.info[i].map);
3197 		m_freem(ss->rx_big.info[i].m);
3198 		ss->rx_big.info[i].m = NULL;
3199 	}
3200 
3201 	for (i = 0; i <= ss->rx_small.mask; i++) {
3202 		if (ss->rx_small.info[i].m == NULL)
3203 			continue;
3204 		bus_dmamap_unload(ss->rx_small.dmat,
3205 				  ss->rx_small.info[i].map);
3206 		m_freem(ss->rx_small.info[i].m);
3207 		ss->rx_small.info[i].m = NULL;
3208 	}
3209 
3210 	/* transmit ring used only on the first slice */
3211 	if (ss->tx.info == NULL)
3212 		return;
3213 
3214 	for (i = 0; i <= ss->tx.mask; i++) {
3215 		ss->tx.info[i].flag = 0;
3216 		if (ss->tx.info[i].m == NULL)
3217 			continue;
3218 		bus_dmamap_unload(ss->tx.dmat,
3219 				  ss->tx.info[i].map);
3220 		m_freem(ss->tx.info[i].m);
3221 		ss->tx.info[i].m = NULL;
3222 	}
3223 }
3224 
3225 static void
3226 mxge_free_mbufs(mxge_softc_t *sc)
3227 {
3228 	int slice;
3229 
3230 	for (slice = 0; slice < sc->num_slices; slice++)
3231 		mxge_free_slice_mbufs(&sc->ss[slice]);
3232 }
3233 
3234 static void
3235 mxge_free_slice_rings(struct mxge_slice_state *ss)
3236 {
3237 	int i;
3238 
3239 
3240 	if (ss->rx_done.entry != NULL)
3241 		mxge_dma_free(&ss->rx_done.dma);
3242 	ss->rx_done.entry = NULL;
3243 
3244 	if (ss->tx.req_bytes != NULL)
3245 		free(ss->tx.req_bytes, M_DEVBUF);
3246 	ss->tx.req_bytes = NULL;
3247 
3248 	if (ss->tx.seg_list != NULL)
3249 		free(ss->tx.seg_list, M_DEVBUF);
3250 	ss->tx.seg_list = NULL;
3251 
3252 	if (ss->rx_small.shadow != NULL)
3253 		free(ss->rx_small.shadow, M_DEVBUF);
3254 	ss->rx_small.shadow = NULL;
3255 
3256 	if (ss->rx_big.shadow != NULL)
3257 		free(ss->rx_big.shadow, M_DEVBUF);
3258 	ss->rx_big.shadow = NULL;
3259 
3260 	if (ss->tx.info != NULL) {
3261 		if (ss->tx.dmat != NULL) {
3262 			for (i = 0; i <= ss->tx.mask; i++) {
3263 				bus_dmamap_destroy(ss->tx.dmat,
3264 						   ss->tx.info[i].map);
3265 			}
3266 			bus_dma_tag_destroy(ss->tx.dmat);
3267 		}
3268 		free(ss->tx.info, M_DEVBUF);
3269 	}
3270 	ss->tx.info = NULL;
3271 
3272 	if (ss->rx_small.info != NULL) {
3273 		if (ss->rx_small.dmat != NULL) {
3274 			for (i = 0; i <= ss->rx_small.mask; i++) {
3275 				bus_dmamap_destroy(ss->rx_small.dmat,
3276 						   ss->rx_small.info[i].map);
3277 			}
3278 			bus_dmamap_destroy(ss->rx_small.dmat,
3279 					   ss->rx_small.extra_map);
3280 			bus_dma_tag_destroy(ss->rx_small.dmat);
3281 		}
3282 		free(ss->rx_small.info, M_DEVBUF);
3283 	}
3284 	ss->rx_small.info = NULL;
3285 
3286 	if (ss->rx_big.info != NULL) {
3287 		if (ss->rx_big.dmat != NULL) {
3288 			for (i = 0; i <= ss->rx_big.mask; i++) {
3289 				bus_dmamap_destroy(ss->rx_big.dmat,
3290 						   ss->rx_big.info[i].map);
3291 			}
3292 			bus_dmamap_destroy(ss->rx_big.dmat,
3293 					   ss->rx_big.extra_map);
3294 			bus_dma_tag_destroy(ss->rx_big.dmat);
3295 		}
3296 		free(ss->rx_big.info, M_DEVBUF);
3297 	}
3298 	ss->rx_big.info = NULL;
3299 }
3300 
3301 static void
3302 mxge_free_rings(mxge_softc_t *sc)
3303 {
3304 	int slice;
3305 
3306 	for (slice = 0; slice < sc->num_slices; slice++)
3307 		mxge_free_slice_rings(&sc->ss[slice]);
3308 }
3309 
3310 static int
3311 mxge_alloc_slice_rings(struct mxge_slice_state *ss, int rx_ring_entries,
3312 		       int tx_ring_entries)
3313 {
3314 	mxge_softc_t *sc = ss->sc;
3315 	size_t bytes;
3316 	int err, i;
3317 
3318 	/* allocate per-slice receive resources */
3319 
3320 	ss->rx_small.mask = ss->rx_big.mask = rx_ring_entries - 1;
3321 	ss->rx_done.mask = (2 * rx_ring_entries) - 1;
3322 
3323 	/* allocate the rx shadow rings */
3324 	bytes = rx_ring_entries * sizeof (*ss->rx_small.shadow);
3325 	ss->rx_small.shadow = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3326 
3327 	bytes = rx_ring_entries * sizeof (*ss->rx_big.shadow);
3328 	ss->rx_big.shadow = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3329 
3330 	/* allocate the rx host info rings */
3331 	bytes = rx_ring_entries * sizeof (*ss->rx_small.info);
3332 	ss->rx_small.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3333 
3334 	bytes = rx_ring_entries * sizeof (*ss->rx_big.info);
3335 	ss->rx_big.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3336 
3337 	/* allocate the rx busdma resources */
3338 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
3339 				 1,			/* alignment */
3340 				 4096,			/* boundary */
3341 				 BUS_SPACE_MAXADDR,	/* low */
3342 				 BUS_SPACE_MAXADDR,	/* high */
3343 				 NULL, NULL,		/* filter */
3344 				 MHLEN,			/* maxsize */
3345 				 1,			/* num segs */
3346 				 MHLEN,			/* maxsegsize */
3347 				 BUS_DMA_ALLOCNOW,	/* flags */
3348 				 NULL, NULL,		/* lock */
3349 				 &ss->rx_small.dmat);	/* tag */
3350 	if (err != 0) {
3351 		device_printf(sc->dev, "Err %d allocating rx_small dmat\n",
3352 			      err);
3353 		return err;
3354 	}
3355 
3356 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
3357 				 1,			/* alignment */
3358 #if MXGE_VIRT_JUMBOS
3359 				 4096,			/* boundary */
3360 #else
3361 				 0,			/* boundary */
3362 #endif
3363 				 BUS_SPACE_MAXADDR,	/* low */
3364 				 BUS_SPACE_MAXADDR,	/* high */
3365 				 NULL, NULL,		/* filter */
3366 				 3*4096,		/* maxsize */
3367 #if MXGE_VIRT_JUMBOS
3368 				 3,			/* num segs */
3369 				 4096,			/* maxsegsize*/
3370 #else
3371 				 1,			/* num segs */
3372 				 MJUM9BYTES,		/* maxsegsize*/
3373 #endif
3374 				 BUS_DMA_ALLOCNOW,	/* flags */
3375 				 NULL, NULL,		/* lock */
3376 				 &ss->rx_big.dmat);	/* tag */
3377 	if (err != 0) {
3378 		device_printf(sc->dev, "Err %d allocating rx_big dmat\n",
3379 			      err);
3380 		return err;
3381 	}
3382 	for (i = 0; i <= ss->rx_small.mask; i++) {
3383 		err = bus_dmamap_create(ss->rx_small.dmat, 0,
3384 					&ss->rx_small.info[i].map);
3385 		if (err != 0) {
3386 			device_printf(sc->dev, "Err %d  rx_small dmamap\n",
3387 				      err);
3388 			return err;
3389 		}
3390 	}
3391 	err = bus_dmamap_create(ss->rx_small.dmat, 0,
3392 				&ss->rx_small.extra_map);
3393 	if (err != 0) {
3394 		device_printf(sc->dev, "Err %d extra rx_small dmamap\n",
3395 			      err);
3396 		return err;
3397 	}
3398 
3399 	for (i = 0; i <= ss->rx_big.mask; i++) {
3400 		err = bus_dmamap_create(ss->rx_big.dmat, 0,
3401 					&ss->rx_big.info[i].map);
3402 		if (err != 0) {
3403 			device_printf(sc->dev, "Err %d  rx_big dmamap\n",
3404 				      err);
3405 			return err;
3406 		}
3407 	}
3408 	err = bus_dmamap_create(ss->rx_big.dmat, 0,
3409 				&ss->rx_big.extra_map);
3410 	if (err != 0) {
3411 		device_printf(sc->dev, "Err %d extra rx_big dmamap\n",
3412 			      err);
3413 		return err;
3414 	}
3415 
3416 	/* now allocate TX resources */
3417 
3418 #ifndef IFNET_BUF_RING
3419 	/* only use a single TX ring for now */
3420 	if (ss != ss->sc->ss)
3421 		return 0;
3422 #endif
3423 
3424 	ss->tx.mask = tx_ring_entries - 1;
3425 	ss->tx.max_desc = MIN(MXGE_MAX_SEND_DESC, tx_ring_entries / 4);
3426 
3427 
3428 	/* allocate the tx request copy block */
3429 	bytes = 8 +
3430 		sizeof (*ss->tx.req_list) * (ss->tx.max_desc + 4);
3431 	ss->tx.req_bytes = malloc(bytes, M_DEVBUF, M_WAITOK);
3432 	/* ensure req_list entries are aligned to 8 bytes */
3433 	ss->tx.req_list = (mcp_kreq_ether_send_t *)
3434 		((unsigned long)(ss->tx.req_bytes + 7) & ~7UL);
3435 
3436 	/* allocate the tx busdma segment list */
3437 	bytes = sizeof (*ss->tx.seg_list) * ss->tx.max_desc;
3438 	ss->tx.seg_list = (bus_dma_segment_t *)
3439 		malloc(bytes, M_DEVBUF, M_WAITOK);
3440 
3441 	/* allocate the tx host info ring */
3442 	bytes = tx_ring_entries * sizeof (*ss->tx.info);
3443 	ss->tx.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3444 
3445 	/* allocate the tx busdma resources */
3446 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
3447 				 1,			/* alignment */
3448 				 sc->tx_boundary,	/* boundary */
3449 				 BUS_SPACE_MAXADDR,	/* low */
3450 				 BUS_SPACE_MAXADDR,	/* high */
3451 				 NULL, NULL,		/* filter */
3452 				 65536 + 256,		/* maxsize */
3453 				 ss->tx.max_desc - 2,	/* num segs */
3454 				 sc->tx_boundary,	/* maxsegsz */
3455 				 BUS_DMA_ALLOCNOW,	/* flags */
3456 				 NULL, NULL,		/* lock */
3457 				 &ss->tx.dmat);		/* tag */
3458 
3459 	if (err != 0) {
3460 		device_printf(sc->dev, "Err %d allocating tx dmat\n",
3461 			      err);
3462 		return err;
3463 	}
3464 
3465 	/* now use these tags to setup dmamaps for each slot
3466 	   in the ring */
3467 	for (i = 0; i <= ss->tx.mask; i++) {
3468 		err = bus_dmamap_create(ss->tx.dmat, 0,
3469 					&ss->tx.info[i].map);
3470 		if (err != 0) {
3471 			device_printf(sc->dev, "Err %d  tx dmamap\n",
3472 				      err);
3473 			return err;
3474 		}
3475 	}
3476 	return 0;
3477 
3478 }
3479 
3480 static int
3481 mxge_alloc_rings(mxge_softc_t *sc)
3482 {
3483 	mxge_cmd_t cmd;
3484 	int tx_ring_size;
3485 	int tx_ring_entries, rx_ring_entries;
3486 	int err, slice;
3487 
3488 	/* get ring sizes */
3489 	err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_RING_SIZE, &cmd);
3490 	tx_ring_size = cmd.data0;
3491 	if (err != 0) {
3492 		device_printf(sc->dev, "Cannot determine tx ring sizes\n");
3493 		goto abort;
3494 	}
3495 
3496 	tx_ring_entries = tx_ring_size / sizeof (mcp_kreq_ether_send_t);
3497 	rx_ring_entries = sc->rx_ring_size / sizeof (mcp_dma_addr_t);
3498 	IFQ_SET_MAXLEN(&sc->ifp->if_snd, tx_ring_entries - 1);
3499 	sc->ifp->if_snd.ifq_drv_maxlen = sc->ifp->if_snd.ifq_maxlen;
3500 	IFQ_SET_READY(&sc->ifp->if_snd);
3501 
3502 	for (slice = 0; slice < sc->num_slices; slice++) {
3503 		err = mxge_alloc_slice_rings(&sc->ss[slice],
3504 					     rx_ring_entries,
3505 					     tx_ring_entries);
3506 		if (err != 0)
3507 			goto abort;
3508 	}
3509 	return 0;
3510 
3511 abort:
3512 	mxge_free_rings(sc);
3513 	return err;
3514 
3515 }
3516 
3517 
3518 static void
3519 mxge_choose_params(int mtu, int *big_buf_size, int *cl_size, int *nbufs)
3520 {
3521 	int bufsize = mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN + MXGEFW_PAD;
3522 
3523 	if (bufsize < MCLBYTES) {
3524 		/* easy, everything fits in a single buffer */
3525 		*big_buf_size = MCLBYTES;
3526 		*cl_size = MCLBYTES;
3527 		*nbufs = 1;
3528 		return;
3529 	}
3530 
3531 	if (bufsize < MJUMPAGESIZE) {
3532 		/* still easy, everything still fits in a single buffer */
3533 		*big_buf_size = MJUMPAGESIZE;
3534 		*cl_size = MJUMPAGESIZE;
3535 		*nbufs = 1;
3536 		return;
3537 	}
3538 #if MXGE_VIRT_JUMBOS
3539 	/* now we need to use virtually contiguous buffers */
3540 	*cl_size = MJUM9BYTES;
3541 	*big_buf_size = 4096;
3542 	*nbufs = mtu / 4096 + 1;
3543 	/* needs to be a power of two, so round up */
3544 	if (*nbufs == 3)
3545 		*nbufs = 4;
3546 #else
3547 	*cl_size = MJUM9BYTES;
3548 	*big_buf_size = MJUM9BYTES;
3549 	*nbufs = 1;
3550 #endif
3551 }
3552 
3553 static int
3554 mxge_slice_open(struct mxge_slice_state *ss, int nbufs, int cl_size)
3555 {
3556 	mxge_softc_t *sc;
3557 	mxge_cmd_t cmd;
3558 	bus_dmamap_t map;
3559 	int err, i, slice;
3560 
3561 
3562 	sc = ss->sc;
3563 	slice = ss - sc->ss;
3564 
3565 #if defined(INET) || defined(INET6)
3566 	(void)tcp_lro_init(&ss->lc);
3567 #endif
3568 	ss->lc.ifp = sc->ifp;
3569 
3570 	/* get the lanai pointers to the send and receive rings */
3571 
3572 	err = 0;
3573 #ifndef IFNET_BUF_RING
3574 	/* We currently only send from the first slice */
3575 	if (slice == 0) {
3576 #endif
3577 		cmd.data0 = slice;
3578 		err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_OFFSET, &cmd);
3579 		ss->tx.lanai =
3580 			(volatile mcp_kreq_ether_send_t *)(sc->sram + cmd.data0);
3581 		ss->tx.send_go = (volatile uint32_t *)
3582 			(sc->sram + MXGEFW_ETH_SEND_GO + 64 * slice);
3583 		ss->tx.send_stop = (volatile uint32_t *)
3584 		(sc->sram + MXGEFW_ETH_SEND_STOP + 64 * slice);
3585 #ifndef IFNET_BUF_RING
3586 	}
3587 #endif
3588 	cmd.data0 = slice;
3589 	err |= mxge_send_cmd(sc,
3590 			     MXGEFW_CMD_GET_SMALL_RX_OFFSET, &cmd);
3591 	ss->rx_small.lanai =
3592 		(volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
3593 	cmd.data0 = slice;
3594 	err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_BIG_RX_OFFSET, &cmd);
3595 	ss->rx_big.lanai =
3596 		(volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
3597 
3598 	if (err != 0) {
3599 		device_printf(sc->dev,
3600 			      "failed to get ring sizes or locations\n");
3601 		return EIO;
3602 	}
3603 
3604 	/* stock receive rings */
3605 	for (i = 0; i <= ss->rx_small.mask; i++) {
3606 		map = ss->rx_small.info[i].map;
3607 		err = mxge_get_buf_small(ss, map, i);
3608 		if (err) {
3609 			device_printf(sc->dev, "alloced %d/%d smalls\n",
3610 				      i, ss->rx_small.mask + 1);
3611 			return ENOMEM;
3612 		}
3613 	}
3614 	for (i = 0; i <= ss->rx_big.mask; i++) {
3615 		ss->rx_big.shadow[i].addr_low = 0xffffffff;
3616 		ss->rx_big.shadow[i].addr_high = 0xffffffff;
3617 	}
3618 	ss->rx_big.nbufs = nbufs;
3619 	ss->rx_big.cl_size = cl_size;
3620 	ss->rx_big.mlen = ss->sc->ifp->if_mtu + ETHER_HDR_LEN +
3621 		ETHER_VLAN_ENCAP_LEN + MXGEFW_PAD;
3622 	for (i = 0; i <= ss->rx_big.mask; i += ss->rx_big.nbufs) {
3623 		map = ss->rx_big.info[i].map;
3624 		err = mxge_get_buf_big(ss, map, i);
3625 		if (err) {
3626 			device_printf(sc->dev, "alloced %d/%d bigs\n",
3627 				      i, ss->rx_big.mask + 1);
3628 			return ENOMEM;
3629 		}
3630 	}
3631 	return 0;
3632 }
3633 
3634 static int
3635 mxge_open(mxge_softc_t *sc)
3636 {
3637 	mxge_cmd_t cmd;
3638 	int err, big_bytes, nbufs, slice, cl_size, i;
3639 	bus_addr_t bus;
3640 	volatile uint8_t *itable;
3641 	struct mxge_slice_state *ss;
3642 
3643 	/* Copy the MAC address in case it was overridden */
3644 	bcopy(IF_LLADDR(sc->ifp), sc->mac_addr, ETHER_ADDR_LEN);
3645 
3646 	err = mxge_reset(sc, 1);
3647 	if (err != 0) {
3648 		device_printf(sc->dev, "failed to reset\n");
3649 		return EIO;
3650 	}
3651 
3652 	if (sc->num_slices > 1) {
3653 		/* setup the indirection table */
3654 		cmd.data0 = sc->num_slices;
3655 		err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_TABLE_SIZE,
3656 				    &cmd);
3657 
3658 		err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_RSS_TABLE_OFFSET,
3659 				     &cmd);
3660 		if (err != 0) {
3661 			device_printf(sc->dev,
3662 				      "failed to setup rss tables\n");
3663 			return err;
3664 		}
3665 
3666 		/* just enable an identity mapping */
3667 		itable = sc->sram + cmd.data0;
3668 		for (i = 0; i < sc->num_slices; i++)
3669 			itable[i] = (uint8_t)i;
3670 
3671 		cmd.data0 = 1;
3672 		cmd.data1 = mxge_rss_hash_type;
3673 		err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_ENABLE, &cmd);
3674 		if (err != 0) {
3675 			device_printf(sc->dev, "failed to enable slices\n");
3676 			return err;
3677 		}
3678 	}
3679 
3680 
3681 	mxge_choose_params(sc->ifp->if_mtu, &big_bytes, &cl_size, &nbufs);
3682 
3683 	cmd.data0 = nbufs;
3684 	err = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
3685 			    &cmd);
3686 	/* error is only meaningful if we're trying to set
3687 	   MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS > 1 */
3688 	if (err && nbufs > 1) {
3689 		device_printf(sc->dev,
3690 			      "Failed to set alway-use-n to %d\n",
3691 			      nbufs);
3692 		return EIO;
3693 	}
3694 	/* Give the firmware the mtu and the big and small buffer
3695 	   sizes.  The firmware wants the big buf size to be a power
3696 	   of two. Luckily, FreeBSD's clusters are powers of two */
3697 	cmd.data0 = sc->ifp->if_mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
3698 	err = mxge_send_cmd(sc, MXGEFW_CMD_SET_MTU, &cmd);
3699 	cmd.data0 = MHLEN - MXGEFW_PAD;
3700 	err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_SMALL_BUFFER_SIZE,
3701 			     &cmd);
3702 	cmd.data0 = big_bytes;
3703 	err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_BIG_BUFFER_SIZE, &cmd);
3704 
3705 	if (err != 0) {
3706 		device_printf(sc->dev, "failed to setup params\n");
3707 		goto abort;
3708 	}
3709 
3710 	/* Now give him the pointer to the stats block */
3711 	for (slice = 0;
3712 #ifdef IFNET_BUF_RING
3713 	     slice < sc->num_slices;
3714 #else
3715 	     slice < 1;
3716 #endif
3717 	     slice++) {
3718 		ss = &sc->ss[slice];
3719 		cmd.data0 =
3720 			MXGE_LOWPART_TO_U32(ss->fw_stats_dma.bus_addr);
3721 		cmd.data1 =
3722 			MXGE_HIGHPART_TO_U32(ss->fw_stats_dma.bus_addr);
3723 		cmd.data2 = sizeof(struct mcp_irq_data);
3724 		cmd.data2 |= (slice << 16);
3725 		err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_STATS_DMA_V2, &cmd);
3726 	}
3727 
3728 	if (err != 0) {
3729 		bus = sc->ss->fw_stats_dma.bus_addr;
3730 		bus += offsetof(struct mcp_irq_data, send_done_count);
3731 		cmd.data0 = MXGE_LOWPART_TO_U32(bus);
3732 		cmd.data1 = MXGE_HIGHPART_TO_U32(bus);
3733 		err = mxge_send_cmd(sc,
3734 				    MXGEFW_CMD_SET_STATS_DMA_OBSOLETE,
3735 				    &cmd);
3736 		/* Firmware cannot support multicast without STATS_DMA_V2 */
3737 		sc->fw_multicast_support = 0;
3738 	} else {
3739 		sc->fw_multicast_support = 1;
3740 	}
3741 
3742 	if (err != 0) {
3743 		device_printf(sc->dev, "failed to setup params\n");
3744 		goto abort;
3745 	}
3746 
3747 	for (slice = 0; slice < sc->num_slices; slice++) {
3748 		err = mxge_slice_open(&sc->ss[slice], nbufs, cl_size);
3749 		if (err != 0) {
3750 			device_printf(sc->dev, "couldn't open slice %d\n",
3751 				      slice);
3752 			goto abort;
3753 		}
3754 	}
3755 
3756 	/* Finally, start the firmware running */
3757 	err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_UP, &cmd);
3758 	if (err) {
3759 		device_printf(sc->dev, "Couldn't bring up link\n");
3760 		goto abort;
3761 	}
3762 #ifdef IFNET_BUF_RING
3763 	for (slice = 0; slice < sc->num_slices; slice++) {
3764 		ss = &sc->ss[slice];
3765 		ss->if_drv_flags |= IFF_DRV_RUNNING;
3766 		ss->if_drv_flags &= ~IFF_DRV_OACTIVE;
3767 	}
3768 #endif
3769 	sc->ifp->if_drv_flags |= IFF_DRV_RUNNING;
3770 	sc->ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
3771 
3772 	return 0;
3773 
3774 
3775 abort:
3776 	mxge_free_mbufs(sc);
3777 
3778 	return err;
3779 }
3780 
3781 static int
3782 mxge_close(mxge_softc_t *sc, int down)
3783 {
3784 	mxge_cmd_t cmd;
3785 	int err, old_down_cnt;
3786 #ifdef IFNET_BUF_RING
3787 	struct mxge_slice_state *ss;
3788 	int slice;
3789 #endif
3790 
3791 #ifdef IFNET_BUF_RING
3792 	for (slice = 0; slice < sc->num_slices; slice++) {
3793 		ss = &sc->ss[slice];
3794 		ss->if_drv_flags &= ~IFF_DRV_RUNNING;
3795 	}
3796 #endif
3797 	sc->ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
3798 	if (!down) {
3799 		old_down_cnt = sc->down_cnt;
3800 		wmb();
3801 		err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_DOWN, &cmd);
3802 		if (err) {
3803 			device_printf(sc->dev,
3804 				      "Couldn't bring down link\n");
3805 		}
3806 		if (old_down_cnt == sc->down_cnt) {
3807 			/* wait for down irq */
3808 			DELAY(10 * sc->intr_coal_delay);
3809 		}
3810 		wmb();
3811 		if (old_down_cnt == sc->down_cnt) {
3812 			device_printf(sc->dev, "never got down irq\n");
3813 		}
3814 	}
3815 	mxge_free_mbufs(sc);
3816 
3817 	return 0;
3818 }
3819 
3820 static void
3821 mxge_setup_cfg_space(mxge_softc_t *sc)
3822 {
3823 	device_t dev = sc->dev;
3824 	int reg;
3825 	uint16_t lnk, pectl;
3826 
3827 	/* find the PCIe link width and set max read request to 4KB*/
3828 	if (pci_find_cap(dev, PCIY_EXPRESS, &reg) == 0) {
3829 		lnk = pci_read_config(dev, reg + 0x12, 2);
3830 		sc->link_width = (lnk >> 4) & 0x3f;
3831 
3832 		if (sc->pectl == 0) {
3833 			pectl = pci_read_config(dev, reg + 0x8, 2);
3834 			pectl = (pectl & ~0x7000) | (5 << 12);
3835 			pci_write_config(dev, reg + 0x8, pectl, 2);
3836 			sc->pectl = pectl;
3837 		} else {
3838 			/* restore saved pectl after watchdog reset */
3839 			pci_write_config(dev, reg + 0x8, sc->pectl, 2);
3840 		}
3841 	}
3842 
3843 	/* Enable DMA and Memory space access */
3844 	pci_enable_busmaster(dev);
3845 }
3846 
3847 static uint32_t
3848 mxge_read_reboot(mxge_softc_t *sc)
3849 {
3850 	device_t dev = sc->dev;
3851 	uint32_t vs;
3852 
3853 	/* find the vendor specific offset */
3854 	if (pci_find_cap(dev, PCIY_VENDOR, &vs) != 0) {
3855 		device_printf(sc->dev,
3856 			      "could not find vendor specific offset\n");
3857 		return (uint32_t)-1;
3858 	}
3859 	/* enable read32 mode */
3860 	pci_write_config(dev, vs + 0x10, 0x3, 1);
3861 	/* tell NIC which register to read */
3862 	pci_write_config(dev, vs + 0x18, 0xfffffff0, 4);
3863 	return (pci_read_config(dev, vs + 0x14, 4));
3864 }
3865 
3866 static void
3867 mxge_watchdog_reset(mxge_softc_t *sc)
3868 {
3869 	struct pci_devinfo *dinfo;
3870 	struct mxge_slice_state *ss;
3871 	int err, running, s, num_tx_slices = 1;
3872 	uint32_t reboot;
3873 	uint16_t cmd;
3874 
3875 	err = ENXIO;
3876 
3877 	device_printf(sc->dev, "Watchdog reset!\n");
3878 
3879 	/*
3880 	 * check to see if the NIC rebooted.  If it did, then all of
3881 	 * PCI config space has been reset, and things like the
3882 	 * busmaster bit will be zero.  If this is the case, then we
3883 	 * must restore PCI config space before the NIC can be used
3884 	 * again
3885 	 */
3886 	cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3887 	if (cmd == 0xffff) {
3888 		/*
3889 		 * maybe the watchdog caught the NIC rebooting; wait
3890 		 * up to 100ms for it to finish.  If it does not come
3891 		 * back, then give up
3892 		 */
3893 		DELAY(1000*100);
3894 		cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3895 		if (cmd == 0xffff) {
3896 			device_printf(sc->dev, "NIC disappeared!\n");
3897 		}
3898 	}
3899 	if ((cmd & PCIM_CMD_BUSMASTEREN) == 0) {
3900 		/* print the reboot status */
3901 		reboot = mxge_read_reboot(sc);
3902 		device_printf(sc->dev, "NIC rebooted, status = 0x%x\n",
3903 			      reboot);
3904 		running = sc->ifp->if_drv_flags & IFF_DRV_RUNNING;
3905 		if (running) {
3906 
3907 			/*
3908 			 * quiesce NIC so that TX routines will not try to
3909 			 * xmit after restoration of BAR
3910 			 */
3911 
3912 			/* Mark the link as down */
3913 			if (sc->link_state) {
3914 				sc->link_state = 0;
3915 				if_link_state_change(sc->ifp,
3916 						     LINK_STATE_DOWN);
3917 			}
3918 #ifdef IFNET_BUF_RING
3919 			num_tx_slices = sc->num_slices;
3920 #endif
3921 			/* grab all TX locks to ensure no tx  */
3922 			for (s = 0; s < num_tx_slices; s++) {
3923 				ss = &sc->ss[s];
3924 				mtx_lock(&ss->tx.mtx);
3925 			}
3926 			mxge_close(sc, 1);
3927 		}
3928 		/* restore PCI configuration space */
3929 		dinfo = device_get_ivars(sc->dev);
3930 		pci_cfg_restore(sc->dev, dinfo);
3931 
3932 		/* and redo any changes we made to our config space */
3933 		mxge_setup_cfg_space(sc);
3934 
3935 		/* reload f/w */
3936 		err = mxge_load_firmware(sc, 0);
3937 		if (err) {
3938 			device_printf(sc->dev,
3939 				      "Unable to re-load f/w\n");
3940 		}
3941 		if (running) {
3942 			if (!err)
3943 				err = mxge_open(sc);
3944 			/* release all TX locks */
3945 			for (s = 0; s < num_tx_slices; s++) {
3946 				ss = &sc->ss[s];
3947 #ifdef IFNET_BUF_RING
3948 				mxge_start_locked(ss);
3949 #endif
3950 				mtx_unlock(&ss->tx.mtx);
3951 			}
3952 		}
3953 		sc->watchdog_resets++;
3954 	} else {
3955 		device_printf(sc->dev,
3956 			      "NIC did not reboot, not resetting\n");
3957 		err = 0;
3958 	}
3959 	if (err) {
3960 		device_printf(sc->dev, "watchdog reset failed\n");
3961 	} else {
3962 		if (sc->dying == 2)
3963 			sc->dying = 0;
3964 		callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
3965 	}
3966 }
3967 
3968 static void
3969 mxge_watchdog_task(void *arg, int pending)
3970 {
3971 	mxge_softc_t *sc = arg;
3972 
3973 
3974 	mtx_lock(&sc->driver_mtx);
3975 	mxge_watchdog_reset(sc);
3976 	mtx_unlock(&sc->driver_mtx);
3977 }
3978 
3979 static void
3980 mxge_warn_stuck(mxge_softc_t *sc, mxge_tx_ring_t *tx, int slice)
3981 {
3982 	tx = &sc->ss[slice].tx;
3983 	device_printf(sc->dev, "slice %d struck? ring state:\n", slice);
3984 	device_printf(sc->dev,
3985 		      "tx.req=%d tx.done=%d, tx.queue_active=%d\n",
3986 		      tx->req, tx->done, tx->queue_active);
3987 	device_printf(sc->dev, "tx.activate=%d tx.deactivate=%d\n",
3988 			      tx->activate, tx->deactivate);
3989 	device_printf(sc->dev, "pkt_done=%d fw=%d\n",
3990 		      tx->pkt_done,
3991 		      be32toh(sc->ss->fw_stats->send_done_count));
3992 }
3993 
3994 static int
3995 mxge_watchdog(mxge_softc_t *sc)
3996 {
3997 	mxge_tx_ring_t *tx;
3998 	uint32_t rx_pause = be32toh(sc->ss->fw_stats->dropped_pause);
3999 	int i, err = 0;
4000 
4001 	/* see if we have outstanding transmits, which
4002 	   have been pending for more than mxge_ticks */
4003 	for (i = 0;
4004 #ifdef IFNET_BUF_RING
4005 	     (i < sc->num_slices) && (err == 0);
4006 #else
4007 	     (i < 1) && (err == 0);
4008 #endif
4009 	     i++) {
4010 		tx = &sc->ss[i].tx;
4011 		if (tx->req != tx->done &&
4012 		    tx->watchdog_req != tx->watchdog_done &&
4013 		    tx->done == tx->watchdog_done) {
4014 			/* check for pause blocking before resetting */
4015 			if (tx->watchdog_rx_pause == rx_pause) {
4016 				mxge_warn_stuck(sc, tx, i);
4017 				taskqueue_enqueue(sc->tq, &sc->watchdog_task);
4018 				return (ENXIO);
4019 			}
4020 			else
4021 				device_printf(sc->dev, "Flow control blocking "
4022 					      "xmits, check link partner\n");
4023 		}
4024 
4025 		tx->watchdog_req = tx->req;
4026 		tx->watchdog_done = tx->done;
4027 		tx->watchdog_rx_pause = rx_pause;
4028 	}
4029 
4030 	if (sc->need_media_probe)
4031 		mxge_media_probe(sc);
4032 	return (err);
4033 }
4034 
4035 static uint64_t
4036 mxge_get_counter(struct ifnet *ifp, ift_counter cnt)
4037 {
4038 	struct mxge_softc *sc;
4039 	uint64_t rv;
4040 
4041 	sc = if_getsoftc(ifp);
4042 	rv = 0;
4043 
4044 	switch (cnt) {
4045 	case IFCOUNTER_IPACKETS:
4046 		for (int s = 0; s < sc->num_slices; s++)
4047 			rv += sc->ss[s].ipackets;
4048 		return (rv);
4049 	case IFCOUNTER_OPACKETS:
4050 		for (int s = 0; s < sc->num_slices; s++)
4051 			rv += sc->ss[s].opackets;
4052 		return (rv);
4053 	case IFCOUNTER_OERRORS:
4054 		for (int s = 0; s < sc->num_slices; s++)
4055 			rv += sc->ss[s].oerrors;
4056 		return (rv);
4057 #ifdef IFNET_BUF_RING
4058 	case IFCOUNTER_OBYTES:
4059 		for (int s = 0; s < sc->num_slices; s++)
4060 			rv += sc->ss[s].obytes;
4061 		return (rv);
4062 	case IFCOUNTER_OMCASTS:
4063 		for (int s = 0; s < sc->num_slices; s++)
4064 			rv += sc->ss[s].omcasts;
4065 		return (rv);
4066 	case IFCOUNTER_OQDROPS:
4067 		for (int s = 0; s < sc->num_slices; s++)
4068 			rv += sc->ss[s].tx.br->br_drops;
4069 		return (rv);
4070 #endif
4071 	default:
4072 		return (if_get_counter_default(ifp, cnt));
4073 	}
4074 }
4075 
4076 static void
4077 mxge_tick(void *arg)
4078 {
4079 	mxge_softc_t *sc = arg;
4080 	u_long pkts = 0;
4081 	int err = 0;
4082 	int running, ticks;
4083 	uint16_t cmd;
4084 
4085 	ticks = mxge_ticks;
4086 	running = sc->ifp->if_drv_flags & IFF_DRV_RUNNING;
4087 	if (running) {
4088 		if (!sc->watchdog_countdown) {
4089 			err = mxge_watchdog(sc);
4090 			sc->watchdog_countdown = 4;
4091 		}
4092 		sc->watchdog_countdown--;
4093 	}
4094 	if (pkts == 0) {
4095 		/* ensure NIC did not suffer h/w fault while idle */
4096 		cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
4097 		if ((cmd & PCIM_CMD_BUSMASTEREN) == 0) {
4098 			sc->dying = 2;
4099 			taskqueue_enqueue(sc->tq, &sc->watchdog_task);
4100 			err = ENXIO;
4101 		}
4102 		/* look less often if NIC is idle */
4103 		ticks *= 4;
4104 	}
4105 
4106 	if (err == 0)
4107 		callout_reset(&sc->co_hdl, ticks, mxge_tick, sc);
4108 
4109 }
4110 
4111 static int
4112 mxge_media_change(struct ifnet *ifp)
4113 {
4114 	return EINVAL;
4115 }
4116 
4117 static int
4118 mxge_change_mtu(mxge_softc_t *sc, int mtu)
4119 {
4120 	struct ifnet *ifp = sc->ifp;
4121 	int real_mtu, old_mtu;
4122 	int err = 0;
4123 
4124 
4125 	real_mtu = mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
4126 	if ((real_mtu > sc->max_mtu) || real_mtu < 60)
4127 		return EINVAL;
4128 	mtx_lock(&sc->driver_mtx);
4129 	old_mtu = ifp->if_mtu;
4130 	ifp->if_mtu = mtu;
4131 	if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
4132 		mxge_close(sc, 0);
4133 		err = mxge_open(sc);
4134 		if (err != 0) {
4135 			ifp->if_mtu = old_mtu;
4136 			mxge_close(sc, 0);
4137 			(void) mxge_open(sc);
4138 		}
4139 	}
4140 	mtx_unlock(&sc->driver_mtx);
4141 	return err;
4142 }
4143 
4144 static void
4145 mxge_media_status(struct ifnet *ifp, struct ifmediareq *ifmr)
4146 {
4147 	mxge_softc_t *sc = ifp->if_softc;
4148 
4149 
4150 	if (sc == NULL)
4151 		return;
4152 	ifmr->ifm_status = IFM_AVALID;
4153 	ifmr->ifm_active = IFM_ETHER | IFM_FDX;
4154 	ifmr->ifm_status |= sc->link_state ? IFM_ACTIVE : 0;
4155 	ifmr->ifm_active |= sc->current_media;
4156 }
4157 
4158 static int
4159 mxge_ioctl(struct ifnet *ifp, u_long command, caddr_t data)
4160 {
4161 	mxge_softc_t *sc = ifp->if_softc;
4162 	struct ifreq *ifr = (struct ifreq *)data;
4163 	int err, mask;
4164 
4165 	err = 0;
4166 	switch (command) {
4167 	case SIOCSIFADDR:
4168 	case SIOCGIFADDR:
4169 		err = ether_ioctl(ifp, command, data);
4170 		break;
4171 
4172 	case SIOCSIFMTU:
4173 		err = mxge_change_mtu(sc, ifr->ifr_mtu);
4174 		break;
4175 
4176 	case SIOCSIFFLAGS:
4177 		mtx_lock(&sc->driver_mtx);
4178 		if (sc->dying) {
4179 			mtx_unlock(&sc->driver_mtx);
4180 			return EINVAL;
4181 		}
4182 		if (ifp->if_flags & IFF_UP) {
4183 			if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) {
4184 				err = mxge_open(sc);
4185 			} else {
4186 				/* take care of promis can allmulti
4187 				   flag chages */
4188 				mxge_change_promisc(sc,
4189 						    ifp->if_flags & IFF_PROMISC);
4190 				mxge_set_multicast_list(sc);
4191 			}
4192 		} else {
4193 			if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
4194 				mxge_close(sc, 0);
4195 			}
4196 		}
4197 		mtx_unlock(&sc->driver_mtx);
4198 		break;
4199 
4200 	case SIOCADDMULTI:
4201 	case SIOCDELMULTI:
4202 		mtx_lock(&sc->driver_mtx);
4203 		mxge_set_multicast_list(sc);
4204 		mtx_unlock(&sc->driver_mtx);
4205 		break;
4206 
4207 	case SIOCSIFCAP:
4208 		mtx_lock(&sc->driver_mtx);
4209 		mask = ifr->ifr_reqcap ^ ifp->if_capenable;
4210 		if (mask & IFCAP_TXCSUM) {
4211 			if (IFCAP_TXCSUM & ifp->if_capenable) {
4212 				ifp->if_capenable &= ~(IFCAP_TXCSUM|IFCAP_TSO4);
4213 				ifp->if_hwassist &= ~(CSUM_TCP | CSUM_UDP);
4214 			} else {
4215 				ifp->if_capenable |= IFCAP_TXCSUM;
4216 				ifp->if_hwassist |= (CSUM_TCP | CSUM_UDP);
4217 			}
4218 		} else if (mask & IFCAP_RXCSUM) {
4219 			if (IFCAP_RXCSUM & ifp->if_capenable) {
4220 				ifp->if_capenable &= ~IFCAP_RXCSUM;
4221 			} else {
4222 				ifp->if_capenable |= IFCAP_RXCSUM;
4223 			}
4224 		}
4225 		if (mask & IFCAP_TSO4) {
4226 			if (IFCAP_TSO4 & ifp->if_capenable) {
4227 				ifp->if_capenable &= ~IFCAP_TSO4;
4228 			} else if (IFCAP_TXCSUM & ifp->if_capenable) {
4229 				ifp->if_capenable |= IFCAP_TSO4;
4230 				ifp->if_hwassist |= CSUM_TSO;
4231 			} else {
4232 				printf("mxge requires tx checksum offload"
4233 				       " be enabled to use TSO\n");
4234 				err = EINVAL;
4235 			}
4236 		}
4237 #if IFCAP_TSO6
4238 		if (mask & IFCAP_TXCSUM_IPV6) {
4239 			if (IFCAP_TXCSUM_IPV6 & ifp->if_capenable) {
4240 				ifp->if_capenable &= ~(IFCAP_TXCSUM_IPV6
4241 						       | IFCAP_TSO6);
4242 				ifp->if_hwassist &= ~(CSUM_TCP_IPV6
4243 						      | CSUM_UDP);
4244 			} else {
4245 				ifp->if_capenable |= IFCAP_TXCSUM_IPV6;
4246 				ifp->if_hwassist |= (CSUM_TCP_IPV6
4247 						     | CSUM_UDP_IPV6);
4248 			}
4249 		} else if (mask & IFCAP_RXCSUM_IPV6) {
4250 			if (IFCAP_RXCSUM_IPV6 & ifp->if_capenable) {
4251 				ifp->if_capenable &= ~IFCAP_RXCSUM_IPV6;
4252 			} else {
4253 				ifp->if_capenable |= IFCAP_RXCSUM_IPV6;
4254 			}
4255 		}
4256 		if (mask & IFCAP_TSO6) {
4257 			if (IFCAP_TSO6 & ifp->if_capenable) {
4258 				ifp->if_capenable &= ~IFCAP_TSO6;
4259 			} else if (IFCAP_TXCSUM_IPV6 & ifp->if_capenable) {
4260 				ifp->if_capenable |= IFCAP_TSO6;
4261 				ifp->if_hwassist |= CSUM_TSO;
4262 			} else {
4263 				printf("mxge requires tx checksum offload"
4264 				       " be enabled to use TSO\n");
4265 				err = EINVAL;
4266 			}
4267 		}
4268 #endif /*IFCAP_TSO6 */
4269 
4270 		if (mask & IFCAP_LRO)
4271 			ifp->if_capenable ^= IFCAP_LRO;
4272 		if (mask & IFCAP_VLAN_HWTAGGING)
4273 			ifp->if_capenable ^= IFCAP_VLAN_HWTAGGING;
4274 		if (mask & IFCAP_VLAN_HWTSO)
4275 			ifp->if_capenable ^= IFCAP_VLAN_HWTSO;
4276 
4277 		if (!(ifp->if_capabilities & IFCAP_VLAN_HWTSO) ||
4278 		    !(ifp->if_capenable & IFCAP_VLAN_HWTAGGING))
4279 			ifp->if_capenable &= ~IFCAP_VLAN_HWTSO;
4280 
4281 		mtx_unlock(&sc->driver_mtx);
4282 		VLAN_CAPABILITIES(ifp);
4283 
4284 		break;
4285 
4286 	case SIOCGIFMEDIA:
4287 		mtx_lock(&sc->driver_mtx);
4288 		mxge_media_probe(sc);
4289 		mtx_unlock(&sc->driver_mtx);
4290 		err = ifmedia_ioctl(ifp, (struct ifreq *)data,
4291 				    &sc->media, command);
4292 		break;
4293 
4294 	default:
4295 		err = ENOTTY;
4296 	}
4297 	return err;
4298 }
4299 
4300 static void
4301 mxge_fetch_tunables(mxge_softc_t *sc)
4302 {
4303 
4304 	TUNABLE_INT_FETCH("hw.mxge.max_slices", &mxge_max_slices);
4305 	TUNABLE_INT_FETCH("hw.mxge.flow_control_enabled",
4306 			  &mxge_flow_control);
4307 	TUNABLE_INT_FETCH("hw.mxge.intr_coal_delay",
4308 			  &mxge_intr_coal_delay);
4309 	TUNABLE_INT_FETCH("hw.mxge.nvidia_ecrc_enable",
4310 			  &mxge_nvidia_ecrc_enable);
4311 	TUNABLE_INT_FETCH("hw.mxge.force_firmware",
4312 			  &mxge_force_firmware);
4313 	TUNABLE_INT_FETCH("hw.mxge.deassert_wait",
4314 			  &mxge_deassert_wait);
4315 	TUNABLE_INT_FETCH("hw.mxge.verbose",
4316 			  &mxge_verbose);
4317 	TUNABLE_INT_FETCH("hw.mxge.ticks", &mxge_ticks);
4318 	TUNABLE_INT_FETCH("hw.mxge.always_promisc", &mxge_always_promisc);
4319 	TUNABLE_INT_FETCH("hw.mxge.rss_hash_type", &mxge_rss_hash_type);
4320 	TUNABLE_INT_FETCH("hw.mxge.rss_hashtype", &mxge_rss_hash_type);
4321 	TUNABLE_INT_FETCH("hw.mxge.initial_mtu", &mxge_initial_mtu);
4322 	TUNABLE_INT_FETCH("hw.mxge.throttle", &mxge_throttle);
4323 
4324 	if (bootverbose)
4325 		mxge_verbose = 1;
4326 	if (mxge_intr_coal_delay < 0 || mxge_intr_coal_delay > 10*1000)
4327 		mxge_intr_coal_delay = 30;
4328 	if (mxge_ticks == 0)
4329 		mxge_ticks = hz / 2;
4330 	sc->pause = mxge_flow_control;
4331 	if (mxge_rss_hash_type < MXGEFW_RSS_HASH_TYPE_IPV4
4332 	    || mxge_rss_hash_type > MXGEFW_RSS_HASH_TYPE_MAX) {
4333 		mxge_rss_hash_type = MXGEFW_RSS_HASH_TYPE_SRC_DST_PORT;
4334 	}
4335 	if (mxge_initial_mtu > ETHERMTU_JUMBO ||
4336 	    mxge_initial_mtu < ETHER_MIN_LEN)
4337 		mxge_initial_mtu = ETHERMTU_JUMBO;
4338 
4339 	if (mxge_throttle && mxge_throttle > MXGE_MAX_THROTTLE)
4340 		mxge_throttle = MXGE_MAX_THROTTLE;
4341 	if (mxge_throttle && mxge_throttle < MXGE_MIN_THROTTLE)
4342 		mxge_throttle = MXGE_MIN_THROTTLE;
4343 	sc->throttle = mxge_throttle;
4344 }
4345 
4346 
4347 static void
4348 mxge_free_slices(mxge_softc_t *sc)
4349 {
4350 	struct mxge_slice_state *ss;
4351 	int i;
4352 
4353 
4354 	if (sc->ss == NULL)
4355 		return;
4356 
4357 	for (i = 0; i < sc->num_slices; i++) {
4358 		ss = &sc->ss[i];
4359 		if (ss->fw_stats != NULL) {
4360 			mxge_dma_free(&ss->fw_stats_dma);
4361 			ss->fw_stats = NULL;
4362 #ifdef IFNET_BUF_RING
4363 			if (ss->tx.br != NULL) {
4364 				drbr_free(ss->tx.br, M_DEVBUF);
4365 				ss->tx.br = NULL;
4366 			}
4367 #endif
4368 			mtx_destroy(&ss->tx.mtx);
4369 		}
4370 		if (ss->rx_done.entry != NULL) {
4371 			mxge_dma_free(&ss->rx_done.dma);
4372 			ss->rx_done.entry = NULL;
4373 		}
4374 	}
4375 	free(sc->ss, M_DEVBUF);
4376 	sc->ss = NULL;
4377 }
4378 
4379 static int
4380 mxge_alloc_slices(mxge_softc_t *sc)
4381 {
4382 	mxge_cmd_t cmd;
4383 	struct mxge_slice_state *ss;
4384 	size_t bytes;
4385 	int err, i, max_intr_slots;
4386 
4387 	err = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
4388 	if (err != 0) {
4389 		device_printf(sc->dev, "Cannot determine rx ring size\n");
4390 		return err;
4391 	}
4392 	sc->rx_ring_size = cmd.data0;
4393 	max_intr_slots = 2 * (sc->rx_ring_size / sizeof (mcp_dma_addr_t));
4394 
4395 	bytes = sizeof (*sc->ss) * sc->num_slices;
4396 	sc->ss = malloc(bytes, M_DEVBUF, M_NOWAIT | M_ZERO);
4397 	if (sc->ss == NULL)
4398 		return (ENOMEM);
4399 	for (i = 0; i < sc->num_slices; i++) {
4400 		ss = &sc->ss[i];
4401 
4402 		ss->sc = sc;
4403 
4404 		/* allocate per-slice rx interrupt queues */
4405 
4406 		bytes = max_intr_slots * sizeof (*ss->rx_done.entry);
4407 		err = mxge_dma_alloc(sc, &ss->rx_done.dma, bytes, 4096);
4408 		if (err != 0)
4409 			goto abort;
4410 		ss->rx_done.entry = ss->rx_done.dma.addr;
4411 		bzero(ss->rx_done.entry, bytes);
4412 
4413 		/*
4414 		 * allocate the per-slice firmware stats; stats
4415 		 * (including tx) are used used only on the first
4416 		 * slice for now
4417 		 */
4418 #ifndef IFNET_BUF_RING
4419 		if (i > 0)
4420 			continue;
4421 #endif
4422 
4423 		bytes = sizeof (*ss->fw_stats);
4424 		err = mxge_dma_alloc(sc, &ss->fw_stats_dma,
4425 				     sizeof (*ss->fw_stats), 64);
4426 		if (err != 0)
4427 			goto abort;
4428 		ss->fw_stats = (mcp_irq_data_t *)ss->fw_stats_dma.addr;
4429 		snprintf(ss->tx.mtx_name, sizeof(ss->tx.mtx_name),
4430 			 "%s:tx(%d)", device_get_nameunit(sc->dev), i);
4431 		mtx_init(&ss->tx.mtx, ss->tx.mtx_name, NULL, MTX_DEF);
4432 #ifdef IFNET_BUF_RING
4433 		ss->tx.br = buf_ring_alloc(2048, M_DEVBUF, M_WAITOK,
4434 					   &ss->tx.mtx);
4435 #endif
4436 	}
4437 
4438 	return (0);
4439 
4440 abort:
4441 	mxge_free_slices(sc);
4442 	return (ENOMEM);
4443 }
4444 
4445 static void
4446 mxge_slice_probe(mxge_softc_t *sc)
4447 {
4448 	mxge_cmd_t cmd;
4449 	char *old_fw;
4450 	int msix_cnt, status, max_intr_slots;
4451 
4452 	sc->num_slices = 1;
4453 	/*
4454 	 *  don't enable multiple slices if they are not enabled,
4455 	 *  or if this is not an SMP system
4456 	 */
4457 
4458 	if (mxge_max_slices == 0 || mxge_max_slices == 1 || mp_ncpus < 2)
4459 		return;
4460 
4461 	/* see how many MSI-X interrupts are available */
4462 	msix_cnt = pci_msix_count(sc->dev);
4463 	if (msix_cnt < 2)
4464 		return;
4465 
4466 	/* now load the slice aware firmware see what it supports */
4467 	old_fw = sc->fw_name;
4468 	if (old_fw == mxge_fw_aligned)
4469 		sc->fw_name = mxge_fw_rss_aligned;
4470 	else
4471 		sc->fw_name = mxge_fw_rss_unaligned;
4472 	status = mxge_load_firmware(sc, 0);
4473 	if (status != 0) {
4474 		device_printf(sc->dev, "Falling back to a single slice\n");
4475 		return;
4476 	}
4477 
4478 	/* try to send a reset command to the card to see if it
4479 	   is alive */
4480 	memset(&cmd, 0, sizeof (cmd));
4481 	status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
4482 	if (status != 0) {
4483 		device_printf(sc->dev, "failed reset\n");
4484 		goto abort_with_fw;
4485 	}
4486 
4487 	/* get rx ring size */
4488 	status = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
4489 	if (status != 0) {
4490 		device_printf(sc->dev, "Cannot determine rx ring size\n");
4491 		goto abort_with_fw;
4492 	}
4493 	max_intr_slots = 2 * (cmd.data0 / sizeof (mcp_dma_addr_t));
4494 
4495 	/* tell it the size of the interrupt queues */
4496 	cmd.data0 = max_intr_slots * sizeof (struct mcp_slot);
4497 	status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
4498 	if (status != 0) {
4499 		device_printf(sc->dev, "failed MXGEFW_CMD_SET_INTRQ_SIZE\n");
4500 		goto abort_with_fw;
4501 	}
4502 
4503 	/* ask the maximum number of slices it supports */
4504 	status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES, &cmd);
4505 	if (status != 0) {
4506 		device_printf(sc->dev,
4507 			      "failed MXGEFW_CMD_GET_MAX_RSS_QUEUES\n");
4508 		goto abort_with_fw;
4509 	}
4510 	sc->num_slices = cmd.data0;
4511 	if (sc->num_slices > msix_cnt)
4512 		sc->num_slices = msix_cnt;
4513 
4514 	if (mxge_max_slices == -1) {
4515 		/* cap to number of CPUs in system */
4516 		if (sc->num_slices > mp_ncpus)
4517 			sc->num_slices = mp_ncpus;
4518 	} else {
4519 		if (sc->num_slices > mxge_max_slices)
4520 			sc->num_slices = mxge_max_slices;
4521 	}
4522 	/* make sure it is a power of two */
4523 	while (sc->num_slices & (sc->num_slices - 1))
4524 		sc->num_slices--;
4525 
4526 	if (mxge_verbose)
4527 		device_printf(sc->dev, "using %d slices\n",
4528 			      sc->num_slices);
4529 
4530 	return;
4531 
4532 abort_with_fw:
4533 	sc->fw_name = old_fw;
4534 	(void) mxge_load_firmware(sc, 0);
4535 }
4536 
4537 static int
4538 mxge_add_msix_irqs(mxge_softc_t *sc)
4539 {
4540 	size_t bytes;
4541 	int count, err, i, rid;
4542 
4543 	rid = PCIR_BAR(2);
4544 	sc->msix_table_res = bus_alloc_resource_any(sc->dev, SYS_RES_MEMORY,
4545 						    &rid, RF_ACTIVE);
4546 
4547 	if (sc->msix_table_res == NULL) {
4548 		device_printf(sc->dev, "couldn't alloc MSIX table res\n");
4549 		return ENXIO;
4550 	}
4551 
4552 	count = sc->num_slices;
4553 	err = pci_alloc_msix(sc->dev, &count);
4554 	if (err != 0) {
4555 		device_printf(sc->dev, "pci_alloc_msix: failed, wanted %d"
4556 			      "err = %d \n", sc->num_slices, err);
4557 		goto abort_with_msix_table;
4558 	}
4559 	if (count < sc->num_slices) {
4560 		device_printf(sc->dev, "pci_alloc_msix: need %d, got %d\n",
4561 			      count, sc->num_slices);
4562 		device_printf(sc->dev,
4563 			      "Try setting hw.mxge.max_slices to %d\n",
4564 			      count);
4565 		err = ENOSPC;
4566 		goto abort_with_msix;
4567 	}
4568 	bytes = sizeof (*sc->msix_irq_res) * sc->num_slices;
4569 	sc->msix_irq_res = malloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO);
4570 	if (sc->msix_irq_res == NULL) {
4571 		err = ENOMEM;
4572 		goto abort_with_msix;
4573 	}
4574 
4575 	for (i = 0; i < sc->num_slices; i++) {
4576 		rid = i + 1;
4577 		sc->msix_irq_res[i] = bus_alloc_resource_any(sc->dev,
4578 							  SYS_RES_IRQ,
4579 							  &rid, RF_ACTIVE);
4580 		if (sc->msix_irq_res[i] == NULL) {
4581 			device_printf(sc->dev, "couldn't allocate IRQ res"
4582 				      " for message %d\n", i);
4583 			err = ENXIO;
4584 			goto abort_with_res;
4585 		}
4586 	}
4587 
4588 	bytes = sizeof (*sc->msix_ih) * sc->num_slices;
4589 	sc->msix_ih =  malloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO);
4590 
4591 	for (i = 0; i < sc->num_slices; i++) {
4592 		err = bus_setup_intr(sc->dev, sc->msix_irq_res[i],
4593 				     INTR_TYPE_NET | INTR_MPSAFE,
4594 #if __FreeBSD_version > 700030
4595 				     NULL,
4596 #endif
4597 				     mxge_intr, &sc->ss[i], &sc->msix_ih[i]);
4598 		if (err != 0) {
4599 			device_printf(sc->dev, "couldn't setup intr for "
4600 				      "message %d\n", i);
4601 			goto abort_with_intr;
4602 		}
4603 		bus_describe_intr(sc->dev, sc->msix_irq_res[i],
4604 				  sc->msix_ih[i], "s%d", i);
4605 	}
4606 
4607 	if (mxge_verbose) {
4608 		device_printf(sc->dev, "using %d msix IRQs:",
4609 			      sc->num_slices);
4610 		for (i = 0; i < sc->num_slices; i++)
4611 			printf(" %jd", rman_get_start(sc->msix_irq_res[i]));
4612 		printf("\n");
4613 	}
4614 	return (0);
4615 
4616 abort_with_intr:
4617 	for (i = 0; i < sc->num_slices; i++) {
4618 		if (sc->msix_ih[i] != NULL) {
4619 			bus_teardown_intr(sc->dev, sc->msix_irq_res[i],
4620 					  sc->msix_ih[i]);
4621 			sc->msix_ih[i] = NULL;
4622 		}
4623 	}
4624 	free(sc->msix_ih, M_DEVBUF);
4625 
4626 
4627 abort_with_res:
4628 	for (i = 0; i < sc->num_slices; i++) {
4629 		rid = i + 1;
4630 		if (sc->msix_irq_res[i] != NULL)
4631 			bus_release_resource(sc->dev, SYS_RES_IRQ, rid,
4632 					     sc->msix_irq_res[i]);
4633 		sc->msix_irq_res[i] = NULL;
4634 	}
4635 	free(sc->msix_irq_res, M_DEVBUF);
4636 
4637 
4638 abort_with_msix:
4639 	pci_release_msi(sc->dev);
4640 
4641 abort_with_msix_table:
4642 	bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2),
4643 			     sc->msix_table_res);
4644 
4645 	return err;
4646 }
4647 
4648 static int
4649 mxge_add_single_irq(mxge_softc_t *sc)
4650 {
4651 	int count, err, rid;
4652 
4653 	count = pci_msi_count(sc->dev);
4654 	if (count == 1 && pci_alloc_msi(sc->dev, &count) == 0) {
4655 		rid = 1;
4656 	} else {
4657 		rid = 0;
4658 		sc->legacy_irq = 1;
4659 	}
4660 	sc->irq_res = bus_alloc_resource_any(sc->dev, SYS_RES_IRQ, &rid,
4661 					     RF_SHAREABLE | RF_ACTIVE);
4662 	if (sc->irq_res == NULL) {
4663 		device_printf(sc->dev, "could not alloc interrupt\n");
4664 		return ENXIO;
4665 	}
4666 	if (mxge_verbose)
4667 		device_printf(sc->dev, "using %s irq %jd\n",
4668 			      sc->legacy_irq ? "INTx" : "MSI",
4669 			      rman_get_start(sc->irq_res));
4670 	err = bus_setup_intr(sc->dev, sc->irq_res,
4671 			     INTR_TYPE_NET | INTR_MPSAFE,
4672 #if __FreeBSD_version > 700030
4673 			     NULL,
4674 #endif
4675 			     mxge_intr, &sc->ss[0], &sc->ih);
4676 	if (err != 0) {
4677 		bus_release_resource(sc->dev, SYS_RES_IRQ,
4678 				     sc->legacy_irq ? 0 : 1, sc->irq_res);
4679 		if (!sc->legacy_irq)
4680 			pci_release_msi(sc->dev);
4681 	}
4682 	return err;
4683 }
4684 
4685 static void
4686 mxge_rem_msix_irqs(mxge_softc_t *sc)
4687 {
4688 	int i, rid;
4689 
4690 	for (i = 0; i < sc->num_slices; i++) {
4691 		if (sc->msix_ih[i] != NULL) {
4692 			bus_teardown_intr(sc->dev, sc->msix_irq_res[i],
4693 					  sc->msix_ih[i]);
4694 			sc->msix_ih[i] = NULL;
4695 		}
4696 	}
4697 	free(sc->msix_ih, M_DEVBUF);
4698 
4699 	for (i = 0; i < sc->num_slices; i++) {
4700 		rid = i + 1;
4701 		if (sc->msix_irq_res[i] != NULL)
4702 			bus_release_resource(sc->dev, SYS_RES_IRQ, rid,
4703 					     sc->msix_irq_res[i]);
4704 		sc->msix_irq_res[i] = NULL;
4705 	}
4706 	free(sc->msix_irq_res, M_DEVBUF);
4707 
4708 	bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2),
4709 			     sc->msix_table_res);
4710 
4711 	pci_release_msi(sc->dev);
4712 	return;
4713 }
4714 
4715 static void
4716 mxge_rem_single_irq(mxge_softc_t *sc)
4717 {
4718 	bus_teardown_intr(sc->dev, sc->irq_res, sc->ih);
4719 	bus_release_resource(sc->dev, SYS_RES_IRQ,
4720 			     sc->legacy_irq ? 0 : 1, sc->irq_res);
4721 	if (!sc->legacy_irq)
4722 		pci_release_msi(sc->dev);
4723 }
4724 
4725 static void
4726 mxge_rem_irq(mxge_softc_t *sc)
4727 {
4728 	if (sc->num_slices > 1)
4729 		mxge_rem_msix_irqs(sc);
4730 	else
4731 		mxge_rem_single_irq(sc);
4732 }
4733 
4734 static int
4735 mxge_add_irq(mxge_softc_t *sc)
4736 {
4737 	int err;
4738 
4739 	if (sc->num_slices > 1)
4740 		err = mxge_add_msix_irqs(sc);
4741 	else
4742 		err = mxge_add_single_irq(sc);
4743 
4744 	if (0 && err == 0 && sc->num_slices > 1) {
4745 		mxge_rem_msix_irqs(sc);
4746 		err = mxge_add_msix_irqs(sc);
4747 	}
4748 	return err;
4749 }
4750 
4751 
4752 static int
4753 mxge_attach(device_t dev)
4754 {
4755 	mxge_cmd_t cmd;
4756 	mxge_softc_t *sc = device_get_softc(dev);
4757 	struct ifnet *ifp;
4758 	int err, rid;
4759 
4760 	sc->dev = dev;
4761 	mxge_fetch_tunables(sc);
4762 
4763 	TASK_INIT(&sc->watchdog_task, 1, mxge_watchdog_task, sc);
4764 	sc->tq = taskqueue_create("mxge_taskq", M_WAITOK,
4765 				  taskqueue_thread_enqueue, &sc->tq);
4766 	if (sc->tq == NULL) {
4767 		err = ENOMEM;
4768 		goto abort_with_nothing;
4769 	}
4770 
4771 	err = bus_dma_tag_create(bus_get_dma_tag(dev),	/* parent */
4772 				 1,			/* alignment */
4773 				 0,			/* boundary */
4774 				 BUS_SPACE_MAXADDR,	/* low */
4775 				 BUS_SPACE_MAXADDR,	/* high */
4776 				 NULL, NULL,		/* filter */
4777 				 65536 + 256,		/* maxsize */
4778 				 MXGE_MAX_SEND_DESC, 	/* num segs */
4779 				 65536,			/* maxsegsize */
4780 				 0,			/* flags */
4781 				 NULL, NULL,		/* lock */
4782 				 &sc->parent_dmat);	/* tag */
4783 
4784 	if (err != 0) {
4785 		device_printf(sc->dev, "Err %d allocating parent dmat\n",
4786 			      err);
4787 		goto abort_with_tq;
4788 	}
4789 
4790 	ifp = sc->ifp = if_alloc(IFT_ETHER);
4791 	if (ifp == NULL) {
4792 		device_printf(dev, "can not if_alloc()\n");
4793 		err = ENOSPC;
4794 		goto abort_with_parent_dmat;
4795 	}
4796 	if_initname(ifp, device_get_name(dev), device_get_unit(dev));
4797 
4798 	snprintf(sc->cmd_mtx_name, sizeof(sc->cmd_mtx_name), "%s:cmd",
4799 		 device_get_nameunit(dev));
4800 	mtx_init(&sc->cmd_mtx, sc->cmd_mtx_name, NULL, MTX_DEF);
4801 	snprintf(sc->driver_mtx_name, sizeof(sc->driver_mtx_name),
4802 		 "%s:drv", device_get_nameunit(dev));
4803 	mtx_init(&sc->driver_mtx, sc->driver_mtx_name,
4804 		 MTX_NETWORK_LOCK, MTX_DEF);
4805 
4806 	callout_init_mtx(&sc->co_hdl, &sc->driver_mtx, 0);
4807 
4808 	mxge_setup_cfg_space(sc);
4809 
4810 	/* Map the board into the kernel */
4811 	rid = PCIR_BARS;
4812 	sc->mem_res = bus_alloc_resource_any(dev, SYS_RES_MEMORY, &rid,
4813 					     RF_ACTIVE);
4814 	if (sc->mem_res == NULL) {
4815 		device_printf(dev, "could not map memory\n");
4816 		err = ENXIO;
4817 		goto abort_with_lock;
4818 	}
4819 	sc->sram = rman_get_virtual(sc->mem_res);
4820 	sc->sram_size = 2*1024*1024 - (2*(48*1024)+(32*1024)) - 0x100;
4821 	if (sc->sram_size > rman_get_size(sc->mem_res)) {
4822 		device_printf(dev, "impossible memory region size %jd\n",
4823 			      rman_get_size(sc->mem_res));
4824 		err = ENXIO;
4825 		goto abort_with_mem_res;
4826 	}
4827 
4828 	/* make NULL terminated copy of the EEPROM strings section of
4829 	   lanai SRAM */
4830 	bzero(sc->eeprom_strings, MXGE_EEPROM_STRINGS_SIZE);
4831 	bus_space_read_region_1(rman_get_bustag(sc->mem_res),
4832 				rman_get_bushandle(sc->mem_res),
4833 				sc->sram_size - MXGE_EEPROM_STRINGS_SIZE,
4834 				sc->eeprom_strings,
4835 				MXGE_EEPROM_STRINGS_SIZE - 2);
4836 	err = mxge_parse_strings(sc);
4837 	if (err != 0)
4838 		goto abort_with_mem_res;
4839 
4840 	/* Enable write combining for efficient use of PCIe bus */
4841 	mxge_enable_wc(sc);
4842 
4843 	/* Allocate the out of band dma memory */
4844 	err = mxge_dma_alloc(sc, &sc->cmd_dma,
4845 			     sizeof (mxge_cmd_t), 64);
4846 	if (err != 0)
4847 		goto abort_with_mem_res;
4848 	sc->cmd = (mcp_cmd_response_t *) sc->cmd_dma.addr;
4849 	err = mxge_dma_alloc(sc, &sc->zeropad_dma, 64, 64);
4850 	if (err != 0)
4851 		goto abort_with_cmd_dma;
4852 
4853 	err = mxge_dma_alloc(sc, &sc->dmabench_dma, 4096, 4096);
4854 	if (err != 0)
4855 		goto abort_with_zeropad_dma;
4856 
4857 	/* select & load the firmware */
4858 	err = mxge_select_firmware(sc);
4859 	if (err != 0)
4860 		goto abort_with_dmabench;
4861 	sc->intr_coal_delay = mxge_intr_coal_delay;
4862 
4863 	mxge_slice_probe(sc);
4864 	err = mxge_alloc_slices(sc);
4865 	if (err != 0)
4866 		goto abort_with_dmabench;
4867 
4868 	err = mxge_reset(sc, 0);
4869 	if (err != 0)
4870 		goto abort_with_slices;
4871 
4872 	err = mxge_alloc_rings(sc);
4873 	if (err != 0) {
4874 		device_printf(sc->dev, "failed to allocate rings\n");
4875 		goto abort_with_slices;
4876 	}
4877 
4878 	err = mxge_add_irq(sc);
4879 	if (err != 0) {
4880 		device_printf(sc->dev, "failed to add irq\n");
4881 		goto abort_with_rings;
4882 	}
4883 
4884 	ifp->if_baudrate = IF_Gbps(10);
4885 	ifp->if_capabilities = IFCAP_RXCSUM | IFCAP_TXCSUM | IFCAP_TSO4 |
4886 		IFCAP_VLAN_MTU | IFCAP_LINKSTATE | IFCAP_TXCSUM_IPV6 |
4887 		IFCAP_RXCSUM_IPV6;
4888 #if defined(INET) || defined(INET6)
4889 	ifp->if_capabilities |= IFCAP_LRO;
4890 #endif
4891 
4892 #ifdef MXGE_NEW_VLAN_API
4893 	ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_HWCSUM;
4894 
4895 	/* Only FW 1.4.32 and newer can do TSO over vlans */
4896 	if (sc->fw_ver_major == 1 && sc->fw_ver_minor == 4 &&
4897 	    sc->fw_ver_tiny >= 32)
4898 		ifp->if_capabilities |= IFCAP_VLAN_HWTSO;
4899 #endif
4900 	sc->max_mtu = mxge_max_mtu(sc);
4901 	if (sc->max_mtu >= 9000)
4902 		ifp->if_capabilities |= IFCAP_JUMBO_MTU;
4903 	else
4904 		device_printf(dev, "MTU limited to %d.  Install "
4905 			      "latest firmware for 9000 byte jumbo support\n",
4906 			      sc->max_mtu - ETHER_HDR_LEN);
4907 	ifp->if_hwassist = CSUM_TCP | CSUM_UDP | CSUM_TSO;
4908 	ifp->if_hwassist |= CSUM_TCP_IPV6 | CSUM_UDP_IPV6;
4909 	/* check to see if f/w supports TSO for IPv6 */
4910 	if (!mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_TSO6_HDR_SIZE, &cmd)) {
4911 		if (CSUM_TCP_IPV6)
4912 			ifp->if_capabilities |= IFCAP_TSO6;
4913 		sc->max_tso6_hlen = min(cmd.data0,
4914 					sizeof (sc->ss[0].scratch));
4915 	}
4916 	ifp->if_capenable = ifp->if_capabilities;
4917 	if (sc->lro_cnt == 0)
4918 		ifp->if_capenable &= ~IFCAP_LRO;
4919 	ifp->if_init = mxge_init;
4920 	ifp->if_softc = sc;
4921 	ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
4922 	ifp->if_ioctl = mxge_ioctl;
4923 	ifp->if_start = mxge_start;
4924 	ifp->if_get_counter = mxge_get_counter;
4925 	/* Initialise the ifmedia structure */
4926 	ifmedia_init(&sc->media, 0, mxge_media_change,
4927 		     mxge_media_status);
4928 	mxge_media_init(sc);
4929 	mxge_media_probe(sc);
4930 	sc->dying = 0;
4931 	ether_ifattach(ifp, sc->mac_addr);
4932 	/* ether_ifattach sets mtu to ETHERMTU */
4933 	if (mxge_initial_mtu != ETHERMTU)
4934 		mxge_change_mtu(sc, mxge_initial_mtu);
4935 
4936 	mxge_add_sysctls(sc);
4937 #ifdef IFNET_BUF_RING
4938 	ifp->if_transmit = mxge_transmit;
4939 	ifp->if_qflush = mxge_qflush;
4940 #endif
4941 	taskqueue_start_threads(&sc->tq, 1, PI_NET, "%s taskq",
4942 				device_get_nameunit(sc->dev));
4943 	callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
4944 	return 0;
4945 
4946 abort_with_rings:
4947 	mxge_free_rings(sc);
4948 abort_with_slices:
4949 	mxge_free_slices(sc);
4950 abort_with_dmabench:
4951 	mxge_dma_free(&sc->dmabench_dma);
4952 abort_with_zeropad_dma:
4953 	mxge_dma_free(&sc->zeropad_dma);
4954 abort_with_cmd_dma:
4955 	mxge_dma_free(&sc->cmd_dma);
4956 abort_with_mem_res:
4957 	bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
4958 abort_with_lock:
4959 	pci_disable_busmaster(dev);
4960 	mtx_destroy(&sc->cmd_mtx);
4961 	mtx_destroy(&sc->driver_mtx);
4962 	if_free(ifp);
4963 abort_with_parent_dmat:
4964 	bus_dma_tag_destroy(sc->parent_dmat);
4965 abort_with_tq:
4966 	if (sc->tq != NULL) {
4967 		taskqueue_drain(sc->tq, &sc->watchdog_task);
4968 		taskqueue_free(sc->tq);
4969 		sc->tq = NULL;
4970 	}
4971 abort_with_nothing:
4972 	return err;
4973 }
4974 
4975 static int
4976 mxge_detach(device_t dev)
4977 {
4978 	mxge_softc_t *sc = device_get_softc(dev);
4979 
4980 	if (mxge_vlans_active(sc)) {
4981 		device_printf(sc->dev,
4982 			      "Detach vlans before removing module\n");
4983 		return EBUSY;
4984 	}
4985 	mtx_lock(&sc->driver_mtx);
4986 	sc->dying = 1;
4987 	if (sc->ifp->if_drv_flags & IFF_DRV_RUNNING)
4988 		mxge_close(sc, 0);
4989 	mtx_unlock(&sc->driver_mtx);
4990 	ether_ifdetach(sc->ifp);
4991 	if (sc->tq != NULL) {
4992 		taskqueue_drain(sc->tq, &sc->watchdog_task);
4993 		taskqueue_free(sc->tq);
4994 		sc->tq = NULL;
4995 	}
4996 	callout_drain(&sc->co_hdl);
4997 	ifmedia_removeall(&sc->media);
4998 	mxge_dummy_rdma(sc, 0);
4999 	mxge_rem_sysctls(sc);
5000 	mxge_rem_irq(sc);
5001 	mxge_free_rings(sc);
5002 	mxge_free_slices(sc);
5003 	mxge_dma_free(&sc->dmabench_dma);
5004 	mxge_dma_free(&sc->zeropad_dma);
5005 	mxge_dma_free(&sc->cmd_dma);
5006 	bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
5007 	pci_disable_busmaster(dev);
5008 	mtx_destroy(&sc->cmd_mtx);
5009 	mtx_destroy(&sc->driver_mtx);
5010 	if_free(sc->ifp);
5011 	bus_dma_tag_destroy(sc->parent_dmat);
5012 	return 0;
5013 }
5014 
5015 static int
5016 mxge_shutdown(device_t dev)
5017 {
5018 	return 0;
5019 }
5020 
5021 /*
5022   This file uses Myri10GE driver indentation.
5023 
5024   Local Variables:
5025   c-file-style:"linux"
5026   tab-width:8
5027   End:
5028 */
5029