xref: /freebsd/sys/dev/mxge/if_mxge.c (revision dda5b39711dab90ae1c5624bdd6ff7453177df31)
1 /******************************************************************************
2 
3 Copyright (c) 2006-2013, Myricom Inc.
4 All rights reserved.
5 
6 Redistribution and use in source and binary forms, with or without
7 modification, are permitted provided that the following conditions are met:
8 
9  1. Redistributions of source code must retain the above copyright notice,
10     this list of conditions and the following disclaimer.
11 
12  2. Neither the name of the Myricom Inc, nor the names of its
13     contributors may be used to endorse or promote products derived from
14     this software without specific prior written permission.
15 
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 POSSIBILITY OF SUCH DAMAGE.
27 
28 ***************************************************************************/
29 
30 #include <sys/cdefs.h>
31 __FBSDID("$FreeBSD$");
32 
33 #include <sys/param.h>
34 #include <sys/systm.h>
35 #include <sys/linker.h>
36 #include <sys/firmware.h>
37 #include <sys/endian.h>
38 #include <sys/sockio.h>
39 #include <sys/mbuf.h>
40 #include <sys/malloc.h>
41 #include <sys/kdb.h>
42 #include <sys/kernel.h>
43 #include <sys/lock.h>
44 #include <sys/module.h>
45 #include <sys/socket.h>
46 #include <sys/sysctl.h>
47 #include <sys/sx.h>
48 #include <sys/taskqueue.h>
49 
50 #include <net/if.h>
51 #include <net/if_var.h>
52 #include <net/if_arp.h>
53 #include <net/ethernet.h>
54 #include <net/if_dl.h>
55 #include <net/if_media.h>
56 
57 #include <net/bpf.h>
58 
59 #include <net/if_types.h>
60 #include <net/if_vlan_var.h>
61 #include <net/zlib.h>
62 
63 #include <netinet/in_systm.h>
64 #include <netinet/in.h>
65 #include <netinet/ip.h>
66 #include <netinet/ip6.h>
67 #include <netinet/tcp.h>
68 #include <netinet/tcp_lro.h>
69 #include <netinet6/ip6_var.h>
70 
71 #include <machine/bus.h>
72 #include <machine/in_cksum.h>
73 #include <machine/resource.h>
74 #include <sys/bus.h>
75 #include <sys/rman.h>
76 #include <sys/smp.h>
77 
78 #include <dev/pci/pcireg.h>
79 #include <dev/pci/pcivar.h>
80 #include <dev/pci/pci_private.h> /* XXX for pci_cfg_restore */
81 
82 #include <vm/vm.h>		/* for pmap_mapdev() */
83 #include <vm/pmap.h>
84 
85 #if defined(__i386) || defined(__amd64)
86 #include <machine/specialreg.h>
87 #endif
88 
89 #include <dev/mxge/mxge_mcp.h>
90 #include <dev/mxge/mcp_gen_header.h>
91 /*#define MXGE_FAKE_IFP*/
92 #include <dev/mxge/if_mxge_var.h>
93 #ifdef IFNET_BUF_RING
94 #include <sys/buf_ring.h>
95 #endif
96 
97 #include "opt_inet.h"
98 #include "opt_inet6.h"
99 
100 /* tunable params */
101 static int mxge_nvidia_ecrc_enable = 1;
102 static int mxge_force_firmware = 0;
103 static int mxge_intr_coal_delay = 30;
104 static int mxge_deassert_wait = 1;
105 static int mxge_flow_control = 1;
106 static int mxge_verbose = 0;
107 static int mxge_ticks;
108 static int mxge_max_slices = 1;
109 static int mxge_rss_hash_type = MXGEFW_RSS_HASH_TYPE_SRC_DST_PORT;
110 static int mxge_always_promisc = 0;
111 static int mxge_initial_mtu = ETHERMTU_JUMBO;
112 static int mxge_throttle = 0;
113 static char *mxge_fw_unaligned = "mxge_ethp_z8e";
114 static char *mxge_fw_aligned = "mxge_eth_z8e";
115 static char *mxge_fw_rss_aligned = "mxge_rss_eth_z8e";
116 static char *mxge_fw_rss_unaligned = "mxge_rss_ethp_z8e";
117 
118 static int mxge_probe(device_t dev);
119 static int mxge_attach(device_t dev);
120 static int mxge_detach(device_t dev);
121 static int mxge_shutdown(device_t dev);
122 static void mxge_intr(void *arg);
123 
124 static device_method_t mxge_methods[] =
125 {
126   /* Device interface */
127   DEVMETHOD(device_probe, mxge_probe),
128   DEVMETHOD(device_attach, mxge_attach),
129   DEVMETHOD(device_detach, mxge_detach),
130   DEVMETHOD(device_shutdown, mxge_shutdown),
131 
132   DEVMETHOD_END
133 };
134 
135 static driver_t mxge_driver =
136 {
137   "mxge",
138   mxge_methods,
139   sizeof(mxge_softc_t),
140 };
141 
142 static devclass_t mxge_devclass;
143 
144 /* Declare ourselves to be a child of the PCI bus.*/
145 DRIVER_MODULE(mxge, pci, mxge_driver, mxge_devclass, 0, 0);
146 MODULE_DEPEND(mxge, firmware, 1, 1, 1);
147 MODULE_DEPEND(mxge, zlib, 1, 1, 1);
148 
149 static int mxge_load_firmware(mxge_softc_t *sc, int adopt);
150 static int mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data);
151 static int mxge_close(mxge_softc_t *sc, int down);
152 static int mxge_open(mxge_softc_t *sc);
153 static void mxge_tick(void *arg);
154 
155 static int
156 mxge_probe(device_t dev)
157 {
158 	int rev;
159 
160 
161 	if ((pci_get_vendor(dev) == MXGE_PCI_VENDOR_MYRICOM) &&
162 	    ((pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E) ||
163 	     (pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E_9))) {
164 		rev = pci_get_revid(dev);
165 		switch (rev) {
166 		case MXGE_PCI_REV_Z8E:
167 			device_set_desc(dev, "Myri10G-PCIE-8A");
168 			break;
169 		case MXGE_PCI_REV_Z8ES:
170 			device_set_desc(dev, "Myri10G-PCIE-8B");
171 			break;
172 		default:
173 			device_set_desc(dev, "Myri10G-PCIE-8??");
174 			device_printf(dev, "Unrecognized rev %d NIC\n",
175 				      rev);
176 			break;
177 		}
178 		return 0;
179 	}
180 	return ENXIO;
181 }
182 
183 static void
184 mxge_enable_wc(mxge_softc_t *sc)
185 {
186 #if defined(__i386) || defined(__amd64)
187 	vm_offset_t len;
188 	int err;
189 
190 	sc->wc = 1;
191 	len = rman_get_size(sc->mem_res);
192 	err = pmap_change_attr((vm_offset_t) sc->sram,
193 			       len, PAT_WRITE_COMBINING);
194 	if (err != 0) {
195 		device_printf(sc->dev, "pmap_change_attr failed, %d\n",
196 			      err);
197 		sc->wc = 0;
198 	}
199 #endif
200 }
201 
202 
203 /* callback to get our DMA address */
204 static void
205 mxge_dmamap_callback(void *arg, bus_dma_segment_t *segs, int nsegs,
206 			 int error)
207 {
208 	if (error == 0) {
209 		*(bus_addr_t *) arg = segs->ds_addr;
210 	}
211 }
212 
213 static int
214 mxge_dma_alloc(mxge_softc_t *sc, mxge_dma_t *dma, size_t bytes,
215 		   bus_size_t alignment)
216 {
217 	int err;
218 	device_t dev = sc->dev;
219 	bus_size_t boundary, maxsegsize;
220 
221 	if (bytes > 4096 && alignment == 4096) {
222 		boundary = 0;
223 		maxsegsize = bytes;
224 	} else {
225 		boundary = 4096;
226 		maxsegsize = 4096;
227 	}
228 
229 	/* allocate DMAable memory tags */
230 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
231 				 alignment,		/* alignment */
232 				 boundary,		/* boundary */
233 				 BUS_SPACE_MAXADDR,	/* low */
234 				 BUS_SPACE_MAXADDR,	/* high */
235 				 NULL, NULL,		/* filter */
236 				 bytes,			/* maxsize */
237 				 1,			/* num segs */
238 				 maxsegsize,		/* maxsegsize */
239 				 BUS_DMA_COHERENT,	/* flags */
240 				 NULL, NULL,		/* lock */
241 				 &dma->dmat);		/* tag */
242 	if (err != 0) {
243 		device_printf(dev, "couldn't alloc tag (err = %d)\n", err);
244 		return err;
245 	}
246 
247 	/* allocate DMAable memory & map */
248 	err = bus_dmamem_alloc(dma->dmat, &dma->addr,
249 			       (BUS_DMA_WAITOK | BUS_DMA_COHERENT
250 				| BUS_DMA_ZERO),  &dma->map);
251 	if (err != 0) {
252 		device_printf(dev, "couldn't alloc mem (err = %d)\n", err);
253 		goto abort_with_dmat;
254 	}
255 
256 	/* load the memory */
257 	err = bus_dmamap_load(dma->dmat, dma->map, dma->addr, bytes,
258 			      mxge_dmamap_callback,
259 			      (void *)&dma->bus_addr, 0);
260 	if (err != 0) {
261 		device_printf(dev, "couldn't load map (err = %d)\n", err);
262 		goto abort_with_mem;
263 	}
264 	return 0;
265 
266 abort_with_mem:
267 	bus_dmamem_free(dma->dmat, dma->addr, dma->map);
268 abort_with_dmat:
269 	(void)bus_dma_tag_destroy(dma->dmat);
270 	return err;
271 }
272 
273 
274 static void
275 mxge_dma_free(mxge_dma_t *dma)
276 {
277 	bus_dmamap_unload(dma->dmat, dma->map);
278 	bus_dmamem_free(dma->dmat, dma->addr, dma->map);
279 	(void)bus_dma_tag_destroy(dma->dmat);
280 }
281 
282 /*
283  * The eeprom strings on the lanaiX have the format
284  * SN=x\0
285  * MAC=x:x:x:x:x:x\0
286  * PC=text\0
287  */
288 
289 static int
290 mxge_parse_strings(mxge_softc_t *sc)
291 {
292 	char *ptr;
293 	int i, found_mac, found_sn2;
294 	char *endptr;
295 
296 	ptr = sc->eeprom_strings;
297 	found_mac = 0;
298 	found_sn2 = 0;
299 	while (*ptr != '\0') {
300 		if (strncmp(ptr, "MAC=", 4) == 0) {
301 			ptr += 4;
302 			for (i = 0;;) {
303 				sc->mac_addr[i] = strtoul(ptr, &endptr, 16);
304 				if (endptr - ptr != 2)
305 					goto abort;
306 				ptr = endptr;
307 				if (++i == 6)
308 					break;
309 				if (*ptr++ != ':')
310 					goto abort;
311 			}
312 			found_mac = 1;
313 		} else if (strncmp(ptr, "PC=", 3) == 0) {
314 			ptr += 3;
315 			strlcpy(sc->product_code_string, ptr,
316 			    sizeof(sc->product_code_string));
317 		} else if (!found_sn2 && (strncmp(ptr, "SN=", 3) == 0)) {
318 			ptr += 3;
319 			strlcpy(sc->serial_number_string, ptr,
320 			    sizeof(sc->serial_number_string));
321 		} else if (strncmp(ptr, "SN2=", 4) == 0) {
322 			/* SN2 takes precedence over SN */
323 			ptr += 4;
324 			found_sn2 = 1;
325 			strlcpy(sc->serial_number_string, ptr,
326 			    sizeof(sc->serial_number_string));
327 		}
328 		while (*ptr++ != '\0') {}
329 	}
330 
331 	if (found_mac)
332 		return 0;
333 
334  abort:
335 	device_printf(sc->dev, "failed to parse eeprom_strings\n");
336 
337 	return ENXIO;
338 }
339 
340 #if defined __i386 || defined i386 || defined __i386__ || defined __x86_64__
341 static void
342 mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
343 {
344 	uint32_t val;
345 	unsigned long base, off;
346 	char *va, *cfgptr;
347 	device_t pdev, mcp55;
348 	uint16_t vendor_id, device_id, word;
349 	uintptr_t bus, slot, func, ivend, idev;
350 	uint32_t *ptr32;
351 
352 
353 	if (!mxge_nvidia_ecrc_enable)
354 		return;
355 
356 	pdev = device_get_parent(device_get_parent(sc->dev));
357 	if (pdev == NULL) {
358 		device_printf(sc->dev, "could not find parent?\n");
359 		return;
360 	}
361 	vendor_id = pci_read_config(pdev, PCIR_VENDOR, 2);
362 	device_id = pci_read_config(pdev, PCIR_DEVICE, 2);
363 
364 	if (vendor_id != 0x10de)
365 		return;
366 
367 	base = 0;
368 
369 	if (device_id == 0x005d) {
370 		/* ck804, base address is magic */
371 		base = 0xe0000000UL;
372 	} else if (device_id >= 0x0374 && device_id <= 0x378) {
373 		/* mcp55, base address stored in chipset */
374 		mcp55 = pci_find_bsf(0, 0, 0);
375 		if (mcp55 &&
376 		    0x10de == pci_read_config(mcp55, PCIR_VENDOR, 2) &&
377 		    0x0369 == pci_read_config(mcp55, PCIR_DEVICE, 2)) {
378 			word = pci_read_config(mcp55, 0x90, 2);
379 			base = ((unsigned long)word & 0x7ffeU) << 25;
380 		}
381 	}
382 	if (!base)
383 		return;
384 
385 	/* XXXX
386 	   Test below is commented because it is believed that doing
387 	   config read/write beyond 0xff will access the config space
388 	   for the next larger function.  Uncomment this and remove
389 	   the hacky pmap_mapdev() way of accessing config space when
390 	   FreeBSD grows support for extended pcie config space access
391 	*/
392 #if 0
393 	/* See if we can, by some miracle, access the extended
394 	   config space */
395 	val = pci_read_config(pdev, 0x178, 4);
396 	if (val != 0xffffffff) {
397 		val |= 0x40;
398 		pci_write_config(pdev, 0x178, val, 4);
399 		return;
400 	}
401 #endif
402 	/* Rather than using normal pci config space writes, we must
403 	 * map the Nvidia config space ourselves.  This is because on
404 	 * opteron/nvidia class machine the 0xe000000 mapping is
405 	 * handled by the nvidia chipset, that means the internal PCI
406 	 * device (the on-chip northbridge), or the amd-8131 bridge
407 	 * and things behind them are not visible by this method.
408 	 */
409 
410 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
411 		      PCI_IVAR_BUS, &bus);
412 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
413 		      PCI_IVAR_SLOT, &slot);
414 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
415 		      PCI_IVAR_FUNCTION, &func);
416 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
417 		      PCI_IVAR_VENDOR, &ivend);
418 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
419 		      PCI_IVAR_DEVICE, &idev);
420 
421 	off =  base
422 		+ 0x00100000UL * (unsigned long)bus
423 		+ 0x00001000UL * (unsigned long)(func
424 						 + 8 * slot);
425 
426 	/* map it into the kernel */
427 	va = pmap_mapdev(trunc_page((vm_paddr_t)off), PAGE_SIZE);
428 
429 
430 	if (va == NULL) {
431 		device_printf(sc->dev, "pmap_kenter_temporary didn't\n");
432 		return;
433 	}
434 	/* get a pointer to the config space mapped into the kernel */
435 	cfgptr = va + (off & PAGE_MASK);
436 
437 	/* make sure that we can really access it */
438 	vendor_id = *(uint16_t *)(cfgptr + PCIR_VENDOR);
439 	device_id = *(uint16_t *)(cfgptr + PCIR_DEVICE);
440 	if (! (vendor_id == ivend && device_id == idev)) {
441 		device_printf(sc->dev, "mapping failed: 0x%x:0x%x\n",
442 			      vendor_id, device_id);
443 		pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
444 		return;
445 	}
446 
447 	ptr32 = (uint32_t*)(cfgptr + 0x178);
448 	val = *ptr32;
449 
450 	if (val == 0xffffffff) {
451 		device_printf(sc->dev, "extended mapping failed\n");
452 		pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
453 		return;
454 	}
455 	*ptr32 = val | 0x40;
456 	pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
457 	if (mxge_verbose)
458 		device_printf(sc->dev,
459 			      "Enabled ECRC on upstream Nvidia bridge "
460 			      "at %d:%d:%d\n",
461 			      (int)bus, (int)slot, (int)func);
462 	return;
463 }
464 #else
465 static void
466 mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
467 {
468 	device_printf(sc->dev,
469 		      "Nforce 4 chipset on non-x86/amd64!?!?!\n");
470 	return;
471 }
472 #endif
473 
474 
475 static int
476 mxge_dma_test(mxge_softc_t *sc, int test_type)
477 {
478 	mxge_cmd_t cmd;
479 	bus_addr_t dmatest_bus = sc->dmabench_dma.bus_addr;
480 	int status;
481 	uint32_t len;
482 	char *test = " ";
483 
484 
485 	/* Run a small DMA test.
486 	 * The magic multipliers to the length tell the firmware
487 	 * to do DMA read, write, or read+write tests.  The
488 	 * results are returned in cmd.data0.  The upper 16
489 	 * bits of the return is the number of transfers completed.
490 	 * The lower 16 bits is the time in 0.5us ticks that the
491 	 * transfers took to complete.
492 	 */
493 
494 	len = sc->tx_boundary;
495 
496 	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
497 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
498 	cmd.data2 = len * 0x10000;
499 	status = mxge_send_cmd(sc, test_type, &cmd);
500 	if (status != 0) {
501 		test = "read";
502 		goto abort;
503 	}
504 	sc->read_dma = ((cmd.data0>>16) * len * 2) /
505 		(cmd.data0 & 0xffff);
506 	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
507 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
508 	cmd.data2 = len * 0x1;
509 	status = mxge_send_cmd(sc, test_type, &cmd);
510 	if (status != 0) {
511 		test = "write";
512 		goto abort;
513 	}
514 	sc->write_dma = ((cmd.data0>>16) * len * 2) /
515 		(cmd.data0 & 0xffff);
516 
517 	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
518 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
519 	cmd.data2 = len * 0x10001;
520 	status = mxge_send_cmd(sc, test_type, &cmd);
521 	if (status != 0) {
522 		test = "read/write";
523 		goto abort;
524 	}
525 	sc->read_write_dma = ((cmd.data0>>16) * len * 2 * 2) /
526 		(cmd.data0 & 0xffff);
527 
528 abort:
529 	if (status != 0 && test_type != MXGEFW_CMD_UNALIGNED_TEST)
530 		device_printf(sc->dev, "DMA %s benchmark failed: %d\n",
531 			      test, status);
532 
533 	return status;
534 }
535 
536 /*
537  * The Lanai Z8E PCI-E interface achieves higher Read-DMA throughput
538  * when the PCI-E Completion packets are aligned on an 8-byte
539  * boundary.  Some PCI-E chip sets always align Completion packets; on
540  * the ones that do not, the alignment can be enforced by enabling
541  * ECRC generation (if supported).
542  *
543  * When PCI-E Completion packets are not aligned, it is actually more
544  * efficient to limit Read-DMA transactions to 2KB, rather than 4KB.
545  *
546  * If the driver can neither enable ECRC nor verify that it has
547  * already been enabled, then it must use a firmware image which works
548  * around unaligned completion packets (ethp_z8e.dat), and it should
549  * also ensure that it never gives the device a Read-DMA which is
550  * larger than 2KB by setting the tx_boundary to 2KB.  If ECRC is
551  * enabled, then the driver should use the aligned (eth_z8e.dat)
552  * firmware image, and set tx_boundary to 4KB.
553  */
554 
555 static int
556 mxge_firmware_probe(mxge_softc_t *sc)
557 {
558 	device_t dev = sc->dev;
559 	int reg, status;
560 	uint16_t pectl;
561 
562 	sc->tx_boundary = 4096;
563 	/*
564 	 * Verify the max read request size was set to 4KB
565 	 * before trying the test with 4KB.
566 	 */
567 	if (pci_find_cap(dev, PCIY_EXPRESS, &reg) == 0) {
568 		pectl = pci_read_config(dev, reg + 0x8, 2);
569 		if ((pectl & (5 << 12)) != (5 << 12)) {
570 			device_printf(dev, "Max Read Req. size != 4k (0x%x\n",
571 				      pectl);
572 			sc->tx_boundary = 2048;
573 		}
574 	}
575 
576 	/*
577 	 * load the optimized firmware (which assumes aligned PCIe
578 	 * completions) in order to see if it works on this host.
579 	 */
580 	sc->fw_name = mxge_fw_aligned;
581 	status = mxge_load_firmware(sc, 1);
582 	if (status != 0) {
583 		return status;
584 	}
585 
586 	/*
587 	 * Enable ECRC if possible
588 	 */
589 	mxge_enable_nvidia_ecrc(sc);
590 
591 	/*
592 	 * Run a DMA test which watches for unaligned completions and
593 	 * aborts on the first one seen.  Not required on Z8ES or newer.
594 	 */
595 	if (pci_get_revid(sc->dev) >= MXGE_PCI_REV_Z8ES)
596 		return 0;
597 	status = mxge_dma_test(sc, MXGEFW_CMD_UNALIGNED_TEST);
598 	if (status == 0)
599 		return 0; /* keep the aligned firmware */
600 
601 	if (status != E2BIG)
602 		device_printf(dev, "DMA test failed: %d\n", status);
603 	if (status == ENOSYS)
604 		device_printf(dev, "Falling back to ethp! "
605 			      "Please install up to date fw\n");
606 	return status;
607 }
608 
609 static int
610 mxge_select_firmware(mxge_softc_t *sc)
611 {
612 	int aligned = 0;
613 	int force_firmware = mxge_force_firmware;
614 
615 	if (sc->throttle)
616 		force_firmware = sc->throttle;
617 
618 	if (force_firmware != 0) {
619 		if (force_firmware == 1)
620 			aligned = 1;
621 		else
622 			aligned = 0;
623 		if (mxge_verbose)
624 			device_printf(sc->dev,
625 				      "Assuming %s completions (forced)\n",
626 				      aligned ? "aligned" : "unaligned");
627 		goto abort;
628 	}
629 
630 	/* if the PCIe link width is 4 or less, we can use the aligned
631 	   firmware and skip any checks */
632 	if (sc->link_width != 0 && sc->link_width <= 4) {
633 		device_printf(sc->dev,
634 			      "PCIe x%d Link, expect reduced performance\n",
635 			      sc->link_width);
636 		aligned = 1;
637 		goto abort;
638 	}
639 
640 	if (0 == mxge_firmware_probe(sc))
641 		return 0;
642 
643 abort:
644 	if (aligned) {
645 		sc->fw_name = mxge_fw_aligned;
646 		sc->tx_boundary = 4096;
647 	} else {
648 		sc->fw_name = mxge_fw_unaligned;
649 		sc->tx_boundary = 2048;
650 	}
651 	return (mxge_load_firmware(sc, 0));
652 }
653 
654 static int
655 mxge_validate_firmware(mxge_softc_t *sc, const mcp_gen_header_t *hdr)
656 {
657 
658 
659 	if (be32toh(hdr->mcp_type) != MCP_TYPE_ETH) {
660 		device_printf(sc->dev, "Bad firmware type: 0x%x\n",
661 			      be32toh(hdr->mcp_type));
662 		return EIO;
663 	}
664 
665 	/* save firmware version for sysctl */
666 	strlcpy(sc->fw_version, hdr->version, sizeof(sc->fw_version));
667 	if (mxge_verbose)
668 		device_printf(sc->dev, "firmware id: %s\n", hdr->version);
669 
670 	sscanf(sc->fw_version, "%d.%d.%d", &sc->fw_ver_major,
671 	       &sc->fw_ver_minor, &sc->fw_ver_tiny);
672 
673 	if (!(sc->fw_ver_major == MXGEFW_VERSION_MAJOR
674 	      && sc->fw_ver_minor == MXGEFW_VERSION_MINOR)) {
675 		device_printf(sc->dev, "Found firmware version %s\n",
676 			      sc->fw_version);
677 		device_printf(sc->dev, "Driver needs %d.%d\n",
678 			      MXGEFW_VERSION_MAJOR, MXGEFW_VERSION_MINOR);
679 		return EINVAL;
680 	}
681 	return 0;
682 
683 }
684 
685 static void *
686 z_alloc(void *nil, u_int items, u_int size)
687 {
688         void *ptr;
689 
690         ptr = malloc(items * size, M_TEMP, M_NOWAIT);
691         return ptr;
692 }
693 
694 static void
695 z_free(void *nil, void *ptr)
696 {
697         free(ptr, M_TEMP);
698 }
699 
700 
701 static int
702 mxge_load_firmware_helper(mxge_softc_t *sc, uint32_t *limit)
703 {
704 	z_stream zs;
705 	char *inflate_buffer;
706 	const struct firmware *fw;
707 	const mcp_gen_header_t *hdr;
708 	unsigned hdr_offset;
709 	int status;
710 	unsigned int i;
711 	char dummy;
712 	size_t fw_len;
713 
714 	fw = firmware_get(sc->fw_name);
715 	if (fw == NULL) {
716 		device_printf(sc->dev, "Could not find firmware image %s\n",
717 			      sc->fw_name);
718 		return ENOENT;
719 	}
720 
721 
722 
723 	/* setup zlib and decompress f/w */
724 	bzero(&zs, sizeof (zs));
725 	zs.zalloc = z_alloc;
726 	zs.zfree = z_free;
727 	status = inflateInit(&zs);
728 	if (status != Z_OK) {
729 		status = EIO;
730 		goto abort_with_fw;
731 	}
732 
733 	/* the uncompressed size is stored as the firmware version,
734 	   which would otherwise go unused */
735 	fw_len = (size_t) fw->version;
736 	inflate_buffer = malloc(fw_len, M_TEMP, M_NOWAIT);
737 	if (inflate_buffer == NULL)
738 		goto abort_with_zs;
739 	zs.avail_in = fw->datasize;
740 	zs.next_in = __DECONST(char *, fw->data);
741 	zs.avail_out = fw_len;
742 	zs.next_out = inflate_buffer;
743 	status = inflate(&zs, Z_FINISH);
744 	if (status != Z_STREAM_END) {
745 		device_printf(sc->dev, "zlib %d\n", status);
746 		status = EIO;
747 		goto abort_with_buffer;
748 	}
749 
750 	/* check id */
751 	hdr_offset = htobe32(*(const uint32_t *)
752 			     (inflate_buffer + MCP_HEADER_PTR_OFFSET));
753 	if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > fw_len) {
754 		device_printf(sc->dev, "Bad firmware file");
755 		status = EIO;
756 		goto abort_with_buffer;
757 	}
758 	hdr = (const void*)(inflate_buffer + hdr_offset);
759 
760 	status = mxge_validate_firmware(sc, hdr);
761 	if (status != 0)
762 		goto abort_with_buffer;
763 
764 	/* Copy the inflated firmware to NIC SRAM. */
765 	for (i = 0; i < fw_len; i += 256) {
766 		mxge_pio_copy(sc->sram + MXGE_FW_OFFSET + i,
767 			      inflate_buffer + i,
768 			      min(256U, (unsigned)(fw_len - i)));
769 		wmb();
770 		dummy = *sc->sram;
771 		wmb();
772 	}
773 
774 	*limit = fw_len;
775 	status = 0;
776 abort_with_buffer:
777 	free(inflate_buffer, M_TEMP);
778 abort_with_zs:
779 	inflateEnd(&zs);
780 abort_with_fw:
781 	firmware_put(fw, FIRMWARE_UNLOAD);
782 	return status;
783 }
784 
785 /*
786  * Enable or disable periodic RDMAs from the host to make certain
787  * chipsets resend dropped PCIe messages
788  */
789 
790 static void
791 mxge_dummy_rdma(mxge_softc_t *sc, int enable)
792 {
793 	char buf_bytes[72];
794 	volatile uint32_t *confirm;
795 	volatile char *submit;
796 	uint32_t *buf, dma_low, dma_high;
797 	int i;
798 
799 	buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
800 
801 	/* clear confirmation addr */
802 	confirm = (volatile uint32_t *)sc->cmd;
803 	*confirm = 0;
804 	wmb();
805 
806 	/* send an rdma command to the PCIe engine, and wait for the
807 	   response in the confirmation address.  The firmware should
808 	   write a -1 there to indicate it is alive and well
809 	*/
810 
811 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
812 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
813 	buf[0] = htobe32(dma_high);		/* confirm addr MSW */
814 	buf[1] = htobe32(dma_low);		/* confirm addr LSW */
815 	buf[2] = htobe32(0xffffffff);		/* confirm data */
816 	dma_low = MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr);
817 	dma_high = MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr);
818 	buf[3] = htobe32(dma_high); 		/* dummy addr MSW */
819 	buf[4] = htobe32(dma_low); 		/* dummy addr LSW */
820 	buf[5] = htobe32(enable);			/* enable? */
821 
822 
823 	submit = (volatile char *)(sc->sram + MXGEFW_BOOT_DUMMY_RDMA);
824 
825 	mxge_pio_copy(submit, buf, 64);
826 	wmb();
827 	DELAY(1000);
828 	wmb();
829 	i = 0;
830 	while (*confirm != 0xffffffff && i < 20) {
831 		DELAY(1000);
832 		i++;
833 	}
834 	if (*confirm != 0xffffffff) {
835 		device_printf(sc->dev, "dummy rdma %s failed (%p = 0x%x)",
836 			      (enable ? "enable" : "disable"), confirm,
837 			      *confirm);
838 	}
839 	return;
840 }
841 
842 static int
843 mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data)
844 {
845 	mcp_cmd_t *buf;
846 	char buf_bytes[sizeof(*buf) + 8];
847 	volatile mcp_cmd_response_t *response = sc->cmd;
848 	volatile char *cmd_addr = sc->sram + MXGEFW_ETH_CMD;
849 	uint32_t dma_low, dma_high;
850 	int err, sleep_total = 0;
851 
852 	/* ensure buf is aligned to 8 bytes */
853 	buf = (mcp_cmd_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
854 
855 	buf->data0 = htobe32(data->data0);
856 	buf->data1 = htobe32(data->data1);
857 	buf->data2 = htobe32(data->data2);
858 	buf->cmd = htobe32(cmd);
859 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
860 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
861 
862 	buf->response_addr.low = htobe32(dma_low);
863 	buf->response_addr.high = htobe32(dma_high);
864 	mtx_lock(&sc->cmd_mtx);
865 	response->result = 0xffffffff;
866 	wmb();
867 	mxge_pio_copy((volatile void *)cmd_addr, buf, sizeof (*buf));
868 
869 	/* wait up to 20ms */
870 	err = EAGAIN;
871 	for (sleep_total = 0; sleep_total <  20; sleep_total++) {
872 		bus_dmamap_sync(sc->cmd_dma.dmat,
873 				sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
874 		wmb();
875 		switch (be32toh(response->result)) {
876 		case 0:
877 			data->data0 = be32toh(response->data);
878 			err = 0;
879 			break;
880 		case 0xffffffff:
881 			DELAY(1000);
882 			break;
883 		case MXGEFW_CMD_UNKNOWN:
884 			err = ENOSYS;
885 			break;
886 		case MXGEFW_CMD_ERROR_UNALIGNED:
887 			err = E2BIG;
888 			break;
889 		case MXGEFW_CMD_ERROR_BUSY:
890 			err = EBUSY;
891 			break;
892 		case MXGEFW_CMD_ERROR_I2C_ABSENT:
893 			err = ENXIO;
894 			break;
895 		default:
896 			device_printf(sc->dev,
897 				      "mxge: command %d "
898 				      "failed, result = %d\n",
899 				      cmd, be32toh(response->result));
900 			err = ENXIO;
901 			break;
902 		}
903 		if (err != EAGAIN)
904 			break;
905 	}
906 	if (err == EAGAIN)
907 		device_printf(sc->dev, "mxge: command %d timed out"
908 			      "result = %d\n",
909 			      cmd, be32toh(response->result));
910 	mtx_unlock(&sc->cmd_mtx);
911 	return err;
912 }
913 
914 static int
915 mxge_adopt_running_firmware(mxge_softc_t *sc)
916 {
917 	struct mcp_gen_header *hdr;
918 	const size_t bytes = sizeof (struct mcp_gen_header);
919 	size_t hdr_offset;
920 	int status;
921 
922 	/* find running firmware header */
923 	hdr_offset = htobe32(*(volatile uint32_t *)
924 			     (sc->sram + MCP_HEADER_PTR_OFFSET));
925 
926 	if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > sc->sram_size) {
927 		device_printf(sc->dev,
928 			      "Running firmware has bad header offset (%d)\n",
929 			      (int)hdr_offset);
930 		return EIO;
931 	}
932 
933 	/* copy header of running firmware from SRAM to host memory to
934 	 * validate firmware */
935 	hdr = malloc(bytes, M_DEVBUF, M_NOWAIT);
936 	if (hdr == NULL) {
937 		device_printf(sc->dev, "could not malloc firmware hdr\n");
938 		return ENOMEM;
939 	}
940 	bus_space_read_region_1(rman_get_bustag(sc->mem_res),
941 				rman_get_bushandle(sc->mem_res),
942 				hdr_offset, (char *)hdr, bytes);
943 	status = mxge_validate_firmware(sc, hdr);
944 	free(hdr, M_DEVBUF);
945 
946 	/*
947 	 * check to see if adopted firmware has bug where adopting
948 	 * it will cause broadcasts to be filtered unless the NIC
949 	 * is kept in ALLMULTI mode
950 	 */
951 	if (sc->fw_ver_major == 1 && sc->fw_ver_minor == 4 &&
952 	    sc->fw_ver_tiny >= 4 && sc->fw_ver_tiny <= 11) {
953 		sc->adopted_rx_filter_bug = 1;
954 		device_printf(sc->dev, "Adopting fw %d.%d.%d: "
955 			      "working around rx filter bug\n",
956 			      sc->fw_ver_major, sc->fw_ver_minor,
957 			      sc->fw_ver_tiny);
958 	}
959 
960 	return status;
961 }
962 
963 
964 static int
965 mxge_load_firmware(mxge_softc_t *sc, int adopt)
966 {
967 	volatile uint32_t *confirm;
968 	volatile char *submit;
969 	char buf_bytes[72];
970 	uint32_t *buf, size, dma_low, dma_high;
971 	int status, i;
972 
973 	buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
974 
975 	size = sc->sram_size;
976 	status = mxge_load_firmware_helper(sc, &size);
977 	if (status) {
978 		if (!adopt)
979 			return status;
980 		/* Try to use the currently running firmware, if
981 		   it is new enough */
982 		status = mxge_adopt_running_firmware(sc);
983 		if (status) {
984 			device_printf(sc->dev,
985 				      "failed to adopt running firmware\n");
986 			return status;
987 		}
988 		device_printf(sc->dev,
989 			      "Successfully adopted running firmware\n");
990 		if (sc->tx_boundary == 4096) {
991 			device_printf(sc->dev,
992 				"Using firmware currently running on NIC"
993 				 ".  For optimal\n");
994 			device_printf(sc->dev,
995 				 "performance consider loading optimized "
996 				 "firmware\n");
997 		}
998 		sc->fw_name = mxge_fw_unaligned;
999 		sc->tx_boundary = 2048;
1000 		return 0;
1001 	}
1002 	/* clear confirmation addr */
1003 	confirm = (volatile uint32_t *)sc->cmd;
1004 	*confirm = 0;
1005 	wmb();
1006 	/* send a reload command to the bootstrap MCP, and wait for the
1007 	   response in the confirmation address.  The firmware should
1008 	   write a -1 there to indicate it is alive and well
1009 	*/
1010 
1011 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
1012 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
1013 
1014 	buf[0] = htobe32(dma_high);	/* confirm addr MSW */
1015 	buf[1] = htobe32(dma_low);	/* confirm addr LSW */
1016 	buf[2] = htobe32(0xffffffff);	/* confirm data */
1017 
1018 	/* FIX: All newest firmware should un-protect the bottom of
1019 	   the sram before handoff. However, the very first interfaces
1020 	   do not. Therefore the handoff copy must skip the first 8 bytes
1021 	*/
1022 					/* where the code starts*/
1023 	buf[3] = htobe32(MXGE_FW_OFFSET + 8);
1024 	buf[4] = htobe32(size - 8); 	/* length of code */
1025 	buf[5] = htobe32(8);		/* where to copy to */
1026 	buf[6] = htobe32(0);		/* where to jump to */
1027 
1028 	submit = (volatile char *)(sc->sram + MXGEFW_BOOT_HANDOFF);
1029 	mxge_pio_copy(submit, buf, 64);
1030 	wmb();
1031 	DELAY(1000);
1032 	wmb();
1033 	i = 0;
1034 	while (*confirm != 0xffffffff && i < 20) {
1035 		DELAY(1000*10);
1036 		i++;
1037 		bus_dmamap_sync(sc->cmd_dma.dmat,
1038 				sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
1039 	}
1040 	if (*confirm != 0xffffffff) {
1041 		device_printf(sc->dev,"handoff failed (%p = 0x%x)",
1042 			confirm, *confirm);
1043 
1044 		return ENXIO;
1045 	}
1046 	return 0;
1047 }
1048 
1049 static int
1050 mxge_update_mac_address(mxge_softc_t *sc)
1051 {
1052 	mxge_cmd_t cmd;
1053 	uint8_t *addr = sc->mac_addr;
1054 	int status;
1055 
1056 
1057 	cmd.data0 = ((addr[0] << 24) | (addr[1] << 16)
1058 		     | (addr[2] << 8) | addr[3]);
1059 
1060 	cmd.data1 = ((addr[4] << 8) | (addr[5]));
1061 
1062 	status = mxge_send_cmd(sc, MXGEFW_SET_MAC_ADDRESS, &cmd);
1063 	return status;
1064 }
1065 
1066 static int
1067 mxge_change_pause(mxge_softc_t *sc, int pause)
1068 {
1069 	mxge_cmd_t cmd;
1070 	int status;
1071 
1072 	if (pause)
1073 		status = mxge_send_cmd(sc, MXGEFW_ENABLE_FLOW_CONTROL,
1074 				       &cmd);
1075 	else
1076 		status = mxge_send_cmd(sc, MXGEFW_DISABLE_FLOW_CONTROL,
1077 				       &cmd);
1078 
1079 	if (status) {
1080 		device_printf(sc->dev, "Failed to set flow control mode\n");
1081 		return ENXIO;
1082 	}
1083 	sc->pause = pause;
1084 	return 0;
1085 }
1086 
1087 static void
1088 mxge_change_promisc(mxge_softc_t *sc, int promisc)
1089 {
1090 	mxge_cmd_t cmd;
1091 	int status;
1092 
1093 	if (mxge_always_promisc)
1094 		promisc = 1;
1095 
1096 	if (promisc)
1097 		status = mxge_send_cmd(sc, MXGEFW_ENABLE_PROMISC,
1098 				       &cmd);
1099 	else
1100 		status = mxge_send_cmd(sc, MXGEFW_DISABLE_PROMISC,
1101 				       &cmd);
1102 
1103 	if (status) {
1104 		device_printf(sc->dev, "Failed to set promisc mode\n");
1105 	}
1106 }
1107 
1108 static void
1109 mxge_set_multicast_list(mxge_softc_t *sc)
1110 {
1111 	mxge_cmd_t cmd;
1112 	struct ifmultiaddr *ifma;
1113 	struct ifnet *ifp = sc->ifp;
1114 	int err;
1115 
1116 	/* This firmware is known to not support multicast */
1117 	if (!sc->fw_multicast_support)
1118 		return;
1119 
1120 	/* Disable multicast filtering while we play with the lists*/
1121 	err = mxge_send_cmd(sc, MXGEFW_ENABLE_ALLMULTI, &cmd);
1122 	if (err != 0) {
1123 		device_printf(sc->dev, "Failed MXGEFW_ENABLE_ALLMULTI,"
1124 		       " error status: %d\n", err);
1125 		return;
1126 	}
1127 
1128 	if (sc->adopted_rx_filter_bug)
1129 		return;
1130 
1131 	if (ifp->if_flags & IFF_ALLMULTI)
1132 		/* request to disable multicast filtering, so quit here */
1133 		return;
1134 
1135 	/* Flush all the filters */
1136 
1137 	err = mxge_send_cmd(sc, MXGEFW_LEAVE_ALL_MULTICAST_GROUPS, &cmd);
1138 	if (err != 0) {
1139 		device_printf(sc->dev,
1140 			      "Failed MXGEFW_LEAVE_ALL_MULTICAST_GROUPS"
1141 			      ", error status: %d\n", err);
1142 		return;
1143 	}
1144 
1145 	/* Walk the multicast list, and add each address */
1146 
1147 	if_maddr_rlock(ifp);
1148 	TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
1149 		if (ifma->ifma_addr->sa_family != AF_LINK)
1150 			continue;
1151 		bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr),
1152 		      &cmd.data0, 4);
1153 		bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr) + 4,
1154 		      &cmd.data1, 2);
1155 		cmd.data0 = htonl(cmd.data0);
1156 		cmd.data1 = htonl(cmd.data1);
1157 		err = mxge_send_cmd(sc, MXGEFW_JOIN_MULTICAST_GROUP, &cmd);
1158 		if (err != 0) {
1159 			device_printf(sc->dev, "Failed "
1160 			       "MXGEFW_JOIN_MULTICAST_GROUP, error status:"
1161 			       "%d\t", err);
1162 			/* abort, leaving multicast filtering off */
1163 			if_maddr_runlock(ifp);
1164 			return;
1165 		}
1166 	}
1167 	if_maddr_runlock(ifp);
1168 	/* Enable multicast filtering */
1169 	err = mxge_send_cmd(sc, MXGEFW_DISABLE_ALLMULTI, &cmd);
1170 	if (err != 0) {
1171 		device_printf(sc->dev, "Failed MXGEFW_DISABLE_ALLMULTI"
1172 		       ", error status: %d\n", err);
1173 	}
1174 }
1175 
1176 static int
1177 mxge_max_mtu(mxge_softc_t *sc)
1178 {
1179 	mxge_cmd_t cmd;
1180 	int status;
1181 
1182 	if (MJUMPAGESIZE - MXGEFW_PAD >  MXGEFW_MAX_MTU)
1183 		return  MXGEFW_MAX_MTU - MXGEFW_PAD;
1184 
1185 	/* try to set nbufs to see if it we can
1186 	   use virtually contiguous jumbos */
1187 	cmd.data0 = 0;
1188 	status = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
1189 			       &cmd);
1190 	if (status == 0)
1191 		return  MXGEFW_MAX_MTU - MXGEFW_PAD;
1192 
1193 	/* otherwise, we're limited to MJUMPAGESIZE */
1194 	return MJUMPAGESIZE - MXGEFW_PAD;
1195 }
1196 
1197 static int
1198 mxge_reset(mxge_softc_t *sc, int interrupts_setup)
1199 {
1200 	struct mxge_slice_state *ss;
1201 	mxge_rx_done_t *rx_done;
1202 	volatile uint32_t *irq_claim;
1203 	mxge_cmd_t cmd;
1204 	int slice, status;
1205 
1206 	/* try to send a reset command to the card to see if it
1207 	   is alive */
1208 	memset(&cmd, 0, sizeof (cmd));
1209 	status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
1210 	if (status != 0) {
1211 		device_printf(sc->dev, "failed reset\n");
1212 		return ENXIO;
1213 	}
1214 
1215 	mxge_dummy_rdma(sc, 1);
1216 
1217 
1218 	/* set the intrq size */
1219 	cmd.data0 = sc->rx_ring_size;
1220 	status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
1221 
1222 	/*
1223 	 * Even though we already know how many slices are supported
1224 	 * via mxge_slice_probe(), MXGEFW_CMD_GET_MAX_RSS_QUEUES
1225 	 * has magic side effects, and must be called after a reset.
1226 	 * It must be called prior to calling any RSS related cmds,
1227 	 * including assigning an interrupt queue for anything but
1228 	 * slice 0.  It must also be called *after*
1229 	 * MXGEFW_CMD_SET_INTRQ_SIZE, since the intrq size is used by
1230 	 * the firmware to compute offsets.
1231 	 */
1232 
1233 	if (sc->num_slices > 1) {
1234 		/* ask the maximum number of slices it supports */
1235 		status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES,
1236 					   &cmd);
1237 		if (status != 0) {
1238 			device_printf(sc->dev,
1239 				      "failed to get number of slices\n");
1240 			return status;
1241 		}
1242 		/*
1243 		 * MXGEFW_CMD_ENABLE_RSS_QUEUES must be called prior
1244 		 * to setting up the interrupt queue DMA
1245 		 */
1246 		cmd.data0 = sc->num_slices;
1247 		cmd.data1 = MXGEFW_SLICE_INTR_MODE_ONE_PER_SLICE;
1248 #ifdef IFNET_BUF_RING
1249 		cmd.data1 |= MXGEFW_SLICE_ENABLE_MULTIPLE_TX_QUEUES;
1250 #endif
1251 		status = mxge_send_cmd(sc, MXGEFW_CMD_ENABLE_RSS_QUEUES,
1252 					   &cmd);
1253 		if (status != 0) {
1254 			device_printf(sc->dev,
1255 				      "failed to set number of slices\n");
1256 			return status;
1257 		}
1258 	}
1259 
1260 
1261 	if (interrupts_setup) {
1262 		/* Now exchange information about interrupts  */
1263 		for (slice = 0; slice < sc->num_slices; slice++) {
1264 			rx_done = &sc->ss[slice].rx_done;
1265 			memset(rx_done->entry, 0, sc->rx_ring_size);
1266 			cmd.data0 = MXGE_LOWPART_TO_U32(rx_done->dma.bus_addr);
1267 			cmd.data1 = MXGE_HIGHPART_TO_U32(rx_done->dma.bus_addr);
1268 			cmd.data2 = slice;
1269 			status |= mxge_send_cmd(sc,
1270 						MXGEFW_CMD_SET_INTRQ_DMA,
1271 						&cmd);
1272 		}
1273 	}
1274 
1275 	status |= mxge_send_cmd(sc,
1276 				MXGEFW_CMD_GET_INTR_COAL_DELAY_OFFSET, &cmd);
1277 
1278 
1279 	sc->intr_coal_delay_ptr = (volatile uint32_t *)(sc->sram + cmd.data0);
1280 
1281 	status |= mxge_send_cmd(sc, MXGEFW_CMD_GET_IRQ_ACK_OFFSET, &cmd);
1282 	irq_claim = (volatile uint32_t *)(sc->sram + cmd.data0);
1283 
1284 
1285 	status |= mxge_send_cmd(sc,  MXGEFW_CMD_GET_IRQ_DEASSERT_OFFSET,
1286 				&cmd);
1287 	sc->irq_deassert = (volatile uint32_t *)(sc->sram + cmd.data0);
1288 	if (status != 0) {
1289 		device_printf(sc->dev, "failed set interrupt parameters\n");
1290 		return status;
1291 	}
1292 
1293 
1294 	*sc->intr_coal_delay_ptr = htobe32(sc->intr_coal_delay);
1295 
1296 
1297 	/* run a DMA benchmark */
1298 	(void) mxge_dma_test(sc, MXGEFW_DMA_TEST);
1299 
1300 	for (slice = 0; slice < sc->num_slices; slice++) {
1301 		ss = &sc->ss[slice];
1302 
1303 		ss->irq_claim = irq_claim + (2 * slice);
1304 		/* reset mcp/driver shared state back to 0 */
1305 		ss->rx_done.idx = 0;
1306 		ss->rx_done.cnt = 0;
1307 		ss->tx.req = 0;
1308 		ss->tx.done = 0;
1309 		ss->tx.pkt_done = 0;
1310 		ss->tx.queue_active = 0;
1311 		ss->tx.activate = 0;
1312 		ss->tx.deactivate = 0;
1313 		ss->tx.wake = 0;
1314 		ss->tx.defrag = 0;
1315 		ss->tx.stall = 0;
1316 		ss->rx_big.cnt = 0;
1317 		ss->rx_small.cnt = 0;
1318 		ss->lc.lro_bad_csum = 0;
1319 		ss->lc.lro_queued = 0;
1320 		ss->lc.lro_flushed = 0;
1321 		if (ss->fw_stats != NULL) {
1322 			bzero(ss->fw_stats, sizeof *ss->fw_stats);
1323 		}
1324 	}
1325 	sc->rdma_tags_available = 15;
1326 	status = mxge_update_mac_address(sc);
1327 	mxge_change_promisc(sc, sc->ifp->if_flags & IFF_PROMISC);
1328 	mxge_change_pause(sc, sc->pause);
1329 	mxge_set_multicast_list(sc);
1330 	if (sc->throttle) {
1331 		cmd.data0 = sc->throttle;
1332 		if (mxge_send_cmd(sc, MXGEFW_CMD_SET_THROTTLE_FACTOR,
1333 				  &cmd)) {
1334 			device_printf(sc->dev,
1335 				      "can't enable throttle\n");
1336 		}
1337 	}
1338 	return status;
1339 }
1340 
1341 static int
1342 mxge_change_throttle(SYSCTL_HANDLER_ARGS)
1343 {
1344 	mxge_cmd_t cmd;
1345 	mxge_softc_t *sc;
1346 	int err;
1347 	unsigned int throttle;
1348 
1349 	sc = arg1;
1350 	throttle = sc->throttle;
1351 	err = sysctl_handle_int(oidp, &throttle, arg2, req);
1352         if (err != 0) {
1353                 return err;
1354         }
1355 
1356 	if (throttle == sc->throttle)
1357 		return 0;
1358 
1359         if (throttle < MXGE_MIN_THROTTLE || throttle > MXGE_MAX_THROTTLE)
1360                 return EINVAL;
1361 
1362 	mtx_lock(&sc->driver_mtx);
1363 	cmd.data0 = throttle;
1364 	err = mxge_send_cmd(sc, MXGEFW_CMD_SET_THROTTLE_FACTOR, &cmd);
1365 	if (err == 0)
1366 		sc->throttle = throttle;
1367 	mtx_unlock(&sc->driver_mtx);
1368 	return err;
1369 }
1370 
1371 static int
1372 mxge_change_intr_coal(SYSCTL_HANDLER_ARGS)
1373 {
1374         mxge_softc_t *sc;
1375         unsigned int intr_coal_delay;
1376         int err;
1377 
1378         sc = arg1;
1379         intr_coal_delay = sc->intr_coal_delay;
1380         err = sysctl_handle_int(oidp, &intr_coal_delay, arg2, req);
1381         if (err != 0) {
1382                 return err;
1383         }
1384         if (intr_coal_delay == sc->intr_coal_delay)
1385                 return 0;
1386 
1387         if (intr_coal_delay == 0 || intr_coal_delay > 1000*1000)
1388                 return EINVAL;
1389 
1390 	mtx_lock(&sc->driver_mtx);
1391 	*sc->intr_coal_delay_ptr = htobe32(intr_coal_delay);
1392 	sc->intr_coal_delay = intr_coal_delay;
1393 
1394 	mtx_unlock(&sc->driver_mtx);
1395         return err;
1396 }
1397 
1398 static int
1399 mxge_change_flow_control(SYSCTL_HANDLER_ARGS)
1400 {
1401         mxge_softc_t *sc;
1402         unsigned int enabled;
1403         int err;
1404 
1405         sc = arg1;
1406         enabled = sc->pause;
1407         err = sysctl_handle_int(oidp, &enabled, arg2, req);
1408         if (err != 0) {
1409                 return err;
1410         }
1411         if (enabled == sc->pause)
1412                 return 0;
1413 
1414 	mtx_lock(&sc->driver_mtx);
1415 	err = mxge_change_pause(sc, enabled);
1416 	mtx_unlock(&sc->driver_mtx);
1417         return err;
1418 }
1419 
1420 static int
1421 mxge_handle_be32(SYSCTL_HANDLER_ARGS)
1422 {
1423         int err;
1424 
1425         if (arg1 == NULL)
1426                 return EFAULT;
1427         arg2 = be32toh(*(int *)arg1);
1428         arg1 = NULL;
1429         err = sysctl_handle_int(oidp, arg1, arg2, req);
1430 
1431         return err;
1432 }
1433 
1434 static void
1435 mxge_rem_sysctls(mxge_softc_t *sc)
1436 {
1437 	struct mxge_slice_state *ss;
1438 	int slice;
1439 
1440 	if (sc->slice_sysctl_tree == NULL)
1441 		return;
1442 
1443 	for (slice = 0; slice < sc->num_slices; slice++) {
1444 		ss = &sc->ss[slice];
1445 		if (ss == NULL || ss->sysctl_tree == NULL)
1446 			continue;
1447 		sysctl_ctx_free(&ss->sysctl_ctx);
1448 		ss->sysctl_tree = NULL;
1449 	}
1450 	sysctl_ctx_free(&sc->slice_sysctl_ctx);
1451 	sc->slice_sysctl_tree = NULL;
1452 }
1453 
1454 static void
1455 mxge_add_sysctls(mxge_softc_t *sc)
1456 {
1457 	struct sysctl_ctx_list *ctx;
1458 	struct sysctl_oid_list *children;
1459 	mcp_irq_data_t *fw;
1460 	struct mxge_slice_state *ss;
1461 	int slice;
1462 	char slice_num[8];
1463 
1464 	ctx = device_get_sysctl_ctx(sc->dev);
1465 	children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev));
1466 	fw = sc->ss[0].fw_stats;
1467 
1468 	/* random information */
1469 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1470 		       "firmware_version",
1471 		       CTLFLAG_RD, &sc->fw_version,
1472 		       0, "firmware version");
1473 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1474 		       "serial_number",
1475 		       CTLFLAG_RD, &sc->serial_number_string,
1476 		       0, "serial number");
1477 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1478 		       "product_code",
1479 		       CTLFLAG_RD, &sc->product_code_string,
1480 		       0, "product_code");
1481 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1482 		       "pcie_link_width",
1483 		       CTLFLAG_RD, &sc->link_width,
1484 		       0, "tx_boundary");
1485 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1486 		       "tx_boundary",
1487 		       CTLFLAG_RD, &sc->tx_boundary,
1488 		       0, "tx_boundary");
1489 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1490 		       "write_combine",
1491 		       CTLFLAG_RD, &sc->wc,
1492 		       0, "write combining PIO?");
1493 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1494 		       "read_dma_MBs",
1495 		       CTLFLAG_RD, &sc->read_dma,
1496 		       0, "DMA Read speed in MB/s");
1497 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1498 		       "write_dma_MBs",
1499 		       CTLFLAG_RD, &sc->write_dma,
1500 		       0, "DMA Write speed in MB/s");
1501 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1502 		       "read_write_dma_MBs",
1503 		       CTLFLAG_RD, &sc->read_write_dma,
1504 		       0, "DMA concurrent Read/Write speed in MB/s");
1505 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1506 		       "watchdog_resets",
1507 		       CTLFLAG_RD, &sc->watchdog_resets,
1508 		       0, "Number of times NIC was reset");
1509 
1510 
1511 	/* performance related tunables */
1512 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1513 			"intr_coal_delay",
1514 			CTLTYPE_INT|CTLFLAG_RW, sc,
1515 			0, mxge_change_intr_coal,
1516 			"I", "interrupt coalescing delay in usecs");
1517 
1518 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1519 			"throttle",
1520 			CTLTYPE_INT|CTLFLAG_RW, sc,
1521 			0, mxge_change_throttle,
1522 			"I", "transmit throttling");
1523 
1524 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1525 			"flow_control_enabled",
1526 			CTLTYPE_INT|CTLFLAG_RW, sc,
1527 			0, mxge_change_flow_control,
1528 			"I", "interrupt coalescing delay in usecs");
1529 
1530 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1531 		       "deassert_wait",
1532 		       CTLFLAG_RW, &mxge_deassert_wait,
1533 		       0, "Wait for IRQ line to go low in ihandler");
1534 
1535 	/* stats block from firmware is in network byte order.
1536 	   Need to swap it */
1537 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1538 			"link_up",
1539 			CTLTYPE_INT|CTLFLAG_RD, &fw->link_up,
1540 			0, mxge_handle_be32,
1541 			"I", "link up");
1542 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1543 			"rdma_tags_available",
1544 			CTLTYPE_INT|CTLFLAG_RD, &fw->rdma_tags_available,
1545 			0, mxge_handle_be32,
1546 			"I", "rdma_tags_available");
1547 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1548 			"dropped_bad_crc32",
1549 			CTLTYPE_INT|CTLFLAG_RD,
1550 			&fw->dropped_bad_crc32,
1551 			0, mxge_handle_be32,
1552 			"I", "dropped_bad_crc32");
1553 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1554 			"dropped_bad_phy",
1555 			CTLTYPE_INT|CTLFLAG_RD,
1556 			&fw->dropped_bad_phy,
1557 			0, mxge_handle_be32,
1558 			"I", "dropped_bad_phy");
1559 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1560 			"dropped_link_error_or_filtered",
1561 			CTLTYPE_INT|CTLFLAG_RD,
1562 			&fw->dropped_link_error_or_filtered,
1563 			0, mxge_handle_be32,
1564 			"I", "dropped_link_error_or_filtered");
1565 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1566 			"dropped_link_overflow",
1567 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_link_overflow,
1568 			0, mxge_handle_be32,
1569 			"I", "dropped_link_overflow");
1570 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1571 			"dropped_multicast_filtered",
1572 			CTLTYPE_INT|CTLFLAG_RD,
1573 			&fw->dropped_multicast_filtered,
1574 			0, mxge_handle_be32,
1575 			"I", "dropped_multicast_filtered");
1576 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1577 			"dropped_no_big_buffer",
1578 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_no_big_buffer,
1579 			0, mxge_handle_be32,
1580 			"I", "dropped_no_big_buffer");
1581 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1582 			"dropped_no_small_buffer",
1583 			CTLTYPE_INT|CTLFLAG_RD,
1584 			&fw->dropped_no_small_buffer,
1585 			0, mxge_handle_be32,
1586 			"I", "dropped_no_small_buffer");
1587 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1588 			"dropped_overrun",
1589 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_overrun,
1590 			0, mxge_handle_be32,
1591 			"I", "dropped_overrun");
1592 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1593 			"dropped_pause",
1594 			CTLTYPE_INT|CTLFLAG_RD,
1595 			&fw->dropped_pause,
1596 			0, mxge_handle_be32,
1597 			"I", "dropped_pause");
1598 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1599 			"dropped_runt",
1600 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_runt,
1601 			0, mxge_handle_be32,
1602 			"I", "dropped_runt");
1603 
1604 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1605 			"dropped_unicast_filtered",
1606 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_unicast_filtered,
1607 			0, mxge_handle_be32,
1608 			"I", "dropped_unicast_filtered");
1609 
1610 	/* verbose printing? */
1611 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1612 		       "verbose",
1613 		       CTLFLAG_RW, &mxge_verbose,
1614 		       0, "verbose printing");
1615 
1616 	/* add counters exported for debugging from all slices */
1617 	sysctl_ctx_init(&sc->slice_sysctl_ctx);
1618 	sc->slice_sysctl_tree =
1619 		SYSCTL_ADD_NODE(&sc->slice_sysctl_ctx, children, OID_AUTO,
1620 				"slice", CTLFLAG_RD, 0, "");
1621 
1622 	for (slice = 0; slice < sc->num_slices; slice++) {
1623 		ss = &sc->ss[slice];
1624 		sysctl_ctx_init(&ss->sysctl_ctx);
1625 		ctx = &ss->sysctl_ctx;
1626 		children = SYSCTL_CHILDREN(sc->slice_sysctl_tree);
1627 		sprintf(slice_num, "%d", slice);
1628 		ss->sysctl_tree =
1629 			SYSCTL_ADD_NODE(ctx, children, OID_AUTO, slice_num,
1630 					CTLFLAG_RD, 0, "");
1631 		children = SYSCTL_CHILDREN(ss->sysctl_tree);
1632 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1633 			       "rx_small_cnt",
1634 			       CTLFLAG_RD, &ss->rx_small.cnt,
1635 			       0, "rx_small_cnt");
1636 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1637 			       "rx_big_cnt",
1638 			       CTLFLAG_RD, &ss->rx_big.cnt,
1639 			       0, "rx_small_cnt");
1640 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1641 			       "lro_flushed", CTLFLAG_RD, &ss->lc.lro_flushed,
1642 			       0, "number of lro merge queues flushed");
1643 
1644 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1645 			       "lro_bad_csum", CTLFLAG_RD, &ss->lc.lro_bad_csum,
1646 			       0, "number of bad csums preventing LRO");
1647 
1648 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1649 			       "lro_queued", CTLFLAG_RD, &ss->lc.lro_queued,
1650 			       0, "number of frames appended to lro merge"
1651 			       "queues");
1652 
1653 #ifndef IFNET_BUF_RING
1654 		/* only transmit from slice 0 for now */
1655 		if (slice > 0)
1656 			continue;
1657 #endif
1658 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1659 			       "tx_req",
1660 			       CTLFLAG_RD, &ss->tx.req,
1661 			       0, "tx_req");
1662 
1663 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1664 			       "tx_done",
1665 			       CTLFLAG_RD, &ss->tx.done,
1666 			       0, "tx_done");
1667 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1668 			       "tx_pkt_done",
1669 			       CTLFLAG_RD, &ss->tx.pkt_done,
1670 			       0, "tx_done");
1671 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1672 			       "tx_stall",
1673 			       CTLFLAG_RD, &ss->tx.stall,
1674 			       0, "tx_stall");
1675 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1676 			       "tx_wake",
1677 			       CTLFLAG_RD, &ss->tx.wake,
1678 			       0, "tx_wake");
1679 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1680 			       "tx_defrag",
1681 			       CTLFLAG_RD, &ss->tx.defrag,
1682 			       0, "tx_defrag");
1683 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1684 			       "tx_queue_active",
1685 			       CTLFLAG_RD, &ss->tx.queue_active,
1686 			       0, "tx_queue_active");
1687 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1688 			       "tx_activate",
1689 			       CTLFLAG_RD, &ss->tx.activate,
1690 			       0, "tx_activate");
1691 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1692 			       "tx_deactivate",
1693 			       CTLFLAG_RD, &ss->tx.deactivate,
1694 			       0, "tx_deactivate");
1695 	}
1696 }
1697 
1698 /* copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
1699    backwards one at a time and handle ring wraps */
1700 
1701 static inline void
1702 mxge_submit_req_backwards(mxge_tx_ring_t *tx,
1703 			    mcp_kreq_ether_send_t *src, int cnt)
1704 {
1705         int idx, starting_slot;
1706         starting_slot = tx->req;
1707         while (cnt > 1) {
1708                 cnt--;
1709                 idx = (starting_slot + cnt) & tx->mask;
1710                 mxge_pio_copy(&tx->lanai[idx],
1711 			      &src[cnt], sizeof(*src));
1712                 wmb();
1713         }
1714 }
1715 
1716 /*
1717  * copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
1718  * at most 32 bytes at a time, so as to avoid involving the software
1719  * pio handler in the nic.   We re-write the first segment's flags
1720  * to mark them valid only after writing the entire chain
1721  */
1722 
1723 static inline void
1724 mxge_submit_req(mxge_tx_ring_t *tx, mcp_kreq_ether_send_t *src,
1725                   int cnt)
1726 {
1727         int idx, i;
1728         uint32_t *src_ints;
1729 	volatile uint32_t *dst_ints;
1730         mcp_kreq_ether_send_t *srcp;
1731 	volatile mcp_kreq_ether_send_t *dstp, *dst;
1732 	uint8_t last_flags;
1733 
1734         idx = tx->req & tx->mask;
1735 
1736 	last_flags = src->flags;
1737 	src->flags = 0;
1738         wmb();
1739         dst = dstp = &tx->lanai[idx];
1740         srcp = src;
1741 
1742         if ((idx + cnt) < tx->mask) {
1743                 for (i = 0; i < (cnt - 1); i += 2) {
1744                         mxge_pio_copy(dstp, srcp, 2 * sizeof(*src));
1745                         wmb(); /* force write every 32 bytes */
1746                         srcp += 2;
1747                         dstp += 2;
1748                 }
1749         } else {
1750                 /* submit all but the first request, and ensure
1751                    that it is submitted below */
1752                 mxge_submit_req_backwards(tx, src, cnt);
1753                 i = 0;
1754         }
1755         if (i < cnt) {
1756                 /* submit the first request */
1757                 mxge_pio_copy(dstp, srcp, sizeof(*src));
1758                 wmb(); /* barrier before setting valid flag */
1759         }
1760 
1761         /* re-write the last 32-bits with the valid flags */
1762         src->flags = last_flags;
1763         src_ints = (uint32_t *)src;
1764         src_ints+=3;
1765         dst_ints = (volatile uint32_t *)dst;
1766         dst_ints+=3;
1767         *dst_ints =  *src_ints;
1768         tx->req += cnt;
1769         wmb();
1770 }
1771 
1772 static int
1773 mxge_parse_tx(struct mxge_slice_state *ss, struct mbuf *m,
1774     struct mxge_pkt_info *pi)
1775 {
1776 	struct ether_vlan_header *eh;
1777 	uint16_t etype;
1778 	int tso = m->m_pkthdr.csum_flags & (CSUM_TSO);
1779 #if IFCAP_TSO6 && defined(INET6)
1780 	int nxt;
1781 #endif
1782 
1783 	eh = mtod(m, struct ether_vlan_header *);
1784 	if (eh->evl_encap_proto == htons(ETHERTYPE_VLAN)) {
1785 		etype = ntohs(eh->evl_proto);
1786 		pi->ip_off = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
1787 	} else {
1788 		etype = ntohs(eh->evl_encap_proto);
1789 		pi->ip_off = ETHER_HDR_LEN;
1790 	}
1791 
1792 	switch (etype) {
1793 	case ETHERTYPE_IP:
1794 		/*
1795 		 * ensure ip header is in first mbuf, copy it to a
1796 		 * scratch buffer if not
1797 		 */
1798 		pi->ip = (struct ip *)(m->m_data + pi->ip_off);
1799 		pi->ip6 = NULL;
1800 		if (__predict_false(m->m_len < pi->ip_off + sizeof(*pi->ip))) {
1801 			m_copydata(m, 0, pi->ip_off + sizeof(*pi->ip),
1802 			    ss->scratch);
1803 			pi->ip = (struct ip *)(ss->scratch + pi->ip_off);
1804 		}
1805 		pi->ip_hlen = pi->ip->ip_hl << 2;
1806 		if (!tso)
1807 			return 0;
1808 
1809 		if (__predict_false(m->m_len < pi->ip_off + pi->ip_hlen +
1810 		    sizeof(struct tcphdr))) {
1811 			m_copydata(m, 0, pi->ip_off + pi->ip_hlen +
1812 			    sizeof(struct tcphdr), ss->scratch);
1813 			pi->ip = (struct ip *)(ss->scratch + pi->ip_off);
1814 		}
1815 		pi->tcp = (struct tcphdr *)((char *)pi->ip + pi->ip_hlen);
1816 		break;
1817 #if IFCAP_TSO6 && defined(INET6)
1818 	case ETHERTYPE_IPV6:
1819 		pi->ip6 = (struct ip6_hdr *)(m->m_data + pi->ip_off);
1820 		if (__predict_false(m->m_len < pi->ip_off + sizeof(*pi->ip6))) {
1821 			m_copydata(m, 0, pi->ip_off + sizeof(*pi->ip6),
1822 			    ss->scratch);
1823 			pi->ip6 = (struct ip6_hdr *)(ss->scratch + pi->ip_off);
1824 		}
1825 		nxt = 0;
1826 		pi->ip_hlen = ip6_lasthdr(m, pi->ip_off, IPPROTO_IPV6, &nxt);
1827 		pi->ip_hlen -= pi->ip_off;
1828 		if (nxt != IPPROTO_TCP && nxt != IPPROTO_UDP)
1829 			return EINVAL;
1830 
1831 		if (!tso)
1832 			return 0;
1833 
1834 		if (pi->ip_off + pi->ip_hlen > ss->sc->max_tso6_hlen)
1835 			return EINVAL;
1836 
1837 		if (__predict_false(m->m_len < pi->ip_off + pi->ip_hlen +
1838 		    sizeof(struct tcphdr))) {
1839 			m_copydata(m, 0, pi->ip_off + pi->ip_hlen +
1840 			    sizeof(struct tcphdr), ss->scratch);
1841 			pi->ip6 = (struct ip6_hdr *)(ss->scratch + pi->ip_off);
1842 		}
1843 		pi->tcp = (struct tcphdr *)((char *)pi->ip6 + pi->ip_hlen);
1844 		break;
1845 #endif
1846 	default:
1847 		return EINVAL;
1848 	}
1849 	return 0;
1850 }
1851 
1852 #if IFCAP_TSO4
1853 
1854 static void
1855 mxge_encap_tso(struct mxge_slice_state *ss, struct mbuf *m,
1856 	       int busdma_seg_cnt, struct mxge_pkt_info *pi)
1857 {
1858 	mxge_tx_ring_t *tx;
1859 	mcp_kreq_ether_send_t *req;
1860 	bus_dma_segment_t *seg;
1861 	uint32_t low, high_swapped;
1862 	int len, seglen, cum_len, cum_len_next;
1863 	int next_is_first, chop, cnt, rdma_count, small;
1864 	uint16_t pseudo_hdr_offset, cksum_offset, mss, sum;
1865 	uint8_t flags, flags_next;
1866 	static int once;
1867 
1868 	mss = m->m_pkthdr.tso_segsz;
1869 
1870 	/* negative cum_len signifies to the
1871 	 * send loop that we are still in the
1872 	 * header portion of the TSO packet.
1873 	 */
1874 
1875 	cksum_offset = pi->ip_off + pi->ip_hlen;
1876 	cum_len = -(cksum_offset + (pi->tcp->th_off << 2));
1877 
1878 	/* TSO implies checksum offload on this hardware */
1879 	if (__predict_false((m->m_pkthdr.csum_flags & (CSUM_TCP|CSUM_TCP_IPV6)) == 0)) {
1880 		/*
1881 		 * If packet has full TCP csum, replace it with pseudo hdr
1882 		 * sum that the NIC expects, otherwise the NIC will emit
1883 		 * packets with bad TCP checksums.
1884 		 */
1885 		m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
1886 		if (pi->ip6) {
1887 #if (CSUM_TCP_IPV6 != 0) && defined(INET6)
1888 			m->m_pkthdr.csum_flags |= CSUM_TCP_IPV6;
1889 			sum = in6_cksum_pseudo(pi->ip6,
1890 			    m->m_pkthdr.len - cksum_offset,
1891 			    IPPROTO_TCP, 0);
1892 #endif
1893 		} else {
1894 #ifdef INET
1895 			m->m_pkthdr.csum_flags |= CSUM_TCP;
1896 			sum = in_pseudo(pi->ip->ip_src.s_addr,
1897 			    pi->ip->ip_dst.s_addr,
1898 			    htons(IPPROTO_TCP + (m->m_pkthdr.len -
1899 				    cksum_offset)));
1900 #endif
1901 		}
1902 		m_copyback(m, offsetof(struct tcphdr, th_sum) +
1903 		    cksum_offset, sizeof(sum), (caddr_t)&sum);
1904 	}
1905 	flags = MXGEFW_FLAGS_TSO_HDR | MXGEFW_FLAGS_FIRST;
1906 
1907 
1908 	/* for TSO, pseudo_hdr_offset holds mss.
1909 	 * The firmware figures out where to put
1910 	 * the checksum by parsing the header. */
1911 	pseudo_hdr_offset = htobe16(mss);
1912 
1913 	if (pi->ip6) {
1914 		/*
1915 		 * for IPv6 TSO, the "checksum offset" is re-purposed
1916 		 * to store the TCP header len
1917 		 */
1918 		cksum_offset = (pi->tcp->th_off << 2);
1919 	}
1920 
1921 	tx = &ss->tx;
1922 	req = tx->req_list;
1923 	seg = tx->seg_list;
1924 	cnt = 0;
1925 	rdma_count = 0;
1926 	/* "rdma_count" is the number of RDMAs belonging to the
1927 	 * current packet BEFORE the current send request. For
1928 	 * non-TSO packets, this is equal to "count".
1929 	 * For TSO packets, rdma_count needs to be reset
1930 	 * to 0 after a segment cut.
1931 	 *
1932 	 * The rdma_count field of the send request is
1933 	 * the number of RDMAs of the packet starting at
1934 	 * that request. For TSO send requests with one ore more cuts
1935 	 * in the middle, this is the number of RDMAs starting
1936 	 * after the last cut in the request. All previous
1937 	 * segments before the last cut implicitly have 1 RDMA.
1938 	 *
1939 	 * Since the number of RDMAs is not known beforehand,
1940 	 * it must be filled-in retroactively - after each
1941 	 * segmentation cut or at the end of the entire packet.
1942 	 */
1943 
1944 	while (busdma_seg_cnt) {
1945 		/* Break the busdma segment up into pieces*/
1946 		low = MXGE_LOWPART_TO_U32(seg->ds_addr);
1947 		high_swapped = 	htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
1948 		len = seg->ds_len;
1949 
1950 		while (len) {
1951 			flags_next = flags & ~MXGEFW_FLAGS_FIRST;
1952 			seglen = len;
1953 			cum_len_next = cum_len + seglen;
1954 			(req-rdma_count)->rdma_count = rdma_count + 1;
1955 			if (__predict_true(cum_len >= 0)) {
1956 				/* payload */
1957 				chop = (cum_len_next > mss);
1958 				cum_len_next = cum_len_next % mss;
1959 				next_is_first = (cum_len_next == 0);
1960 				flags |= chop * MXGEFW_FLAGS_TSO_CHOP;
1961 				flags_next |= next_is_first *
1962 					MXGEFW_FLAGS_FIRST;
1963 				rdma_count |= -(chop | next_is_first);
1964 				rdma_count += chop & !next_is_first;
1965 			} else if (cum_len_next >= 0) {
1966 				/* header ends */
1967 				rdma_count = -1;
1968 				cum_len_next = 0;
1969 				seglen = -cum_len;
1970 				small = (mss <= MXGEFW_SEND_SMALL_SIZE);
1971 				flags_next = MXGEFW_FLAGS_TSO_PLD |
1972 					MXGEFW_FLAGS_FIRST |
1973 					(small * MXGEFW_FLAGS_SMALL);
1974 			    }
1975 
1976 			req->addr_high = high_swapped;
1977 			req->addr_low = htobe32(low);
1978 			req->pseudo_hdr_offset = pseudo_hdr_offset;
1979 			req->pad = 0;
1980 			req->rdma_count = 1;
1981 			req->length = htobe16(seglen);
1982 			req->cksum_offset = cksum_offset;
1983 			req->flags = flags | ((cum_len & 1) *
1984 					      MXGEFW_FLAGS_ALIGN_ODD);
1985 			low += seglen;
1986 			len -= seglen;
1987 			cum_len = cum_len_next;
1988 			flags = flags_next;
1989 			req++;
1990 			cnt++;
1991 			rdma_count++;
1992 			if (cksum_offset != 0 && !pi->ip6) {
1993 				if (__predict_false(cksum_offset > seglen))
1994 					cksum_offset -= seglen;
1995 				else
1996 					cksum_offset = 0;
1997 			}
1998 			if (__predict_false(cnt > tx->max_desc))
1999 				goto drop;
2000 		}
2001 		busdma_seg_cnt--;
2002 		seg++;
2003 	}
2004 	(req-rdma_count)->rdma_count = rdma_count;
2005 
2006 	do {
2007 		req--;
2008 		req->flags |= MXGEFW_FLAGS_TSO_LAST;
2009 	} while (!(req->flags & (MXGEFW_FLAGS_TSO_CHOP | MXGEFW_FLAGS_FIRST)));
2010 
2011 	tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
2012 	mxge_submit_req(tx, tx->req_list, cnt);
2013 #ifdef IFNET_BUF_RING
2014 	if ((ss->sc->num_slices > 1) && tx->queue_active == 0) {
2015 		/* tell the NIC to start polling this slice */
2016 		*tx->send_go = 1;
2017 		tx->queue_active = 1;
2018 		tx->activate++;
2019 		wmb();
2020 	}
2021 #endif
2022 	return;
2023 
2024 drop:
2025 	bus_dmamap_unload(tx->dmat, tx->info[tx->req & tx->mask].map);
2026 	m_freem(m);
2027 	ss->oerrors++;
2028 	if (!once) {
2029 		printf("tx->max_desc exceeded via TSO!\n");
2030 		printf("mss = %d, %ld, %d!\n", mss,
2031 		       (long)seg - (long)tx->seg_list, tx->max_desc);
2032 		once = 1;
2033 	}
2034 	return;
2035 
2036 }
2037 
2038 #endif /* IFCAP_TSO4 */
2039 
2040 #ifdef MXGE_NEW_VLAN_API
2041 /*
2042  * We reproduce the software vlan tag insertion from
2043  * net/if_vlan.c:vlan_start() here so that we can advertise "hardware"
2044  * vlan tag insertion. We need to advertise this in order to have the
2045  * vlan interface respect our csum offload flags.
2046  */
2047 static struct mbuf *
2048 mxge_vlan_tag_insert(struct mbuf *m)
2049 {
2050 	struct ether_vlan_header *evl;
2051 
2052 	M_PREPEND(m, ETHER_VLAN_ENCAP_LEN, M_NOWAIT);
2053 	if (__predict_false(m == NULL))
2054 		return NULL;
2055 	if (m->m_len < sizeof(*evl)) {
2056 		m = m_pullup(m, sizeof(*evl));
2057 		if (__predict_false(m == NULL))
2058 			return NULL;
2059 	}
2060 	/*
2061 	 * Transform the Ethernet header into an Ethernet header
2062 	 * with 802.1Q encapsulation.
2063 	 */
2064 	evl = mtod(m, struct ether_vlan_header *);
2065 	bcopy((char *)evl + ETHER_VLAN_ENCAP_LEN,
2066 	      (char *)evl, ETHER_HDR_LEN - ETHER_TYPE_LEN);
2067 	evl->evl_encap_proto = htons(ETHERTYPE_VLAN);
2068 	evl->evl_tag = htons(m->m_pkthdr.ether_vtag);
2069 	m->m_flags &= ~M_VLANTAG;
2070 	return m;
2071 }
2072 #endif /* MXGE_NEW_VLAN_API */
2073 
2074 static void
2075 mxge_encap(struct mxge_slice_state *ss, struct mbuf *m)
2076 {
2077 	struct mxge_pkt_info pi = {0,0,0,0};
2078 	mxge_softc_t *sc;
2079 	mcp_kreq_ether_send_t *req;
2080 	bus_dma_segment_t *seg;
2081 	struct mbuf *m_tmp;
2082 	struct ifnet *ifp;
2083 	mxge_tx_ring_t *tx;
2084 	int cnt, cum_len, err, i, idx, odd_flag;
2085 	uint16_t pseudo_hdr_offset;
2086         uint8_t flags, cksum_offset;
2087 
2088 
2089 	sc = ss->sc;
2090 	ifp = sc->ifp;
2091 	tx = &ss->tx;
2092 
2093 #ifdef MXGE_NEW_VLAN_API
2094 	if (m->m_flags & M_VLANTAG) {
2095 		m = mxge_vlan_tag_insert(m);
2096 		if (__predict_false(m == NULL))
2097 			goto drop_without_m;
2098 	}
2099 #endif
2100 	if (m->m_pkthdr.csum_flags &
2101 	    (CSUM_TSO | CSUM_DELAY_DATA | CSUM_DELAY_DATA_IPV6)) {
2102 		if (mxge_parse_tx(ss, m, &pi))
2103 			goto drop;
2104 	}
2105 
2106 	/* (try to) map the frame for DMA */
2107 	idx = tx->req & tx->mask;
2108 	err = bus_dmamap_load_mbuf_sg(tx->dmat, tx->info[idx].map,
2109 				      m, tx->seg_list, &cnt,
2110 				      BUS_DMA_NOWAIT);
2111 	if (__predict_false(err == EFBIG)) {
2112 		/* Too many segments in the chain.  Try
2113 		   to defrag */
2114 		m_tmp = m_defrag(m, M_NOWAIT);
2115 		if (m_tmp == NULL) {
2116 			goto drop;
2117 		}
2118 		ss->tx.defrag++;
2119 		m = m_tmp;
2120 		err = bus_dmamap_load_mbuf_sg(tx->dmat,
2121 					      tx->info[idx].map,
2122 					      m, tx->seg_list, &cnt,
2123 					      BUS_DMA_NOWAIT);
2124 	}
2125 	if (__predict_false(err != 0)) {
2126 		device_printf(sc->dev, "bus_dmamap_load_mbuf_sg returned %d"
2127 			      " packet len = %d\n", err, m->m_pkthdr.len);
2128 		goto drop;
2129 	}
2130 	bus_dmamap_sync(tx->dmat, tx->info[idx].map,
2131 			BUS_DMASYNC_PREWRITE);
2132 	tx->info[idx].m = m;
2133 
2134 #if IFCAP_TSO4
2135 	/* TSO is different enough, we handle it in another routine */
2136 	if (m->m_pkthdr.csum_flags & (CSUM_TSO)) {
2137 		mxge_encap_tso(ss, m, cnt, &pi);
2138 		return;
2139 	}
2140 #endif
2141 
2142 	req = tx->req_list;
2143 	cksum_offset = 0;
2144 	pseudo_hdr_offset = 0;
2145 	flags = MXGEFW_FLAGS_NO_TSO;
2146 
2147 	/* checksum offloading? */
2148 	if (m->m_pkthdr.csum_flags &
2149 	    (CSUM_DELAY_DATA | CSUM_DELAY_DATA_IPV6)) {
2150 		/* ensure ip header is in first mbuf, copy
2151 		   it to a scratch buffer if not */
2152 		cksum_offset = pi.ip_off + pi.ip_hlen;
2153 		pseudo_hdr_offset = cksum_offset +  m->m_pkthdr.csum_data;
2154 		pseudo_hdr_offset = htobe16(pseudo_hdr_offset);
2155 		req->cksum_offset = cksum_offset;
2156 		flags |= MXGEFW_FLAGS_CKSUM;
2157 		odd_flag = MXGEFW_FLAGS_ALIGN_ODD;
2158 	} else {
2159 		odd_flag = 0;
2160 	}
2161 	if (m->m_pkthdr.len < MXGEFW_SEND_SMALL_SIZE)
2162 		flags |= MXGEFW_FLAGS_SMALL;
2163 
2164 	/* convert segments into a request list */
2165 	cum_len = 0;
2166 	seg = tx->seg_list;
2167 	req->flags = MXGEFW_FLAGS_FIRST;
2168 	for (i = 0; i < cnt; i++) {
2169 		req->addr_low =
2170 			htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
2171 		req->addr_high =
2172 			htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
2173 		req->length = htobe16(seg->ds_len);
2174 		req->cksum_offset = cksum_offset;
2175 		if (cksum_offset > seg->ds_len)
2176 			cksum_offset -= seg->ds_len;
2177 		else
2178 			cksum_offset = 0;
2179 		req->pseudo_hdr_offset = pseudo_hdr_offset;
2180 		req->pad = 0; /* complete solid 16-byte block */
2181 		req->rdma_count = 1;
2182 		req->flags |= flags | ((cum_len & 1) * odd_flag);
2183 		cum_len += seg->ds_len;
2184 		seg++;
2185 		req++;
2186 		req->flags = 0;
2187 	}
2188 	req--;
2189 	/* pad runts to 60 bytes */
2190 	if (cum_len < 60) {
2191 		req++;
2192 		req->addr_low =
2193 			htobe32(MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr));
2194 		req->addr_high =
2195 			htobe32(MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr));
2196 		req->length = htobe16(60 - cum_len);
2197 		req->cksum_offset = 0;
2198 		req->pseudo_hdr_offset = pseudo_hdr_offset;
2199 		req->pad = 0; /* complete solid 16-byte block */
2200 		req->rdma_count = 1;
2201 		req->flags |= flags | ((cum_len & 1) * odd_flag);
2202 		cnt++;
2203 	}
2204 
2205 	tx->req_list[0].rdma_count = cnt;
2206 #if 0
2207 	/* print what the firmware will see */
2208 	for (i = 0; i < cnt; i++) {
2209 		printf("%d: addr: 0x%x 0x%x len:%d pso%d,"
2210 		    "cso:%d, flags:0x%x, rdma:%d\n",
2211 		    i, (int)ntohl(tx->req_list[i].addr_high),
2212 		    (int)ntohl(tx->req_list[i].addr_low),
2213 		    (int)ntohs(tx->req_list[i].length),
2214 		    (int)ntohs(tx->req_list[i].pseudo_hdr_offset),
2215 		    tx->req_list[i].cksum_offset, tx->req_list[i].flags,
2216 		    tx->req_list[i].rdma_count);
2217 	}
2218 	printf("--------------\n");
2219 #endif
2220 	tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
2221 	mxge_submit_req(tx, tx->req_list, cnt);
2222 #ifdef IFNET_BUF_RING
2223 	if ((ss->sc->num_slices > 1) && tx->queue_active == 0) {
2224 		/* tell the NIC to start polling this slice */
2225 		*tx->send_go = 1;
2226 		tx->queue_active = 1;
2227 		tx->activate++;
2228 		wmb();
2229 	}
2230 #endif
2231 	return;
2232 
2233 drop:
2234 	m_freem(m);
2235 drop_without_m:
2236 	ss->oerrors++;
2237 	return;
2238 }
2239 
2240 #ifdef IFNET_BUF_RING
2241 static void
2242 mxge_qflush(struct ifnet *ifp)
2243 {
2244 	mxge_softc_t *sc = ifp->if_softc;
2245 	mxge_tx_ring_t *tx;
2246 	struct mbuf *m;
2247 	int slice;
2248 
2249 	for (slice = 0; slice < sc->num_slices; slice++) {
2250 		tx = &sc->ss[slice].tx;
2251 		mtx_lock(&tx->mtx);
2252 		while ((m = buf_ring_dequeue_sc(tx->br)) != NULL)
2253 			m_freem(m);
2254 		mtx_unlock(&tx->mtx);
2255 	}
2256 	if_qflush(ifp);
2257 }
2258 
2259 static inline void
2260 mxge_start_locked(struct mxge_slice_state *ss)
2261 {
2262 	mxge_softc_t *sc;
2263 	struct mbuf *m;
2264 	struct ifnet *ifp;
2265 	mxge_tx_ring_t *tx;
2266 
2267 	sc = ss->sc;
2268 	ifp = sc->ifp;
2269 	tx = &ss->tx;
2270 
2271 	while ((tx->mask - (tx->req - tx->done)) > tx->max_desc) {
2272 		m = drbr_dequeue(ifp, tx->br);
2273 		if (m == NULL) {
2274 			return;
2275 		}
2276 		/* let BPF see it */
2277 		BPF_MTAP(ifp, m);
2278 
2279 		/* give it to the nic */
2280 		mxge_encap(ss, m);
2281 	}
2282 	/* ran out of transmit slots */
2283 	if (((ss->if_drv_flags & IFF_DRV_OACTIVE) == 0)
2284 	    && (!drbr_empty(ifp, tx->br))) {
2285 		ss->if_drv_flags |= IFF_DRV_OACTIVE;
2286 		tx->stall++;
2287 	}
2288 }
2289 
2290 static int
2291 mxge_transmit_locked(struct mxge_slice_state *ss, struct mbuf *m)
2292 {
2293 	mxge_softc_t *sc;
2294 	struct ifnet *ifp;
2295 	mxge_tx_ring_t *tx;
2296 	int err;
2297 
2298 	sc = ss->sc;
2299 	ifp = sc->ifp;
2300 	tx = &ss->tx;
2301 
2302 	if ((ss->if_drv_flags & (IFF_DRV_RUNNING|IFF_DRV_OACTIVE)) !=
2303 	    IFF_DRV_RUNNING) {
2304 		err = drbr_enqueue(ifp, tx->br, m);
2305 		return (err);
2306 	}
2307 
2308 	if (!drbr_needs_enqueue(ifp, tx->br) &&
2309 	    ((tx->mask - (tx->req - tx->done)) > tx->max_desc)) {
2310 		/* let BPF see it */
2311 		BPF_MTAP(ifp, m);
2312 		/* give it to the nic */
2313 		mxge_encap(ss, m);
2314 	} else if ((err = drbr_enqueue(ifp, tx->br, m)) != 0) {
2315 		return (err);
2316 	}
2317 	if (!drbr_empty(ifp, tx->br))
2318 		mxge_start_locked(ss);
2319 	return (0);
2320 }
2321 
2322 static int
2323 mxge_transmit(struct ifnet *ifp, struct mbuf *m)
2324 {
2325 	mxge_softc_t *sc = ifp->if_softc;
2326 	struct mxge_slice_state *ss;
2327 	mxge_tx_ring_t *tx;
2328 	int err = 0;
2329 	int slice;
2330 
2331 	slice = m->m_pkthdr.flowid;
2332 	slice &= (sc->num_slices - 1);  /* num_slices always power of 2 */
2333 
2334 	ss = &sc->ss[slice];
2335 	tx = &ss->tx;
2336 
2337 	if (mtx_trylock(&tx->mtx)) {
2338 		err = mxge_transmit_locked(ss, m);
2339 		mtx_unlock(&tx->mtx);
2340 	} else {
2341 		err = drbr_enqueue(ifp, tx->br, m);
2342 	}
2343 
2344 	return (err);
2345 }
2346 
2347 #else
2348 
2349 static inline void
2350 mxge_start_locked(struct mxge_slice_state *ss)
2351 {
2352 	mxge_softc_t *sc;
2353 	struct mbuf *m;
2354 	struct ifnet *ifp;
2355 	mxge_tx_ring_t *tx;
2356 
2357 	sc = ss->sc;
2358 	ifp = sc->ifp;
2359 	tx = &ss->tx;
2360 	while ((tx->mask - (tx->req - tx->done)) > tx->max_desc) {
2361 		IFQ_DRV_DEQUEUE(&ifp->if_snd, m);
2362 		if (m == NULL) {
2363 			return;
2364 		}
2365 		/* let BPF see it */
2366 		BPF_MTAP(ifp, m);
2367 
2368 		/* give it to the nic */
2369 		mxge_encap(ss, m);
2370 	}
2371 	/* ran out of transmit slots */
2372 	if ((sc->ifp->if_drv_flags & IFF_DRV_OACTIVE) == 0) {
2373 		sc->ifp->if_drv_flags |= IFF_DRV_OACTIVE;
2374 		tx->stall++;
2375 	}
2376 }
2377 #endif
2378 static void
2379 mxge_start(struct ifnet *ifp)
2380 {
2381 	mxge_softc_t *sc = ifp->if_softc;
2382 	struct mxge_slice_state *ss;
2383 
2384 	/* only use the first slice for now */
2385 	ss = &sc->ss[0];
2386 	mtx_lock(&ss->tx.mtx);
2387 	mxge_start_locked(ss);
2388 	mtx_unlock(&ss->tx.mtx);
2389 }
2390 
2391 /*
2392  * copy an array of mcp_kreq_ether_recv_t's to the mcp.  Copy
2393  * at most 32 bytes at a time, so as to avoid involving the software
2394  * pio handler in the nic.   We re-write the first segment's low
2395  * DMA address to mark it valid only after we write the entire chunk
2396  * in a burst
2397  */
2398 static inline void
2399 mxge_submit_8rx(volatile mcp_kreq_ether_recv_t *dst,
2400 		mcp_kreq_ether_recv_t *src)
2401 {
2402 	uint32_t low;
2403 
2404 	low = src->addr_low;
2405 	src->addr_low = 0xffffffff;
2406 	mxge_pio_copy(dst, src, 4 * sizeof (*src));
2407 	wmb();
2408 	mxge_pio_copy(dst + 4, src + 4, 4 * sizeof (*src));
2409 	wmb();
2410 	src->addr_low = low;
2411 	dst->addr_low = low;
2412 	wmb();
2413 }
2414 
2415 static int
2416 mxge_get_buf_small(struct mxge_slice_state *ss, bus_dmamap_t map, int idx)
2417 {
2418 	bus_dma_segment_t seg;
2419 	struct mbuf *m;
2420 	mxge_rx_ring_t *rx = &ss->rx_small;
2421 	int cnt, err;
2422 
2423 	m = m_gethdr(M_NOWAIT, MT_DATA);
2424 	if (m == NULL) {
2425 		rx->alloc_fail++;
2426 		err = ENOBUFS;
2427 		goto done;
2428 	}
2429 	m->m_len = MHLEN;
2430 	err = bus_dmamap_load_mbuf_sg(rx->dmat, map, m,
2431 				      &seg, &cnt, BUS_DMA_NOWAIT);
2432 	if (err != 0) {
2433 		m_free(m);
2434 		goto done;
2435 	}
2436 	rx->info[idx].m = m;
2437 	rx->shadow[idx].addr_low =
2438 		htobe32(MXGE_LOWPART_TO_U32(seg.ds_addr));
2439 	rx->shadow[idx].addr_high =
2440 		htobe32(MXGE_HIGHPART_TO_U32(seg.ds_addr));
2441 
2442 done:
2443 	if ((idx & 7) == 7)
2444 		mxge_submit_8rx(&rx->lanai[idx - 7], &rx->shadow[idx - 7]);
2445 	return err;
2446 }
2447 
2448 static int
2449 mxge_get_buf_big(struct mxge_slice_state *ss, bus_dmamap_t map, int idx)
2450 {
2451 	bus_dma_segment_t seg[3];
2452 	struct mbuf *m;
2453 	mxge_rx_ring_t *rx = &ss->rx_big;
2454 	int cnt, err, i;
2455 
2456 	m = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, rx->cl_size);
2457 	if (m == NULL) {
2458 		rx->alloc_fail++;
2459 		err = ENOBUFS;
2460 		goto done;
2461 	}
2462 	m->m_len = rx->mlen;
2463 	err = bus_dmamap_load_mbuf_sg(rx->dmat, map, m,
2464 				      seg, &cnt, BUS_DMA_NOWAIT);
2465 	if (err != 0) {
2466 		m_free(m);
2467 		goto done;
2468 	}
2469 	rx->info[idx].m = m;
2470 	rx->shadow[idx].addr_low =
2471 		htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
2472 	rx->shadow[idx].addr_high =
2473 		htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
2474 
2475 #if MXGE_VIRT_JUMBOS
2476 	for (i = 1; i < cnt; i++) {
2477 		rx->shadow[idx + i].addr_low =
2478 			htobe32(MXGE_LOWPART_TO_U32(seg[i].ds_addr));
2479 		rx->shadow[idx + i].addr_high =
2480 			htobe32(MXGE_HIGHPART_TO_U32(seg[i].ds_addr));
2481        }
2482 #endif
2483 
2484 done:
2485        for (i = 0; i < rx->nbufs; i++) {
2486 		if ((idx & 7) == 7) {
2487 			mxge_submit_8rx(&rx->lanai[idx - 7],
2488 					&rx->shadow[idx - 7]);
2489 		}
2490 		idx++;
2491 	}
2492 	return err;
2493 }
2494 
2495 #ifdef INET6
2496 
2497 static uint16_t
2498 mxge_csum_generic(uint16_t *raw, int len)
2499 {
2500 	uint32_t csum;
2501 
2502 
2503 	csum = 0;
2504 	while (len > 0) {
2505 		csum += *raw;
2506 		raw++;
2507 		len -= 2;
2508 	}
2509 	csum = (csum >> 16) + (csum & 0xffff);
2510 	csum = (csum >> 16) + (csum & 0xffff);
2511 	return (uint16_t)csum;
2512 }
2513 
2514 static inline uint16_t
2515 mxge_rx_csum6(void *p, struct mbuf *m, uint32_t csum)
2516 {
2517 	uint32_t partial;
2518 	int nxt, cksum_offset;
2519 	struct ip6_hdr *ip6 = p;
2520 	uint16_t c;
2521 
2522 	nxt = ip6->ip6_nxt;
2523 	cksum_offset = sizeof (*ip6) + ETHER_HDR_LEN;
2524 	if (nxt != IPPROTO_TCP && nxt != IPPROTO_UDP) {
2525 		cksum_offset = ip6_lasthdr(m, ETHER_HDR_LEN,
2526 					   IPPROTO_IPV6, &nxt);
2527 		if (nxt != IPPROTO_TCP && nxt != IPPROTO_UDP)
2528 			return (1);
2529 	}
2530 
2531 	/*
2532 	 * IPv6 headers do not contain a checksum, and hence
2533 	 * do not checksum to zero, so they don't "fall out"
2534 	 * of the partial checksum calculation like IPv4
2535 	 * headers do.  We need to fix the partial checksum by
2536 	 * subtracting the checksum of the IPv6 header.
2537 	 */
2538 
2539 	partial = mxge_csum_generic((uint16_t *)ip6, cksum_offset -
2540 				    ETHER_HDR_LEN);
2541 	csum += ~partial;
2542 	csum +=	 (csum < ~partial);
2543 	csum = (csum >> 16) + (csum & 0xFFFF);
2544 	csum = (csum >> 16) + (csum & 0xFFFF);
2545 	c = in6_cksum_pseudo(ip6, m->m_pkthdr.len - cksum_offset, nxt,
2546 			     csum);
2547 	c ^= 0xffff;
2548 	return (c);
2549 }
2550 #endif /* INET6 */
2551 /*
2552  *  Myri10GE hardware checksums are not valid if the sender
2553  *  padded the frame with non-zero padding.  This is because
2554  *  the firmware just does a simple 16-bit 1s complement
2555  *  checksum across the entire frame, excluding the first 14
2556  *  bytes.  It is best to simply to check the checksum and
2557  *  tell the stack about it only if the checksum is good
2558  */
2559 
2560 static inline uint16_t
2561 mxge_rx_csum(struct mbuf *m, int csum)
2562 {
2563 	struct ether_header *eh;
2564 #ifdef INET
2565 	struct ip *ip;
2566 #endif
2567 #if defined(INET) || defined(INET6)
2568 	int cap = m->m_pkthdr.rcvif->if_capenable;
2569 #endif
2570 	uint16_t c, etype;
2571 
2572 
2573 	eh = mtod(m, struct ether_header *);
2574 	etype = ntohs(eh->ether_type);
2575 	switch (etype) {
2576 #ifdef INET
2577 	case ETHERTYPE_IP:
2578 		if ((cap & IFCAP_RXCSUM) == 0)
2579 			return (1);
2580 		ip = (struct ip *)(eh + 1);
2581 		if (ip->ip_p != IPPROTO_TCP && ip->ip_p != IPPROTO_UDP)
2582 			return (1);
2583 		c = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
2584 			      htonl(ntohs(csum) + ntohs(ip->ip_len) -
2585 				    (ip->ip_hl << 2) + ip->ip_p));
2586 		c ^= 0xffff;
2587 		break;
2588 #endif
2589 #ifdef INET6
2590 	case ETHERTYPE_IPV6:
2591 		if ((cap & IFCAP_RXCSUM_IPV6) == 0)
2592 			return (1);
2593 		c = mxge_rx_csum6((eh + 1), m, csum);
2594 		break;
2595 #endif
2596 	default:
2597 		c = 1;
2598 	}
2599 	return (c);
2600 }
2601 
2602 static void
2603 mxge_vlan_tag_remove(struct mbuf *m, uint32_t *csum)
2604 {
2605 	struct ether_vlan_header *evl;
2606 	struct ether_header *eh;
2607 	uint32_t partial;
2608 
2609 	evl = mtod(m, struct ether_vlan_header *);
2610 	eh = mtod(m, struct ether_header *);
2611 
2612 	/*
2613 	 * fix checksum by subtracting ETHER_VLAN_ENCAP_LEN bytes
2614 	 * after what the firmware thought was the end of the ethernet
2615 	 * header.
2616 	 */
2617 
2618 	/* put checksum into host byte order */
2619 	*csum = ntohs(*csum);
2620 	partial = ntohl(*(uint32_t *)(mtod(m, char *) + ETHER_HDR_LEN));
2621 	(*csum) += ~partial;
2622 	(*csum) +=  ((*csum) < ~partial);
2623 	(*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2624 	(*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2625 
2626 	/* restore checksum to network byte order;
2627 	   later consumers expect this */
2628 	*csum = htons(*csum);
2629 
2630 	/* save the tag */
2631 #ifdef MXGE_NEW_VLAN_API
2632 	m->m_pkthdr.ether_vtag = ntohs(evl->evl_tag);
2633 #else
2634 	{
2635 		struct m_tag *mtag;
2636 		mtag = m_tag_alloc(MTAG_VLAN, MTAG_VLAN_TAG, sizeof(u_int),
2637 				   M_NOWAIT);
2638 		if (mtag == NULL)
2639 			return;
2640 		VLAN_TAG_VALUE(mtag) = ntohs(evl->evl_tag);
2641 		m_tag_prepend(m, mtag);
2642 	}
2643 
2644 #endif
2645 	m->m_flags |= M_VLANTAG;
2646 
2647 	/*
2648 	 * Remove the 802.1q header by copying the Ethernet
2649 	 * addresses over it and adjusting the beginning of
2650 	 * the data in the mbuf.  The encapsulated Ethernet
2651 	 * type field is already in place.
2652 	 */
2653 	bcopy((char *)evl, (char *)evl + ETHER_VLAN_ENCAP_LEN,
2654 	      ETHER_HDR_LEN - ETHER_TYPE_LEN);
2655 	m_adj(m, ETHER_VLAN_ENCAP_LEN);
2656 }
2657 
2658 
2659 static inline void
2660 mxge_rx_done_big(struct mxge_slice_state *ss, uint32_t len,
2661 		 uint32_t csum, int lro)
2662 {
2663 	mxge_softc_t *sc;
2664 	struct ifnet *ifp;
2665 	struct mbuf *m;
2666 	struct ether_header *eh;
2667 	mxge_rx_ring_t *rx;
2668 	bus_dmamap_t old_map;
2669 	int idx;
2670 
2671 	sc = ss->sc;
2672 	ifp = sc->ifp;
2673 	rx = &ss->rx_big;
2674 	idx = rx->cnt & rx->mask;
2675 	rx->cnt += rx->nbufs;
2676 	/* save a pointer to the received mbuf */
2677 	m = rx->info[idx].m;
2678 	/* try to replace the received mbuf */
2679 	if (mxge_get_buf_big(ss, rx->extra_map, idx)) {
2680 		/* drop the frame -- the old mbuf is re-cycled */
2681 		ifp->if_ierrors++;
2682 		return;
2683 	}
2684 
2685 	/* unmap the received buffer */
2686 	old_map = rx->info[idx].map;
2687 	bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2688 	bus_dmamap_unload(rx->dmat, old_map);
2689 
2690 	/* swap the bus_dmamap_t's */
2691 	rx->info[idx].map = rx->extra_map;
2692 	rx->extra_map = old_map;
2693 
2694 	/* mcp implicitly skips 1st 2 bytes so that packet is properly
2695 	 * aligned */
2696 	m->m_data += MXGEFW_PAD;
2697 
2698 	m->m_pkthdr.rcvif = ifp;
2699 	m->m_len = m->m_pkthdr.len = len;
2700 	ss->ipackets++;
2701 	eh = mtod(m, struct ether_header *);
2702 	if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2703 		mxge_vlan_tag_remove(m, &csum);
2704 	}
2705 	/* if the checksum is valid, mark it in the mbuf header */
2706 
2707 	if ((ifp->if_capenable & (IFCAP_RXCSUM_IPV6 | IFCAP_RXCSUM)) &&
2708 	    (0 == mxge_rx_csum(m, csum))) {
2709 		/* Tell the stack that the  checksum is good */
2710 		m->m_pkthdr.csum_data = 0xffff;
2711 		m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR |
2712 			CSUM_DATA_VALID;
2713 
2714 #if defined(INET) || defined (INET6)
2715 		if (lro && (0 == tcp_lro_rx(&ss->lc, m, 0)))
2716 			return;
2717 #endif
2718 	}
2719 	/* flowid only valid if RSS hashing is enabled */
2720 	if (sc->num_slices > 1) {
2721 		m->m_pkthdr.flowid = (ss - sc->ss);
2722 		m->m_flags |= M_FLOWID;
2723 	}
2724 	/* pass the frame up the stack */
2725 	(*ifp->if_input)(ifp, m);
2726 }
2727 
2728 static inline void
2729 mxge_rx_done_small(struct mxge_slice_state *ss, uint32_t len,
2730 		   uint32_t csum, int lro)
2731 {
2732 	mxge_softc_t *sc;
2733 	struct ifnet *ifp;
2734 	struct ether_header *eh;
2735 	struct mbuf *m;
2736 	mxge_rx_ring_t *rx;
2737 	bus_dmamap_t old_map;
2738 	int idx;
2739 
2740 	sc = ss->sc;
2741 	ifp = sc->ifp;
2742 	rx = &ss->rx_small;
2743 	idx = rx->cnt & rx->mask;
2744 	rx->cnt++;
2745 	/* save a pointer to the received mbuf */
2746 	m = rx->info[idx].m;
2747 	/* try to replace the received mbuf */
2748 	if (mxge_get_buf_small(ss, rx->extra_map, idx)) {
2749 		/* drop the frame -- the old mbuf is re-cycled */
2750 		ifp->if_ierrors++;
2751 		return;
2752 	}
2753 
2754 	/* unmap the received buffer */
2755 	old_map = rx->info[idx].map;
2756 	bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2757 	bus_dmamap_unload(rx->dmat, old_map);
2758 
2759 	/* swap the bus_dmamap_t's */
2760 	rx->info[idx].map = rx->extra_map;
2761 	rx->extra_map = old_map;
2762 
2763 	/* mcp implicitly skips 1st 2 bytes so that packet is properly
2764 	 * aligned */
2765 	m->m_data += MXGEFW_PAD;
2766 
2767 	m->m_pkthdr.rcvif = ifp;
2768 	m->m_len = m->m_pkthdr.len = len;
2769 	ss->ipackets++;
2770 	eh = mtod(m, struct ether_header *);
2771 	if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2772 		mxge_vlan_tag_remove(m, &csum);
2773 	}
2774 	/* if the checksum is valid, mark it in the mbuf header */
2775 	if ((ifp->if_capenable & (IFCAP_RXCSUM_IPV6 | IFCAP_RXCSUM)) &&
2776 	    (0 == mxge_rx_csum(m, csum))) {
2777 		/* Tell the stack that the  checksum is good */
2778 		m->m_pkthdr.csum_data = 0xffff;
2779 		m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR |
2780 			CSUM_DATA_VALID;
2781 
2782 #if defined(INET) || defined (INET6)
2783 		if (lro && (0 == tcp_lro_rx(&ss->lc, m, csum)))
2784 			return;
2785 #endif
2786 	}
2787 	/* flowid only valid if RSS hashing is enabled */
2788 	if (sc->num_slices > 1) {
2789 		m->m_pkthdr.flowid = (ss - sc->ss);
2790 		m->m_flags |= M_FLOWID;
2791 	}
2792 	/* pass the frame up the stack */
2793 	(*ifp->if_input)(ifp, m);
2794 }
2795 
2796 static inline void
2797 mxge_clean_rx_done(struct mxge_slice_state *ss)
2798 {
2799 	mxge_rx_done_t *rx_done = &ss->rx_done;
2800 	int limit = 0;
2801 	uint16_t length;
2802 	uint16_t checksum;
2803 	int lro;
2804 
2805 	lro = ss->sc->ifp->if_capenable & IFCAP_LRO;
2806 	while (rx_done->entry[rx_done->idx].length != 0) {
2807 		length = ntohs(rx_done->entry[rx_done->idx].length);
2808 		rx_done->entry[rx_done->idx].length = 0;
2809 		checksum = rx_done->entry[rx_done->idx].checksum;
2810 		if (length <= (MHLEN - MXGEFW_PAD))
2811 			mxge_rx_done_small(ss, length, checksum, lro);
2812 		else
2813 			mxge_rx_done_big(ss, length, checksum, lro);
2814 		rx_done->cnt++;
2815 		rx_done->idx = rx_done->cnt & rx_done->mask;
2816 
2817 		/* limit potential for livelock */
2818 		if (__predict_false(++limit > rx_done->mask / 2))
2819 			break;
2820 	}
2821 #if defined(INET)  || defined (INET6)
2822 	while (!SLIST_EMPTY(&ss->lc.lro_active)) {
2823 		struct lro_entry *lro = SLIST_FIRST(&ss->lc.lro_active);
2824 		SLIST_REMOVE_HEAD(&ss->lc.lro_active, next);
2825 		tcp_lro_flush(&ss->lc, lro);
2826 	}
2827 #endif
2828 }
2829 
2830 
2831 static inline void
2832 mxge_tx_done(struct mxge_slice_state *ss, uint32_t mcp_idx)
2833 {
2834 	struct ifnet *ifp;
2835 	mxge_tx_ring_t *tx;
2836 	struct mbuf *m;
2837 	bus_dmamap_t map;
2838 	int idx;
2839 	int *flags;
2840 
2841 	tx = &ss->tx;
2842 	ifp = ss->sc->ifp;
2843 	while (tx->pkt_done != mcp_idx) {
2844 		idx = tx->done & tx->mask;
2845 		tx->done++;
2846 		m = tx->info[idx].m;
2847 		/* mbuf and DMA map only attached to the first
2848 		   segment per-mbuf */
2849 		if (m != NULL) {
2850 			ss->obytes += m->m_pkthdr.len;
2851 			if (m->m_flags & M_MCAST)
2852 				ss->omcasts++;
2853 			ss->opackets++;
2854 			tx->info[idx].m = NULL;
2855 			map = tx->info[idx].map;
2856 			bus_dmamap_unload(tx->dmat, map);
2857 			m_freem(m);
2858 		}
2859 		if (tx->info[idx].flag) {
2860 			tx->info[idx].flag = 0;
2861 			tx->pkt_done++;
2862 		}
2863 	}
2864 
2865 	/* If we have space, clear IFF_OACTIVE to tell the stack that
2866            its OK to send packets */
2867 #ifdef IFNET_BUF_RING
2868 	flags = &ss->if_drv_flags;
2869 #else
2870 	flags = &ifp->if_drv_flags;
2871 #endif
2872 	mtx_lock(&ss->tx.mtx);
2873 	if ((*flags) & IFF_DRV_OACTIVE &&
2874 	    tx->req - tx->done < (tx->mask + 1)/4) {
2875 		*(flags) &= ~IFF_DRV_OACTIVE;
2876 		ss->tx.wake++;
2877 		mxge_start_locked(ss);
2878 	}
2879 #ifdef IFNET_BUF_RING
2880 	if ((ss->sc->num_slices > 1) && (tx->req == tx->done)) {
2881 		/* let the NIC stop polling this queue, since there
2882 		 * are no more transmits pending */
2883 		if (tx->req == tx->done) {
2884 			*tx->send_stop = 1;
2885 			tx->queue_active = 0;
2886 			tx->deactivate++;
2887 			wmb();
2888 		}
2889 	}
2890 #endif
2891 	mtx_unlock(&ss->tx.mtx);
2892 
2893 }
2894 
2895 static struct mxge_media_type mxge_xfp_media_types[] =
2896 {
2897 	{IFM_10G_CX4,	0x7f, 		"10GBASE-CX4 (module)"},
2898 	{IFM_10G_SR, 	(1 << 7),	"10GBASE-SR"},
2899 	{IFM_10G_LR, 	(1 << 6),	"10GBASE-LR"},
2900 	{0,		(1 << 5),	"10GBASE-ER"},
2901 	{IFM_10G_LRM,	(1 << 4),	"10GBASE-LRM"},
2902 	{0,		(1 << 3),	"10GBASE-SW"},
2903 	{0,		(1 << 2),	"10GBASE-LW"},
2904 	{0,		(1 << 1),	"10GBASE-EW"},
2905 	{0,		(1 << 0),	"Reserved"}
2906 };
2907 static struct mxge_media_type mxge_sfp_media_types[] =
2908 {
2909 	{IFM_10G_TWINAX,      0,	"10GBASE-Twinax"},
2910 	{0,		(1 << 7),	"Reserved"},
2911 	{IFM_10G_LRM,	(1 << 6),	"10GBASE-LRM"},
2912 	{IFM_10G_LR, 	(1 << 5),	"10GBASE-LR"},
2913 	{IFM_10G_SR,	(1 << 4),	"10GBASE-SR"},
2914 	{IFM_10G_TWINAX,(1 << 0),	"10GBASE-Twinax"}
2915 };
2916 
2917 static void
2918 mxge_media_set(mxge_softc_t *sc, int media_type)
2919 {
2920 
2921 
2922 	ifmedia_add(&sc->media, IFM_ETHER | IFM_FDX | media_type,
2923 		    0, NULL);
2924 	ifmedia_set(&sc->media, IFM_ETHER | IFM_FDX | media_type);
2925 	sc->current_media = media_type;
2926 	sc->media.ifm_media = sc->media.ifm_cur->ifm_media;
2927 }
2928 
2929 static void
2930 mxge_media_init(mxge_softc_t *sc)
2931 {
2932 	char *ptr;
2933 	int i;
2934 
2935 	ifmedia_removeall(&sc->media);
2936 	mxge_media_set(sc, IFM_AUTO);
2937 
2938 	/*
2939 	 * parse the product code to deterimine the interface type
2940 	 * (CX4, XFP, Quad Ribbon Fiber) by looking at the character
2941 	 * after the 3rd dash in the driver's cached copy of the
2942 	 * EEPROM's product code string.
2943 	 */
2944 	ptr = sc->product_code_string;
2945 	if (ptr == NULL) {
2946 		device_printf(sc->dev, "Missing product code\n");
2947 		return;
2948 	}
2949 
2950 	for (i = 0; i < 3; i++, ptr++) {
2951 		ptr = strchr(ptr, '-');
2952 		if (ptr == NULL) {
2953 			device_printf(sc->dev,
2954 				      "only %d dashes in PC?!?\n", i);
2955 			return;
2956 		}
2957 	}
2958 	if (*ptr == 'C' || *(ptr +1) == 'C') {
2959 		/* -C is CX4 */
2960 		sc->connector = MXGE_CX4;
2961 		mxge_media_set(sc, IFM_10G_CX4);
2962 	} else if (*ptr == 'Q') {
2963 		/* -Q is Quad Ribbon Fiber */
2964 		sc->connector = MXGE_QRF;
2965 		device_printf(sc->dev, "Quad Ribbon Fiber Media\n");
2966 		/* FreeBSD has no media type for Quad ribbon fiber */
2967 	} else if (*ptr == 'R') {
2968 		/* -R is XFP */
2969 		sc->connector = MXGE_XFP;
2970 	} else if (*ptr == 'S' || *(ptr +1) == 'S') {
2971 		/* -S or -2S is SFP+ */
2972 		sc->connector = MXGE_SFP;
2973 	} else {
2974 		device_printf(sc->dev, "Unknown media type: %c\n", *ptr);
2975 	}
2976 }
2977 
2978 /*
2979  * Determine the media type for a NIC.  Some XFPs will identify
2980  * themselves only when their link is up, so this is initiated via a
2981  * link up interrupt.  However, this can potentially take up to
2982  * several milliseconds, so it is run via the watchdog routine, rather
2983  * than in the interrupt handler itself.
2984  */
2985 static void
2986 mxge_media_probe(mxge_softc_t *sc)
2987 {
2988 	mxge_cmd_t cmd;
2989 	char *cage_type;
2990 
2991 	struct mxge_media_type *mxge_media_types = NULL;
2992 	int i, err, ms, mxge_media_type_entries;
2993 	uint32_t byte;
2994 
2995 	sc->need_media_probe = 0;
2996 
2997 	if (sc->connector == MXGE_XFP) {
2998 		/* -R is XFP */
2999 		mxge_media_types = mxge_xfp_media_types;
3000 		mxge_media_type_entries =
3001 			sizeof (mxge_xfp_media_types) /
3002 			sizeof (mxge_xfp_media_types[0]);
3003 		byte = MXGE_XFP_COMPLIANCE_BYTE;
3004 		cage_type = "XFP";
3005 	} else 	if (sc->connector == MXGE_SFP) {
3006 		/* -S or -2S is SFP+ */
3007 		mxge_media_types = mxge_sfp_media_types;
3008 		mxge_media_type_entries =
3009 			sizeof (mxge_sfp_media_types) /
3010 			sizeof (mxge_sfp_media_types[0]);
3011 		cage_type = "SFP+";
3012 		byte = 3;
3013 	} else {
3014 		/* nothing to do; media type cannot change */
3015 		return;
3016 	}
3017 
3018 	/*
3019 	 * At this point we know the NIC has an XFP cage, so now we
3020 	 * try to determine what is in the cage by using the
3021 	 * firmware's XFP I2C commands to read the XFP 10GbE compilance
3022 	 * register.  We read just one byte, which may take over
3023 	 * a millisecond
3024 	 */
3025 
3026 	cmd.data0 = 0;	 /* just fetch 1 byte, not all 256 */
3027 	cmd.data1 = byte;
3028 	err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_READ, &cmd);
3029 	if (err == MXGEFW_CMD_ERROR_I2C_FAILURE) {
3030 		device_printf(sc->dev, "failed to read XFP\n");
3031 	}
3032 	if (err == MXGEFW_CMD_ERROR_I2C_ABSENT) {
3033 		device_printf(sc->dev, "Type R/S with no XFP!?!?\n");
3034 	}
3035 	if (err != MXGEFW_CMD_OK) {
3036 		return;
3037 	}
3038 
3039 	/* now we wait for the data to be cached */
3040 	cmd.data0 = byte;
3041 	err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
3042 	for (ms = 0; (err == EBUSY) && (ms < 50); ms++) {
3043 		DELAY(1000);
3044 		cmd.data0 = byte;
3045 		err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
3046 	}
3047 	if (err != MXGEFW_CMD_OK) {
3048 		device_printf(sc->dev, "failed to read %s (%d, %dms)\n",
3049 			      cage_type, err, ms);
3050 		return;
3051 	}
3052 
3053 	if (cmd.data0 == mxge_media_types[0].bitmask) {
3054 		if (mxge_verbose)
3055 			device_printf(sc->dev, "%s:%s\n", cage_type,
3056 				      mxge_media_types[0].name);
3057 		if (sc->current_media != mxge_media_types[0].flag) {
3058 			mxge_media_init(sc);
3059 			mxge_media_set(sc, mxge_media_types[0].flag);
3060 		}
3061 		return;
3062 	}
3063 	for (i = 1; i < mxge_media_type_entries; i++) {
3064 		if (cmd.data0 & mxge_media_types[i].bitmask) {
3065 			if (mxge_verbose)
3066 				device_printf(sc->dev, "%s:%s\n",
3067 					      cage_type,
3068 					      mxge_media_types[i].name);
3069 
3070 			if (sc->current_media != mxge_media_types[i].flag) {
3071 				mxge_media_init(sc);
3072 				mxge_media_set(sc, mxge_media_types[i].flag);
3073 			}
3074 			return;
3075 		}
3076 	}
3077 	if (mxge_verbose)
3078 		device_printf(sc->dev, "%s media 0x%x unknown\n",
3079 			      cage_type, cmd.data0);
3080 
3081 	return;
3082 }
3083 
3084 static void
3085 mxge_intr(void *arg)
3086 {
3087 	struct mxge_slice_state *ss = arg;
3088 	mxge_softc_t *sc = ss->sc;
3089 	mcp_irq_data_t *stats = ss->fw_stats;
3090 	mxge_tx_ring_t *tx = &ss->tx;
3091 	mxge_rx_done_t *rx_done = &ss->rx_done;
3092 	uint32_t send_done_count;
3093 	uint8_t valid;
3094 
3095 
3096 #ifndef IFNET_BUF_RING
3097 	/* an interrupt on a non-zero slice is implicitly valid
3098 	   since MSI-X irqs are not shared */
3099 	if (ss != sc->ss) {
3100 		mxge_clean_rx_done(ss);
3101 		*ss->irq_claim = be32toh(3);
3102 		return;
3103 	}
3104 #endif
3105 
3106 	/* make sure the DMA has finished */
3107 	if (!stats->valid) {
3108 		return;
3109 	}
3110 	valid = stats->valid;
3111 
3112 	if (sc->legacy_irq) {
3113 		/* lower legacy IRQ  */
3114 		*sc->irq_deassert = 0;
3115 		if (!mxge_deassert_wait)
3116 			/* don't wait for conf. that irq is low */
3117 			stats->valid = 0;
3118 	} else {
3119 		stats->valid = 0;
3120 	}
3121 
3122 	/* loop while waiting for legacy irq deassertion */
3123 	do {
3124 		/* check for transmit completes and receives */
3125 		send_done_count = be32toh(stats->send_done_count);
3126 		while ((send_done_count != tx->pkt_done) ||
3127 		       (rx_done->entry[rx_done->idx].length != 0)) {
3128 			if (send_done_count != tx->pkt_done)
3129 				mxge_tx_done(ss, (int)send_done_count);
3130 			mxge_clean_rx_done(ss);
3131 			send_done_count = be32toh(stats->send_done_count);
3132 		}
3133 		if (sc->legacy_irq && mxge_deassert_wait)
3134 			wmb();
3135 	} while (*((volatile uint8_t *) &stats->valid));
3136 
3137 	/* fw link & error stats meaningful only on the first slice */
3138 	if (__predict_false((ss == sc->ss) && stats->stats_updated)) {
3139 		if (sc->link_state != stats->link_up) {
3140 			sc->link_state = stats->link_up;
3141 			if (sc->link_state) {
3142 				if_link_state_change(sc->ifp, LINK_STATE_UP);
3143 				if (mxge_verbose)
3144 					device_printf(sc->dev, "link up\n");
3145 			} else {
3146 				if_link_state_change(sc->ifp, LINK_STATE_DOWN);
3147 				if (mxge_verbose)
3148 					device_printf(sc->dev, "link down\n");
3149 			}
3150 			sc->need_media_probe = 1;
3151 		}
3152 		if (sc->rdma_tags_available !=
3153 		    be32toh(stats->rdma_tags_available)) {
3154 			sc->rdma_tags_available =
3155 				be32toh(stats->rdma_tags_available);
3156 			device_printf(sc->dev, "RDMA timed out! %d tags "
3157 				      "left\n", sc->rdma_tags_available);
3158 		}
3159 
3160 		if (stats->link_down) {
3161 			sc->down_cnt += stats->link_down;
3162 			sc->link_state = 0;
3163 			if_link_state_change(sc->ifp, LINK_STATE_DOWN);
3164 		}
3165 	}
3166 
3167 	/* check to see if we have rx token to pass back */
3168 	if (valid & 0x1)
3169 	    *ss->irq_claim = be32toh(3);
3170 	*(ss->irq_claim + 1) = be32toh(3);
3171 }
3172 
3173 static void
3174 mxge_init(void *arg)
3175 {
3176 	mxge_softc_t *sc = arg;
3177 	struct ifnet *ifp = sc->ifp;
3178 
3179 
3180 	mtx_lock(&sc->driver_mtx);
3181 	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0)
3182 		(void) mxge_open(sc);
3183 	mtx_unlock(&sc->driver_mtx);
3184 }
3185 
3186 
3187 
3188 static void
3189 mxge_free_slice_mbufs(struct mxge_slice_state *ss)
3190 {
3191 	int i;
3192 
3193 #if defined(INET) || defined(INET6)
3194 	tcp_lro_free(&ss->lc);
3195 #endif
3196 	for (i = 0; i <= ss->rx_big.mask; i++) {
3197 		if (ss->rx_big.info[i].m == NULL)
3198 			continue;
3199 		bus_dmamap_unload(ss->rx_big.dmat,
3200 				  ss->rx_big.info[i].map);
3201 		m_freem(ss->rx_big.info[i].m);
3202 		ss->rx_big.info[i].m = NULL;
3203 	}
3204 
3205 	for (i = 0; i <= ss->rx_small.mask; i++) {
3206 		if (ss->rx_small.info[i].m == NULL)
3207 			continue;
3208 		bus_dmamap_unload(ss->rx_small.dmat,
3209 				  ss->rx_small.info[i].map);
3210 		m_freem(ss->rx_small.info[i].m);
3211 		ss->rx_small.info[i].m = NULL;
3212 	}
3213 
3214 	/* transmit ring used only on the first slice */
3215 	if (ss->tx.info == NULL)
3216 		return;
3217 
3218 	for (i = 0; i <= ss->tx.mask; i++) {
3219 		ss->tx.info[i].flag = 0;
3220 		if (ss->tx.info[i].m == NULL)
3221 			continue;
3222 		bus_dmamap_unload(ss->tx.dmat,
3223 				  ss->tx.info[i].map);
3224 		m_freem(ss->tx.info[i].m);
3225 		ss->tx.info[i].m = NULL;
3226 	}
3227 }
3228 
3229 static void
3230 mxge_free_mbufs(mxge_softc_t *sc)
3231 {
3232 	int slice;
3233 
3234 	for (slice = 0; slice < sc->num_slices; slice++)
3235 		mxge_free_slice_mbufs(&sc->ss[slice]);
3236 }
3237 
3238 static void
3239 mxge_free_slice_rings(struct mxge_slice_state *ss)
3240 {
3241 	int i;
3242 
3243 
3244 	if (ss->rx_done.entry != NULL)
3245 		mxge_dma_free(&ss->rx_done.dma);
3246 	ss->rx_done.entry = NULL;
3247 
3248 	if (ss->tx.req_bytes != NULL)
3249 		free(ss->tx.req_bytes, M_DEVBUF);
3250 	ss->tx.req_bytes = NULL;
3251 
3252 	if (ss->tx.seg_list != NULL)
3253 		free(ss->tx.seg_list, M_DEVBUF);
3254 	ss->tx.seg_list = NULL;
3255 
3256 	if (ss->rx_small.shadow != NULL)
3257 		free(ss->rx_small.shadow, M_DEVBUF);
3258 	ss->rx_small.shadow = NULL;
3259 
3260 	if (ss->rx_big.shadow != NULL)
3261 		free(ss->rx_big.shadow, M_DEVBUF);
3262 	ss->rx_big.shadow = NULL;
3263 
3264 	if (ss->tx.info != NULL) {
3265 		if (ss->tx.dmat != NULL) {
3266 			for (i = 0; i <= ss->tx.mask; i++) {
3267 				bus_dmamap_destroy(ss->tx.dmat,
3268 						   ss->tx.info[i].map);
3269 			}
3270 			bus_dma_tag_destroy(ss->tx.dmat);
3271 		}
3272 		free(ss->tx.info, M_DEVBUF);
3273 	}
3274 	ss->tx.info = NULL;
3275 
3276 	if (ss->rx_small.info != NULL) {
3277 		if (ss->rx_small.dmat != NULL) {
3278 			for (i = 0; i <= ss->rx_small.mask; i++) {
3279 				bus_dmamap_destroy(ss->rx_small.dmat,
3280 						   ss->rx_small.info[i].map);
3281 			}
3282 			bus_dmamap_destroy(ss->rx_small.dmat,
3283 					   ss->rx_small.extra_map);
3284 			bus_dma_tag_destroy(ss->rx_small.dmat);
3285 		}
3286 		free(ss->rx_small.info, M_DEVBUF);
3287 	}
3288 	ss->rx_small.info = NULL;
3289 
3290 	if (ss->rx_big.info != NULL) {
3291 		if (ss->rx_big.dmat != NULL) {
3292 			for (i = 0; i <= ss->rx_big.mask; i++) {
3293 				bus_dmamap_destroy(ss->rx_big.dmat,
3294 						   ss->rx_big.info[i].map);
3295 			}
3296 			bus_dmamap_destroy(ss->rx_big.dmat,
3297 					   ss->rx_big.extra_map);
3298 			bus_dma_tag_destroy(ss->rx_big.dmat);
3299 		}
3300 		free(ss->rx_big.info, M_DEVBUF);
3301 	}
3302 	ss->rx_big.info = NULL;
3303 }
3304 
3305 static void
3306 mxge_free_rings(mxge_softc_t *sc)
3307 {
3308 	int slice;
3309 
3310 	for (slice = 0; slice < sc->num_slices; slice++)
3311 		mxge_free_slice_rings(&sc->ss[slice]);
3312 }
3313 
3314 static int
3315 mxge_alloc_slice_rings(struct mxge_slice_state *ss, int rx_ring_entries,
3316 		       int tx_ring_entries)
3317 {
3318 	mxge_softc_t *sc = ss->sc;
3319 	size_t bytes;
3320 	int err, i;
3321 
3322 	/* allocate per-slice receive resources */
3323 
3324 	ss->rx_small.mask = ss->rx_big.mask = rx_ring_entries - 1;
3325 	ss->rx_done.mask = (2 * rx_ring_entries) - 1;
3326 
3327 	/* allocate the rx shadow rings */
3328 	bytes = rx_ring_entries * sizeof (*ss->rx_small.shadow);
3329 	ss->rx_small.shadow = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3330 
3331 	bytes = rx_ring_entries * sizeof (*ss->rx_big.shadow);
3332 	ss->rx_big.shadow = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3333 
3334 	/* allocate the rx host info rings */
3335 	bytes = rx_ring_entries * sizeof (*ss->rx_small.info);
3336 	ss->rx_small.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3337 
3338 	bytes = rx_ring_entries * sizeof (*ss->rx_big.info);
3339 	ss->rx_big.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3340 
3341 	/* allocate the rx busdma resources */
3342 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
3343 				 1,			/* alignment */
3344 				 4096,			/* boundary */
3345 				 BUS_SPACE_MAXADDR,	/* low */
3346 				 BUS_SPACE_MAXADDR,	/* high */
3347 				 NULL, NULL,		/* filter */
3348 				 MHLEN,			/* maxsize */
3349 				 1,			/* num segs */
3350 				 MHLEN,			/* maxsegsize */
3351 				 BUS_DMA_ALLOCNOW,	/* flags */
3352 				 NULL, NULL,		/* lock */
3353 				 &ss->rx_small.dmat);	/* tag */
3354 	if (err != 0) {
3355 		device_printf(sc->dev, "Err %d allocating rx_small dmat\n",
3356 			      err);
3357 		return err;
3358 	}
3359 
3360 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
3361 				 1,			/* alignment */
3362 #if MXGE_VIRT_JUMBOS
3363 				 4096,			/* boundary */
3364 #else
3365 				 0,			/* boundary */
3366 #endif
3367 				 BUS_SPACE_MAXADDR,	/* low */
3368 				 BUS_SPACE_MAXADDR,	/* high */
3369 				 NULL, NULL,		/* filter */
3370 				 3*4096,		/* maxsize */
3371 #if MXGE_VIRT_JUMBOS
3372 				 3,			/* num segs */
3373 				 4096,			/* maxsegsize*/
3374 #else
3375 				 1,			/* num segs */
3376 				 MJUM9BYTES,		/* maxsegsize*/
3377 #endif
3378 				 BUS_DMA_ALLOCNOW,	/* flags */
3379 				 NULL, NULL,		/* lock */
3380 				 &ss->rx_big.dmat);	/* tag */
3381 	if (err != 0) {
3382 		device_printf(sc->dev, "Err %d allocating rx_big dmat\n",
3383 			      err);
3384 		return err;
3385 	}
3386 	for (i = 0; i <= ss->rx_small.mask; i++) {
3387 		err = bus_dmamap_create(ss->rx_small.dmat, 0,
3388 					&ss->rx_small.info[i].map);
3389 		if (err != 0) {
3390 			device_printf(sc->dev, "Err %d  rx_small dmamap\n",
3391 				      err);
3392 			return err;
3393 		}
3394 	}
3395 	err = bus_dmamap_create(ss->rx_small.dmat, 0,
3396 				&ss->rx_small.extra_map);
3397 	if (err != 0) {
3398 		device_printf(sc->dev, "Err %d extra rx_small dmamap\n",
3399 			      err);
3400 		return err;
3401 	}
3402 
3403 	for (i = 0; i <= ss->rx_big.mask; i++) {
3404 		err = bus_dmamap_create(ss->rx_big.dmat, 0,
3405 					&ss->rx_big.info[i].map);
3406 		if (err != 0) {
3407 			device_printf(sc->dev, "Err %d  rx_big dmamap\n",
3408 				      err);
3409 			return err;
3410 		}
3411 	}
3412 	err = bus_dmamap_create(ss->rx_big.dmat, 0,
3413 				&ss->rx_big.extra_map);
3414 	if (err != 0) {
3415 		device_printf(sc->dev, "Err %d extra rx_big dmamap\n",
3416 			      err);
3417 		return err;
3418 	}
3419 
3420 	/* now allocate TX resources */
3421 
3422 #ifndef IFNET_BUF_RING
3423 	/* only use a single TX ring for now */
3424 	if (ss != ss->sc->ss)
3425 		return 0;
3426 #endif
3427 
3428 	ss->tx.mask = tx_ring_entries - 1;
3429 	ss->tx.max_desc = MIN(MXGE_MAX_SEND_DESC, tx_ring_entries / 4);
3430 
3431 
3432 	/* allocate the tx request copy block */
3433 	bytes = 8 +
3434 		sizeof (*ss->tx.req_list) * (ss->tx.max_desc + 4);
3435 	ss->tx.req_bytes = malloc(bytes, M_DEVBUF, M_WAITOK);
3436 	/* ensure req_list entries are aligned to 8 bytes */
3437 	ss->tx.req_list = (mcp_kreq_ether_send_t *)
3438 		((unsigned long)(ss->tx.req_bytes + 7) & ~7UL);
3439 
3440 	/* allocate the tx busdma segment list */
3441 	bytes = sizeof (*ss->tx.seg_list) * ss->tx.max_desc;
3442 	ss->tx.seg_list = (bus_dma_segment_t *)
3443 		malloc(bytes, M_DEVBUF, M_WAITOK);
3444 
3445 	/* allocate the tx host info ring */
3446 	bytes = tx_ring_entries * sizeof (*ss->tx.info);
3447 	ss->tx.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3448 
3449 	/* allocate the tx busdma resources */
3450 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
3451 				 1,			/* alignment */
3452 				 sc->tx_boundary,	/* boundary */
3453 				 BUS_SPACE_MAXADDR,	/* low */
3454 				 BUS_SPACE_MAXADDR,	/* high */
3455 				 NULL, NULL,		/* filter */
3456 				 65536 + 256,		/* maxsize */
3457 				 ss->tx.max_desc - 2,	/* num segs */
3458 				 sc->tx_boundary,	/* maxsegsz */
3459 				 BUS_DMA_ALLOCNOW,	/* flags */
3460 				 NULL, NULL,		/* lock */
3461 				 &ss->tx.dmat);		/* tag */
3462 
3463 	if (err != 0) {
3464 		device_printf(sc->dev, "Err %d allocating tx dmat\n",
3465 			      err);
3466 		return err;
3467 	}
3468 
3469 	/* now use these tags to setup dmamaps for each slot
3470 	   in the ring */
3471 	for (i = 0; i <= ss->tx.mask; i++) {
3472 		err = bus_dmamap_create(ss->tx.dmat, 0,
3473 					&ss->tx.info[i].map);
3474 		if (err != 0) {
3475 			device_printf(sc->dev, "Err %d  tx dmamap\n",
3476 				      err);
3477 			return err;
3478 		}
3479 	}
3480 	return 0;
3481 
3482 }
3483 
3484 static int
3485 mxge_alloc_rings(mxge_softc_t *sc)
3486 {
3487 	mxge_cmd_t cmd;
3488 	int tx_ring_size;
3489 	int tx_ring_entries, rx_ring_entries;
3490 	int err, slice;
3491 
3492 	/* get ring sizes */
3493 	err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_RING_SIZE, &cmd);
3494 	tx_ring_size = cmd.data0;
3495 	if (err != 0) {
3496 		device_printf(sc->dev, "Cannot determine tx ring sizes\n");
3497 		goto abort;
3498 	}
3499 
3500 	tx_ring_entries = tx_ring_size / sizeof (mcp_kreq_ether_send_t);
3501 	rx_ring_entries = sc->rx_ring_size / sizeof (mcp_dma_addr_t);
3502 	IFQ_SET_MAXLEN(&sc->ifp->if_snd, tx_ring_entries - 1);
3503 	sc->ifp->if_snd.ifq_drv_maxlen = sc->ifp->if_snd.ifq_maxlen;
3504 	IFQ_SET_READY(&sc->ifp->if_snd);
3505 
3506 	for (slice = 0; slice < sc->num_slices; slice++) {
3507 		err = mxge_alloc_slice_rings(&sc->ss[slice],
3508 					     rx_ring_entries,
3509 					     tx_ring_entries);
3510 		if (err != 0)
3511 			goto abort;
3512 	}
3513 	return 0;
3514 
3515 abort:
3516 	mxge_free_rings(sc);
3517 	return err;
3518 
3519 }
3520 
3521 
3522 static void
3523 mxge_choose_params(int mtu, int *big_buf_size, int *cl_size, int *nbufs)
3524 {
3525 	int bufsize = mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN + MXGEFW_PAD;
3526 
3527 	if (bufsize < MCLBYTES) {
3528 		/* easy, everything fits in a single buffer */
3529 		*big_buf_size = MCLBYTES;
3530 		*cl_size = MCLBYTES;
3531 		*nbufs = 1;
3532 		return;
3533 	}
3534 
3535 	if (bufsize < MJUMPAGESIZE) {
3536 		/* still easy, everything still fits in a single buffer */
3537 		*big_buf_size = MJUMPAGESIZE;
3538 		*cl_size = MJUMPAGESIZE;
3539 		*nbufs = 1;
3540 		return;
3541 	}
3542 #if MXGE_VIRT_JUMBOS
3543 	/* now we need to use virtually contiguous buffers */
3544 	*cl_size = MJUM9BYTES;
3545 	*big_buf_size = 4096;
3546 	*nbufs = mtu / 4096 + 1;
3547 	/* needs to be a power of two, so round up */
3548 	if (*nbufs == 3)
3549 		*nbufs = 4;
3550 #else
3551 	*cl_size = MJUM9BYTES;
3552 	*big_buf_size = MJUM9BYTES;
3553 	*nbufs = 1;
3554 #endif
3555 }
3556 
3557 static int
3558 mxge_slice_open(struct mxge_slice_state *ss, int nbufs, int cl_size)
3559 {
3560 	mxge_softc_t *sc;
3561 	mxge_cmd_t cmd;
3562 	bus_dmamap_t map;
3563 	int err, i, slice;
3564 
3565 
3566 	sc = ss->sc;
3567 	slice = ss - sc->ss;
3568 
3569 #if defined(INET) || defined(INET6)
3570 	(void)tcp_lro_init(&ss->lc);
3571 #endif
3572 	ss->lc.ifp = sc->ifp;
3573 
3574 	/* get the lanai pointers to the send and receive rings */
3575 
3576 	err = 0;
3577 #ifndef IFNET_BUF_RING
3578 	/* We currently only send from the first slice */
3579 	if (slice == 0) {
3580 #endif
3581 		cmd.data0 = slice;
3582 		err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_OFFSET, &cmd);
3583 		ss->tx.lanai =
3584 			(volatile mcp_kreq_ether_send_t *)(sc->sram + cmd.data0);
3585 		ss->tx.send_go = (volatile uint32_t *)
3586 			(sc->sram + MXGEFW_ETH_SEND_GO + 64 * slice);
3587 		ss->tx.send_stop = (volatile uint32_t *)
3588 		(sc->sram + MXGEFW_ETH_SEND_STOP + 64 * slice);
3589 #ifndef IFNET_BUF_RING
3590 	}
3591 #endif
3592 	cmd.data0 = slice;
3593 	err |= mxge_send_cmd(sc,
3594 			     MXGEFW_CMD_GET_SMALL_RX_OFFSET, &cmd);
3595 	ss->rx_small.lanai =
3596 		(volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
3597 	cmd.data0 = slice;
3598 	err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_BIG_RX_OFFSET, &cmd);
3599 	ss->rx_big.lanai =
3600 		(volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
3601 
3602 	if (err != 0) {
3603 		device_printf(sc->dev,
3604 			      "failed to get ring sizes or locations\n");
3605 		return EIO;
3606 	}
3607 
3608 	/* stock receive rings */
3609 	for (i = 0; i <= ss->rx_small.mask; i++) {
3610 		map = ss->rx_small.info[i].map;
3611 		err = mxge_get_buf_small(ss, map, i);
3612 		if (err) {
3613 			device_printf(sc->dev, "alloced %d/%d smalls\n",
3614 				      i, ss->rx_small.mask + 1);
3615 			return ENOMEM;
3616 		}
3617 	}
3618 	for (i = 0; i <= ss->rx_big.mask; i++) {
3619 		ss->rx_big.shadow[i].addr_low = 0xffffffff;
3620 		ss->rx_big.shadow[i].addr_high = 0xffffffff;
3621 	}
3622 	ss->rx_big.nbufs = nbufs;
3623 	ss->rx_big.cl_size = cl_size;
3624 	ss->rx_big.mlen = ss->sc->ifp->if_mtu + ETHER_HDR_LEN +
3625 		ETHER_VLAN_ENCAP_LEN + MXGEFW_PAD;
3626 	for (i = 0; i <= ss->rx_big.mask; i += ss->rx_big.nbufs) {
3627 		map = ss->rx_big.info[i].map;
3628 		err = mxge_get_buf_big(ss, map, i);
3629 		if (err) {
3630 			device_printf(sc->dev, "alloced %d/%d bigs\n",
3631 				      i, ss->rx_big.mask + 1);
3632 			return ENOMEM;
3633 		}
3634 	}
3635 	return 0;
3636 }
3637 
3638 static int
3639 mxge_open(mxge_softc_t *sc)
3640 {
3641 	mxge_cmd_t cmd;
3642 	int err, big_bytes, nbufs, slice, cl_size, i;
3643 	bus_addr_t bus;
3644 	volatile uint8_t *itable;
3645 	struct mxge_slice_state *ss;
3646 
3647 	/* Copy the MAC address in case it was overridden */
3648 	bcopy(IF_LLADDR(sc->ifp), sc->mac_addr, ETHER_ADDR_LEN);
3649 
3650 	err = mxge_reset(sc, 1);
3651 	if (err != 0) {
3652 		device_printf(sc->dev, "failed to reset\n");
3653 		return EIO;
3654 	}
3655 
3656 	if (sc->num_slices > 1) {
3657 		/* setup the indirection table */
3658 		cmd.data0 = sc->num_slices;
3659 		err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_TABLE_SIZE,
3660 				    &cmd);
3661 
3662 		err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_RSS_TABLE_OFFSET,
3663 				     &cmd);
3664 		if (err != 0) {
3665 			device_printf(sc->dev,
3666 				      "failed to setup rss tables\n");
3667 			return err;
3668 		}
3669 
3670 		/* just enable an identity mapping */
3671 		itable = sc->sram + cmd.data0;
3672 		for (i = 0; i < sc->num_slices; i++)
3673 			itable[i] = (uint8_t)i;
3674 
3675 		cmd.data0 = 1;
3676 		cmd.data1 = mxge_rss_hash_type;
3677 		err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_ENABLE, &cmd);
3678 		if (err != 0) {
3679 			device_printf(sc->dev, "failed to enable slices\n");
3680 			return err;
3681 		}
3682 	}
3683 
3684 
3685 	mxge_choose_params(sc->ifp->if_mtu, &big_bytes, &cl_size, &nbufs);
3686 
3687 	cmd.data0 = nbufs;
3688 	err = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
3689 			    &cmd);
3690 	/* error is only meaningful if we're trying to set
3691 	   MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS > 1 */
3692 	if (err && nbufs > 1) {
3693 		device_printf(sc->dev,
3694 			      "Failed to set alway-use-n to %d\n",
3695 			      nbufs);
3696 		return EIO;
3697 	}
3698 	/* Give the firmware the mtu and the big and small buffer
3699 	   sizes.  The firmware wants the big buf size to be a power
3700 	   of two. Luckily, FreeBSD's clusters are powers of two */
3701 	cmd.data0 = sc->ifp->if_mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
3702 	err = mxge_send_cmd(sc, MXGEFW_CMD_SET_MTU, &cmd);
3703 	cmd.data0 = MHLEN - MXGEFW_PAD;
3704 	err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_SMALL_BUFFER_SIZE,
3705 			     &cmd);
3706 	cmd.data0 = big_bytes;
3707 	err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_BIG_BUFFER_SIZE, &cmd);
3708 
3709 	if (err != 0) {
3710 		device_printf(sc->dev, "failed to setup params\n");
3711 		goto abort;
3712 	}
3713 
3714 	/* Now give him the pointer to the stats block */
3715 	for (slice = 0;
3716 #ifdef IFNET_BUF_RING
3717 	     slice < sc->num_slices;
3718 #else
3719 	     slice < 1;
3720 #endif
3721 	     slice++) {
3722 		ss = &sc->ss[slice];
3723 		cmd.data0 =
3724 			MXGE_LOWPART_TO_U32(ss->fw_stats_dma.bus_addr);
3725 		cmd.data1 =
3726 			MXGE_HIGHPART_TO_U32(ss->fw_stats_dma.bus_addr);
3727 		cmd.data2 = sizeof(struct mcp_irq_data);
3728 		cmd.data2 |= (slice << 16);
3729 		err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_STATS_DMA_V2, &cmd);
3730 	}
3731 
3732 	if (err != 0) {
3733 		bus = sc->ss->fw_stats_dma.bus_addr;
3734 		bus += offsetof(struct mcp_irq_data, send_done_count);
3735 		cmd.data0 = MXGE_LOWPART_TO_U32(bus);
3736 		cmd.data1 = MXGE_HIGHPART_TO_U32(bus);
3737 		err = mxge_send_cmd(sc,
3738 				    MXGEFW_CMD_SET_STATS_DMA_OBSOLETE,
3739 				    &cmd);
3740 		/* Firmware cannot support multicast without STATS_DMA_V2 */
3741 		sc->fw_multicast_support = 0;
3742 	} else {
3743 		sc->fw_multicast_support = 1;
3744 	}
3745 
3746 	if (err != 0) {
3747 		device_printf(sc->dev, "failed to setup params\n");
3748 		goto abort;
3749 	}
3750 
3751 	for (slice = 0; slice < sc->num_slices; slice++) {
3752 		err = mxge_slice_open(&sc->ss[slice], nbufs, cl_size);
3753 		if (err != 0) {
3754 			device_printf(sc->dev, "couldn't open slice %d\n",
3755 				      slice);
3756 			goto abort;
3757 		}
3758 	}
3759 
3760 	/* Finally, start the firmware running */
3761 	err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_UP, &cmd);
3762 	if (err) {
3763 		device_printf(sc->dev, "Couldn't bring up link\n");
3764 		goto abort;
3765 	}
3766 #ifdef IFNET_BUF_RING
3767 	for (slice = 0; slice < sc->num_slices; slice++) {
3768 		ss = &sc->ss[slice];
3769 		ss->if_drv_flags |= IFF_DRV_RUNNING;
3770 		ss->if_drv_flags &= ~IFF_DRV_OACTIVE;
3771 	}
3772 #endif
3773 	sc->ifp->if_drv_flags |= IFF_DRV_RUNNING;
3774 	sc->ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
3775 
3776 	return 0;
3777 
3778 
3779 abort:
3780 	mxge_free_mbufs(sc);
3781 
3782 	return err;
3783 }
3784 
3785 static int
3786 mxge_close(mxge_softc_t *sc, int down)
3787 {
3788 	mxge_cmd_t cmd;
3789 	int err, old_down_cnt;
3790 #ifdef IFNET_BUF_RING
3791 	struct mxge_slice_state *ss;
3792 	int slice;
3793 #endif
3794 
3795 #ifdef IFNET_BUF_RING
3796 	for (slice = 0; slice < sc->num_slices; slice++) {
3797 		ss = &sc->ss[slice];
3798 		ss->if_drv_flags &= ~IFF_DRV_RUNNING;
3799 	}
3800 #endif
3801 	sc->ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
3802 	if (!down) {
3803 		old_down_cnt = sc->down_cnt;
3804 		wmb();
3805 		err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_DOWN, &cmd);
3806 		if (err) {
3807 			device_printf(sc->dev,
3808 				      "Couldn't bring down link\n");
3809 		}
3810 		if (old_down_cnt == sc->down_cnt) {
3811 			/* wait for down irq */
3812 			DELAY(10 * sc->intr_coal_delay);
3813 		}
3814 		wmb();
3815 		if (old_down_cnt == sc->down_cnt) {
3816 			device_printf(sc->dev, "never got down irq\n");
3817 		}
3818 	}
3819 	mxge_free_mbufs(sc);
3820 
3821 	return 0;
3822 }
3823 
3824 static void
3825 mxge_setup_cfg_space(mxge_softc_t *sc)
3826 {
3827 	device_t dev = sc->dev;
3828 	int reg;
3829 	uint16_t lnk, pectl;
3830 
3831 	/* find the PCIe link width and set max read request to 4KB*/
3832 	if (pci_find_cap(dev, PCIY_EXPRESS, &reg) == 0) {
3833 		lnk = pci_read_config(dev, reg + 0x12, 2);
3834 		sc->link_width = (lnk >> 4) & 0x3f;
3835 
3836 		if (sc->pectl == 0) {
3837 			pectl = pci_read_config(dev, reg + 0x8, 2);
3838 			pectl = (pectl & ~0x7000) | (5 << 12);
3839 			pci_write_config(dev, reg + 0x8, pectl, 2);
3840 			sc->pectl = pectl;
3841 		} else {
3842 			/* restore saved pectl after watchdog reset */
3843 			pci_write_config(dev, reg + 0x8, sc->pectl, 2);
3844 		}
3845 	}
3846 
3847 	/* Enable DMA and Memory space access */
3848 	pci_enable_busmaster(dev);
3849 }
3850 
3851 static uint32_t
3852 mxge_read_reboot(mxge_softc_t *sc)
3853 {
3854 	device_t dev = sc->dev;
3855 	uint32_t vs;
3856 
3857 	/* find the vendor specific offset */
3858 	if (pci_find_cap(dev, PCIY_VENDOR, &vs) != 0) {
3859 		device_printf(sc->dev,
3860 			      "could not find vendor specific offset\n");
3861 		return (uint32_t)-1;
3862 	}
3863 	/* enable read32 mode */
3864 	pci_write_config(dev, vs + 0x10, 0x3, 1);
3865 	/* tell NIC which register to read */
3866 	pci_write_config(dev, vs + 0x18, 0xfffffff0, 4);
3867 	return (pci_read_config(dev, vs + 0x14, 4));
3868 }
3869 
3870 static void
3871 mxge_watchdog_reset(mxge_softc_t *sc)
3872 {
3873 	struct pci_devinfo *dinfo;
3874 	struct mxge_slice_state *ss;
3875 	int err, running, s, num_tx_slices = 1;
3876 	uint32_t reboot;
3877 	uint16_t cmd;
3878 
3879 	err = ENXIO;
3880 
3881 	device_printf(sc->dev, "Watchdog reset!\n");
3882 
3883 	/*
3884 	 * check to see if the NIC rebooted.  If it did, then all of
3885 	 * PCI config space has been reset, and things like the
3886 	 * busmaster bit will be zero.  If this is the case, then we
3887 	 * must restore PCI config space before the NIC can be used
3888 	 * again
3889 	 */
3890 	cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3891 	if (cmd == 0xffff) {
3892 		/*
3893 		 * maybe the watchdog caught the NIC rebooting; wait
3894 		 * up to 100ms for it to finish.  If it does not come
3895 		 * back, then give up
3896 		 */
3897 		DELAY(1000*100);
3898 		cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3899 		if (cmd == 0xffff) {
3900 			device_printf(sc->dev, "NIC disappeared!\n");
3901 		}
3902 	}
3903 	if ((cmd & PCIM_CMD_BUSMASTEREN) == 0) {
3904 		/* print the reboot status */
3905 		reboot = mxge_read_reboot(sc);
3906 		device_printf(sc->dev, "NIC rebooted, status = 0x%x\n",
3907 			      reboot);
3908 		running = sc->ifp->if_drv_flags & IFF_DRV_RUNNING;
3909 		if (running) {
3910 
3911 			/*
3912 			 * quiesce NIC so that TX routines will not try to
3913 			 * xmit after restoration of BAR
3914 			 */
3915 
3916 			/* Mark the link as down */
3917 			if (sc->link_state) {
3918 				sc->link_state = 0;
3919 				if_link_state_change(sc->ifp,
3920 						     LINK_STATE_DOWN);
3921 			}
3922 #ifdef IFNET_BUF_RING
3923 			num_tx_slices = sc->num_slices;
3924 #endif
3925 			/* grab all TX locks to ensure no tx  */
3926 			for (s = 0; s < num_tx_slices; s++) {
3927 				ss = &sc->ss[s];
3928 				mtx_lock(&ss->tx.mtx);
3929 			}
3930 			mxge_close(sc, 1);
3931 		}
3932 		/* restore PCI configuration space */
3933 		dinfo = device_get_ivars(sc->dev);
3934 		pci_cfg_restore(sc->dev, dinfo);
3935 
3936 		/* and redo any changes we made to our config space */
3937 		mxge_setup_cfg_space(sc);
3938 
3939 		/* reload f/w */
3940 		err = mxge_load_firmware(sc, 0);
3941 		if (err) {
3942 			device_printf(sc->dev,
3943 				      "Unable to re-load f/w\n");
3944 		}
3945 		if (running) {
3946 			if (!err)
3947 				err = mxge_open(sc);
3948 			/* release all TX locks */
3949 			for (s = 0; s < num_tx_slices; s++) {
3950 				ss = &sc->ss[s];
3951 #ifdef IFNET_BUF_RING
3952 				mxge_start_locked(ss);
3953 #endif
3954 				mtx_unlock(&ss->tx.mtx);
3955 			}
3956 		}
3957 		sc->watchdog_resets++;
3958 	} else {
3959 		device_printf(sc->dev,
3960 			      "NIC did not reboot, not resetting\n");
3961 		err = 0;
3962 	}
3963 	if (err) {
3964 		device_printf(sc->dev, "watchdog reset failed\n");
3965 	} else {
3966 		if (sc->dying == 2)
3967 			sc->dying = 0;
3968 		callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
3969 	}
3970 }
3971 
3972 static void
3973 mxge_watchdog_task(void *arg, int pending)
3974 {
3975 	mxge_softc_t *sc = arg;
3976 
3977 
3978 	mtx_lock(&sc->driver_mtx);
3979 	mxge_watchdog_reset(sc);
3980 	mtx_unlock(&sc->driver_mtx);
3981 }
3982 
3983 static void
3984 mxge_warn_stuck(mxge_softc_t *sc, mxge_tx_ring_t *tx, int slice)
3985 {
3986 	tx = &sc->ss[slice].tx;
3987 	device_printf(sc->dev, "slice %d struck? ring state:\n", slice);
3988 	device_printf(sc->dev,
3989 		      "tx.req=%d tx.done=%d, tx.queue_active=%d\n",
3990 		      tx->req, tx->done, tx->queue_active);
3991 	device_printf(sc->dev, "tx.activate=%d tx.deactivate=%d\n",
3992 			      tx->activate, tx->deactivate);
3993 	device_printf(sc->dev, "pkt_done=%d fw=%d\n",
3994 		      tx->pkt_done,
3995 		      be32toh(sc->ss->fw_stats->send_done_count));
3996 }
3997 
3998 static int
3999 mxge_watchdog(mxge_softc_t *sc)
4000 {
4001 	mxge_tx_ring_t *tx;
4002 	uint32_t rx_pause = be32toh(sc->ss->fw_stats->dropped_pause);
4003 	int i, err = 0;
4004 
4005 	/* see if we have outstanding transmits, which
4006 	   have been pending for more than mxge_ticks */
4007 	for (i = 0;
4008 #ifdef IFNET_BUF_RING
4009 	     (i < sc->num_slices) && (err == 0);
4010 #else
4011 	     (i < 1) && (err == 0);
4012 #endif
4013 	     i++) {
4014 		tx = &sc->ss[i].tx;
4015 		if (tx->req != tx->done &&
4016 		    tx->watchdog_req != tx->watchdog_done &&
4017 		    tx->done == tx->watchdog_done) {
4018 			/* check for pause blocking before resetting */
4019 			if (tx->watchdog_rx_pause == rx_pause) {
4020 				mxge_warn_stuck(sc, tx, i);
4021 				taskqueue_enqueue(sc->tq, &sc->watchdog_task);
4022 				return (ENXIO);
4023 			}
4024 			else
4025 				device_printf(sc->dev, "Flow control blocking "
4026 					      "xmits, check link partner\n");
4027 		}
4028 
4029 		tx->watchdog_req = tx->req;
4030 		tx->watchdog_done = tx->done;
4031 		tx->watchdog_rx_pause = rx_pause;
4032 	}
4033 
4034 	if (sc->need_media_probe)
4035 		mxge_media_probe(sc);
4036 	return (err);
4037 }
4038 
4039 static u_long
4040 mxge_update_stats(mxge_softc_t *sc)
4041 {
4042 	struct mxge_slice_state *ss;
4043 	u_long pkts = 0;
4044 	u_long ipackets = 0;
4045 	u_long opackets = 0;
4046 #ifdef IFNET_BUF_RING
4047 	u_long obytes = 0;
4048 	u_long omcasts = 0;
4049 	u_long odrops = 0;
4050 #endif
4051 	u_long oerrors = 0;
4052 	int slice;
4053 
4054 	for (slice = 0; slice < sc->num_slices; slice++) {
4055 		ss = &sc->ss[slice];
4056 		ipackets += ss->ipackets;
4057 		opackets += ss->opackets;
4058 #ifdef IFNET_BUF_RING
4059 		obytes += ss->obytes;
4060 		omcasts += ss->omcasts;
4061 		odrops += ss->tx.br->br_drops;
4062 #endif
4063 		oerrors += ss->oerrors;
4064 	}
4065 	pkts = (ipackets - sc->ifp->if_ipackets);
4066 	pkts += (opackets - sc->ifp->if_opackets);
4067 	sc->ifp->if_ipackets = ipackets;
4068 	sc->ifp->if_opackets = opackets;
4069 #ifdef IFNET_BUF_RING
4070 	sc->ifp->if_obytes = obytes;
4071 	sc->ifp->if_omcasts = omcasts;
4072 	sc->ifp->if_snd.ifq_drops = odrops;
4073 #endif
4074 	sc->ifp->if_oerrors = oerrors;
4075 	return pkts;
4076 }
4077 
4078 static void
4079 mxge_tick(void *arg)
4080 {
4081 	mxge_softc_t *sc = arg;
4082 	u_long pkts = 0;
4083 	int err = 0;
4084 	int running, ticks;
4085 	uint16_t cmd;
4086 
4087 	ticks = mxge_ticks;
4088 	running = sc->ifp->if_drv_flags & IFF_DRV_RUNNING;
4089 	if (running) {
4090 		/* aggregate stats from different slices */
4091 		pkts = mxge_update_stats(sc);
4092 		if (!sc->watchdog_countdown) {
4093 			err = mxge_watchdog(sc);
4094 			sc->watchdog_countdown = 4;
4095 		}
4096 		sc->watchdog_countdown--;
4097 	}
4098 	if (pkts == 0) {
4099 		/* ensure NIC did not suffer h/w fault while idle */
4100 		cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
4101 		if ((cmd & PCIM_CMD_BUSMASTEREN) == 0) {
4102 			sc->dying = 2;
4103 			taskqueue_enqueue(sc->tq, &sc->watchdog_task);
4104 			err = ENXIO;
4105 		}
4106 		/* look less often if NIC is idle */
4107 		ticks *= 4;
4108 	}
4109 
4110 	if (err == 0)
4111 		callout_reset(&sc->co_hdl, ticks, mxge_tick, sc);
4112 
4113 }
4114 
4115 static int
4116 mxge_media_change(struct ifnet *ifp)
4117 {
4118 	return EINVAL;
4119 }
4120 
4121 static int
4122 mxge_change_mtu(mxge_softc_t *sc, int mtu)
4123 {
4124 	struct ifnet *ifp = sc->ifp;
4125 	int real_mtu, old_mtu;
4126 	int err = 0;
4127 
4128 
4129 	real_mtu = mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
4130 	if ((real_mtu > sc->max_mtu) || real_mtu < 60)
4131 		return EINVAL;
4132 	mtx_lock(&sc->driver_mtx);
4133 	old_mtu = ifp->if_mtu;
4134 	ifp->if_mtu = mtu;
4135 	if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
4136 		mxge_close(sc, 0);
4137 		err = mxge_open(sc);
4138 		if (err != 0) {
4139 			ifp->if_mtu = old_mtu;
4140 			mxge_close(sc, 0);
4141 			(void) mxge_open(sc);
4142 		}
4143 	}
4144 	mtx_unlock(&sc->driver_mtx);
4145 	return err;
4146 }
4147 
4148 static void
4149 mxge_media_status(struct ifnet *ifp, struct ifmediareq *ifmr)
4150 {
4151 	mxge_softc_t *sc = ifp->if_softc;
4152 
4153 
4154 	if (sc == NULL)
4155 		return;
4156 	ifmr->ifm_status = IFM_AVALID;
4157 	ifmr->ifm_active = IFM_ETHER | IFM_FDX;
4158 	ifmr->ifm_status |= sc->link_state ? IFM_ACTIVE : 0;
4159 	ifmr->ifm_active |= sc->current_media;
4160 }
4161 
4162 static int
4163 mxge_ioctl(struct ifnet *ifp, u_long command, caddr_t data)
4164 {
4165 	mxge_softc_t *sc = ifp->if_softc;
4166 	struct ifreq *ifr = (struct ifreq *)data;
4167 	int err, mask;
4168 
4169 	err = 0;
4170 	switch (command) {
4171 	case SIOCSIFADDR:
4172 	case SIOCGIFADDR:
4173 		err = ether_ioctl(ifp, command, data);
4174 		break;
4175 
4176 	case SIOCSIFMTU:
4177 		err = mxge_change_mtu(sc, ifr->ifr_mtu);
4178 		break;
4179 
4180 	case SIOCSIFFLAGS:
4181 		mtx_lock(&sc->driver_mtx);
4182 		if (sc->dying) {
4183 			mtx_unlock(&sc->driver_mtx);
4184 			return EINVAL;
4185 		}
4186 		if (ifp->if_flags & IFF_UP) {
4187 			if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) {
4188 				err = mxge_open(sc);
4189 			} else {
4190 				/* take care of promis can allmulti
4191 				   flag chages */
4192 				mxge_change_promisc(sc,
4193 						    ifp->if_flags & IFF_PROMISC);
4194 				mxge_set_multicast_list(sc);
4195 			}
4196 		} else {
4197 			if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
4198 				mxge_close(sc, 0);
4199 			}
4200 		}
4201 		mtx_unlock(&sc->driver_mtx);
4202 		break;
4203 
4204 	case SIOCADDMULTI:
4205 	case SIOCDELMULTI:
4206 		mtx_lock(&sc->driver_mtx);
4207 		mxge_set_multicast_list(sc);
4208 		mtx_unlock(&sc->driver_mtx);
4209 		break;
4210 
4211 	case SIOCSIFCAP:
4212 		mtx_lock(&sc->driver_mtx);
4213 		mask = ifr->ifr_reqcap ^ ifp->if_capenable;
4214 		if (mask & IFCAP_TXCSUM) {
4215 			if (IFCAP_TXCSUM & ifp->if_capenable) {
4216 				ifp->if_capenable &= ~(IFCAP_TXCSUM|IFCAP_TSO4);
4217 				ifp->if_hwassist &= ~(CSUM_TCP | CSUM_UDP);
4218 			} else {
4219 				ifp->if_capenable |= IFCAP_TXCSUM;
4220 				ifp->if_hwassist |= (CSUM_TCP | CSUM_UDP);
4221 			}
4222 		} else if (mask & IFCAP_RXCSUM) {
4223 			if (IFCAP_RXCSUM & ifp->if_capenable) {
4224 				ifp->if_capenable &= ~IFCAP_RXCSUM;
4225 			} else {
4226 				ifp->if_capenable |= IFCAP_RXCSUM;
4227 			}
4228 		}
4229 		if (mask & IFCAP_TSO4) {
4230 			if (IFCAP_TSO4 & ifp->if_capenable) {
4231 				ifp->if_capenable &= ~IFCAP_TSO4;
4232 			} else if (IFCAP_TXCSUM & ifp->if_capenable) {
4233 				ifp->if_capenable |= IFCAP_TSO4;
4234 				ifp->if_hwassist |= CSUM_TSO;
4235 			} else {
4236 				printf("mxge requires tx checksum offload"
4237 				       " be enabled to use TSO\n");
4238 				err = EINVAL;
4239 			}
4240 		}
4241 #if IFCAP_TSO6
4242 		if (mask & IFCAP_TXCSUM_IPV6) {
4243 			if (IFCAP_TXCSUM_IPV6 & ifp->if_capenable) {
4244 				ifp->if_capenable &= ~(IFCAP_TXCSUM_IPV6
4245 						       | IFCAP_TSO6);
4246 				ifp->if_hwassist &= ~(CSUM_TCP_IPV6
4247 						      | CSUM_UDP);
4248 			} else {
4249 				ifp->if_capenable |= IFCAP_TXCSUM_IPV6;
4250 				ifp->if_hwassist |= (CSUM_TCP_IPV6
4251 						     | CSUM_UDP_IPV6);
4252 			}
4253 		} else if (mask & IFCAP_RXCSUM_IPV6) {
4254 			if (IFCAP_RXCSUM_IPV6 & ifp->if_capenable) {
4255 				ifp->if_capenable &= ~IFCAP_RXCSUM_IPV6;
4256 			} else {
4257 				ifp->if_capenable |= IFCAP_RXCSUM_IPV6;
4258 			}
4259 		}
4260 		if (mask & IFCAP_TSO6) {
4261 			if (IFCAP_TSO6 & ifp->if_capenable) {
4262 				ifp->if_capenable &= ~IFCAP_TSO6;
4263 			} else if (IFCAP_TXCSUM_IPV6 & ifp->if_capenable) {
4264 				ifp->if_capenable |= IFCAP_TSO6;
4265 				ifp->if_hwassist |= CSUM_TSO;
4266 			} else {
4267 				printf("mxge requires tx checksum offload"
4268 				       " be enabled to use TSO\n");
4269 				err = EINVAL;
4270 			}
4271 		}
4272 #endif /*IFCAP_TSO6 */
4273 
4274 		if (mask & IFCAP_LRO)
4275 			ifp->if_capenable ^= IFCAP_LRO;
4276 		if (mask & IFCAP_VLAN_HWTAGGING)
4277 			ifp->if_capenable ^= IFCAP_VLAN_HWTAGGING;
4278 		if (mask & IFCAP_VLAN_HWTSO)
4279 			ifp->if_capenable ^= IFCAP_VLAN_HWTSO;
4280 
4281 		if (!(ifp->if_capabilities & IFCAP_VLAN_HWTSO) ||
4282 		    !(ifp->if_capenable & IFCAP_VLAN_HWTAGGING))
4283 			ifp->if_capenable &= ~IFCAP_VLAN_HWTSO;
4284 
4285 		mtx_unlock(&sc->driver_mtx);
4286 		VLAN_CAPABILITIES(ifp);
4287 
4288 		break;
4289 
4290 	case SIOCGIFMEDIA:
4291 		mtx_lock(&sc->driver_mtx);
4292 		mxge_media_probe(sc);
4293 		mtx_unlock(&sc->driver_mtx);
4294 		err = ifmedia_ioctl(ifp, (struct ifreq *)data,
4295 				    &sc->media, command);
4296                 break;
4297 
4298 	default:
4299 		err = ENOTTY;
4300         }
4301 	return err;
4302 }
4303 
4304 static void
4305 mxge_fetch_tunables(mxge_softc_t *sc)
4306 {
4307 
4308 	TUNABLE_INT_FETCH("hw.mxge.max_slices", &mxge_max_slices);
4309 	TUNABLE_INT_FETCH("hw.mxge.flow_control_enabled",
4310 			  &mxge_flow_control);
4311 	TUNABLE_INT_FETCH("hw.mxge.intr_coal_delay",
4312 			  &mxge_intr_coal_delay);
4313 	TUNABLE_INT_FETCH("hw.mxge.nvidia_ecrc_enable",
4314 			  &mxge_nvidia_ecrc_enable);
4315 	TUNABLE_INT_FETCH("hw.mxge.force_firmware",
4316 			  &mxge_force_firmware);
4317 	TUNABLE_INT_FETCH("hw.mxge.deassert_wait",
4318 			  &mxge_deassert_wait);
4319 	TUNABLE_INT_FETCH("hw.mxge.verbose",
4320 			  &mxge_verbose);
4321 	TUNABLE_INT_FETCH("hw.mxge.ticks", &mxge_ticks);
4322 	TUNABLE_INT_FETCH("hw.mxge.always_promisc", &mxge_always_promisc);
4323 	TUNABLE_INT_FETCH("hw.mxge.rss_hash_type", &mxge_rss_hash_type);
4324 	TUNABLE_INT_FETCH("hw.mxge.rss_hashtype", &mxge_rss_hash_type);
4325 	TUNABLE_INT_FETCH("hw.mxge.initial_mtu", &mxge_initial_mtu);
4326 	TUNABLE_INT_FETCH("hw.mxge.throttle", &mxge_throttle);
4327 
4328 	if (bootverbose)
4329 		mxge_verbose = 1;
4330 	if (mxge_intr_coal_delay < 0 || mxge_intr_coal_delay > 10*1000)
4331 		mxge_intr_coal_delay = 30;
4332 	if (mxge_ticks == 0)
4333 		mxge_ticks = hz / 2;
4334 	sc->pause = mxge_flow_control;
4335 	if (mxge_rss_hash_type < MXGEFW_RSS_HASH_TYPE_IPV4
4336 	    || mxge_rss_hash_type > MXGEFW_RSS_HASH_TYPE_MAX) {
4337 		mxge_rss_hash_type = MXGEFW_RSS_HASH_TYPE_SRC_DST_PORT;
4338 	}
4339 	if (mxge_initial_mtu > ETHERMTU_JUMBO ||
4340 	    mxge_initial_mtu < ETHER_MIN_LEN)
4341 		mxge_initial_mtu = ETHERMTU_JUMBO;
4342 
4343 	if (mxge_throttle && mxge_throttle > MXGE_MAX_THROTTLE)
4344 		mxge_throttle = MXGE_MAX_THROTTLE;
4345 	if (mxge_throttle && mxge_throttle < MXGE_MIN_THROTTLE)
4346 		mxge_throttle = MXGE_MIN_THROTTLE;
4347 	sc->throttle = mxge_throttle;
4348 }
4349 
4350 
4351 static void
4352 mxge_free_slices(mxge_softc_t *sc)
4353 {
4354 	struct mxge_slice_state *ss;
4355 	int i;
4356 
4357 
4358 	if (sc->ss == NULL)
4359 		return;
4360 
4361 	for (i = 0; i < sc->num_slices; i++) {
4362 		ss = &sc->ss[i];
4363 		if (ss->fw_stats != NULL) {
4364 			mxge_dma_free(&ss->fw_stats_dma);
4365 			ss->fw_stats = NULL;
4366 #ifdef IFNET_BUF_RING
4367 			if (ss->tx.br != NULL) {
4368 				drbr_free(ss->tx.br, M_DEVBUF);
4369 				ss->tx.br = NULL;
4370 			}
4371 #endif
4372 			mtx_destroy(&ss->tx.mtx);
4373 		}
4374 		if (ss->rx_done.entry != NULL) {
4375 			mxge_dma_free(&ss->rx_done.dma);
4376 			ss->rx_done.entry = NULL;
4377 		}
4378 	}
4379 	free(sc->ss, M_DEVBUF);
4380 	sc->ss = NULL;
4381 }
4382 
4383 static int
4384 mxge_alloc_slices(mxge_softc_t *sc)
4385 {
4386 	mxge_cmd_t cmd;
4387 	struct mxge_slice_state *ss;
4388 	size_t bytes;
4389 	int err, i, max_intr_slots;
4390 
4391 	err = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
4392 	if (err != 0) {
4393 		device_printf(sc->dev, "Cannot determine rx ring size\n");
4394 		return err;
4395 	}
4396 	sc->rx_ring_size = cmd.data0;
4397 	max_intr_slots = 2 * (sc->rx_ring_size / sizeof (mcp_dma_addr_t));
4398 
4399 	bytes = sizeof (*sc->ss) * sc->num_slices;
4400 	sc->ss = malloc(bytes, M_DEVBUF, M_NOWAIT | M_ZERO);
4401 	if (sc->ss == NULL)
4402 		return (ENOMEM);
4403 	for (i = 0; i < sc->num_slices; i++) {
4404 		ss = &sc->ss[i];
4405 
4406 		ss->sc = sc;
4407 
4408 		/* allocate per-slice rx interrupt queues */
4409 
4410 		bytes = max_intr_slots * sizeof (*ss->rx_done.entry);
4411 		err = mxge_dma_alloc(sc, &ss->rx_done.dma, bytes, 4096);
4412 		if (err != 0)
4413 			goto abort;
4414 		ss->rx_done.entry = ss->rx_done.dma.addr;
4415 		bzero(ss->rx_done.entry, bytes);
4416 
4417 		/*
4418 		 * allocate the per-slice firmware stats; stats
4419 		 * (including tx) are used used only on the first
4420 		 * slice for now
4421 		 */
4422 #ifndef IFNET_BUF_RING
4423 		if (i > 0)
4424 			continue;
4425 #endif
4426 
4427 		bytes = sizeof (*ss->fw_stats);
4428 		err = mxge_dma_alloc(sc, &ss->fw_stats_dma,
4429 				     sizeof (*ss->fw_stats), 64);
4430 		if (err != 0)
4431 			goto abort;
4432 		ss->fw_stats = (mcp_irq_data_t *)ss->fw_stats_dma.addr;
4433 		snprintf(ss->tx.mtx_name, sizeof(ss->tx.mtx_name),
4434 			 "%s:tx(%d)", device_get_nameunit(sc->dev), i);
4435 		mtx_init(&ss->tx.mtx, ss->tx.mtx_name, NULL, MTX_DEF);
4436 #ifdef IFNET_BUF_RING
4437 		ss->tx.br = buf_ring_alloc(2048, M_DEVBUF, M_WAITOK,
4438 					   &ss->tx.mtx);
4439 #endif
4440 	}
4441 
4442 	return (0);
4443 
4444 abort:
4445 	mxge_free_slices(sc);
4446 	return (ENOMEM);
4447 }
4448 
4449 static void
4450 mxge_slice_probe(mxge_softc_t *sc)
4451 {
4452 	mxge_cmd_t cmd;
4453 	char *old_fw;
4454 	int msix_cnt, status, max_intr_slots;
4455 
4456 	sc->num_slices = 1;
4457 	/*
4458 	 *  don't enable multiple slices if they are not enabled,
4459 	 *  or if this is not an SMP system
4460 	 */
4461 
4462 	if (mxge_max_slices == 0 || mxge_max_slices == 1 || mp_ncpus < 2)
4463 		return;
4464 
4465 	/* see how many MSI-X interrupts are available */
4466 	msix_cnt = pci_msix_count(sc->dev);
4467 	if (msix_cnt < 2)
4468 		return;
4469 
4470 	/* now load the slice aware firmware see what it supports */
4471 	old_fw = sc->fw_name;
4472 	if (old_fw == mxge_fw_aligned)
4473 		sc->fw_name = mxge_fw_rss_aligned;
4474 	else
4475 		sc->fw_name = mxge_fw_rss_unaligned;
4476 	status = mxge_load_firmware(sc, 0);
4477 	if (status != 0) {
4478 		device_printf(sc->dev, "Falling back to a single slice\n");
4479 		return;
4480 	}
4481 
4482 	/* try to send a reset command to the card to see if it
4483 	   is alive */
4484 	memset(&cmd, 0, sizeof (cmd));
4485 	status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
4486 	if (status != 0) {
4487 		device_printf(sc->dev, "failed reset\n");
4488 		goto abort_with_fw;
4489 	}
4490 
4491 	/* get rx ring size */
4492 	status = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
4493 	if (status != 0) {
4494 		device_printf(sc->dev, "Cannot determine rx ring size\n");
4495 		goto abort_with_fw;
4496 	}
4497 	max_intr_slots = 2 * (cmd.data0 / sizeof (mcp_dma_addr_t));
4498 
4499 	/* tell it the size of the interrupt queues */
4500 	cmd.data0 = max_intr_slots * sizeof (struct mcp_slot);
4501 	status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
4502 	if (status != 0) {
4503 		device_printf(sc->dev, "failed MXGEFW_CMD_SET_INTRQ_SIZE\n");
4504 		goto abort_with_fw;
4505 	}
4506 
4507 	/* ask the maximum number of slices it supports */
4508 	status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES, &cmd);
4509 	if (status != 0) {
4510 		device_printf(sc->dev,
4511 			      "failed MXGEFW_CMD_GET_MAX_RSS_QUEUES\n");
4512 		goto abort_with_fw;
4513 	}
4514 	sc->num_slices = cmd.data0;
4515 	if (sc->num_slices > msix_cnt)
4516 		sc->num_slices = msix_cnt;
4517 
4518 	if (mxge_max_slices == -1) {
4519 		/* cap to number of CPUs in system */
4520 		if (sc->num_slices > mp_ncpus)
4521 			sc->num_slices = mp_ncpus;
4522 	} else {
4523 		if (sc->num_slices > mxge_max_slices)
4524 			sc->num_slices = mxge_max_slices;
4525 	}
4526 	/* make sure it is a power of two */
4527 	while (sc->num_slices & (sc->num_slices - 1))
4528 		sc->num_slices--;
4529 
4530 	if (mxge_verbose)
4531 		device_printf(sc->dev, "using %d slices\n",
4532 			      sc->num_slices);
4533 
4534 	return;
4535 
4536 abort_with_fw:
4537 	sc->fw_name = old_fw;
4538 	(void) mxge_load_firmware(sc, 0);
4539 }
4540 
4541 static int
4542 mxge_add_msix_irqs(mxge_softc_t *sc)
4543 {
4544 	size_t bytes;
4545 	int count, err, i, rid;
4546 
4547 	rid = PCIR_BAR(2);
4548 	sc->msix_table_res = bus_alloc_resource_any(sc->dev, SYS_RES_MEMORY,
4549 						    &rid, RF_ACTIVE);
4550 
4551 	if (sc->msix_table_res == NULL) {
4552 		device_printf(sc->dev, "couldn't alloc MSIX table res\n");
4553 		return ENXIO;
4554 	}
4555 
4556 	count = sc->num_slices;
4557 	err = pci_alloc_msix(sc->dev, &count);
4558 	if (err != 0) {
4559 		device_printf(sc->dev, "pci_alloc_msix: failed, wanted %d"
4560 			      "err = %d \n", sc->num_slices, err);
4561 		goto abort_with_msix_table;
4562 	}
4563 	if (count < sc->num_slices) {
4564 		device_printf(sc->dev, "pci_alloc_msix: need %d, got %d\n",
4565 			      count, sc->num_slices);
4566 		device_printf(sc->dev,
4567 			      "Try setting hw.mxge.max_slices to %d\n",
4568 			      count);
4569 		err = ENOSPC;
4570 		goto abort_with_msix;
4571 	}
4572 	bytes = sizeof (*sc->msix_irq_res) * sc->num_slices;
4573 	sc->msix_irq_res = malloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO);
4574 	if (sc->msix_irq_res == NULL) {
4575 		err = ENOMEM;
4576 		goto abort_with_msix;
4577 	}
4578 
4579 	for (i = 0; i < sc->num_slices; i++) {
4580 		rid = i + 1;
4581 		sc->msix_irq_res[i] = bus_alloc_resource_any(sc->dev,
4582 							  SYS_RES_IRQ,
4583 							  &rid, RF_ACTIVE);
4584 		if (sc->msix_irq_res[i] == NULL) {
4585 			device_printf(sc->dev, "couldn't allocate IRQ res"
4586 				      " for message %d\n", i);
4587 			err = ENXIO;
4588 			goto abort_with_res;
4589 		}
4590 	}
4591 
4592 	bytes = sizeof (*sc->msix_ih) * sc->num_slices;
4593 	sc->msix_ih =  malloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO);
4594 
4595 	for (i = 0; i < sc->num_slices; i++) {
4596 		err = bus_setup_intr(sc->dev, sc->msix_irq_res[i],
4597 				     INTR_TYPE_NET | INTR_MPSAFE,
4598 #if __FreeBSD_version > 700030
4599 				     NULL,
4600 #endif
4601 				     mxge_intr, &sc->ss[i], &sc->msix_ih[i]);
4602 		if (err != 0) {
4603 			device_printf(sc->dev, "couldn't setup intr for "
4604 				      "message %d\n", i);
4605 			goto abort_with_intr;
4606 		}
4607 		bus_describe_intr(sc->dev, sc->msix_irq_res[i],
4608 				  sc->msix_ih[i], "s%d", i);
4609 	}
4610 
4611 	if (mxge_verbose) {
4612 		device_printf(sc->dev, "using %d msix IRQs:",
4613 			      sc->num_slices);
4614 		for (i = 0; i < sc->num_slices; i++)
4615 			printf(" %ld",  rman_get_start(sc->msix_irq_res[i]));
4616 		printf("\n");
4617 	}
4618 	return (0);
4619 
4620 abort_with_intr:
4621 	for (i = 0; i < sc->num_slices; i++) {
4622 		if (sc->msix_ih[i] != NULL) {
4623 			bus_teardown_intr(sc->dev, sc->msix_irq_res[i],
4624 					  sc->msix_ih[i]);
4625 			sc->msix_ih[i] = NULL;
4626 		}
4627 	}
4628 	free(sc->msix_ih, M_DEVBUF);
4629 
4630 
4631 abort_with_res:
4632 	for (i = 0; i < sc->num_slices; i++) {
4633 		rid = i + 1;
4634 		if (sc->msix_irq_res[i] != NULL)
4635 			bus_release_resource(sc->dev, SYS_RES_IRQ, rid,
4636 					     sc->msix_irq_res[i]);
4637 		sc->msix_irq_res[i] = NULL;
4638 	}
4639 	free(sc->msix_irq_res, M_DEVBUF);
4640 
4641 
4642 abort_with_msix:
4643 	pci_release_msi(sc->dev);
4644 
4645 abort_with_msix_table:
4646 	bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2),
4647 			     sc->msix_table_res);
4648 
4649 	return err;
4650 }
4651 
4652 static int
4653 mxge_add_single_irq(mxge_softc_t *sc)
4654 {
4655 	int count, err, rid;
4656 
4657 	count = pci_msi_count(sc->dev);
4658 	if (count == 1 && pci_alloc_msi(sc->dev, &count) == 0) {
4659 		rid = 1;
4660 	} else {
4661 		rid = 0;
4662 		sc->legacy_irq = 1;
4663 	}
4664 	sc->irq_res = bus_alloc_resource(sc->dev, SYS_RES_IRQ, &rid, 0, ~0,
4665 					 1, RF_SHAREABLE | RF_ACTIVE);
4666 	if (sc->irq_res == NULL) {
4667 		device_printf(sc->dev, "could not alloc interrupt\n");
4668 		return ENXIO;
4669 	}
4670 	if (mxge_verbose)
4671 		device_printf(sc->dev, "using %s irq %ld\n",
4672 			      sc->legacy_irq ? "INTx" : "MSI",
4673 			      rman_get_start(sc->irq_res));
4674 	err = bus_setup_intr(sc->dev, sc->irq_res,
4675 			     INTR_TYPE_NET | INTR_MPSAFE,
4676 #if __FreeBSD_version > 700030
4677 			     NULL,
4678 #endif
4679 			     mxge_intr, &sc->ss[0], &sc->ih);
4680 	if (err != 0) {
4681 		bus_release_resource(sc->dev, SYS_RES_IRQ,
4682 				     sc->legacy_irq ? 0 : 1, sc->irq_res);
4683 		if (!sc->legacy_irq)
4684 			pci_release_msi(sc->dev);
4685 	}
4686 	return err;
4687 }
4688 
4689 static void
4690 mxge_rem_msix_irqs(mxge_softc_t *sc)
4691 {
4692 	int i, rid;
4693 
4694 	for (i = 0; i < sc->num_slices; i++) {
4695 		if (sc->msix_ih[i] != NULL) {
4696 			bus_teardown_intr(sc->dev, sc->msix_irq_res[i],
4697 					  sc->msix_ih[i]);
4698 			sc->msix_ih[i] = NULL;
4699 		}
4700 	}
4701 	free(sc->msix_ih, M_DEVBUF);
4702 
4703 	for (i = 0; i < sc->num_slices; i++) {
4704 		rid = i + 1;
4705 		if (sc->msix_irq_res[i] != NULL)
4706 			bus_release_resource(sc->dev, SYS_RES_IRQ, rid,
4707 					     sc->msix_irq_res[i]);
4708 		sc->msix_irq_res[i] = NULL;
4709 	}
4710 	free(sc->msix_irq_res, M_DEVBUF);
4711 
4712 	bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2),
4713 			     sc->msix_table_res);
4714 
4715 	pci_release_msi(sc->dev);
4716 	return;
4717 }
4718 
4719 static void
4720 mxge_rem_single_irq(mxge_softc_t *sc)
4721 {
4722 	bus_teardown_intr(sc->dev, sc->irq_res, sc->ih);
4723 	bus_release_resource(sc->dev, SYS_RES_IRQ,
4724 			     sc->legacy_irq ? 0 : 1, sc->irq_res);
4725 	if (!sc->legacy_irq)
4726 		pci_release_msi(sc->dev);
4727 }
4728 
4729 static void
4730 mxge_rem_irq(mxge_softc_t *sc)
4731 {
4732 	if (sc->num_slices > 1)
4733 		mxge_rem_msix_irqs(sc);
4734 	else
4735 		mxge_rem_single_irq(sc);
4736 }
4737 
4738 static int
4739 mxge_add_irq(mxge_softc_t *sc)
4740 {
4741 	int err;
4742 
4743 	if (sc->num_slices > 1)
4744 		err = mxge_add_msix_irqs(sc);
4745 	else
4746 		err = mxge_add_single_irq(sc);
4747 
4748 	if (0 && err == 0 && sc->num_slices > 1) {
4749 		mxge_rem_msix_irqs(sc);
4750 		err = mxge_add_msix_irqs(sc);
4751 	}
4752 	return err;
4753 }
4754 
4755 
4756 static int
4757 mxge_attach(device_t dev)
4758 {
4759 	mxge_cmd_t cmd;
4760 	mxge_softc_t *sc = device_get_softc(dev);
4761 	struct ifnet *ifp;
4762 	int err, rid;
4763 
4764 	sc->dev = dev;
4765 	mxge_fetch_tunables(sc);
4766 
4767 	TASK_INIT(&sc->watchdog_task, 1, mxge_watchdog_task, sc);
4768 	sc->tq = taskqueue_create("mxge_taskq", M_WAITOK,
4769 				  taskqueue_thread_enqueue, &sc->tq);
4770 	if (sc->tq == NULL) {
4771 		err = ENOMEM;
4772 		goto abort_with_nothing;
4773 	}
4774 
4775 	err = bus_dma_tag_create(bus_get_dma_tag(dev),	/* parent */
4776 				 1,			/* alignment */
4777 				 0,			/* boundary */
4778 				 BUS_SPACE_MAXADDR,	/* low */
4779 				 BUS_SPACE_MAXADDR,	/* high */
4780 				 NULL, NULL,		/* filter */
4781 				 65536 + 256,		/* maxsize */
4782 				 MXGE_MAX_SEND_DESC, 	/* num segs */
4783 				 65536,			/* maxsegsize */
4784 				 0,			/* flags */
4785 				 NULL, NULL,		/* lock */
4786 				 &sc->parent_dmat);	/* tag */
4787 
4788 	if (err != 0) {
4789 		device_printf(sc->dev, "Err %d allocating parent dmat\n",
4790 			      err);
4791 		goto abort_with_tq;
4792 	}
4793 
4794 	ifp = sc->ifp = if_alloc(IFT_ETHER);
4795 	if (ifp == NULL) {
4796 		device_printf(dev, "can not if_alloc()\n");
4797 		err = ENOSPC;
4798 		goto abort_with_parent_dmat;
4799 	}
4800 	if_initname(ifp, device_get_name(dev), device_get_unit(dev));
4801 
4802 	snprintf(sc->cmd_mtx_name, sizeof(sc->cmd_mtx_name), "%s:cmd",
4803 		 device_get_nameunit(dev));
4804 	mtx_init(&sc->cmd_mtx, sc->cmd_mtx_name, NULL, MTX_DEF);
4805 	snprintf(sc->driver_mtx_name, sizeof(sc->driver_mtx_name),
4806 		 "%s:drv", device_get_nameunit(dev));
4807 	mtx_init(&sc->driver_mtx, sc->driver_mtx_name,
4808 		 MTX_NETWORK_LOCK, MTX_DEF);
4809 
4810 	callout_init_mtx(&sc->co_hdl, &sc->driver_mtx, 0);
4811 
4812 	mxge_setup_cfg_space(sc);
4813 
4814 	/* Map the board into the kernel */
4815 	rid = PCIR_BARS;
4816 	sc->mem_res = bus_alloc_resource(dev, SYS_RES_MEMORY, &rid, 0,
4817 					 ~0, 1, RF_ACTIVE);
4818 	if (sc->mem_res == NULL) {
4819 		device_printf(dev, "could not map memory\n");
4820 		err = ENXIO;
4821 		goto abort_with_lock;
4822 	}
4823 	sc->sram = rman_get_virtual(sc->mem_res);
4824 	sc->sram_size = 2*1024*1024 - (2*(48*1024)+(32*1024)) - 0x100;
4825 	if (sc->sram_size > rman_get_size(sc->mem_res)) {
4826 		device_printf(dev, "impossible memory region size %ld\n",
4827 			      rman_get_size(sc->mem_res));
4828 		err = ENXIO;
4829 		goto abort_with_mem_res;
4830 	}
4831 
4832 	/* make NULL terminated copy of the EEPROM strings section of
4833 	   lanai SRAM */
4834 	bzero(sc->eeprom_strings, MXGE_EEPROM_STRINGS_SIZE);
4835 	bus_space_read_region_1(rman_get_bustag(sc->mem_res),
4836 				rman_get_bushandle(sc->mem_res),
4837 				sc->sram_size - MXGE_EEPROM_STRINGS_SIZE,
4838 				sc->eeprom_strings,
4839 				MXGE_EEPROM_STRINGS_SIZE - 2);
4840 	err = mxge_parse_strings(sc);
4841 	if (err != 0)
4842 		goto abort_with_mem_res;
4843 
4844 	/* Enable write combining for efficient use of PCIe bus */
4845 	mxge_enable_wc(sc);
4846 
4847 	/* Allocate the out of band dma memory */
4848 	err = mxge_dma_alloc(sc, &sc->cmd_dma,
4849 			     sizeof (mxge_cmd_t), 64);
4850 	if (err != 0)
4851 		goto abort_with_mem_res;
4852 	sc->cmd = (mcp_cmd_response_t *) sc->cmd_dma.addr;
4853 	err = mxge_dma_alloc(sc, &sc->zeropad_dma, 64, 64);
4854 	if (err != 0)
4855 		goto abort_with_cmd_dma;
4856 
4857 	err = mxge_dma_alloc(sc, &sc->dmabench_dma, 4096, 4096);
4858 	if (err != 0)
4859 		goto abort_with_zeropad_dma;
4860 
4861 	/* select & load the firmware */
4862 	err = mxge_select_firmware(sc);
4863 	if (err != 0)
4864 		goto abort_with_dmabench;
4865 	sc->intr_coal_delay = mxge_intr_coal_delay;
4866 
4867 	mxge_slice_probe(sc);
4868 	err = mxge_alloc_slices(sc);
4869 	if (err != 0)
4870 		goto abort_with_dmabench;
4871 
4872 	err = mxge_reset(sc, 0);
4873 	if (err != 0)
4874 		goto abort_with_slices;
4875 
4876 	err = mxge_alloc_rings(sc);
4877 	if (err != 0) {
4878 		device_printf(sc->dev, "failed to allocate rings\n");
4879 		goto abort_with_slices;
4880 	}
4881 
4882 	err = mxge_add_irq(sc);
4883 	if (err != 0) {
4884 		device_printf(sc->dev, "failed to add irq\n");
4885 		goto abort_with_rings;
4886 	}
4887 
4888 	ifp->if_baudrate = IF_Gbps(10);
4889 	ifp->if_capabilities = IFCAP_RXCSUM | IFCAP_TXCSUM | IFCAP_TSO4 |
4890 		IFCAP_VLAN_MTU | IFCAP_LINKSTATE | IFCAP_TXCSUM_IPV6 |
4891 		IFCAP_RXCSUM_IPV6;
4892 #if defined(INET) || defined(INET6)
4893 	ifp->if_capabilities |= IFCAP_LRO;
4894 #endif
4895 
4896 #ifdef MXGE_NEW_VLAN_API
4897 	ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_HWCSUM;
4898 
4899 	/* Only FW 1.4.32 and newer can do TSO over vlans */
4900 	if (sc->fw_ver_major == 1 && sc->fw_ver_minor == 4 &&
4901 	    sc->fw_ver_tiny >= 32)
4902 		ifp->if_capabilities |= IFCAP_VLAN_HWTSO;
4903 #endif
4904 	sc->max_mtu = mxge_max_mtu(sc);
4905 	if (sc->max_mtu >= 9000)
4906 		ifp->if_capabilities |= IFCAP_JUMBO_MTU;
4907 	else
4908 		device_printf(dev, "MTU limited to %d.  Install "
4909 			      "latest firmware for 9000 byte jumbo support\n",
4910 			      sc->max_mtu - ETHER_HDR_LEN);
4911 	ifp->if_hwassist = CSUM_TCP | CSUM_UDP | CSUM_TSO;
4912 	ifp->if_hwassist |= CSUM_TCP_IPV6 | CSUM_UDP_IPV6;
4913 	/* check to see if f/w supports TSO for IPv6 */
4914 	if (!mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_TSO6_HDR_SIZE, &cmd)) {
4915 		if (CSUM_TCP_IPV6)
4916 			ifp->if_capabilities |= IFCAP_TSO6;
4917 		sc->max_tso6_hlen = min(cmd.data0,
4918 					sizeof (sc->ss[0].scratch));
4919 	}
4920 	ifp->if_capenable = ifp->if_capabilities;
4921 	if (sc->lro_cnt == 0)
4922 		ifp->if_capenable &= ~IFCAP_LRO;
4923         ifp->if_init = mxge_init;
4924         ifp->if_softc = sc;
4925         ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
4926         ifp->if_ioctl = mxge_ioctl;
4927         ifp->if_start = mxge_start;
4928 	/* Initialise the ifmedia structure */
4929 	ifmedia_init(&sc->media, 0, mxge_media_change,
4930 		     mxge_media_status);
4931 	mxge_media_init(sc);
4932 	mxge_media_probe(sc);
4933 	sc->dying = 0;
4934 	ether_ifattach(ifp, sc->mac_addr);
4935 	/* ether_ifattach sets mtu to ETHERMTU */
4936 	if (mxge_initial_mtu != ETHERMTU)
4937 		mxge_change_mtu(sc, mxge_initial_mtu);
4938 
4939 	mxge_add_sysctls(sc);
4940 #ifdef IFNET_BUF_RING
4941 	ifp->if_transmit = mxge_transmit;
4942 	ifp->if_qflush = mxge_qflush;
4943 #endif
4944 	taskqueue_start_threads(&sc->tq, 1, PI_NET, "%s taskq",
4945 				device_get_nameunit(sc->dev));
4946 	callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
4947 	return 0;
4948 
4949 abort_with_rings:
4950 	mxge_free_rings(sc);
4951 abort_with_slices:
4952 	mxge_free_slices(sc);
4953 abort_with_dmabench:
4954 	mxge_dma_free(&sc->dmabench_dma);
4955 abort_with_zeropad_dma:
4956 	mxge_dma_free(&sc->zeropad_dma);
4957 abort_with_cmd_dma:
4958 	mxge_dma_free(&sc->cmd_dma);
4959 abort_with_mem_res:
4960 	bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
4961 abort_with_lock:
4962 	pci_disable_busmaster(dev);
4963 	mtx_destroy(&sc->cmd_mtx);
4964 	mtx_destroy(&sc->driver_mtx);
4965 	if_free(ifp);
4966 abort_with_parent_dmat:
4967 	bus_dma_tag_destroy(sc->parent_dmat);
4968 abort_with_tq:
4969 	if (sc->tq != NULL) {
4970 		taskqueue_drain(sc->tq, &sc->watchdog_task);
4971 		taskqueue_free(sc->tq);
4972 		sc->tq = NULL;
4973 	}
4974 abort_with_nothing:
4975 	return err;
4976 }
4977 
4978 static int
4979 mxge_detach(device_t dev)
4980 {
4981 	mxge_softc_t *sc = device_get_softc(dev);
4982 
4983 	if (mxge_vlans_active(sc)) {
4984 		device_printf(sc->dev,
4985 			      "Detach vlans before removing module\n");
4986 		return EBUSY;
4987 	}
4988 	mtx_lock(&sc->driver_mtx);
4989 	sc->dying = 1;
4990 	if (sc->ifp->if_drv_flags & IFF_DRV_RUNNING)
4991 		mxge_close(sc, 0);
4992 	mtx_unlock(&sc->driver_mtx);
4993 	ether_ifdetach(sc->ifp);
4994 	if (sc->tq != NULL) {
4995 		taskqueue_drain(sc->tq, &sc->watchdog_task);
4996 		taskqueue_free(sc->tq);
4997 		sc->tq = NULL;
4998 	}
4999 	callout_drain(&sc->co_hdl);
5000 	ifmedia_removeall(&sc->media);
5001 	mxge_dummy_rdma(sc, 0);
5002 	mxge_rem_sysctls(sc);
5003 	mxge_rem_irq(sc);
5004 	mxge_free_rings(sc);
5005 	mxge_free_slices(sc);
5006 	mxge_dma_free(&sc->dmabench_dma);
5007 	mxge_dma_free(&sc->zeropad_dma);
5008 	mxge_dma_free(&sc->cmd_dma);
5009 	bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
5010 	pci_disable_busmaster(dev);
5011 	mtx_destroy(&sc->cmd_mtx);
5012 	mtx_destroy(&sc->driver_mtx);
5013 	if_free(sc->ifp);
5014 	bus_dma_tag_destroy(sc->parent_dmat);
5015 	return 0;
5016 }
5017 
5018 static int
5019 mxge_shutdown(device_t dev)
5020 {
5021 	return 0;
5022 }
5023 
5024 /*
5025   This file uses Myri10GE driver indentation.
5026 
5027   Local Variables:
5028   c-file-style:"linux"
5029   tab-width:8
5030   End:
5031 */
5032