xref: /freebsd/sys/dev/mxge/if_mxge.c (revision 193d9e768ba63fcfb187cfd17f461f7d41345048)
1 /******************************************************************************
2 
3 Copyright (c) 2006-2013, Myricom Inc.
4 All rights reserved.
5 
6 Redistribution and use in source and binary forms, with or without
7 modification, are permitted provided that the following conditions are met:
8 
9  1. Redistributions of source code must retain the above copyright notice,
10     this list of conditions and the following disclaimer.
11 
12  2. Neither the name of the Myricom Inc, nor the names of its
13     contributors may be used to endorse or promote products derived from
14     this software without specific prior written permission.
15 
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 POSSIBILITY OF SUCH DAMAGE.
27 
28 ***************************************************************************/
29 
30 #include <sys/cdefs.h>
31 __FBSDID("$FreeBSD$");
32 
33 #include <sys/param.h>
34 #include <sys/systm.h>
35 #include <sys/linker.h>
36 #include <sys/firmware.h>
37 #include <sys/endian.h>
38 #include <sys/sockio.h>
39 #include <sys/mbuf.h>
40 #include <sys/malloc.h>
41 #include <sys/kdb.h>
42 #include <sys/kernel.h>
43 #include <sys/lock.h>
44 #include <sys/module.h>
45 #include <sys/socket.h>
46 #include <sys/sysctl.h>
47 #include <sys/sx.h>
48 #include <sys/taskqueue.h>
49 #include <sys/zlib.h>
50 
51 #include <net/if.h>
52 #include <net/if_var.h>
53 #include <net/if_arp.h>
54 #include <net/ethernet.h>
55 #include <net/if_dl.h>
56 #include <net/if_media.h>
57 
58 #include <net/bpf.h>
59 
60 #include <net/if_types.h>
61 #include <net/if_vlan_var.h>
62 
63 #include <netinet/in_systm.h>
64 #include <netinet/in.h>
65 #include <netinet/ip.h>
66 #include <netinet/ip6.h>
67 #include <netinet/tcp.h>
68 #include <netinet/tcp_lro.h>
69 #include <netinet6/ip6_var.h>
70 
71 #include <machine/bus.h>
72 #include <machine/in_cksum.h>
73 #include <machine/resource.h>
74 #include <sys/bus.h>
75 #include <sys/rman.h>
76 #include <sys/smp.h>
77 
78 #include <dev/pci/pcireg.h>
79 #include <dev/pci/pcivar.h>
80 #include <dev/pci/pci_private.h> /* XXX for pci_cfg_restore */
81 
82 #include <vm/vm.h>		/* for pmap_mapdev() */
83 #include <vm/pmap.h>
84 
85 #if defined(__i386) || defined(__amd64)
86 #include <machine/specialreg.h>
87 #endif
88 
89 #include <dev/mxge/mxge_mcp.h>
90 #include <dev/mxge/mcp_gen_header.h>
91 /*#define MXGE_FAKE_IFP*/
92 #include <dev/mxge/if_mxge_var.h>
93 #ifdef IFNET_BUF_RING
94 #include <sys/buf_ring.h>
95 #endif
96 
97 #include "opt_inet.h"
98 #include "opt_inet6.h"
99 
100 /* tunable params */
101 static int mxge_nvidia_ecrc_enable = 1;
102 static int mxge_force_firmware = 0;
103 static int mxge_intr_coal_delay = 30;
104 static int mxge_deassert_wait = 1;
105 static int mxge_flow_control = 1;
106 static int mxge_verbose = 0;
107 static int mxge_ticks;
108 static int mxge_max_slices = 1;
109 static int mxge_rss_hash_type = MXGEFW_RSS_HASH_TYPE_SRC_DST_PORT;
110 static int mxge_always_promisc = 0;
111 static int mxge_initial_mtu = ETHERMTU_JUMBO;
112 static int mxge_throttle = 0;
113 static char *mxge_fw_unaligned = "mxge_ethp_z8e";
114 static char *mxge_fw_aligned = "mxge_eth_z8e";
115 static char *mxge_fw_rss_aligned = "mxge_rss_eth_z8e";
116 static char *mxge_fw_rss_unaligned = "mxge_rss_ethp_z8e";
117 
118 static int mxge_probe(device_t dev);
119 static int mxge_attach(device_t dev);
120 static int mxge_detach(device_t dev);
121 static int mxge_shutdown(device_t dev);
122 static void mxge_intr(void *arg);
123 
124 static device_method_t mxge_methods[] =
125 {
126   /* Device interface */
127   DEVMETHOD(device_probe, mxge_probe),
128   DEVMETHOD(device_attach, mxge_attach),
129   DEVMETHOD(device_detach, mxge_detach),
130   DEVMETHOD(device_shutdown, mxge_shutdown),
131 
132   DEVMETHOD_END
133 };
134 
135 static driver_t mxge_driver =
136 {
137   "mxge",
138   mxge_methods,
139   sizeof(mxge_softc_t),
140 };
141 
142 static devclass_t mxge_devclass;
143 
144 /* Declare ourselves to be a child of the PCI bus.*/
145 DRIVER_MODULE(mxge, pci, mxge_driver, mxge_devclass, 0, 0);
146 MODULE_DEPEND(mxge, firmware, 1, 1, 1);
147 MODULE_DEPEND(mxge, zlib, 1, 1, 1);
148 
149 static int mxge_load_firmware(mxge_softc_t *sc, int adopt);
150 static int mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data);
151 static int mxge_close(mxge_softc_t *sc, int down);
152 static int mxge_open(mxge_softc_t *sc);
153 static void mxge_tick(void *arg);
154 
155 static int
156 mxge_probe(device_t dev)
157 {
158 	int rev;
159 
160 
161 	if ((pci_get_vendor(dev) == MXGE_PCI_VENDOR_MYRICOM) &&
162 	    ((pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E) ||
163 	     (pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E_9))) {
164 		rev = pci_get_revid(dev);
165 		switch (rev) {
166 		case MXGE_PCI_REV_Z8E:
167 			device_set_desc(dev, "Myri10G-PCIE-8A");
168 			break;
169 		case MXGE_PCI_REV_Z8ES:
170 			device_set_desc(dev, "Myri10G-PCIE-8B");
171 			break;
172 		default:
173 			device_set_desc(dev, "Myri10G-PCIE-8??");
174 			device_printf(dev, "Unrecognized rev %d NIC\n",
175 				      rev);
176 			break;
177 		}
178 		return 0;
179 	}
180 	return ENXIO;
181 }
182 
183 static void
184 mxge_enable_wc(mxge_softc_t *sc)
185 {
186 #if defined(__i386) || defined(__amd64)
187 	vm_offset_t len;
188 	int err;
189 
190 	sc->wc = 1;
191 	len = rman_get_size(sc->mem_res);
192 	err = pmap_change_attr((vm_offset_t) sc->sram,
193 			       len, PAT_WRITE_COMBINING);
194 	if (err != 0) {
195 		device_printf(sc->dev, "pmap_change_attr failed, %d\n",
196 			      err);
197 		sc->wc = 0;
198 	}
199 #endif
200 }
201 
202 
203 /* callback to get our DMA address */
204 static void
205 mxge_dmamap_callback(void *arg, bus_dma_segment_t *segs, int nsegs,
206 			 int error)
207 {
208 	if (error == 0) {
209 		*(bus_addr_t *) arg = segs->ds_addr;
210 	}
211 }
212 
213 static int
214 mxge_dma_alloc(mxge_softc_t *sc, mxge_dma_t *dma, size_t bytes,
215 		   bus_size_t alignment)
216 {
217 	int err;
218 	device_t dev = sc->dev;
219 	bus_size_t boundary, maxsegsize;
220 
221 	if (bytes > 4096 && alignment == 4096) {
222 		boundary = 0;
223 		maxsegsize = bytes;
224 	} else {
225 		boundary = 4096;
226 		maxsegsize = 4096;
227 	}
228 
229 	/* allocate DMAable memory tags */
230 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
231 				 alignment,		/* alignment */
232 				 boundary,		/* boundary */
233 				 BUS_SPACE_MAXADDR,	/* low */
234 				 BUS_SPACE_MAXADDR,	/* high */
235 				 NULL, NULL,		/* filter */
236 				 bytes,			/* maxsize */
237 				 1,			/* num segs */
238 				 maxsegsize,		/* maxsegsize */
239 				 BUS_DMA_COHERENT,	/* flags */
240 				 NULL, NULL,		/* lock */
241 				 &dma->dmat);		/* tag */
242 	if (err != 0) {
243 		device_printf(dev, "couldn't alloc tag (err = %d)\n", err);
244 		return err;
245 	}
246 
247 	/* allocate DMAable memory & map */
248 	err = bus_dmamem_alloc(dma->dmat, &dma->addr,
249 			       (BUS_DMA_WAITOK | BUS_DMA_COHERENT
250 				| BUS_DMA_ZERO),  &dma->map);
251 	if (err != 0) {
252 		device_printf(dev, "couldn't alloc mem (err = %d)\n", err);
253 		goto abort_with_dmat;
254 	}
255 
256 	/* load the memory */
257 	err = bus_dmamap_load(dma->dmat, dma->map, dma->addr, bytes,
258 			      mxge_dmamap_callback,
259 			      (void *)&dma->bus_addr, 0);
260 	if (err != 0) {
261 		device_printf(dev, "couldn't load map (err = %d)\n", err);
262 		goto abort_with_mem;
263 	}
264 	return 0;
265 
266 abort_with_mem:
267 	bus_dmamem_free(dma->dmat, dma->addr, dma->map);
268 abort_with_dmat:
269 	(void)bus_dma_tag_destroy(dma->dmat);
270 	return err;
271 }
272 
273 
274 static void
275 mxge_dma_free(mxge_dma_t *dma)
276 {
277 	bus_dmamap_unload(dma->dmat, dma->map);
278 	bus_dmamem_free(dma->dmat, dma->addr, dma->map);
279 	(void)bus_dma_tag_destroy(dma->dmat);
280 }
281 
282 /*
283  * The eeprom strings on the lanaiX have the format
284  * SN=x\0
285  * MAC=x:x:x:x:x:x\0
286  * PC=text\0
287  */
288 
289 static int
290 mxge_parse_strings(mxge_softc_t *sc)
291 {
292 	char *ptr;
293 	int i, found_mac, found_sn2;
294 	char *endptr;
295 
296 	ptr = sc->eeprom_strings;
297 	found_mac = 0;
298 	found_sn2 = 0;
299 	while (*ptr != '\0') {
300 		if (strncmp(ptr, "MAC=", 4) == 0) {
301 			ptr += 4;
302 			for (i = 0;;) {
303 				sc->mac_addr[i] = strtoul(ptr, &endptr, 16);
304 				if (endptr - ptr != 2)
305 					goto abort;
306 				ptr = endptr;
307 				if (++i == 6)
308 					break;
309 				if (*ptr++ != ':')
310 					goto abort;
311 			}
312 			found_mac = 1;
313 		} else if (strncmp(ptr, "PC=", 3) == 0) {
314 			ptr += 3;
315 			strlcpy(sc->product_code_string, ptr,
316 			    sizeof(sc->product_code_string));
317 		} else if (!found_sn2 && (strncmp(ptr, "SN=", 3) == 0)) {
318 			ptr += 3;
319 			strlcpy(sc->serial_number_string, ptr,
320 			    sizeof(sc->serial_number_string));
321 		} else if (strncmp(ptr, "SN2=", 4) == 0) {
322 			/* SN2 takes precedence over SN */
323 			ptr += 4;
324 			found_sn2 = 1;
325 			strlcpy(sc->serial_number_string, ptr,
326 			    sizeof(sc->serial_number_string));
327 		}
328 		while (*ptr++ != '\0') {}
329 	}
330 
331 	if (found_mac)
332 		return 0;
333 
334  abort:
335 	device_printf(sc->dev, "failed to parse eeprom_strings\n");
336 
337 	return ENXIO;
338 }
339 
340 #if defined __i386 || defined i386 || defined __i386__ || defined __x86_64__
341 static void
342 mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
343 {
344 	uint32_t val;
345 	unsigned long base, off;
346 	char *va, *cfgptr;
347 	device_t pdev, mcp55;
348 	uint16_t vendor_id, device_id, word;
349 	uintptr_t bus, slot, func, ivend, idev;
350 	uint32_t *ptr32;
351 
352 
353 	if (!mxge_nvidia_ecrc_enable)
354 		return;
355 
356 	pdev = device_get_parent(device_get_parent(sc->dev));
357 	if (pdev == NULL) {
358 		device_printf(sc->dev, "could not find parent?\n");
359 		return;
360 	}
361 	vendor_id = pci_read_config(pdev, PCIR_VENDOR, 2);
362 	device_id = pci_read_config(pdev, PCIR_DEVICE, 2);
363 
364 	if (vendor_id != 0x10de)
365 		return;
366 
367 	base = 0;
368 
369 	if (device_id == 0x005d) {
370 		/* ck804, base address is magic */
371 		base = 0xe0000000UL;
372 	} else if (device_id >= 0x0374 && device_id <= 0x378) {
373 		/* mcp55, base address stored in chipset */
374 		mcp55 = pci_find_bsf(0, 0, 0);
375 		if (mcp55 &&
376 		    0x10de == pci_read_config(mcp55, PCIR_VENDOR, 2) &&
377 		    0x0369 == pci_read_config(mcp55, PCIR_DEVICE, 2)) {
378 			word = pci_read_config(mcp55, 0x90, 2);
379 			base = ((unsigned long)word & 0x7ffeU) << 25;
380 		}
381 	}
382 	if (!base)
383 		return;
384 
385 	/* XXXX
386 	   Test below is commented because it is believed that doing
387 	   config read/write beyond 0xff will access the config space
388 	   for the next larger function.  Uncomment this and remove
389 	   the hacky pmap_mapdev() way of accessing config space when
390 	   FreeBSD grows support for extended pcie config space access
391 	*/
392 #if 0
393 	/* See if we can, by some miracle, access the extended
394 	   config space */
395 	val = pci_read_config(pdev, 0x178, 4);
396 	if (val != 0xffffffff) {
397 		val |= 0x40;
398 		pci_write_config(pdev, 0x178, val, 4);
399 		return;
400 	}
401 #endif
402 	/* Rather than using normal pci config space writes, we must
403 	 * map the Nvidia config space ourselves.  This is because on
404 	 * opteron/nvidia class machine the 0xe000000 mapping is
405 	 * handled by the nvidia chipset, that means the internal PCI
406 	 * device (the on-chip northbridge), or the amd-8131 bridge
407 	 * and things behind them are not visible by this method.
408 	 */
409 
410 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
411 		      PCI_IVAR_BUS, &bus);
412 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
413 		      PCI_IVAR_SLOT, &slot);
414 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
415 		      PCI_IVAR_FUNCTION, &func);
416 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
417 		      PCI_IVAR_VENDOR, &ivend);
418 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
419 		      PCI_IVAR_DEVICE, &idev);
420 
421 	off =  base
422 		+ 0x00100000UL * (unsigned long)bus
423 		+ 0x00001000UL * (unsigned long)(func
424 						 + 8 * slot);
425 
426 	/* map it into the kernel */
427 	va = pmap_mapdev(trunc_page((vm_paddr_t)off), PAGE_SIZE);
428 
429 
430 	if (va == NULL) {
431 		device_printf(sc->dev, "pmap_kenter_temporary didn't\n");
432 		return;
433 	}
434 	/* get a pointer to the config space mapped into the kernel */
435 	cfgptr = va + (off & PAGE_MASK);
436 
437 	/* make sure that we can really access it */
438 	vendor_id = *(uint16_t *)(cfgptr + PCIR_VENDOR);
439 	device_id = *(uint16_t *)(cfgptr + PCIR_DEVICE);
440 	if (! (vendor_id == ivend && device_id == idev)) {
441 		device_printf(sc->dev, "mapping failed: 0x%x:0x%x\n",
442 			      vendor_id, device_id);
443 		pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
444 		return;
445 	}
446 
447 	ptr32 = (uint32_t*)(cfgptr + 0x178);
448 	val = *ptr32;
449 
450 	if (val == 0xffffffff) {
451 		device_printf(sc->dev, "extended mapping failed\n");
452 		pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
453 		return;
454 	}
455 	*ptr32 = val | 0x40;
456 	pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
457 	if (mxge_verbose)
458 		device_printf(sc->dev,
459 			      "Enabled ECRC on upstream Nvidia bridge "
460 			      "at %d:%d:%d\n",
461 			      (int)bus, (int)slot, (int)func);
462 	return;
463 }
464 #else
465 static void
466 mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
467 {
468 	device_printf(sc->dev,
469 		      "Nforce 4 chipset on non-x86/amd64!?!?!\n");
470 	return;
471 }
472 #endif
473 
474 
475 static int
476 mxge_dma_test(mxge_softc_t *sc, int test_type)
477 {
478 	mxge_cmd_t cmd;
479 	bus_addr_t dmatest_bus = sc->dmabench_dma.bus_addr;
480 	int status;
481 	uint32_t len;
482 	char *test = " ";
483 
484 
485 	/* Run a small DMA test.
486 	 * The magic multipliers to the length tell the firmware
487 	 * to do DMA read, write, or read+write tests.  The
488 	 * results are returned in cmd.data0.  The upper 16
489 	 * bits of the return is the number of transfers completed.
490 	 * The lower 16 bits is the time in 0.5us ticks that the
491 	 * transfers took to complete.
492 	 */
493 
494 	len = sc->tx_boundary;
495 
496 	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
497 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
498 	cmd.data2 = len * 0x10000;
499 	status = mxge_send_cmd(sc, test_type, &cmd);
500 	if (status != 0) {
501 		test = "read";
502 		goto abort;
503 	}
504 	sc->read_dma = ((cmd.data0>>16) * len * 2) /
505 		(cmd.data0 & 0xffff);
506 	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
507 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
508 	cmd.data2 = len * 0x1;
509 	status = mxge_send_cmd(sc, test_type, &cmd);
510 	if (status != 0) {
511 		test = "write";
512 		goto abort;
513 	}
514 	sc->write_dma = ((cmd.data0>>16) * len * 2) /
515 		(cmd.data0 & 0xffff);
516 
517 	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
518 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
519 	cmd.data2 = len * 0x10001;
520 	status = mxge_send_cmd(sc, test_type, &cmd);
521 	if (status != 0) {
522 		test = "read/write";
523 		goto abort;
524 	}
525 	sc->read_write_dma = ((cmd.data0>>16) * len * 2 * 2) /
526 		(cmd.data0 & 0xffff);
527 
528 abort:
529 	if (status != 0 && test_type != MXGEFW_CMD_UNALIGNED_TEST)
530 		device_printf(sc->dev, "DMA %s benchmark failed: %d\n",
531 			      test, status);
532 
533 	return status;
534 }
535 
536 /*
537  * The Lanai Z8E PCI-E interface achieves higher Read-DMA throughput
538  * when the PCI-E Completion packets are aligned on an 8-byte
539  * boundary.  Some PCI-E chip sets always align Completion packets; on
540  * the ones that do not, the alignment can be enforced by enabling
541  * ECRC generation (if supported).
542  *
543  * When PCI-E Completion packets are not aligned, it is actually more
544  * efficient to limit Read-DMA transactions to 2KB, rather than 4KB.
545  *
546  * If the driver can neither enable ECRC nor verify that it has
547  * already been enabled, then it must use a firmware image which works
548  * around unaligned completion packets (ethp_z8e.dat), and it should
549  * also ensure that it never gives the device a Read-DMA which is
550  * larger than 2KB by setting the tx_boundary to 2KB.  If ECRC is
551  * enabled, then the driver should use the aligned (eth_z8e.dat)
552  * firmware image, and set tx_boundary to 4KB.
553  */
554 
555 static int
556 mxge_firmware_probe(mxge_softc_t *sc)
557 {
558 	device_t dev = sc->dev;
559 	int reg, status;
560 	uint16_t pectl;
561 
562 	sc->tx_boundary = 4096;
563 	/*
564 	 * Verify the max read request size was set to 4KB
565 	 * before trying the test with 4KB.
566 	 */
567 	if (pci_find_cap(dev, PCIY_EXPRESS, &reg) == 0) {
568 		pectl = pci_read_config(dev, reg + 0x8, 2);
569 		if ((pectl & (5 << 12)) != (5 << 12)) {
570 			device_printf(dev, "Max Read Req. size != 4k (0x%x\n",
571 				      pectl);
572 			sc->tx_boundary = 2048;
573 		}
574 	}
575 
576 	/*
577 	 * load the optimized firmware (which assumes aligned PCIe
578 	 * completions) in order to see if it works on this host.
579 	 */
580 	sc->fw_name = mxge_fw_aligned;
581 	status = mxge_load_firmware(sc, 1);
582 	if (status != 0) {
583 		return status;
584 	}
585 
586 	/*
587 	 * Enable ECRC if possible
588 	 */
589 	mxge_enable_nvidia_ecrc(sc);
590 
591 	/*
592 	 * Run a DMA test which watches for unaligned completions and
593 	 * aborts on the first one seen.  Not required on Z8ES or newer.
594 	 */
595 	if (pci_get_revid(sc->dev) >= MXGE_PCI_REV_Z8ES)
596 		return 0;
597 	status = mxge_dma_test(sc, MXGEFW_CMD_UNALIGNED_TEST);
598 	if (status == 0)
599 		return 0; /* keep the aligned firmware */
600 
601 	if (status != E2BIG)
602 		device_printf(dev, "DMA test failed: %d\n", status);
603 	if (status == ENOSYS)
604 		device_printf(dev, "Falling back to ethp! "
605 			      "Please install up to date fw\n");
606 	return status;
607 }
608 
609 static int
610 mxge_select_firmware(mxge_softc_t *sc)
611 {
612 	int aligned = 0;
613 	int force_firmware = mxge_force_firmware;
614 
615 	if (sc->throttle)
616 		force_firmware = sc->throttle;
617 
618 	if (force_firmware != 0) {
619 		if (force_firmware == 1)
620 			aligned = 1;
621 		else
622 			aligned = 0;
623 		if (mxge_verbose)
624 			device_printf(sc->dev,
625 				      "Assuming %s completions (forced)\n",
626 				      aligned ? "aligned" : "unaligned");
627 		goto abort;
628 	}
629 
630 	/* if the PCIe link width is 4 or less, we can use the aligned
631 	   firmware and skip any checks */
632 	if (sc->link_width != 0 && sc->link_width <= 4) {
633 		device_printf(sc->dev,
634 			      "PCIe x%d Link, expect reduced performance\n",
635 			      sc->link_width);
636 		aligned = 1;
637 		goto abort;
638 	}
639 
640 	if (0 == mxge_firmware_probe(sc))
641 		return 0;
642 
643 abort:
644 	if (aligned) {
645 		sc->fw_name = mxge_fw_aligned;
646 		sc->tx_boundary = 4096;
647 	} else {
648 		sc->fw_name = mxge_fw_unaligned;
649 		sc->tx_boundary = 2048;
650 	}
651 	return (mxge_load_firmware(sc, 0));
652 }
653 
654 static int
655 mxge_validate_firmware(mxge_softc_t *sc, const mcp_gen_header_t *hdr)
656 {
657 
658 
659 	if (be32toh(hdr->mcp_type) != MCP_TYPE_ETH) {
660 		device_printf(sc->dev, "Bad firmware type: 0x%x\n",
661 			      be32toh(hdr->mcp_type));
662 		return EIO;
663 	}
664 
665 	/* save firmware version for sysctl */
666 	strlcpy(sc->fw_version, hdr->version, sizeof(sc->fw_version));
667 	if (mxge_verbose)
668 		device_printf(sc->dev, "firmware id: %s\n", hdr->version);
669 
670 	sscanf(sc->fw_version, "%d.%d.%d", &sc->fw_ver_major,
671 	       &sc->fw_ver_minor, &sc->fw_ver_tiny);
672 
673 	if (!(sc->fw_ver_major == MXGEFW_VERSION_MAJOR
674 	      && sc->fw_ver_minor == MXGEFW_VERSION_MINOR)) {
675 		device_printf(sc->dev, "Found firmware version %s\n",
676 			      sc->fw_version);
677 		device_printf(sc->dev, "Driver needs %d.%d\n",
678 			      MXGEFW_VERSION_MAJOR, MXGEFW_VERSION_MINOR);
679 		return EINVAL;
680 	}
681 	return 0;
682 
683 }
684 
685 static void *
686 z_alloc(void *nil, u_int items, u_int size)
687 {
688 	void *ptr;
689 
690 	ptr = malloc(items * size, M_TEMP, M_NOWAIT);
691 	return ptr;
692 }
693 
694 static void
695 z_free(void *nil, void *ptr)
696 {
697 	free(ptr, M_TEMP);
698 }
699 
700 
701 static int
702 mxge_load_firmware_helper(mxge_softc_t *sc, uint32_t *limit)
703 {
704 	z_stream zs;
705 	char *inflate_buffer;
706 	const struct firmware *fw;
707 	const mcp_gen_header_t *hdr;
708 	unsigned hdr_offset;
709 	int status;
710 	unsigned int i;
711 	char dummy;
712 	size_t fw_len;
713 
714 	fw = firmware_get(sc->fw_name);
715 	if (fw == NULL) {
716 		device_printf(sc->dev, "Could not find firmware image %s\n",
717 			      sc->fw_name);
718 		return ENOENT;
719 	}
720 
721 
722 
723 	/* setup zlib and decompress f/w */
724 	bzero(&zs, sizeof (zs));
725 	zs.zalloc = z_alloc;
726 	zs.zfree = z_free;
727 	status = inflateInit(&zs);
728 	if (status != Z_OK) {
729 		status = EIO;
730 		goto abort_with_fw;
731 	}
732 
733 	/* the uncompressed size is stored as the firmware version,
734 	   which would otherwise go unused */
735 	fw_len = (size_t) fw->version;
736 	inflate_buffer = malloc(fw_len, M_TEMP, M_NOWAIT);
737 	if (inflate_buffer == NULL)
738 		goto abort_with_zs;
739 	zs.avail_in = fw->datasize;
740 	zs.next_in = __DECONST(char *, fw->data);
741 	zs.avail_out = fw_len;
742 	zs.next_out = inflate_buffer;
743 	status = inflate(&zs, Z_FINISH);
744 	if (status != Z_STREAM_END) {
745 		device_printf(sc->dev, "zlib %d\n", status);
746 		status = EIO;
747 		goto abort_with_buffer;
748 	}
749 
750 	/* check id */
751 	hdr_offset = htobe32(*(const uint32_t *)
752 			     (inflate_buffer + MCP_HEADER_PTR_OFFSET));
753 	if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > fw_len) {
754 		device_printf(sc->dev, "Bad firmware file");
755 		status = EIO;
756 		goto abort_with_buffer;
757 	}
758 	hdr = (const void*)(inflate_buffer + hdr_offset);
759 
760 	status = mxge_validate_firmware(sc, hdr);
761 	if (status != 0)
762 		goto abort_with_buffer;
763 
764 	/* Copy the inflated firmware to NIC SRAM. */
765 	for (i = 0; i < fw_len; i += 256) {
766 		mxge_pio_copy(sc->sram + MXGE_FW_OFFSET + i,
767 			      inflate_buffer + i,
768 			      min(256U, (unsigned)(fw_len - i)));
769 		wmb();
770 		dummy = *sc->sram;
771 		wmb();
772 	}
773 
774 	*limit = fw_len;
775 	status = 0;
776 abort_with_buffer:
777 	free(inflate_buffer, M_TEMP);
778 abort_with_zs:
779 	inflateEnd(&zs);
780 abort_with_fw:
781 	firmware_put(fw, FIRMWARE_UNLOAD);
782 	return status;
783 }
784 
785 /*
786  * Enable or disable periodic RDMAs from the host to make certain
787  * chipsets resend dropped PCIe messages
788  */
789 
790 static void
791 mxge_dummy_rdma(mxge_softc_t *sc, int enable)
792 {
793 	char buf_bytes[72];
794 	volatile uint32_t *confirm;
795 	volatile char *submit;
796 	uint32_t *buf, dma_low, dma_high;
797 	int i;
798 
799 	buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
800 
801 	/* clear confirmation addr */
802 	confirm = (volatile uint32_t *)sc->cmd;
803 	*confirm = 0;
804 	wmb();
805 
806 	/* send an rdma command to the PCIe engine, and wait for the
807 	   response in the confirmation address.  The firmware should
808 	   write a -1 there to indicate it is alive and well
809 	*/
810 
811 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
812 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
813 	buf[0] = htobe32(dma_high);		/* confirm addr MSW */
814 	buf[1] = htobe32(dma_low);		/* confirm addr LSW */
815 	buf[2] = htobe32(0xffffffff);		/* confirm data */
816 	dma_low = MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr);
817 	dma_high = MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr);
818 	buf[3] = htobe32(dma_high); 		/* dummy addr MSW */
819 	buf[4] = htobe32(dma_low); 		/* dummy addr LSW */
820 	buf[5] = htobe32(enable);			/* enable? */
821 
822 
823 	submit = (volatile char *)(sc->sram + MXGEFW_BOOT_DUMMY_RDMA);
824 
825 	mxge_pio_copy(submit, buf, 64);
826 	wmb();
827 	DELAY(1000);
828 	wmb();
829 	i = 0;
830 	while (*confirm != 0xffffffff && i < 20) {
831 		DELAY(1000);
832 		i++;
833 	}
834 	if (*confirm != 0xffffffff) {
835 		device_printf(sc->dev, "dummy rdma %s failed (%p = 0x%x)",
836 			      (enable ? "enable" : "disable"), confirm,
837 			      *confirm);
838 	}
839 	return;
840 }
841 
842 static int
843 mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data)
844 {
845 	mcp_cmd_t *buf;
846 	char buf_bytes[sizeof(*buf) + 8];
847 	volatile mcp_cmd_response_t *response = sc->cmd;
848 	volatile char *cmd_addr = sc->sram + MXGEFW_ETH_CMD;
849 	uint32_t dma_low, dma_high;
850 	int err, sleep_total = 0;
851 
852 	/* ensure buf is aligned to 8 bytes */
853 	buf = (mcp_cmd_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
854 
855 	buf->data0 = htobe32(data->data0);
856 	buf->data1 = htobe32(data->data1);
857 	buf->data2 = htobe32(data->data2);
858 	buf->cmd = htobe32(cmd);
859 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
860 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
861 
862 	buf->response_addr.low = htobe32(dma_low);
863 	buf->response_addr.high = htobe32(dma_high);
864 	mtx_lock(&sc->cmd_mtx);
865 	response->result = 0xffffffff;
866 	wmb();
867 	mxge_pio_copy((volatile void *)cmd_addr, buf, sizeof (*buf));
868 
869 	/* wait up to 20ms */
870 	err = EAGAIN;
871 	for (sleep_total = 0; sleep_total <  20; sleep_total++) {
872 		bus_dmamap_sync(sc->cmd_dma.dmat,
873 				sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
874 		wmb();
875 		switch (be32toh(response->result)) {
876 		case 0:
877 			data->data0 = be32toh(response->data);
878 			err = 0;
879 			break;
880 		case 0xffffffff:
881 			DELAY(1000);
882 			break;
883 		case MXGEFW_CMD_UNKNOWN:
884 			err = ENOSYS;
885 			break;
886 		case MXGEFW_CMD_ERROR_UNALIGNED:
887 			err = E2BIG;
888 			break;
889 		case MXGEFW_CMD_ERROR_BUSY:
890 			err = EBUSY;
891 			break;
892 		case MXGEFW_CMD_ERROR_I2C_ABSENT:
893 			err = ENXIO;
894 			break;
895 		default:
896 			device_printf(sc->dev,
897 				      "mxge: command %d "
898 				      "failed, result = %d\n",
899 				      cmd, be32toh(response->result));
900 			err = ENXIO;
901 			break;
902 		}
903 		if (err != EAGAIN)
904 			break;
905 	}
906 	if (err == EAGAIN)
907 		device_printf(sc->dev, "mxge: command %d timed out"
908 			      "result = %d\n",
909 			      cmd, be32toh(response->result));
910 	mtx_unlock(&sc->cmd_mtx);
911 	return err;
912 }
913 
914 static int
915 mxge_adopt_running_firmware(mxge_softc_t *sc)
916 {
917 	struct mcp_gen_header *hdr;
918 	const size_t bytes = sizeof (struct mcp_gen_header);
919 	size_t hdr_offset;
920 	int status;
921 
922 	/* find running firmware header */
923 	hdr_offset = htobe32(*(volatile uint32_t *)
924 			     (sc->sram + MCP_HEADER_PTR_OFFSET));
925 
926 	if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > sc->sram_size) {
927 		device_printf(sc->dev,
928 			      "Running firmware has bad header offset (%d)\n",
929 			      (int)hdr_offset);
930 		return EIO;
931 	}
932 
933 	/* copy header of running firmware from SRAM to host memory to
934 	 * validate firmware */
935 	hdr = malloc(bytes, M_DEVBUF, M_NOWAIT);
936 	if (hdr == NULL) {
937 		device_printf(sc->dev, "could not malloc firmware hdr\n");
938 		return ENOMEM;
939 	}
940 	bus_space_read_region_1(rman_get_bustag(sc->mem_res),
941 				rman_get_bushandle(sc->mem_res),
942 				hdr_offset, (char *)hdr, bytes);
943 	status = mxge_validate_firmware(sc, hdr);
944 	free(hdr, M_DEVBUF);
945 
946 	/*
947 	 * check to see if adopted firmware has bug where adopting
948 	 * it will cause broadcasts to be filtered unless the NIC
949 	 * is kept in ALLMULTI mode
950 	 */
951 	if (sc->fw_ver_major == 1 && sc->fw_ver_minor == 4 &&
952 	    sc->fw_ver_tiny >= 4 && sc->fw_ver_tiny <= 11) {
953 		sc->adopted_rx_filter_bug = 1;
954 		device_printf(sc->dev, "Adopting fw %d.%d.%d: "
955 			      "working around rx filter bug\n",
956 			      sc->fw_ver_major, sc->fw_ver_minor,
957 			      sc->fw_ver_tiny);
958 	}
959 
960 	return status;
961 }
962 
963 
964 static int
965 mxge_load_firmware(mxge_softc_t *sc, int adopt)
966 {
967 	volatile uint32_t *confirm;
968 	volatile char *submit;
969 	char buf_bytes[72];
970 	uint32_t *buf, size, dma_low, dma_high;
971 	int status, i;
972 
973 	buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
974 
975 	size = sc->sram_size;
976 	status = mxge_load_firmware_helper(sc, &size);
977 	if (status) {
978 		if (!adopt)
979 			return status;
980 		/* Try to use the currently running firmware, if
981 		   it is new enough */
982 		status = mxge_adopt_running_firmware(sc);
983 		if (status) {
984 			device_printf(sc->dev,
985 				      "failed to adopt running firmware\n");
986 			return status;
987 		}
988 		device_printf(sc->dev,
989 			      "Successfully adopted running firmware\n");
990 		if (sc->tx_boundary == 4096) {
991 			device_printf(sc->dev,
992 				"Using firmware currently running on NIC"
993 				 ".  For optimal\n");
994 			device_printf(sc->dev,
995 				 "performance consider loading optimized "
996 				 "firmware\n");
997 		}
998 		sc->fw_name = mxge_fw_unaligned;
999 		sc->tx_boundary = 2048;
1000 		return 0;
1001 	}
1002 	/* clear confirmation addr */
1003 	confirm = (volatile uint32_t *)sc->cmd;
1004 	*confirm = 0;
1005 	wmb();
1006 	/* send a reload command to the bootstrap MCP, and wait for the
1007 	   response in the confirmation address.  The firmware should
1008 	   write a -1 there to indicate it is alive and well
1009 	*/
1010 
1011 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
1012 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
1013 
1014 	buf[0] = htobe32(dma_high);	/* confirm addr MSW */
1015 	buf[1] = htobe32(dma_low);	/* confirm addr LSW */
1016 	buf[2] = htobe32(0xffffffff);	/* confirm data */
1017 
1018 	/* FIX: All newest firmware should un-protect the bottom of
1019 	   the sram before handoff. However, the very first interfaces
1020 	   do not. Therefore the handoff copy must skip the first 8 bytes
1021 	*/
1022 					/* where the code starts*/
1023 	buf[3] = htobe32(MXGE_FW_OFFSET + 8);
1024 	buf[4] = htobe32(size - 8); 	/* length of code */
1025 	buf[5] = htobe32(8);		/* where to copy to */
1026 	buf[6] = htobe32(0);		/* where to jump to */
1027 
1028 	submit = (volatile char *)(sc->sram + MXGEFW_BOOT_HANDOFF);
1029 	mxge_pio_copy(submit, buf, 64);
1030 	wmb();
1031 	DELAY(1000);
1032 	wmb();
1033 	i = 0;
1034 	while (*confirm != 0xffffffff && i < 20) {
1035 		DELAY(1000*10);
1036 		i++;
1037 		bus_dmamap_sync(sc->cmd_dma.dmat,
1038 				sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
1039 	}
1040 	if (*confirm != 0xffffffff) {
1041 		device_printf(sc->dev,"handoff failed (%p = 0x%x)",
1042 			confirm, *confirm);
1043 
1044 		return ENXIO;
1045 	}
1046 	return 0;
1047 }
1048 
1049 static int
1050 mxge_update_mac_address(mxge_softc_t *sc)
1051 {
1052 	mxge_cmd_t cmd;
1053 	uint8_t *addr = sc->mac_addr;
1054 	int status;
1055 
1056 
1057 	cmd.data0 = ((addr[0] << 24) | (addr[1] << 16)
1058 		     | (addr[2] << 8) | addr[3]);
1059 
1060 	cmd.data1 = ((addr[4] << 8) | (addr[5]));
1061 
1062 	status = mxge_send_cmd(sc, MXGEFW_SET_MAC_ADDRESS, &cmd);
1063 	return status;
1064 }
1065 
1066 static int
1067 mxge_change_pause(mxge_softc_t *sc, int pause)
1068 {
1069 	mxge_cmd_t cmd;
1070 	int status;
1071 
1072 	if (pause)
1073 		status = mxge_send_cmd(sc, MXGEFW_ENABLE_FLOW_CONTROL,
1074 				       &cmd);
1075 	else
1076 		status = mxge_send_cmd(sc, MXGEFW_DISABLE_FLOW_CONTROL,
1077 				       &cmd);
1078 
1079 	if (status) {
1080 		device_printf(sc->dev, "Failed to set flow control mode\n");
1081 		return ENXIO;
1082 	}
1083 	sc->pause = pause;
1084 	return 0;
1085 }
1086 
1087 static void
1088 mxge_change_promisc(mxge_softc_t *sc, int promisc)
1089 {
1090 	mxge_cmd_t cmd;
1091 	int status;
1092 
1093 	if (mxge_always_promisc)
1094 		promisc = 1;
1095 
1096 	if (promisc)
1097 		status = mxge_send_cmd(sc, MXGEFW_ENABLE_PROMISC,
1098 				       &cmd);
1099 	else
1100 		status = mxge_send_cmd(sc, MXGEFW_DISABLE_PROMISC,
1101 				       &cmd);
1102 
1103 	if (status) {
1104 		device_printf(sc->dev, "Failed to set promisc mode\n");
1105 	}
1106 }
1107 
1108 static void
1109 mxge_set_multicast_list(mxge_softc_t *sc)
1110 {
1111 	mxge_cmd_t cmd;
1112 	struct ifmultiaddr *ifma;
1113 	struct ifnet *ifp = sc->ifp;
1114 	int err;
1115 
1116 	/* This firmware is known to not support multicast */
1117 	if (!sc->fw_multicast_support)
1118 		return;
1119 
1120 	/* Disable multicast filtering while we play with the lists*/
1121 	err = mxge_send_cmd(sc, MXGEFW_ENABLE_ALLMULTI, &cmd);
1122 	if (err != 0) {
1123 		device_printf(sc->dev, "Failed MXGEFW_ENABLE_ALLMULTI,"
1124 		       " error status: %d\n", err);
1125 		return;
1126 	}
1127 
1128 	if (sc->adopted_rx_filter_bug)
1129 		return;
1130 
1131 	if (ifp->if_flags & IFF_ALLMULTI)
1132 		/* request to disable multicast filtering, so quit here */
1133 		return;
1134 
1135 	/* Flush all the filters */
1136 
1137 	err = mxge_send_cmd(sc, MXGEFW_LEAVE_ALL_MULTICAST_GROUPS, &cmd);
1138 	if (err != 0) {
1139 		device_printf(sc->dev,
1140 			      "Failed MXGEFW_LEAVE_ALL_MULTICAST_GROUPS"
1141 			      ", error status: %d\n", err);
1142 		return;
1143 	}
1144 
1145 	/* Walk the multicast list, and add each address */
1146 
1147 	if_maddr_rlock(ifp);
1148 	TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
1149 		if (ifma->ifma_addr->sa_family != AF_LINK)
1150 			continue;
1151 		bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr),
1152 		      &cmd.data0, 4);
1153 		bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr) + 4,
1154 		      &cmd.data1, 2);
1155 		cmd.data0 = htonl(cmd.data0);
1156 		cmd.data1 = htonl(cmd.data1);
1157 		err = mxge_send_cmd(sc, MXGEFW_JOIN_MULTICAST_GROUP, &cmd);
1158 		if (err != 0) {
1159 			device_printf(sc->dev, "Failed "
1160 			       "MXGEFW_JOIN_MULTICAST_GROUP, error status:"
1161 			       "%d\t", err);
1162 			/* abort, leaving multicast filtering off */
1163 			if_maddr_runlock(ifp);
1164 			return;
1165 		}
1166 	}
1167 	if_maddr_runlock(ifp);
1168 	/* Enable multicast filtering */
1169 	err = mxge_send_cmd(sc, MXGEFW_DISABLE_ALLMULTI, &cmd);
1170 	if (err != 0) {
1171 		device_printf(sc->dev, "Failed MXGEFW_DISABLE_ALLMULTI"
1172 		       ", error status: %d\n", err);
1173 	}
1174 }
1175 
1176 static int
1177 mxge_max_mtu(mxge_softc_t *sc)
1178 {
1179 	mxge_cmd_t cmd;
1180 	int status;
1181 
1182 	if (MJUMPAGESIZE - MXGEFW_PAD >  MXGEFW_MAX_MTU)
1183 		return  MXGEFW_MAX_MTU - MXGEFW_PAD;
1184 
1185 	/* try to set nbufs to see if it we can
1186 	   use virtually contiguous jumbos */
1187 	cmd.data0 = 0;
1188 	status = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
1189 			       &cmd);
1190 	if (status == 0)
1191 		return  MXGEFW_MAX_MTU - MXGEFW_PAD;
1192 
1193 	/* otherwise, we're limited to MJUMPAGESIZE */
1194 	return MJUMPAGESIZE - MXGEFW_PAD;
1195 }
1196 
1197 static int
1198 mxge_reset(mxge_softc_t *sc, int interrupts_setup)
1199 {
1200 	struct mxge_slice_state *ss;
1201 	mxge_rx_done_t *rx_done;
1202 	volatile uint32_t *irq_claim;
1203 	mxge_cmd_t cmd;
1204 	int slice, status;
1205 
1206 	/* try to send a reset command to the card to see if it
1207 	   is alive */
1208 	memset(&cmd, 0, sizeof (cmd));
1209 	status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
1210 	if (status != 0) {
1211 		device_printf(sc->dev, "failed reset\n");
1212 		return ENXIO;
1213 	}
1214 
1215 	mxge_dummy_rdma(sc, 1);
1216 
1217 
1218 	/* set the intrq size */
1219 	cmd.data0 = sc->rx_ring_size;
1220 	status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
1221 
1222 	/*
1223 	 * Even though we already know how many slices are supported
1224 	 * via mxge_slice_probe(), MXGEFW_CMD_GET_MAX_RSS_QUEUES
1225 	 * has magic side effects, and must be called after a reset.
1226 	 * It must be called prior to calling any RSS related cmds,
1227 	 * including assigning an interrupt queue for anything but
1228 	 * slice 0.  It must also be called *after*
1229 	 * MXGEFW_CMD_SET_INTRQ_SIZE, since the intrq size is used by
1230 	 * the firmware to compute offsets.
1231 	 */
1232 
1233 	if (sc->num_slices > 1) {
1234 		/* ask the maximum number of slices it supports */
1235 		status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES,
1236 					   &cmd);
1237 		if (status != 0) {
1238 			device_printf(sc->dev,
1239 				      "failed to get number of slices\n");
1240 			return status;
1241 		}
1242 		/*
1243 		 * MXGEFW_CMD_ENABLE_RSS_QUEUES must be called prior
1244 		 * to setting up the interrupt queue DMA
1245 		 */
1246 		cmd.data0 = sc->num_slices;
1247 		cmd.data1 = MXGEFW_SLICE_INTR_MODE_ONE_PER_SLICE;
1248 #ifdef IFNET_BUF_RING
1249 		cmd.data1 |= MXGEFW_SLICE_ENABLE_MULTIPLE_TX_QUEUES;
1250 #endif
1251 		status = mxge_send_cmd(sc, MXGEFW_CMD_ENABLE_RSS_QUEUES,
1252 					   &cmd);
1253 		if (status != 0) {
1254 			device_printf(sc->dev,
1255 				      "failed to set number of slices\n");
1256 			return status;
1257 		}
1258 	}
1259 
1260 
1261 	if (interrupts_setup) {
1262 		/* Now exchange information about interrupts  */
1263 		for (slice = 0; slice < sc->num_slices; slice++) {
1264 			rx_done = &sc->ss[slice].rx_done;
1265 			memset(rx_done->entry, 0, sc->rx_ring_size);
1266 			cmd.data0 = MXGE_LOWPART_TO_U32(rx_done->dma.bus_addr);
1267 			cmd.data1 = MXGE_HIGHPART_TO_U32(rx_done->dma.bus_addr);
1268 			cmd.data2 = slice;
1269 			status |= mxge_send_cmd(sc,
1270 						MXGEFW_CMD_SET_INTRQ_DMA,
1271 						&cmd);
1272 		}
1273 	}
1274 
1275 	status |= mxge_send_cmd(sc,
1276 				MXGEFW_CMD_GET_INTR_COAL_DELAY_OFFSET, &cmd);
1277 
1278 
1279 	sc->intr_coal_delay_ptr = (volatile uint32_t *)(sc->sram + cmd.data0);
1280 
1281 	status |= mxge_send_cmd(sc, MXGEFW_CMD_GET_IRQ_ACK_OFFSET, &cmd);
1282 	irq_claim = (volatile uint32_t *)(sc->sram + cmd.data0);
1283 
1284 
1285 	status |= mxge_send_cmd(sc,  MXGEFW_CMD_GET_IRQ_DEASSERT_OFFSET,
1286 				&cmd);
1287 	sc->irq_deassert = (volatile uint32_t *)(sc->sram + cmd.data0);
1288 	if (status != 0) {
1289 		device_printf(sc->dev, "failed set interrupt parameters\n");
1290 		return status;
1291 	}
1292 
1293 
1294 	*sc->intr_coal_delay_ptr = htobe32(sc->intr_coal_delay);
1295 
1296 
1297 	/* run a DMA benchmark */
1298 	(void) mxge_dma_test(sc, MXGEFW_DMA_TEST);
1299 
1300 	for (slice = 0; slice < sc->num_slices; slice++) {
1301 		ss = &sc->ss[slice];
1302 
1303 		ss->irq_claim = irq_claim + (2 * slice);
1304 		/* reset mcp/driver shared state back to 0 */
1305 		ss->rx_done.idx = 0;
1306 		ss->rx_done.cnt = 0;
1307 		ss->tx.req = 0;
1308 		ss->tx.done = 0;
1309 		ss->tx.pkt_done = 0;
1310 		ss->tx.queue_active = 0;
1311 		ss->tx.activate = 0;
1312 		ss->tx.deactivate = 0;
1313 		ss->tx.wake = 0;
1314 		ss->tx.defrag = 0;
1315 		ss->tx.stall = 0;
1316 		ss->rx_big.cnt = 0;
1317 		ss->rx_small.cnt = 0;
1318 		ss->lc.lro_bad_csum = 0;
1319 		ss->lc.lro_queued = 0;
1320 		ss->lc.lro_flushed = 0;
1321 		if (ss->fw_stats != NULL) {
1322 			bzero(ss->fw_stats, sizeof *ss->fw_stats);
1323 		}
1324 	}
1325 	sc->rdma_tags_available = 15;
1326 	status = mxge_update_mac_address(sc);
1327 	mxge_change_promisc(sc, sc->ifp->if_flags & IFF_PROMISC);
1328 	mxge_change_pause(sc, sc->pause);
1329 	mxge_set_multicast_list(sc);
1330 	if (sc->throttle) {
1331 		cmd.data0 = sc->throttle;
1332 		if (mxge_send_cmd(sc, MXGEFW_CMD_SET_THROTTLE_FACTOR,
1333 				  &cmd)) {
1334 			device_printf(sc->dev,
1335 				      "can't enable throttle\n");
1336 		}
1337 	}
1338 	return status;
1339 }
1340 
1341 static int
1342 mxge_change_throttle(SYSCTL_HANDLER_ARGS)
1343 {
1344 	mxge_cmd_t cmd;
1345 	mxge_softc_t *sc;
1346 	int err;
1347 	unsigned int throttle;
1348 
1349 	sc = arg1;
1350 	throttle = sc->throttle;
1351 	err = sysctl_handle_int(oidp, &throttle, arg2, req);
1352 	if (err != 0) {
1353 		return err;
1354 	}
1355 
1356 	if (throttle == sc->throttle)
1357 		return 0;
1358 
1359 	if (throttle < MXGE_MIN_THROTTLE || throttle > MXGE_MAX_THROTTLE)
1360 		return EINVAL;
1361 
1362 	mtx_lock(&sc->driver_mtx);
1363 	cmd.data0 = throttle;
1364 	err = mxge_send_cmd(sc, MXGEFW_CMD_SET_THROTTLE_FACTOR, &cmd);
1365 	if (err == 0)
1366 		sc->throttle = throttle;
1367 	mtx_unlock(&sc->driver_mtx);
1368 	return err;
1369 }
1370 
1371 static int
1372 mxge_change_intr_coal(SYSCTL_HANDLER_ARGS)
1373 {
1374 	mxge_softc_t *sc;
1375 	unsigned int intr_coal_delay;
1376 	int err;
1377 
1378 	sc = arg1;
1379 	intr_coal_delay = sc->intr_coal_delay;
1380 	err = sysctl_handle_int(oidp, &intr_coal_delay, arg2, req);
1381 	if (err != 0) {
1382 		return err;
1383 	}
1384 	if (intr_coal_delay == sc->intr_coal_delay)
1385 		return 0;
1386 
1387 	if (intr_coal_delay == 0 || intr_coal_delay > 1000*1000)
1388 		return EINVAL;
1389 
1390 	mtx_lock(&sc->driver_mtx);
1391 	*sc->intr_coal_delay_ptr = htobe32(intr_coal_delay);
1392 	sc->intr_coal_delay = intr_coal_delay;
1393 
1394 	mtx_unlock(&sc->driver_mtx);
1395 	return err;
1396 }
1397 
1398 static int
1399 mxge_change_flow_control(SYSCTL_HANDLER_ARGS)
1400 {
1401 	mxge_softc_t *sc;
1402 	unsigned int enabled;
1403 	int err;
1404 
1405 	sc = arg1;
1406 	enabled = sc->pause;
1407 	err = sysctl_handle_int(oidp, &enabled, arg2, req);
1408 	if (err != 0) {
1409 		return err;
1410 	}
1411 	if (enabled == sc->pause)
1412 		return 0;
1413 
1414 	mtx_lock(&sc->driver_mtx);
1415 	err = mxge_change_pause(sc, enabled);
1416 	mtx_unlock(&sc->driver_mtx);
1417 	return err;
1418 }
1419 
1420 static int
1421 mxge_handle_be32(SYSCTL_HANDLER_ARGS)
1422 {
1423 	int err;
1424 
1425 	if (arg1 == NULL)
1426 		return EFAULT;
1427 	arg2 = be32toh(*(int *)arg1);
1428 	arg1 = NULL;
1429 	err = sysctl_handle_int(oidp, arg1, arg2, req);
1430 
1431 	return err;
1432 }
1433 
1434 static void
1435 mxge_rem_sysctls(mxge_softc_t *sc)
1436 {
1437 	struct mxge_slice_state *ss;
1438 	int slice;
1439 
1440 	if (sc->slice_sysctl_tree == NULL)
1441 		return;
1442 
1443 	for (slice = 0; slice < sc->num_slices; slice++) {
1444 		ss = &sc->ss[slice];
1445 		if (ss == NULL || ss->sysctl_tree == NULL)
1446 			continue;
1447 		sysctl_ctx_free(&ss->sysctl_ctx);
1448 		ss->sysctl_tree = NULL;
1449 	}
1450 	sysctl_ctx_free(&sc->slice_sysctl_ctx);
1451 	sc->slice_sysctl_tree = NULL;
1452 }
1453 
1454 static void
1455 mxge_add_sysctls(mxge_softc_t *sc)
1456 {
1457 	struct sysctl_ctx_list *ctx;
1458 	struct sysctl_oid_list *children;
1459 	mcp_irq_data_t *fw;
1460 	struct mxge_slice_state *ss;
1461 	int slice;
1462 	char slice_num[8];
1463 
1464 	ctx = device_get_sysctl_ctx(sc->dev);
1465 	children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev));
1466 	fw = sc->ss[0].fw_stats;
1467 
1468 	/* random information */
1469 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1470 		       "firmware_version",
1471 		       CTLFLAG_RD, sc->fw_version,
1472 		       0, "firmware version");
1473 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1474 		       "serial_number",
1475 		       CTLFLAG_RD, sc->serial_number_string,
1476 		       0, "serial number");
1477 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1478 		       "product_code",
1479 		       CTLFLAG_RD, sc->product_code_string,
1480 		       0, "product_code");
1481 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1482 		       "pcie_link_width",
1483 		       CTLFLAG_RD, &sc->link_width,
1484 		       0, "tx_boundary");
1485 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1486 		       "tx_boundary",
1487 		       CTLFLAG_RD, &sc->tx_boundary,
1488 		       0, "tx_boundary");
1489 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1490 		       "write_combine",
1491 		       CTLFLAG_RD, &sc->wc,
1492 		       0, "write combining PIO?");
1493 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1494 		       "read_dma_MBs",
1495 		       CTLFLAG_RD, &sc->read_dma,
1496 		       0, "DMA Read speed in MB/s");
1497 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1498 		       "write_dma_MBs",
1499 		       CTLFLAG_RD, &sc->write_dma,
1500 		       0, "DMA Write speed in MB/s");
1501 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1502 		       "read_write_dma_MBs",
1503 		       CTLFLAG_RD, &sc->read_write_dma,
1504 		       0, "DMA concurrent Read/Write speed in MB/s");
1505 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1506 		       "watchdog_resets",
1507 		       CTLFLAG_RD, &sc->watchdog_resets,
1508 		       0, "Number of times NIC was reset");
1509 
1510 
1511 	/* performance related tunables */
1512 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1513 			"intr_coal_delay",
1514 			CTLTYPE_INT|CTLFLAG_RW, sc,
1515 			0, mxge_change_intr_coal,
1516 			"I", "interrupt coalescing delay in usecs");
1517 
1518 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1519 			"throttle",
1520 			CTLTYPE_INT|CTLFLAG_RW, sc,
1521 			0, mxge_change_throttle,
1522 			"I", "transmit throttling");
1523 
1524 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1525 			"flow_control_enabled",
1526 			CTLTYPE_INT|CTLFLAG_RW, sc,
1527 			0, mxge_change_flow_control,
1528 			"I", "interrupt coalescing delay in usecs");
1529 
1530 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1531 		       "deassert_wait",
1532 		       CTLFLAG_RW, &mxge_deassert_wait,
1533 		       0, "Wait for IRQ line to go low in ihandler");
1534 
1535 	/* stats block from firmware is in network byte order.
1536 	   Need to swap it */
1537 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1538 			"link_up",
1539 			CTLTYPE_INT|CTLFLAG_RD, &fw->link_up,
1540 			0, mxge_handle_be32,
1541 			"I", "link up");
1542 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1543 			"rdma_tags_available",
1544 			CTLTYPE_INT|CTLFLAG_RD, &fw->rdma_tags_available,
1545 			0, mxge_handle_be32,
1546 			"I", "rdma_tags_available");
1547 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1548 			"dropped_bad_crc32",
1549 			CTLTYPE_INT|CTLFLAG_RD,
1550 			&fw->dropped_bad_crc32,
1551 			0, mxge_handle_be32,
1552 			"I", "dropped_bad_crc32");
1553 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1554 			"dropped_bad_phy",
1555 			CTLTYPE_INT|CTLFLAG_RD,
1556 			&fw->dropped_bad_phy,
1557 			0, mxge_handle_be32,
1558 			"I", "dropped_bad_phy");
1559 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1560 			"dropped_link_error_or_filtered",
1561 			CTLTYPE_INT|CTLFLAG_RD,
1562 			&fw->dropped_link_error_or_filtered,
1563 			0, mxge_handle_be32,
1564 			"I", "dropped_link_error_or_filtered");
1565 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1566 			"dropped_link_overflow",
1567 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_link_overflow,
1568 			0, mxge_handle_be32,
1569 			"I", "dropped_link_overflow");
1570 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1571 			"dropped_multicast_filtered",
1572 			CTLTYPE_INT|CTLFLAG_RD,
1573 			&fw->dropped_multicast_filtered,
1574 			0, mxge_handle_be32,
1575 			"I", "dropped_multicast_filtered");
1576 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1577 			"dropped_no_big_buffer",
1578 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_no_big_buffer,
1579 			0, mxge_handle_be32,
1580 			"I", "dropped_no_big_buffer");
1581 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1582 			"dropped_no_small_buffer",
1583 			CTLTYPE_INT|CTLFLAG_RD,
1584 			&fw->dropped_no_small_buffer,
1585 			0, mxge_handle_be32,
1586 			"I", "dropped_no_small_buffer");
1587 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1588 			"dropped_overrun",
1589 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_overrun,
1590 			0, mxge_handle_be32,
1591 			"I", "dropped_overrun");
1592 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1593 			"dropped_pause",
1594 			CTLTYPE_INT|CTLFLAG_RD,
1595 			&fw->dropped_pause,
1596 			0, mxge_handle_be32,
1597 			"I", "dropped_pause");
1598 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1599 			"dropped_runt",
1600 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_runt,
1601 			0, mxge_handle_be32,
1602 			"I", "dropped_runt");
1603 
1604 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1605 			"dropped_unicast_filtered",
1606 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_unicast_filtered,
1607 			0, mxge_handle_be32,
1608 			"I", "dropped_unicast_filtered");
1609 
1610 	/* verbose printing? */
1611 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1612 		       "verbose",
1613 		       CTLFLAG_RW, &mxge_verbose,
1614 		       0, "verbose printing");
1615 
1616 	/* add counters exported for debugging from all slices */
1617 	sysctl_ctx_init(&sc->slice_sysctl_ctx);
1618 	sc->slice_sysctl_tree =
1619 		SYSCTL_ADD_NODE(&sc->slice_sysctl_ctx, children, OID_AUTO,
1620 				"slice", CTLFLAG_RD, 0, "");
1621 
1622 	for (slice = 0; slice < sc->num_slices; slice++) {
1623 		ss = &sc->ss[slice];
1624 		sysctl_ctx_init(&ss->sysctl_ctx);
1625 		ctx = &ss->sysctl_ctx;
1626 		children = SYSCTL_CHILDREN(sc->slice_sysctl_tree);
1627 		sprintf(slice_num, "%d", slice);
1628 		ss->sysctl_tree =
1629 			SYSCTL_ADD_NODE(ctx, children, OID_AUTO, slice_num,
1630 					CTLFLAG_RD, 0, "");
1631 		children = SYSCTL_CHILDREN(ss->sysctl_tree);
1632 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1633 			       "rx_small_cnt",
1634 			       CTLFLAG_RD, &ss->rx_small.cnt,
1635 			       0, "rx_small_cnt");
1636 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1637 			       "rx_big_cnt",
1638 			       CTLFLAG_RD, &ss->rx_big.cnt,
1639 			       0, "rx_small_cnt");
1640 		SYSCTL_ADD_U64(ctx, children, OID_AUTO,
1641 			       "lro_flushed", CTLFLAG_RD, &ss->lc.lro_flushed,
1642 			       0, "number of lro merge queues flushed");
1643 
1644 		SYSCTL_ADD_U64(ctx, children, OID_AUTO,
1645 			       "lro_bad_csum", CTLFLAG_RD, &ss->lc.lro_bad_csum,
1646 			       0, "number of bad csums preventing LRO");
1647 
1648 		SYSCTL_ADD_U64(ctx, children, OID_AUTO,
1649 			       "lro_queued", CTLFLAG_RD, &ss->lc.lro_queued,
1650 			       0, "number of frames appended to lro merge"
1651 			       "queues");
1652 
1653 #ifndef IFNET_BUF_RING
1654 		/* only transmit from slice 0 for now */
1655 		if (slice > 0)
1656 			continue;
1657 #endif
1658 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1659 			       "tx_req",
1660 			       CTLFLAG_RD, &ss->tx.req,
1661 			       0, "tx_req");
1662 
1663 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1664 			       "tx_done",
1665 			       CTLFLAG_RD, &ss->tx.done,
1666 			       0, "tx_done");
1667 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1668 			       "tx_pkt_done",
1669 			       CTLFLAG_RD, &ss->tx.pkt_done,
1670 			       0, "tx_done");
1671 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1672 			       "tx_stall",
1673 			       CTLFLAG_RD, &ss->tx.stall,
1674 			       0, "tx_stall");
1675 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1676 			       "tx_wake",
1677 			       CTLFLAG_RD, &ss->tx.wake,
1678 			       0, "tx_wake");
1679 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1680 			       "tx_defrag",
1681 			       CTLFLAG_RD, &ss->tx.defrag,
1682 			       0, "tx_defrag");
1683 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1684 			       "tx_queue_active",
1685 			       CTLFLAG_RD, &ss->tx.queue_active,
1686 			       0, "tx_queue_active");
1687 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1688 			       "tx_activate",
1689 			       CTLFLAG_RD, &ss->tx.activate,
1690 			       0, "tx_activate");
1691 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1692 			       "tx_deactivate",
1693 			       CTLFLAG_RD, &ss->tx.deactivate,
1694 			       0, "tx_deactivate");
1695 	}
1696 }
1697 
1698 /* copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
1699    backwards one at a time and handle ring wraps */
1700 
1701 static inline void
1702 mxge_submit_req_backwards(mxge_tx_ring_t *tx,
1703 			    mcp_kreq_ether_send_t *src, int cnt)
1704 {
1705 	int idx, starting_slot;
1706 	starting_slot = tx->req;
1707 	while (cnt > 1) {
1708 		cnt--;
1709 		idx = (starting_slot + cnt) & tx->mask;
1710 		mxge_pio_copy(&tx->lanai[idx],
1711 			      &src[cnt], sizeof(*src));
1712 		wmb();
1713 	}
1714 }
1715 
1716 /*
1717  * copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
1718  * at most 32 bytes at a time, so as to avoid involving the software
1719  * pio handler in the nic.   We re-write the first segment's flags
1720  * to mark them valid only after writing the entire chain
1721  */
1722 
1723 static inline void
1724 mxge_submit_req(mxge_tx_ring_t *tx, mcp_kreq_ether_send_t *src,
1725 		  int cnt)
1726 {
1727 	int idx, i;
1728 	uint32_t *src_ints;
1729 	volatile uint32_t *dst_ints;
1730 	mcp_kreq_ether_send_t *srcp;
1731 	volatile mcp_kreq_ether_send_t *dstp, *dst;
1732 	uint8_t last_flags;
1733 
1734 	idx = tx->req & tx->mask;
1735 
1736 	last_flags = src->flags;
1737 	src->flags = 0;
1738 	wmb();
1739 	dst = dstp = &tx->lanai[idx];
1740 	srcp = src;
1741 
1742 	if ((idx + cnt) < tx->mask) {
1743 		for (i = 0; i < (cnt - 1); i += 2) {
1744 			mxge_pio_copy(dstp, srcp, 2 * sizeof(*src));
1745 			wmb(); /* force write every 32 bytes */
1746 			srcp += 2;
1747 			dstp += 2;
1748 		}
1749 	} else {
1750 		/* submit all but the first request, and ensure
1751 		   that it is submitted below */
1752 		mxge_submit_req_backwards(tx, src, cnt);
1753 		i = 0;
1754 	}
1755 	if (i < cnt) {
1756 		/* submit the first request */
1757 		mxge_pio_copy(dstp, srcp, sizeof(*src));
1758 		wmb(); /* barrier before setting valid flag */
1759 	}
1760 
1761 	/* re-write the last 32-bits with the valid flags */
1762 	src->flags = last_flags;
1763 	src_ints = (uint32_t *)src;
1764 	src_ints+=3;
1765 	dst_ints = (volatile uint32_t *)dst;
1766 	dst_ints+=3;
1767 	*dst_ints =  *src_ints;
1768 	tx->req += cnt;
1769 	wmb();
1770 }
1771 
1772 static int
1773 mxge_parse_tx(struct mxge_slice_state *ss, struct mbuf *m,
1774     struct mxge_pkt_info *pi)
1775 {
1776 	struct ether_vlan_header *eh;
1777 	uint16_t etype;
1778 	int tso = m->m_pkthdr.csum_flags & (CSUM_TSO);
1779 #if IFCAP_TSO6 && defined(INET6)
1780 	int nxt;
1781 #endif
1782 
1783 	eh = mtod(m, struct ether_vlan_header *);
1784 	if (eh->evl_encap_proto == htons(ETHERTYPE_VLAN)) {
1785 		etype = ntohs(eh->evl_proto);
1786 		pi->ip_off = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
1787 	} else {
1788 		etype = ntohs(eh->evl_encap_proto);
1789 		pi->ip_off = ETHER_HDR_LEN;
1790 	}
1791 
1792 	switch (etype) {
1793 	case ETHERTYPE_IP:
1794 		/*
1795 		 * ensure ip header is in first mbuf, copy it to a
1796 		 * scratch buffer if not
1797 		 */
1798 		pi->ip = (struct ip *)(m->m_data + pi->ip_off);
1799 		pi->ip6 = NULL;
1800 		if (__predict_false(m->m_len < pi->ip_off + sizeof(*pi->ip))) {
1801 			m_copydata(m, 0, pi->ip_off + sizeof(*pi->ip),
1802 			    ss->scratch);
1803 			pi->ip = (struct ip *)(ss->scratch + pi->ip_off);
1804 		}
1805 		pi->ip_hlen = pi->ip->ip_hl << 2;
1806 		if (!tso)
1807 			return 0;
1808 
1809 		if (__predict_false(m->m_len < pi->ip_off + pi->ip_hlen +
1810 		    sizeof(struct tcphdr))) {
1811 			m_copydata(m, 0, pi->ip_off + pi->ip_hlen +
1812 			    sizeof(struct tcphdr), ss->scratch);
1813 			pi->ip = (struct ip *)(ss->scratch + pi->ip_off);
1814 		}
1815 		pi->tcp = (struct tcphdr *)((char *)pi->ip + pi->ip_hlen);
1816 		break;
1817 #if IFCAP_TSO6 && defined(INET6)
1818 	case ETHERTYPE_IPV6:
1819 		pi->ip6 = (struct ip6_hdr *)(m->m_data + pi->ip_off);
1820 		if (__predict_false(m->m_len < pi->ip_off + sizeof(*pi->ip6))) {
1821 			m_copydata(m, 0, pi->ip_off + sizeof(*pi->ip6),
1822 			    ss->scratch);
1823 			pi->ip6 = (struct ip6_hdr *)(ss->scratch + pi->ip_off);
1824 		}
1825 		nxt = 0;
1826 		pi->ip_hlen = ip6_lasthdr(m, pi->ip_off, IPPROTO_IPV6, &nxt);
1827 		pi->ip_hlen -= pi->ip_off;
1828 		if (nxt != IPPROTO_TCP && nxt != IPPROTO_UDP)
1829 			return EINVAL;
1830 
1831 		if (!tso)
1832 			return 0;
1833 
1834 		if (pi->ip_off + pi->ip_hlen > ss->sc->max_tso6_hlen)
1835 			return EINVAL;
1836 
1837 		if (__predict_false(m->m_len < pi->ip_off + pi->ip_hlen +
1838 		    sizeof(struct tcphdr))) {
1839 			m_copydata(m, 0, pi->ip_off + pi->ip_hlen +
1840 			    sizeof(struct tcphdr), ss->scratch);
1841 			pi->ip6 = (struct ip6_hdr *)(ss->scratch + pi->ip_off);
1842 		}
1843 		pi->tcp = (struct tcphdr *)((char *)pi->ip6 + pi->ip_hlen);
1844 		break;
1845 #endif
1846 	default:
1847 		return EINVAL;
1848 	}
1849 	return 0;
1850 }
1851 
1852 #if IFCAP_TSO4
1853 
1854 static void
1855 mxge_encap_tso(struct mxge_slice_state *ss, struct mbuf *m,
1856 	       int busdma_seg_cnt, struct mxge_pkt_info *pi)
1857 {
1858 	mxge_tx_ring_t *tx;
1859 	mcp_kreq_ether_send_t *req;
1860 	bus_dma_segment_t *seg;
1861 	uint32_t low, high_swapped;
1862 	int len, seglen, cum_len, cum_len_next;
1863 	int next_is_first, chop, cnt, rdma_count, small;
1864 	uint16_t pseudo_hdr_offset, cksum_offset, mss, sum;
1865 	uint8_t flags, flags_next;
1866 	static int once;
1867 
1868 	mss = m->m_pkthdr.tso_segsz;
1869 
1870 	/* negative cum_len signifies to the
1871 	 * send loop that we are still in the
1872 	 * header portion of the TSO packet.
1873 	 */
1874 
1875 	cksum_offset = pi->ip_off + pi->ip_hlen;
1876 	cum_len = -(cksum_offset + (pi->tcp->th_off << 2));
1877 
1878 	/* TSO implies checksum offload on this hardware */
1879 	if (__predict_false((m->m_pkthdr.csum_flags & (CSUM_TCP|CSUM_TCP_IPV6)) == 0)) {
1880 		/*
1881 		 * If packet has full TCP csum, replace it with pseudo hdr
1882 		 * sum that the NIC expects, otherwise the NIC will emit
1883 		 * packets with bad TCP checksums.
1884 		 */
1885 		m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
1886 		if (pi->ip6) {
1887 #if (CSUM_TCP_IPV6 != 0) && defined(INET6)
1888 			m->m_pkthdr.csum_flags |= CSUM_TCP_IPV6;
1889 			sum = in6_cksum_pseudo(pi->ip6,
1890 			    m->m_pkthdr.len - cksum_offset,
1891 			    IPPROTO_TCP, 0);
1892 #endif
1893 		} else {
1894 #ifdef INET
1895 			m->m_pkthdr.csum_flags |= CSUM_TCP;
1896 			sum = in_pseudo(pi->ip->ip_src.s_addr,
1897 			    pi->ip->ip_dst.s_addr,
1898 			    htons(IPPROTO_TCP + (m->m_pkthdr.len -
1899 				    cksum_offset)));
1900 #endif
1901 		}
1902 		m_copyback(m, offsetof(struct tcphdr, th_sum) +
1903 		    cksum_offset, sizeof(sum), (caddr_t)&sum);
1904 	}
1905 	flags = MXGEFW_FLAGS_TSO_HDR | MXGEFW_FLAGS_FIRST;
1906 
1907 
1908 	/* for TSO, pseudo_hdr_offset holds mss.
1909 	 * The firmware figures out where to put
1910 	 * the checksum by parsing the header. */
1911 	pseudo_hdr_offset = htobe16(mss);
1912 
1913 	if (pi->ip6) {
1914 		/*
1915 		 * for IPv6 TSO, the "checksum offset" is re-purposed
1916 		 * to store the TCP header len
1917 		 */
1918 		cksum_offset = (pi->tcp->th_off << 2);
1919 	}
1920 
1921 	tx = &ss->tx;
1922 	req = tx->req_list;
1923 	seg = tx->seg_list;
1924 	cnt = 0;
1925 	rdma_count = 0;
1926 	/* "rdma_count" is the number of RDMAs belonging to the
1927 	 * current packet BEFORE the current send request. For
1928 	 * non-TSO packets, this is equal to "count".
1929 	 * For TSO packets, rdma_count needs to be reset
1930 	 * to 0 after a segment cut.
1931 	 *
1932 	 * The rdma_count field of the send request is
1933 	 * the number of RDMAs of the packet starting at
1934 	 * that request. For TSO send requests with one ore more cuts
1935 	 * in the middle, this is the number of RDMAs starting
1936 	 * after the last cut in the request. All previous
1937 	 * segments before the last cut implicitly have 1 RDMA.
1938 	 *
1939 	 * Since the number of RDMAs is not known beforehand,
1940 	 * it must be filled-in retroactively - after each
1941 	 * segmentation cut or at the end of the entire packet.
1942 	 */
1943 
1944 	while (busdma_seg_cnt) {
1945 		/* Break the busdma segment up into pieces*/
1946 		low = MXGE_LOWPART_TO_U32(seg->ds_addr);
1947 		high_swapped = 	htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
1948 		len = seg->ds_len;
1949 
1950 		while (len) {
1951 			flags_next = flags & ~MXGEFW_FLAGS_FIRST;
1952 			seglen = len;
1953 			cum_len_next = cum_len + seglen;
1954 			(req-rdma_count)->rdma_count = rdma_count + 1;
1955 			if (__predict_true(cum_len >= 0)) {
1956 				/* payload */
1957 				chop = (cum_len_next > mss);
1958 				cum_len_next = cum_len_next % mss;
1959 				next_is_first = (cum_len_next == 0);
1960 				flags |= chop * MXGEFW_FLAGS_TSO_CHOP;
1961 				flags_next |= next_is_first *
1962 					MXGEFW_FLAGS_FIRST;
1963 				rdma_count |= -(chop | next_is_first);
1964 				rdma_count += chop & !next_is_first;
1965 			} else if (cum_len_next >= 0) {
1966 				/* header ends */
1967 				rdma_count = -1;
1968 				cum_len_next = 0;
1969 				seglen = -cum_len;
1970 				small = (mss <= MXGEFW_SEND_SMALL_SIZE);
1971 				flags_next = MXGEFW_FLAGS_TSO_PLD |
1972 					MXGEFW_FLAGS_FIRST |
1973 					(small * MXGEFW_FLAGS_SMALL);
1974 			    }
1975 
1976 			req->addr_high = high_swapped;
1977 			req->addr_low = htobe32(low);
1978 			req->pseudo_hdr_offset = pseudo_hdr_offset;
1979 			req->pad = 0;
1980 			req->rdma_count = 1;
1981 			req->length = htobe16(seglen);
1982 			req->cksum_offset = cksum_offset;
1983 			req->flags = flags | ((cum_len & 1) *
1984 					      MXGEFW_FLAGS_ALIGN_ODD);
1985 			low += seglen;
1986 			len -= seglen;
1987 			cum_len = cum_len_next;
1988 			flags = flags_next;
1989 			req++;
1990 			cnt++;
1991 			rdma_count++;
1992 			if (cksum_offset != 0 && !pi->ip6) {
1993 				if (__predict_false(cksum_offset > seglen))
1994 					cksum_offset -= seglen;
1995 				else
1996 					cksum_offset = 0;
1997 			}
1998 			if (__predict_false(cnt > tx->max_desc))
1999 				goto drop;
2000 		}
2001 		busdma_seg_cnt--;
2002 		seg++;
2003 	}
2004 	(req-rdma_count)->rdma_count = rdma_count;
2005 
2006 	do {
2007 		req--;
2008 		req->flags |= MXGEFW_FLAGS_TSO_LAST;
2009 	} while (!(req->flags & (MXGEFW_FLAGS_TSO_CHOP | MXGEFW_FLAGS_FIRST)));
2010 
2011 	tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
2012 	mxge_submit_req(tx, tx->req_list, cnt);
2013 #ifdef IFNET_BUF_RING
2014 	if ((ss->sc->num_slices > 1) && tx->queue_active == 0) {
2015 		/* tell the NIC to start polling this slice */
2016 		*tx->send_go = 1;
2017 		tx->queue_active = 1;
2018 		tx->activate++;
2019 		wmb();
2020 	}
2021 #endif
2022 	return;
2023 
2024 drop:
2025 	bus_dmamap_unload(tx->dmat, tx->info[tx->req & tx->mask].map);
2026 	m_freem(m);
2027 	ss->oerrors++;
2028 	if (!once) {
2029 		printf("tx->max_desc exceeded via TSO!\n");
2030 		printf("mss = %d, %ld, %d!\n", mss,
2031 		       (long)seg - (long)tx->seg_list, tx->max_desc);
2032 		once = 1;
2033 	}
2034 	return;
2035 
2036 }
2037 
2038 #endif /* IFCAP_TSO4 */
2039 
2040 #ifdef MXGE_NEW_VLAN_API
2041 /*
2042  * We reproduce the software vlan tag insertion from
2043  * net/if_vlan.c:vlan_start() here so that we can advertise "hardware"
2044  * vlan tag insertion. We need to advertise this in order to have the
2045  * vlan interface respect our csum offload flags.
2046  */
2047 static struct mbuf *
2048 mxge_vlan_tag_insert(struct mbuf *m)
2049 {
2050 	struct ether_vlan_header *evl;
2051 
2052 	M_PREPEND(m, ETHER_VLAN_ENCAP_LEN, M_NOWAIT);
2053 	if (__predict_false(m == NULL))
2054 		return NULL;
2055 	if (m->m_len < sizeof(*evl)) {
2056 		m = m_pullup(m, sizeof(*evl));
2057 		if (__predict_false(m == NULL))
2058 			return NULL;
2059 	}
2060 	/*
2061 	 * Transform the Ethernet header into an Ethernet header
2062 	 * with 802.1Q encapsulation.
2063 	 */
2064 	evl = mtod(m, struct ether_vlan_header *);
2065 	bcopy((char *)evl + ETHER_VLAN_ENCAP_LEN,
2066 	      (char *)evl, ETHER_HDR_LEN - ETHER_TYPE_LEN);
2067 	evl->evl_encap_proto = htons(ETHERTYPE_VLAN);
2068 	evl->evl_tag = htons(m->m_pkthdr.ether_vtag);
2069 	m->m_flags &= ~M_VLANTAG;
2070 	return m;
2071 }
2072 #endif /* MXGE_NEW_VLAN_API */
2073 
2074 static void
2075 mxge_encap(struct mxge_slice_state *ss, struct mbuf *m)
2076 {
2077 	struct mxge_pkt_info pi = {0,0,0,0};
2078 	mxge_softc_t *sc;
2079 	mcp_kreq_ether_send_t *req;
2080 	bus_dma_segment_t *seg;
2081 	struct mbuf *m_tmp;
2082 	struct ifnet *ifp;
2083 	mxge_tx_ring_t *tx;
2084 	int cnt, cum_len, err, i, idx, odd_flag;
2085 	uint16_t pseudo_hdr_offset;
2086 	uint8_t flags, cksum_offset;
2087 
2088 
2089 	sc = ss->sc;
2090 	ifp = sc->ifp;
2091 	tx = &ss->tx;
2092 
2093 #ifdef MXGE_NEW_VLAN_API
2094 	if (m->m_flags & M_VLANTAG) {
2095 		m = mxge_vlan_tag_insert(m);
2096 		if (__predict_false(m == NULL))
2097 			goto drop_without_m;
2098 	}
2099 #endif
2100 	if (m->m_pkthdr.csum_flags &
2101 	    (CSUM_TSO | CSUM_DELAY_DATA | CSUM_DELAY_DATA_IPV6)) {
2102 		if (mxge_parse_tx(ss, m, &pi))
2103 			goto drop;
2104 	}
2105 
2106 	/* (try to) map the frame for DMA */
2107 	idx = tx->req & tx->mask;
2108 	err = bus_dmamap_load_mbuf_sg(tx->dmat, tx->info[idx].map,
2109 				      m, tx->seg_list, &cnt,
2110 				      BUS_DMA_NOWAIT);
2111 	if (__predict_false(err == EFBIG)) {
2112 		/* Too many segments in the chain.  Try
2113 		   to defrag */
2114 		m_tmp = m_defrag(m, M_NOWAIT);
2115 		if (m_tmp == NULL) {
2116 			goto drop;
2117 		}
2118 		ss->tx.defrag++;
2119 		m = m_tmp;
2120 		err = bus_dmamap_load_mbuf_sg(tx->dmat,
2121 					      tx->info[idx].map,
2122 					      m, tx->seg_list, &cnt,
2123 					      BUS_DMA_NOWAIT);
2124 	}
2125 	if (__predict_false(err != 0)) {
2126 		device_printf(sc->dev, "bus_dmamap_load_mbuf_sg returned %d"
2127 			      " packet len = %d\n", err, m->m_pkthdr.len);
2128 		goto drop;
2129 	}
2130 	bus_dmamap_sync(tx->dmat, tx->info[idx].map,
2131 			BUS_DMASYNC_PREWRITE);
2132 	tx->info[idx].m = m;
2133 
2134 #if IFCAP_TSO4
2135 	/* TSO is different enough, we handle it in another routine */
2136 	if (m->m_pkthdr.csum_flags & (CSUM_TSO)) {
2137 		mxge_encap_tso(ss, m, cnt, &pi);
2138 		return;
2139 	}
2140 #endif
2141 
2142 	req = tx->req_list;
2143 	cksum_offset = 0;
2144 	pseudo_hdr_offset = 0;
2145 	flags = MXGEFW_FLAGS_NO_TSO;
2146 
2147 	/* checksum offloading? */
2148 	if (m->m_pkthdr.csum_flags &
2149 	    (CSUM_DELAY_DATA | CSUM_DELAY_DATA_IPV6)) {
2150 		/* ensure ip header is in first mbuf, copy
2151 		   it to a scratch buffer if not */
2152 		cksum_offset = pi.ip_off + pi.ip_hlen;
2153 		pseudo_hdr_offset = cksum_offset +  m->m_pkthdr.csum_data;
2154 		pseudo_hdr_offset = htobe16(pseudo_hdr_offset);
2155 		req->cksum_offset = cksum_offset;
2156 		flags |= MXGEFW_FLAGS_CKSUM;
2157 		odd_flag = MXGEFW_FLAGS_ALIGN_ODD;
2158 	} else {
2159 		odd_flag = 0;
2160 	}
2161 	if (m->m_pkthdr.len < MXGEFW_SEND_SMALL_SIZE)
2162 		flags |= MXGEFW_FLAGS_SMALL;
2163 
2164 	/* convert segments into a request list */
2165 	cum_len = 0;
2166 	seg = tx->seg_list;
2167 	req->flags = MXGEFW_FLAGS_FIRST;
2168 	for (i = 0; i < cnt; i++) {
2169 		req->addr_low =
2170 			htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
2171 		req->addr_high =
2172 			htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
2173 		req->length = htobe16(seg->ds_len);
2174 		req->cksum_offset = cksum_offset;
2175 		if (cksum_offset > seg->ds_len)
2176 			cksum_offset -= seg->ds_len;
2177 		else
2178 			cksum_offset = 0;
2179 		req->pseudo_hdr_offset = pseudo_hdr_offset;
2180 		req->pad = 0; /* complete solid 16-byte block */
2181 		req->rdma_count = 1;
2182 		req->flags |= flags | ((cum_len & 1) * odd_flag);
2183 		cum_len += seg->ds_len;
2184 		seg++;
2185 		req++;
2186 		req->flags = 0;
2187 	}
2188 	req--;
2189 	/* pad runts to 60 bytes */
2190 	if (cum_len < 60) {
2191 		req++;
2192 		req->addr_low =
2193 			htobe32(MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr));
2194 		req->addr_high =
2195 			htobe32(MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr));
2196 		req->length = htobe16(60 - cum_len);
2197 		req->cksum_offset = 0;
2198 		req->pseudo_hdr_offset = pseudo_hdr_offset;
2199 		req->pad = 0; /* complete solid 16-byte block */
2200 		req->rdma_count = 1;
2201 		req->flags |= flags | ((cum_len & 1) * odd_flag);
2202 		cnt++;
2203 	}
2204 
2205 	tx->req_list[0].rdma_count = cnt;
2206 #if 0
2207 	/* print what the firmware will see */
2208 	for (i = 0; i < cnt; i++) {
2209 		printf("%d: addr: 0x%x 0x%x len:%d pso%d,"
2210 		    "cso:%d, flags:0x%x, rdma:%d\n",
2211 		    i, (int)ntohl(tx->req_list[i].addr_high),
2212 		    (int)ntohl(tx->req_list[i].addr_low),
2213 		    (int)ntohs(tx->req_list[i].length),
2214 		    (int)ntohs(tx->req_list[i].pseudo_hdr_offset),
2215 		    tx->req_list[i].cksum_offset, tx->req_list[i].flags,
2216 		    tx->req_list[i].rdma_count);
2217 	}
2218 	printf("--------------\n");
2219 #endif
2220 	tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
2221 	mxge_submit_req(tx, tx->req_list, cnt);
2222 #ifdef IFNET_BUF_RING
2223 	if ((ss->sc->num_slices > 1) && tx->queue_active == 0) {
2224 		/* tell the NIC to start polling this slice */
2225 		*tx->send_go = 1;
2226 		tx->queue_active = 1;
2227 		tx->activate++;
2228 		wmb();
2229 	}
2230 #endif
2231 	return;
2232 
2233 drop:
2234 	m_freem(m);
2235 drop_without_m:
2236 	ss->oerrors++;
2237 	return;
2238 }
2239 
2240 #ifdef IFNET_BUF_RING
2241 static void
2242 mxge_qflush(struct ifnet *ifp)
2243 {
2244 	mxge_softc_t *sc = ifp->if_softc;
2245 	mxge_tx_ring_t *tx;
2246 	struct mbuf *m;
2247 	int slice;
2248 
2249 	for (slice = 0; slice < sc->num_slices; slice++) {
2250 		tx = &sc->ss[slice].tx;
2251 		mtx_lock(&tx->mtx);
2252 		while ((m = buf_ring_dequeue_sc(tx->br)) != NULL)
2253 			m_freem(m);
2254 		mtx_unlock(&tx->mtx);
2255 	}
2256 	if_qflush(ifp);
2257 }
2258 
2259 static inline void
2260 mxge_start_locked(struct mxge_slice_state *ss)
2261 {
2262 	mxge_softc_t *sc;
2263 	struct mbuf *m;
2264 	struct ifnet *ifp;
2265 	mxge_tx_ring_t *tx;
2266 
2267 	sc = ss->sc;
2268 	ifp = sc->ifp;
2269 	tx = &ss->tx;
2270 
2271 	while ((tx->mask - (tx->req - tx->done)) > tx->max_desc) {
2272 		m = drbr_dequeue(ifp, tx->br);
2273 		if (m == NULL) {
2274 			return;
2275 		}
2276 		/* let BPF see it */
2277 		BPF_MTAP(ifp, m);
2278 
2279 		/* give it to the nic */
2280 		mxge_encap(ss, m);
2281 	}
2282 	/* ran out of transmit slots */
2283 	if (((ss->if_drv_flags & IFF_DRV_OACTIVE) == 0)
2284 	    && (!drbr_empty(ifp, tx->br))) {
2285 		ss->if_drv_flags |= IFF_DRV_OACTIVE;
2286 		tx->stall++;
2287 	}
2288 }
2289 
2290 static int
2291 mxge_transmit_locked(struct mxge_slice_state *ss, struct mbuf *m)
2292 {
2293 	mxge_softc_t *sc;
2294 	struct ifnet *ifp;
2295 	mxge_tx_ring_t *tx;
2296 	int err;
2297 
2298 	sc = ss->sc;
2299 	ifp = sc->ifp;
2300 	tx = &ss->tx;
2301 
2302 	if ((ss->if_drv_flags & (IFF_DRV_RUNNING|IFF_DRV_OACTIVE)) !=
2303 	    IFF_DRV_RUNNING) {
2304 		err = drbr_enqueue(ifp, tx->br, m);
2305 		return (err);
2306 	}
2307 
2308 	if (!drbr_needs_enqueue(ifp, tx->br) &&
2309 	    ((tx->mask - (tx->req - tx->done)) > tx->max_desc)) {
2310 		/* let BPF see it */
2311 		BPF_MTAP(ifp, m);
2312 		/* give it to the nic */
2313 		mxge_encap(ss, m);
2314 	} else if ((err = drbr_enqueue(ifp, tx->br, m)) != 0) {
2315 		return (err);
2316 	}
2317 	if (!drbr_empty(ifp, tx->br))
2318 		mxge_start_locked(ss);
2319 	return (0);
2320 }
2321 
2322 static int
2323 mxge_transmit(struct ifnet *ifp, struct mbuf *m)
2324 {
2325 	mxge_softc_t *sc = ifp->if_softc;
2326 	struct mxge_slice_state *ss;
2327 	mxge_tx_ring_t *tx;
2328 	int err = 0;
2329 	int slice;
2330 
2331 	slice = m->m_pkthdr.flowid;
2332 	slice &= (sc->num_slices - 1);  /* num_slices always power of 2 */
2333 
2334 	ss = &sc->ss[slice];
2335 	tx = &ss->tx;
2336 
2337 	if (mtx_trylock(&tx->mtx)) {
2338 		err = mxge_transmit_locked(ss, m);
2339 		mtx_unlock(&tx->mtx);
2340 	} else {
2341 		err = drbr_enqueue(ifp, tx->br, m);
2342 	}
2343 
2344 	return (err);
2345 }
2346 
2347 #else
2348 
2349 static inline void
2350 mxge_start_locked(struct mxge_slice_state *ss)
2351 {
2352 	mxge_softc_t *sc;
2353 	struct mbuf *m;
2354 	struct ifnet *ifp;
2355 	mxge_tx_ring_t *tx;
2356 
2357 	sc = ss->sc;
2358 	ifp = sc->ifp;
2359 	tx = &ss->tx;
2360 	while ((tx->mask - (tx->req - tx->done)) > tx->max_desc) {
2361 		IFQ_DRV_DEQUEUE(&ifp->if_snd, m);
2362 		if (m == NULL) {
2363 			return;
2364 		}
2365 		/* let BPF see it */
2366 		BPF_MTAP(ifp, m);
2367 
2368 		/* give it to the nic */
2369 		mxge_encap(ss, m);
2370 	}
2371 	/* ran out of transmit slots */
2372 	if ((sc->ifp->if_drv_flags & IFF_DRV_OACTIVE) == 0) {
2373 		sc->ifp->if_drv_flags |= IFF_DRV_OACTIVE;
2374 		tx->stall++;
2375 	}
2376 }
2377 #endif
2378 static void
2379 mxge_start(struct ifnet *ifp)
2380 {
2381 	mxge_softc_t *sc = ifp->if_softc;
2382 	struct mxge_slice_state *ss;
2383 
2384 	/* only use the first slice for now */
2385 	ss = &sc->ss[0];
2386 	mtx_lock(&ss->tx.mtx);
2387 	mxge_start_locked(ss);
2388 	mtx_unlock(&ss->tx.mtx);
2389 }
2390 
2391 /*
2392  * copy an array of mcp_kreq_ether_recv_t's to the mcp.  Copy
2393  * at most 32 bytes at a time, so as to avoid involving the software
2394  * pio handler in the nic.   We re-write the first segment's low
2395  * DMA address to mark it valid only after we write the entire chunk
2396  * in a burst
2397  */
2398 static inline void
2399 mxge_submit_8rx(volatile mcp_kreq_ether_recv_t *dst,
2400 		mcp_kreq_ether_recv_t *src)
2401 {
2402 	uint32_t low;
2403 
2404 	low = src->addr_low;
2405 	src->addr_low = 0xffffffff;
2406 	mxge_pio_copy(dst, src, 4 * sizeof (*src));
2407 	wmb();
2408 	mxge_pio_copy(dst + 4, src + 4, 4 * sizeof (*src));
2409 	wmb();
2410 	src->addr_low = low;
2411 	dst->addr_low = low;
2412 	wmb();
2413 }
2414 
2415 static int
2416 mxge_get_buf_small(struct mxge_slice_state *ss, bus_dmamap_t map, int idx)
2417 {
2418 	bus_dma_segment_t seg;
2419 	struct mbuf *m;
2420 	mxge_rx_ring_t *rx = &ss->rx_small;
2421 	int cnt, err;
2422 
2423 	m = m_gethdr(M_NOWAIT, MT_DATA);
2424 	if (m == NULL) {
2425 		rx->alloc_fail++;
2426 		err = ENOBUFS;
2427 		goto done;
2428 	}
2429 	m->m_len = MHLEN;
2430 	err = bus_dmamap_load_mbuf_sg(rx->dmat, map, m,
2431 				      &seg, &cnt, BUS_DMA_NOWAIT);
2432 	if (err != 0) {
2433 		m_free(m);
2434 		goto done;
2435 	}
2436 	rx->info[idx].m = m;
2437 	rx->shadow[idx].addr_low =
2438 		htobe32(MXGE_LOWPART_TO_U32(seg.ds_addr));
2439 	rx->shadow[idx].addr_high =
2440 		htobe32(MXGE_HIGHPART_TO_U32(seg.ds_addr));
2441 
2442 done:
2443 	if ((idx & 7) == 7)
2444 		mxge_submit_8rx(&rx->lanai[idx - 7], &rx->shadow[idx - 7]);
2445 	return err;
2446 }
2447 
2448 static int
2449 mxge_get_buf_big(struct mxge_slice_state *ss, bus_dmamap_t map, int idx)
2450 {
2451 	bus_dma_segment_t seg[3];
2452 	struct mbuf *m;
2453 	mxge_rx_ring_t *rx = &ss->rx_big;
2454 	int cnt, err, i;
2455 
2456 	m = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, rx->cl_size);
2457 	if (m == NULL) {
2458 		rx->alloc_fail++;
2459 		err = ENOBUFS;
2460 		goto done;
2461 	}
2462 	m->m_len = rx->mlen;
2463 	err = bus_dmamap_load_mbuf_sg(rx->dmat, map, m,
2464 				      seg, &cnt, BUS_DMA_NOWAIT);
2465 	if (err != 0) {
2466 		m_free(m);
2467 		goto done;
2468 	}
2469 	rx->info[idx].m = m;
2470 	rx->shadow[idx].addr_low =
2471 		htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
2472 	rx->shadow[idx].addr_high =
2473 		htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
2474 
2475 #if MXGE_VIRT_JUMBOS
2476 	for (i = 1; i < cnt; i++) {
2477 		rx->shadow[idx + i].addr_low =
2478 			htobe32(MXGE_LOWPART_TO_U32(seg[i].ds_addr));
2479 		rx->shadow[idx + i].addr_high =
2480 			htobe32(MXGE_HIGHPART_TO_U32(seg[i].ds_addr));
2481        }
2482 #endif
2483 
2484 done:
2485        for (i = 0; i < rx->nbufs; i++) {
2486 		if ((idx & 7) == 7) {
2487 			mxge_submit_8rx(&rx->lanai[idx - 7],
2488 					&rx->shadow[idx - 7]);
2489 		}
2490 		idx++;
2491 	}
2492 	return err;
2493 }
2494 
2495 #ifdef INET6
2496 
2497 static uint16_t
2498 mxge_csum_generic(uint16_t *raw, int len)
2499 {
2500 	uint32_t csum;
2501 
2502 
2503 	csum = 0;
2504 	while (len > 0) {
2505 		csum += *raw;
2506 		raw++;
2507 		len -= 2;
2508 	}
2509 	csum = (csum >> 16) + (csum & 0xffff);
2510 	csum = (csum >> 16) + (csum & 0xffff);
2511 	return (uint16_t)csum;
2512 }
2513 
2514 static inline uint16_t
2515 mxge_rx_csum6(void *p, struct mbuf *m, uint32_t csum)
2516 {
2517 	uint32_t partial;
2518 	int nxt, cksum_offset;
2519 	struct ip6_hdr *ip6 = p;
2520 	uint16_t c;
2521 
2522 	nxt = ip6->ip6_nxt;
2523 	cksum_offset = sizeof (*ip6) + ETHER_HDR_LEN;
2524 	if (nxt != IPPROTO_TCP && nxt != IPPROTO_UDP) {
2525 		cksum_offset = ip6_lasthdr(m, ETHER_HDR_LEN,
2526 					   IPPROTO_IPV6, &nxt);
2527 		if (nxt != IPPROTO_TCP && nxt != IPPROTO_UDP)
2528 			return (1);
2529 	}
2530 
2531 	/*
2532 	 * IPv6 headers do not contain a checksum, and hence
2533 	 * do not checksum to zero, so they don't "fall out"
2534 	 * of the partial checksum calculation like IPv4
2535 	 * headers do.  We need to fix the partial checksum by
2536 	 * subtracting the checksum of the IPv6 header.
2537 	 */
2538 
2539 	partial = mxge_csum_generic((uint16_t *)ip6, cksum_offset -
2540 				    ETHER_HDR_LEN);
2541 	csum += ~partial;
2542 	csum +=	 (csum < ~partial);
2543 	csum = (csum >> 16) + (csum & 0xFFFF);
2544 	csum = (csum >> 16) + (csum & 0xFFFF);
2545 	c = in6_cksum_pseudo(ip6, m->m_pkthdr.len - cksum_offset, nxt,
2546 			     csum);
2547 	c ^= 0xffff;
2548 	return (c);
2549 }
2550 #endif /* INET6 */
2551 /*
2552  *  Myri10GE hardware checksums are not valid if the sender
2553  *  padded the frame with non-zero padding.  This is because
2554  *  the firmware just does a simple 16-bit 1s complement
2555  *  checksum across the entire frame, excluding the first 14
2556  *  bytes.  It is best to simply to check the checksum and
2557  *  tell the stack about it only if the checksum is good
2558  */
2559 
2560 static inline uint16_t
2561 mxge_rx_csum(struct mbuf *m, int csum)
2562 {
2563 	struct ether_header *eh;
2564 #ifdef INET
2565 	struct ip *ip;
2566 #endif
2567 #if defined(INET) || defined(INET6)
2568 	int cap = m->m_pkthdr.rcvif->if_capenable;
2569 #endif
2570 	uint16_t c, etype;
2571 
2572 
2573 	eh = mtod(m, struct ether_header *);
2574 	etype = ntohs(eh->ether_type);
2575 	switch (etype) {
2576 #ifdef INET
2577 	case ETHERTYPE_IP:
2578 		if ((cap & IFCAP_RXCSUM) == 0)
2579 			return (1);
2580 		ip = (struct ip *)(eh + 1);
2581 		if (ip->ip_p != IPPROTO_TCP && ip->ip_p != IPPROTO_UDP)
2582 			return (1);
2583 		c = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
2584 			      htonl(ntohs(csum) + ntohs(ip->ip_len) -
2585 				    (ip->ip_hl << 2) + ip->ip_p));
2586 		c ^= 0xffff;
2587 		break;
2588 #endif
2589 #ifdef INET6
2590 	case ETHERTYPE_IPV6:
2591 		if ((cap & IFCAP_RXCSUM_IPV6) == 0)
2592 			return (1);
2593 		c = mxge_rx_csum6((eh + 1), m, csum);
2594 		break;
2595 #endif
2596 	default:
2597 		c = 1;
2598 	}
2599 	return (c);
2600 }
2601 
2602 static void
2603 mxge_vlan_tag_remove(struct mbuf *m, uint32_t *csum)
2604 {
2605 	struct ether_vlan_header *evl;
2606 	struct ether_header *eh;
2607 	uint32_t partial;
2608 
2609 	evl = mtod(m, struct ether_vlan_header *);
2610 	eh = mtod(m, struct ether_header *);
2611 
2612 	/*
2613 	 * fix checksum by subtracting ETHER_VLAN_ENCAP_LEN bytes
2614 	 * after what the firmware thought was the end of the ethernet
2615 	 * header.
2616 	 */
2617 
2618 	/* put checksum into host byte order */
2619 	*csum = ntohs(*csum);
2620 	partial = ntohl(*(uint32_t *)(mtod(m, char *) + ETHER_HDR_LEN));
2621 	(*csum) += ~partial;
2622 	(*csum) +=  ((*csum) < ~partial);
2623 	(*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2624 	(*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2625 
2626 	/* restore checksum to network byte order;
2627 	   later consumers expect this */
2628 	*csum = htons(*csum);
2629 
2630 	/* save the tag */
2631 #ifdef MXGE_NEW_VLAN_API
2632 	m->m_pkthdr.ether_vtag = ntohs(evl->evl_tag);
2633 #else
2634 	{
2635 		struct m_tag *mtag;
2636 		mtag = m_tag_alloc(MTAG_VLAN, MTAG_VLAN_TAG, sizeof(u_int),
2637 				   M_NOWAIT);
2638 		if (mtag == NULL)
2639 			return;
2640 		VLAN_TAG_VALUE(mtag) = ntohs(evl->evl_tag);
2641 		m_tag_prepend(m, mtag);
2642 	}
2643 
2644 #endif
2645 	m->m_flags |= M_VLANTAG;
2646 
2647 	/*
2648 	 * Remove the 802.1q header by copying the Ethernet
2649 	 * addresses over it and adjusting the beginning of
2650 	 * the data in the mbuf.  The encapsulated Ethernet
2651 	 * type field is already in place.
2652 	 */
2653 	bcopy((char *)evl, (char *)evl + ETHER_VLAN_ENCAP_LEN,
2654 	      ETHER_HDR_LEN - ETHER_TYPE_LEN);
2655 	m_adj(m, ETHER_VLAN_ENCAP_LEN);
2656 }
2657 
2658 
2659 static inline void
2660 mxge_rx_done_big(struct mxge_slice_state *ss, uint32_t len,
2661 		 uint32_t csum, int lro)
2662 {
2663 	mxge_softc_t *sc;
2664 	struct ifnet *ifp;
2665 	struct mbuf *m;
2666 	struct ether_header *eh;
2667 	mxge_rx_ring_t *rx;
2668 	bus_dmamap_t old_map;
2669 	int idx;
2670 
2671 	sc = ss->sc;
2672 	ifp = sc->ifp;
2673 	rx = &ss->rx_big;
2674 	idx = rx->cnt & rx->mask;
2675 	rx->cnt += rx->nbufs;
2676 	/* save a pointer to the received mbuf */
2677 	m = rx->info[idx].m;
2678 	/* try to replace the received mbuf */
2679 	if (mxge_get_buf_big(ss, rx->extra_map, idx)) {
2680 		/* drop the frame -- the old mbuf is re-cycled */
2681 		if_inc_counter(ifp, IFCOUNTER_IERRORS, 1);
2682 		return;
2683 	}
2684 
2685 	/* unmap the received buffer */
2686 	old_map = rx->info[idx].map;
2687 	bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2688 	bus_dmamap_unload(rx->dmat, old_map);
2689 
2690 	/* swap the bus_dmamap_t's */
2691 	rx->info[idx].map = rx->extra_map;
2692 	rx->extra_map = old_map;
2693 
2694 	/* mcp implicitly skips 1st 2 bytes so that packet is properly
2695 	 * aligned */
2696 	m->m_data += MXGEFW_PAD;
2697 
2698 	m->m_pkthdr.rcvif = ifp;
2699 	m->m_len = m->m_pkthdr.len = len;
2700 	ss->ipackets++;
2701 	eh = mtod(m, struct ether_header *);
2702 	if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2703 		mxge_vlan_tag_remove(m, &csum);
2704 	}
2705 	/* flowid only valid if RSS hashing is enabled */
2706 	if (sc->num_slices > 1) {
2707 		m->m_pkthdr.flowid = (ss - sc->ss);
2708 		M_HASHTYPE_SET(m, M_HASHTYPE_OPAQUE);
2709 	}
2710 	/* if the checksum is valid, mark it in the mbuf header */
2711 	if ((ifp->if_capenable & (IFCAP_RXCSUM_IPV6 | IFCAP_RXCSUM)) &&
2712 	    (0 == mxge_rx_csum(m, csum))) {
2713 		/* Tell the stack that the  checksum is good */
2714 		m->m_pkthdr.csum_data = 0xffff;
2715 		m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR |
2716 			CSUM_DATA_VALID;
2717 
2718 #if defined(INET) || defined (INET6)
2719 		if (lro && (0 == tcp_lro_rx(&ss->lc, m, 0)))
2720 			return;
2721 #endif
2722 	}
2723 	/* pass the frame up the stack */
2724 	(*ifp->if_input)(ifp, m);
2725 }
2726 
2727 static inline void
2728 mxge_rx_done_small(struct mxge_slice_state *ss, uint32_t len,
2729 		   uint32_t csum, int lro)
2730 {
2731 	mxge_softc_t *sc;
2732 	struct ifnet *ifp;
2733 	struct ether_header *eh;
2734 	struct mbuf *m;
2735 	mxge_rx_ring_t *rx;
2736 	bus_dmamap_t old_map;
2737 	int idx;
2738 
2739 	sc = ss->sc;
2740 	ifp = sc->ifp;
2741 	rx = &ss->rx_small;
2742 	idx = rx->cnt & rx->mask;
2743 	rx->cnt++;
2744 	/* save a pointer to the received mbuf */
2745 	m = rx->info[idx].m;
2746 	/* try to replace the received mbuf */
2747 	if (mxge_get_buf_small(ss, rx->extra_map, idx)) {
2748 		/* drop the frame -- the old mbuf is re-cycled */
2749 		if_inc_counter(ifp, IFCOUNTER_IERRORS, 1);
2750 		return;
2751 	}
2752 
2753 	/* unmap the received buffer */
2754 	old_map = rx->info[idx].map;
2755 	bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2756 	bus_dmamap_unload(rx->dmat, old_map);
2757 
2758 	/* swap the bus_dmamap_t's */
2759 	rx->info[idx].map = rx->extra_map;
2760 	rx->extra_map = old_map;
2761 
2762 	/* mcp implicitly skips 1st 2 bytes so that packet is properly
2763 	 * aligned */
2764 	m->m_data += MXGEFW_PAD;
2765 
2766 	m->m_pkthdr.rcvif = ifp;
2767 	m->m_len = m->m_pkthdr.len = len;
2768 	ss->ipackets++;
2769 	eh = mtod(m, struct ether_header *);
2770 	if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2771 		mxge_vlan_tag_remove(m, &csum);
2772 	}
2773 	/* flowid only valid if RSS hashing is enabled */
2774 	if (sc->num_slices > 1) {
2775 		m->m_pkthdr.flowid = (ss - sc->ss);
2776 		M_HASHTYPE_SET(m, M_HASHTYPE_OPAQUE);
2777 	}
2778 	/* if the checksum is valid, mark it in the mbuf header */
2779 	if ((ifp->if_capenable & (IFCAP_RXCSUM_IPV6 | IFCAP_RXCSUM)) &&
2780 	    (0 == mxge_rx_csum(m, csum))) {
2781 		/* Tell the stack that the  checksum is good */
2782 		m->m_pkthdr.csum_data = 0xffff;
2783 		m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR |
2784 			CSUM_DATA_VALID;
2785 
2786 #if defined(INET) || defined (INET6)
2787 		if (lro && (0 == tcp_lro_rx(&ss->lc, m, csum)))
2788 			return;
2789 #endif
2790 	}
2791 	/* pass the frame up the stack */
2792 	(*ifp->if_input)(ifp, m);
2793 }
2794 
2795 static inline void
2796 mxge_clean_rx_done(struct mxge_slice_state *ss)
2797 {
2798 	mxge_rx_done_t *rx_done = &ss->rx_done;
2799 	int limit = 0;
2800 	uint16_t length;
2801 	uint16_t checksum;
2802 	int lro;
2803 
2804 	lro = ss->sc->ifp->if_capenable & IFCAP_LRO;
2805 	while (rx_done->entry[rx_done->idx].length != 0) {
2806 		length = ntohs(rx_done->entry[rx_done->idx].length);
2807 		rx_done->entry[rx_done->idx].length = 0;
2808 		checksum = rx_done->entry[rx_done->idx].checksum;
2809 		if (length <= (MHLEN - MXGEFW_PAD))
2810 			mxge_rx_done_small(ss, length, checksum, lro);
2811 		else
2812 			mxge_rx_done_big(ss, length, checksum, lro);
2813 		rx_done->cnt++;
2814 		rx_done->idx = rx_done->cnt & rx_done->mask;
2815 
2816 		/* limit potential for livelock */
2817 		if (__predict_false(++limit > rx_done->mask / 2))
2818 			break;
2819 	}
2820 #if defined(INET)  || defined (INET6)
2821 	tcp_lro_flush_all(&ss->lc);
2822 #endif
2823 }
2824 
2825 
2826 static inline void
2827 mxge_tx_done(struct mxge_slice_state *ss, uint32_t mcp_idx)
2828 {
2829 	struct ifnet *ifp;
2830 	mxge_tx_ring_t *tx;
2831 	struct mbuf *m;
2832 	bus_dmamap_t map;
2833 	int idx;
2834 	int *flags;
2835 
2836 	tx = &ss->tx;
2837 	ifp = ss->sc->ifp;
2838 	while (tx->pkt_done != mcp_idx) {
2839 		idx = tx->done & tx->mask;
2840 		tx->done++;
2841 		m = tx->info[idx].m;
2842 		/* mbuf and DMA map only attached to the first
2843 		   segment per-mbuf */
2844 		if (m != NULL) {
2845 			ss->obytes += m->m_pkthdr.len;
2846 			if (m->m_flags & M_MCAST)
2847 				ss->omcasts++;
2848 			ss->opackets++;
2849 			tx->info[idx].m = NULL;
2850 			map = tx->info[idx].map;
2851 			bus_dmamap_unload(tx->dmat, map);
2852 			m_freem(m);
2853 		}
2854 		if (tx->info[idx].flag) {
2855 			tx->info[idx].flag = 0;
2856 			tx->pkt_done++;
2857 		}
2858 	}
2859 
2860 	/* If we have space, clear IFF_OACTIVE to tell the stack that
2861 	   its OK to send packets */
2862 #ifdef IFNET_BUF_RING
2863 	flags = &ss->if_drv_flags;
2864 #else
2865 	flags = &ifp->if_drv_flags;
2866 #endif
2867 	mtx_lock(&ss->tx.mtx);
2868 	if ((*flags) & IFF_DRV_OACTIVE &&
2869 	    tx->req - tx->done < (tx->mask + 1)/4) {
2870 		*(flags) &= ~IFF_DRV_OACTIVE;
2871 		ss->tx.wake++;
2872 		mxge_start_locked(ss);
2873 	}
2874 #ifdef IFNET_BUF_RING
2875 	if ((ss->sc->num_slices > 1) && (tx->req == tx->done)) {
2876 		/* let the NIC stop polling this queue, since there
2877 		 * are no more transmits pending */
2878 		if (tx->req == tx->done) {
2879 			*tx->send_stop = 1;
2880 			tx->queue_active = 0;
2881 			tx->deactivate++;
2882 			wmb();
2883 		}
2884 	}
2885 #endif
2886 	mtx_unlock(&ss->tx.mtx);
2887 
2888 }
2889 
2890 static struct mxge_media_type mxge_xfp_media_types[] =
2891 {
2892 	{IFM_10G_CX4,	0x7f, 		"10GBASE-CX4 (module)"},
2893 	{IFM_10G_SR, 	(1 << 7),	"10GBASE-SR"},
2894 	{IFM_10G_LR, 	(1 << 6),	"10GBASE-LR"},
2895 	{0,		(1 << 5),	"10GBASE-ER"},
2896 	{IFM_10G_LRM,	(1 << 4),	"10GBASE-LRM"},
2897 	{0,		(1 << 3),	"10GBASE-SW"},
2898 	{0,		(1 << 2),	"10GBASE-LW"},
2899 	{0,		(1 << 1),	"10GBASE-EW"},
2900 	{0,		(1 << 0),	"Reserved"}
2901 };
2902 static struct mxge_media_type mxge_sfp_media_types[] =
2903 {
2904 	{IFM_10G_TWINAX,      0,	"10GBASE-Twinax"},
2905 	{0,		(1 << 7),	"Reserved"},
2906 	{IFM_10G_LRM,	(1 << 6),	"10GBASE-LRM"},
2907 	{IFM_10G_LR, 	(1 << 5),	"10GBASE-LR"},
2908 	{IFM_10G_SR,	(1 << 4),	"10GBASE-SR"},
2909 	{IFM_10G_TWINAX,(1 << 0),	"10GBASE-Twinax"}
2910 };
2911 
2912 static void
2913 mxge_media_set(mxge_softc_t *sc, int media_type)
2914 {
2915 
2916 
2917 	ifmedia_add(&sc->media, IFM_ETHER | IFM_FDX | media_type,
2918 		    0, NULL);
2919 	ifmedia_set(&sc->media, IFM_ETHER | IFM_FDX | media_type);
2920 	sc->current_media = media_type;
2921 	sc->media.ifm_media = sc->media.ifm_cur->ifm_media;
2922 }
2923 
2924 static void
2925 mxge_media_init(mxge_softc_t *sc)
2926 {
2927 	char *ptr;
2928 	int i;
2929 
2930 	ifmedia_removeall(&sc->media);
2931 	mxge_media_set(sc, IFM_AUTO);
2932 
2933 	/*
2934 	 * parse the product code to deterimine the interface type
2935 	 * (CX4, XFP, Quad Ribbon Fiber) by looking at the character
2936 	 * after the 3rd dash in the driver's cached copy of the
2937 	 * EEPROM's product code string.
2938 	 */
2939 	ptr = sc->product_code_string;
2940 	if (ptr == NULL) {
2941 		device_printf(sc->dev, "Missing product code\n");
2942 		return;
2943 	}
2944 
2945 	for (i = 0; i < 3; i++, ptr++) {
2946 		ptr = strchr(ptr, '-');
2947 		if (ptr == NULL) {
2948 			device_printf(sc->dev,
2949 				      "only %d dashes in PC?!?\n", i);
2950 			return;
2951 		}
2952 	}
2953 	if (*ptr == 'C' || *(ptr +1) == 'C') {
2954 		/* -C is CX4 */
2955 		sc->connector = MXGE_CX4;
2956 		mxge_media_set(sc, IFM_10G_CX4);
2957 	} else if (*ptr == 'Q') {
2958 		/* -Q is Quad Ribbon Fiber */
2959 		sc->connector = MXGE_QRF;
2960 		device_printf(sc->dev, "Quad Ribbon Fiber Media\n");
2961 		/* FreeBSD has no media type for Quad ribbon fiber */
2962 	} else if (*ptr == 'R') {
2963 		/* -R is XFP */
2964 		sc->connector = MXGE_XFP;
2965 	} else if (*ptr == 'S' || *(ptr +1) == 'S') {
2966 		/* -S or -2S is SFP+ */
2967 		sc->connector = MXGE_SFP;
2968 	} else {
2969 		device_printf(sc->dev, "Unknown media type: %c\n", *ptr);
2970 	}
2971 }
2972 
2973 /*
2974  * Determine the media type for a NIC.  Some XFPs will identify
2975  * themselves only when their link is up, so this is initiated via a
2976  * link up interrupt.  However, this can potentially take up to
2977  * several milliseconds, so it is run via the watchdog routine, rather
2978  * than in the interrupt handler itself.
2979  */
2980 static void
2981 mxge_media_probe(mxge_softc_t *sc)
2982 {
2983 	mxge_cmd_t cmd;
2984 	char *cage_type;
2985 
2986 	struct mxge_media_type *mxge_media_types = NULL;
2987 	int i, err, ms, mxge_media_type_entries;
2988 	uint32_t byte;
2989 
2990 	sc->need_media_probe = 0;
2991 
2992 	if (sc->connector == MXGE_XFP) {
2993 		/* -R is XFP */
2994 		mxge_media_types = mxge_xfp_media_types;
2995 		mxge_media_type_entries =
2996 			nitems(mxge_xfp_media_types);
2997 		byte = MXGE_XFP_COMPLIANCE_BYTE;
2998 		cage_type = "XFP";
2999 	} else 	if (sc->connector == MXGE_SFP) {
3000 		/* -S or -2S is SFP+ */
3001 		mxge_media_types = mxge_sfp_media_types;
3002 		mxge_media_type_entries =
3003 			nitems(mxge_sfp_media_types);
3004 		cage_type = "SFP+";
3005 		byte = 3;
3006 	} else {
3007 		/* nothing to do; media type cannot change */
3008 		return;
3009 	}
3010 
3011 	/*
3012 	 * At this point we know the NIC has an XFP cage, so now we
3013 	 * try to determine what is in the cage by using the
3014 	 * firmware's XFP I2C commands to read the XFP 10GbE compilance
3015 	 * register.  We read just one byte, which may take over
3016 	 * a millisecond
3017 	 */
3018 
3019 	cmd.data0 = 0;	 /* just fetch 1 byte, not all 256 */
3020 	cmd.data1 = byte;
3021 	err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_READ, &cmd);
3022 	if (err == MXGEFW_CMD_ERROR_I2C_FAILURE) {
3023 		device_printf(sc->dev, "failed to read XFP\n");
3024 	}
3025 	if (err == MXGEFW_CMD_ERROR_I2C_ABSENT) {
3026 		device_printf(sc->dev, "Type R/S with no XFP!?!?\n");
3027 	}
3028 	if (err != MXGEFW_CMD_OK) {
3029 		return;
3030 	}
3031 
3032 	/* now we wait for the data to be cached */
3033 	cmd.data0 = byte;
3034 	err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
3035 	for (ms = 0; (err == EBUSY) && (ms < 50); ms++) {
3036 		DELAY(1000);
3037 		cmd.data0 = byte;
3038 		err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
3039 	}
3040 	if (err != MXGEFW_CMD_OK) {
3041 		device_printf(sc->dev, "failed to read %s (%d, %dms)\n",
3042 			      cage_type, err, ms);
3043 		return;
3044 	}
3045 
3046 	if (cmd.data0 == mxge_media_types[0].bitmask) {
3047 		if (mxge_verbose)
3048 			device_printf(sc->dev, "%s:%s\n", cage_type,
3049 				      mxge_media_types[0].name);
3050 		if (sc->current_media != mxge_media_types[0].flag) {
3051 			mxge_media_init(sc);
3052 			mxge_media_set(sc, mxge_media_types[0].flag);
3053 		}
3054 		return;
3055 	}
3056 	for (i = 1; i < mxge_media_type_entries; i++) {
3057 		if (cmd.data0 & mxge_media_types[i].bitmask) {
3058 			if (mxge_verbose)
3059 				device_printf(sc->dev, "%s:%s\n",
3060 					      cage_type,
3061 					      mxge_media_types[i].name);
3062 
3063 			if (sc->current_media != mxge_media_types[i].flag) {
3064 				mxge_media_init(sc);
3065 				mxge_media_set(sc, mxge_media_types[i].flag);
3066 			}
3067 			return;
3068 		}
3069 	}
3070 	if (mxge_verbose)
3071 		device_printf(sc->dev, "%s media 0x%x unknown\n",
3072 			      cage_type, cmd.data0);
3073 
3074 	return;
3075 }
3076 
3077 static void
3078 mxge_intr(void *arg)
3079 {
3080 	struct mxge_slice_state *ss = arg;
3081 	mxge_softc_t *sc = ss->sc;
3082 	mcp_irq_data_t *stats = ss->fw_stats;
3083 	mxge_tx_ring_t *tx = &ss->tx;
3084 	mxge_rx_done_t *rx_done = &ss->rx_done;
3085 	uint32_t send_done_count;
3086 	uint8_t valid;
3087 
3088 
3089 #ifndef IFNET_BUF_RING
3090 	/* an interrupt on a non-zero slice is implicitly valid
3091 	   since MSI-X irqs are not shared */
3092 	if (ss != sc->ss) {
3093 		mxge_clean_rx_done(ss);
3094 		*ss->irq_claim = be32toh(3);
3095 		return;
3096 	}
3097 #endif
3098 
3099 	/* make sure the DMA has finished */
3100 	if (!stats->valid) {
3101 		return;
3102 	}
3103 	valid = stats->valid;
3104 
3105 	if (sc->legacy_irq) {
3106 		/* lower legacy IRQ  */
3107 		*sc->irq_deassert = 0;
3108 		if (!mxge_deassert_wait)
3109 			/* don't wait for conf. that irq is low */
3110 			stats->valid = 0;
3111 	} else {
3112 		stats->valid = 0;
3113 	}
3114 
3115 	/* loop while waiting for legacy irq deassertion */
3116 	do {
3117 		/* check for transmit completes and receives */
3118 		send_done_count = be32toh(stats->send_done_count);
3119 		while ((send_done_count != tx->pkt_done) ||
3120 		       (rx_done->entry[rx_done->idx].length != 0)) {
3121 			if (send_done_count != tx->pkt_done)
3122 				mxge_tx_done(ss, (int)send_done_count);
3123 			mxge_clean_rx_done(ss);
3124 			send_done_count = be32toh(stats->send_done_count);
3125 		}
3126 		if (sc->legacy_irq && mxge_deassert_wait)
3127 			wmb();
3128 	} while (*((volatile uint8_t *) &stats->valid));
3129 
3130 	/* fw link & error stats meaningful only on the first slice */
3131 	if (__predict_false((ss == sc->ss) && stats->stats_updated)) {
3132 		if (sc->link_state != stats->link_up) {
3133 			sc->link_state = stats->link_up;
3134 			if (sc->link_state) {
3135 				if_link_state_change(sc->ifp, LINK_STATE_UP);
3136 				if (mxge_verbose)
3137 					device_printf(sc->dev, "link up\n");
3138 			} else {
3139 				if_link_state_change(sc->ifp, LINK_STATE_DOWN);
3140 				if (mxge_verbose)
3141 					device_printf(sc->dev, "link down\n");
3142 			}
3143 			sc->need_media_probe = 1;
3144 		}
3145 		if (sc->rdma_tags_available !=
3146 		    be32toh(stats->rdma_tags_available)) {
3147 			sc->rdma_tags_available =
3148 				be32toh(stats->rdma_tags_available);
3149 			device_printf(sc->dev, "RDMA timed out! %d tags "
3150 				      "left\n", sc->rdma_tags_available);
3151 		}
3152 
3153 		if (stats->link_down) {
3154 			sc->down_cnt += stats->link_down;
3155 			sc->link_state = 0;
3156 			if_link_state_change(sc->ifp, LINK_STATE_DOWN);
3157 		}
3158 	}
3159 
3160 	/* check to see if we have rx token to pass back */
3161 	if (valid & 0x1)
3162 	    *ss->irq_claim = be32toh(3);
3163 	*(ss->irq_claim + 1) = be32toh(3);
3164 }
3165 
3166 static void
3167 mxge_init(void *arg)
3168 {
3169 	mxge_softc_t *sc = arg;
3170 	struct ifnet *ifp = sc->ifp;
3171 
3172 
3173 	mtx_lock(&sc->driver_mtx);
3174 	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0)
3175 		(void) mxge_open(sc);
3176 	mtx_unlock(&sc->driver_mtx);
3177 }
3178 
3179 
3180 
3181 static void
3182 mxge_free_slice_mbufs(struct mxge_slice_state *ss)
3183 {
3184 	int i;
3185 
3186 #if defined(INET) || defined(INET6)
3187 	tcp_lro_free(&ss->lc);
3188 #endif
3189 	for (i = 0; i <= ss->rx_big.mask; i++) {
3190 		if (ss->rx_big.info[i].m == NULL)
3191 			continue;
3192 		bus_dmamap_unload(ss->rx_big.dmat,
3193 				  ss->rx_big.info[i].map);
3194 		m_freem(ss->rx_big.info[i].m);
3195 		ss->rx_big.info[i].m = NULL;
3196 	}
3197 
3198 	for (i = 0; i <= ss->rx_small.mask; i++) {
3199 		if (ss->rx_small.info[i].m == NULL)
3200 			continue;
3201 		bus_dmamap_unload(ss->rx_small.dmat,
3202 				  ss->rx_small.info[i].map);
3203 		m_freem(ss->rx_small.info[i].m);
3204 		ss->rx_small.info[i].m = NULL;
3205 	}
3206 
3207 	/* transmit ring used only on the first slice */
3208 	if (ss->tx.info == NULL)
3209 		return;
3210 
3211 	for (i = 0; i <= ss->tx.mask; i++) {
3212 		ss->tx.info[i].flag = 0;
3213 		if (ss->tx.info[i].m == NULL)
3214 			continue;
3215 		bus_dmamap_unload(ss->tx.dmat,
3216 				  ss->tx.info[i].map);
3217 		m_freem(ss->tx.info[i].m);
3218 		ss->tx.info[i].m = NULL;
3219 	}
3220 }
3221 
3222 static void
3223 mxge_free_mbufs(mxge_softc_t *sc)
3224 {
3225 	int slice;
3226 
3227 	for (slice = 0; slice < sc->num_slices; slice++)
3228 		mxge_free_slice_mbufs(&sc->ss[slice]);
3229 }
3230 
3231 static void
3232 mxge_free_slice_rings(struct mxge_slice_state *ss)
3233 {
3234 	int i;
3235 
3236 
3237 	if (ss->rx_done.entry != NULL)
3238 		mxge_dma_free(&ss->rx_done.dma);
3239 	ss->rx_done.entry = NULL;
3240 
3241 	if (ss->tx.req_bytes != NULL)
3242 		free(ss->tx.req_bytes, M_DEVBUF);
3243 	ss->tx.req_bytes = NULL;
3244 
3245 	if (ss->tx.seg_list != NULL)
3246 		free(ss->tx.seg_list, M_DEVBUF);
3247 	ss->tx.seg_list = NULL;
3248 
3249 	if (ss->rx_small.shadow != NULL)
3250 		free(ss->rx_small.shadow, M_DEVBUF);
3251 	ss->rx_small.shadow = NULL;
3252 
3253 	if (ss->rx_big.shadow != NULL)
3254 		free(ss->rx_big.shadow, M_DEVBUF);
3255 	ss->rx_big.shadow = NULL;
3256 
3257 	if (ss->tx.info != NULL) {
3258 		if (ss->tx.dmat != NULL) {
3259 			for (i = 0; i <= ss->tx.mask; i++) {
3260 				bus_dmamap_destroy(ss->tx.dmat,
3261 						   ss->tx.info[i].map);
3262 			}
3263 			bus_dma_tag_destroy(ss->tx.dmat);
3264 		}
3265 		free(ss->tx.info, M_DEVBUF);
3266 	}
3267 	ss->tx.info = NULL;
3268 
3269 	if (ss->rx_small.info != NULL) {
3270 		if (ss->rx_small.dmat != NULL) {
3271 			for (i = 0; i <= ss->rx_small.mask; i++) {
3272 				bus_dmamap_destroy(ss->rx_small.dmat,
3273 						   ss->rx_small.info[i].map);
3274 			}
3275 			bus_dmamap_destroy(ss->rx_small.dmat,
3276 					   ss->rx_small.extra_map);
3277 			bus_dma_tag_destroy(ss->rx_small.dmat);
3278 		}
3279 		free(ss->rx_small.info, M_DEVBUF);
3280 	}
3281 	ss->rx_small.info = NULL;
3282 
3283 	if (ss->rx_big.info != NULL) {
3284 		if (ss->rx_big.dmat != NULL) {
3285 			for (i = 0; i <= ss->rx_big.mask; i++) {
3286 				bus_dmamap_destroy(ss->rx_big.dmat,
3287 						   ss->rx_big.info[i].map);
3288 			}
3289 			bus_dmamap_destroy(ss->rx_big.dmat,
3290 					   ss->rx_big.extra_map);
3291 			bus_dma_tag_destroy(ss->rx_big.dmat);
3292 		}
3293 		free(ss->rx_big.info, M_DEVBUF);
3294 	}
3295 	ss->rx_big.info = NULL;
3296 }
3297 
3298 static void
3299 mxge_free_rings(mxge_softc_t *sc)
3300 {
3301 	int slice;
3302 
3303 	for (slice = 0; slice < sc->num_slices; slice++)
3304 		mxge_free_slice_rings(&sc->ss[slice]);
3305 }
3306 
3307 static int
3308 mxge_alloc_slice_rings(struct mxge_slice_state *ss, int rx_ring_entries,
3309 		       int tx_ring_entries)
3310 {
3311 	mxge_softc_t *sc = ss->sc;
3312 	size_t bytes;
3313 	int err, i;
3314 
3315 	/* allocate per-slice receive resources */
3316 
3317 	ss->rx_small.mask = ss->rx_big.mask = rx_ring_entries - 1;
3318 	ss->rx_done.mask = (2 * rx_ring_entries) - 1;
3319 
3320 	/* allocate the rx shadow rings */
3321 	bytes = rx_ring_entries * sizeof (*ss->rx_small.shadow);
3322 	ss->rx_small.shadow = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3323 
3324 	bytes = rx_ring_entries * sizeof (*ss->rx_big.shadow);
3325 	ss->rx_big.shadow = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3326 
3327 	/* allocate the rx host info rings */
3328 	bytes = rx_ring_entries * sizeof (*ss->rx_small.info);
3329 	ss->rx_small.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3330 
3331 	bytes = rx_ring_entries * sizeof (*ss->rx_big.info);
3332 	ss->rx_big.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3333 
3334 	/* allocate the rx busdma resources */
3335 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
3336 				 1,			/* alignment */
3337 				 4096,			/* boundary */
3338 				 BUS_SPACE_MAXADDR,	/* low */
3339 				 BUS_SPACE_MAXADDR,	/* high */
3340 				 NULL, NULL,		/* filter */
3341 				 MHLEN,			/* maxsize */
3342 				 1,			/* num segs */
3343 				 MHLEN,			/* maxsegsize */
3344 				 BUS_DMA_ALLOCNOW,	/* flags */
3345 				 NULL, NULL,		/* lock */
3346 				 &ss->rx_small.dmat);	/* tag */
3347 	if (err != 0) {
3348 		device_printf(sc->dev, "Err %d allocating rx_small dmat\n",
3349 			      err);
3350 		return err;
3351 	}
3352 
3353 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
3354 				 1,			/* alignment */
3355 #if MXGE_VIRT_JUMBOS
3356 				 4096,			/* boundary */
3357 #else
3358 				 0,			/* boundary */
3359 #endif
3360 				 BUS_SPACE_MAXADDR,	/* low */
3361 				 BUS_SPACE_MAXADDR,	/* high */
3362 				 NULL, NULL,		/* filter */
3363 				 3*4096,		/* maxsize */
3364 #if MXGE_VIRT_JUMBOS
3365 				 3,			/* num segs */
3366 				 4096,			/* maxsegsize*/
3367 #else
3368 				 1,			/* num segs */
3369 				 MJUM9BYTES,		/* maxsegsize*/
3370 #endif
3371 				 BUS_DMA_ALLOCNOW,	/* flags */
3372 				 NULL, NULL,		/* lock */
3373 				 &ss->rx_big.dmat);	/* tag */
3374 	if (err != 0) {
3375 		device_printf(sc->dev, "Err %d allocating rx_big dmat\n",
3376 			      err);
3377 		return err;
3378 	}
3379 	for (i = 0; i <= ss->rx_small.mask; i++) {
3380 		err = bus_dmamap_create(ss->rx_small.dmat, 0,
3381 					&ss->rx_small.info[i].map);
3382 		if (err != 0) {
3383 			device_printf(sc->dev, "Err %d  rx_small dmamap\n",
3384 				      err);
3385 			return err;
3386 		}
3387 	}
3388 	err = bus_dmamap_create(ss->rx_small.dmat, 0,
3389 				&ss->rx_small.extra_map);
3390 	if (err != 0) {
3391 		device_printf(sc->dev, "Err %d extra rx_small dmamap\n",
3392 			      err);
3393 		return err;
3394 	}
3395 
3396 	for (i = 0; i <= ss->rx_big.mask; i++) {
3397 		err = bus_dmamap_create(ss->rx_big.dmat, 0,
3398 					&ss->rx_big.info[i].map);
3399 		if (err != 0) {
3400 			device_printf(sc->dev, "Err %d  rx_big dmamap\n",
3401 				      err);
3402 			return err;
3403 		}
3404 	}
3405 	err = bus_dmamap_create(ss->rx_big.dmat, 0,
3406 				&ss->rx_big.extra_map);
3407 	if (err != 0) {
3408 		device_printf(sc->dev, "Err %d extra rx_big dmamap\n",
3409 			      err);
3410 		return err;
3411 	}
3412 
3413 	/* now allocate TX resources */
3414 
3415 #ifndef IFNET_BUF_RING
3416 	/* only use a single TX ring for now */
3417 	if (ss != ss->sc->ss)
3418 		return 0;
3419 #endif
3420 
3421 	ss->tx.mask = tx_ring_entries - 1;
3422 	ss->tx.max_desc = MIN(MXGE_MAX_SEND_DESC, tx_ring_entries / 4);
3423 
3424 
3425 	/* allocate the tx request copy block */
3426 	bytes = 8 +
3427 		sizeof (*ss->tx.req_list) * (ss->tx.max_desc + 4);
3428 	ss->tx.req_bytes = malloc(bytes, M_DEVBUF, M_WAITOK);
3429 	/* ensure req_list entries are aligned to 8 bytes */
3430 	ss->tx.req_list = (mcp_kreq_ether_send_t *)
3431 		((unsigned long)(ss->tx.req_bytes + 7) & ~7UL);
3432 
3433 	/* allocate the tx busdma segment list */
3434 	bytes = sizeof (*ss->tx.seg_list) * ss->tx.max_desc;
3435 	ss->tx.seg_list = (bus_dma_segment_t *)
3436 		malloc(bytes, M_DEVBUF, M_WAITOK);
3437 
3438 	/* allocate the tx host info ring */
3439 	bytes = tx_ring_entries * sizeof (*ss->tx.info);
3440 	ss->tx.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3441 
3442 	/* allocate the tx busdma resources */
3443 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
3444 				 1,			/* alignment */
3445 				 sc->tx_boundary,	/* boundary */
3446 				 BUS_SPACE_MAXADDR,	/* low */
3447 				 BUS_SPACE_MAXADDR,	/* high */
3448 				 NULL, NULL,		/* filter */
3449 				 65536 + 256,		/* maxsize */
3450 				 ss->tx.max_desc - 2,	/* num segs */
3451 				 sc->tx_boundary,	/* maxsegsz */
3452 				 BUS_DMA_ALLOCNOW,	/* flags */
3453 				 NULL, NULL,		/* lock */
3454 				 &ss->tx.dmat);		/* tag */
3455 
3456 	if (err != 0) {
3457 		device_printf(sc->dev, "Err %d allocating tx dmat\n",
3458 			      err);
3459 		return err;
3460 	}
3461 
3462 	/* now use these tags to setup dmamaps for each slot
3463 	   in the ring */
3464 	for (i = 0; i <= ss->tx.mask; i++) {
3465 		err = bus_dmamap_create(ss->tx.dmat, 0,
3466 					&ss->tx.info[i].map);
3467 		if (err != 0) {
3468 			device_printf(sc->dev, "Err %d  tx dmamap\n",
3469 				      err);
3470 			return err;
3471 		}
3472 	}
3473 	return 0;
3474 
3475 }
3476 
3477 static int
3478 mxge_alloc_rings(mxge_softc_t *sc)
3479 {
3480 	mxge_cmd_t cmd;
3481 	int tx_ring_size;
3482 	int tx_ring_entries, rx_ring_entries;
3483 	int err, slice;
3484 
3485 	/* get ring sizes */
3486 	err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_RING_SIZE, &cmd);
3487 	tx_ring_size = cmd.data0;
3488 	if (err != 0) {
3489 		device_printf(sc->dev, "Cannot determine tx ring sizes\n");
3490 		goto abort;
3491 	}
3492 
3493 	tx_ring_entries = tx_ring_size / sizeof (mcp_kreq_ether_send_t);
3494 	rx_ring_entries = sc->rx_ring_size / sizeof (mcp_dma_addr_t);
3495 	IFQ_SET_MAXLEN(&sc->ifp->if_snd, tx_ring_entries - 1);
3496 	sc->ifp->if_snd.ifq_drv_maxlen = sc->ifp->if_snd.ifq_maxlen;
3497 	IFQ_SET_READY(&sc->ifp->if_snd);
3498 
3499 	for (slice = 0; slice < sc->num_slices; slice++) {
3500 		err = mxge_alloc_slice_rings(&sc->ss[slice],
3501 					     rx_ring_entries,
3502 					     tx_ring_entries);
3503 		if (err != 0)
3504 			goto abort;
3505 	}
3506 	return 0;
3507 
3508 abort:
3509 	mxge_free_rings(sc);
3510 	return err;
3511 
3512 }
3513 
3514 
3515 static void
3516 mxge_choose_params(int mtu, int *big_buf_size, int *cl_size, int *nbufs)
3517 {
3518 	int bufsize = mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN + MXGEFW_PAD;
3519 
3520 	if (bufsize < MCLBYTES) {
3521 		/* easy, everything fits in a single buffer */
3522 		*big_buf_size = MCLBYTES;
3523 		*cl_size = MCLBYTES;
3524 		*nbufs = 1;
3525 		return;
3526 	}
3527 
3528 	if (bufsize < MJUMPAGESIZE) {
3529 		/* still easy, everything still fits in a single buffer */
3530 		*big_buf_size = MJUMPAGESIZE;
3531 		*cl_size = MJUMPAGESIZE;
3532 		*nbufs = 1;
3533 		return;
3534 	}
3535 #if MXGE_VIRT_JUMBOS
3536 	/* now we need to use virtually contiguous buffers */
3537 	*cl_size = MJUM9BYTES;
3538 	*big_buf_size = 4096;
3539 	*nbufs = mtu / 4096 + 1;
3540 	/* needs to be a power of two, so round up */
3541 	if (*nbufs == 3)
3542 		*nbufs = 4;
3543 #else
3544 	*cl_size = MJUM9BYTES;
3545 	*big_buf_size = MJUM9BYTES;
3546 	*nbufs = 1;
3547 #endif
3548 }
3549 
3550 static int
3551 mxge_slice_open(struct mxge_slice_state *ss, int nbufs, int cl_size)
3552 {
3553 	mxge_softc_t *sc;
3554 	mxge_cmd_t cmd;
3555 	bus_dmamap_t map;
3556 	int err, i, slice;
3557 
3558 
3559 	sc = ss->sc;
3560 	slice = ss - sc->ss;
3561 
3562 #if defined(INET) || defined(INET6)
3563 	(void)tcp_lro_init(&ss->lc);
3564 #endif
3565 	ss->lc.ifp = sc->ifp;
3566 
3567 	/* get the lanai pointers to the send and receive rings */
3568 
3569 	err = 0;
3570 #ifndef IFNET_BUF_RING
3571 	/* We currently only send from the first slice */
3572 	if (slice == 0) {
3573 #endif
3574 		cmd.data0 = slice;
3575 		err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_OFFSET, &cmd);
3576 		ss->tx.lanai =
3577 			(volatile mcp_kreq_ether_send_t *)(sc->sram + cmd.data0);
3578 		ss->tx.send_go = (volatile uint32_t *)
3579 			(sc->sram + MXGEFW_ETH_SEND_GO + 64 * slice);
3580 		ss->tx.send_stop = (volatile uint32_t *)
3581 		(sc->sram + MXGEFW_ETH_SEND_STOP + 64 * slice);
3582 #ifndef IFNET_BUF_RING
3583 	}
3584 #endif
3585 	cmd.data0 = slice;
3586 	err |= mxge_send_cmd(sc,
3587 			     MXGEFW_CMD_GET_SMALL_RX_OFFSET, &cmd);
3588 	ss->rx_small.lanai =
3589 		(volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
3590 	cmd.data0 = slice;
3591 	err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_BIG_RX_OFFSET, &cmd);
3592 	ss->rx_big.lanai =
3593 		(volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
3594 
3595 	if (err != 0) {
3596 		device_printf(sc->dev,
3597 			      "failed to get ring sizes or locations\n");
3598 		return EIO;
3599 	}
3600 
3601 	/* stock receive rings */
3602 	for (i = 0; i <= ss->rx_small.mask; i++) {
3603 		map = ss->rx_small.info[i].map;
3604 		err = mxge_get_buf_small(ss, map, i);
3605 		if (err) {
3606 			device_printf(sc->dev, "alloced %d/%d smalls\n",
3607 				      i, ss->rx_small.mask + 1);
3608 			return ENOMEM;
3609 		}
3610 	}
3611 	for (i = 0; i <= ss->rx_big.mask; i++) {
3612 		ss->rx_big.shadow[i].addr_low = 0xffffffff;
3613 		ss->rx_big.shadow[i].addr_high = 0xffffffff;
3614 	}
3615 	ss->rx_big.nbufs = nbufs;
3616 	ss->rx_big.cl_size = cl_size;
3617 	ss->rx_big.mlen = ss->sc->ifp->if_mtu + ETHER_HDR_LEN +
3618 		ETHER_VLAN_ENCAP_LEN + MXGEFW_PAD;
3619 	for (i = 0; i <= ss->rx_big.mask; i += ss->rx_big.nbufs) {
3620 		map = ss->rx_big.info[i].map;
3621 		err = mxge_get_buf_big(ss, map, i);
3622 		if (err) {
3623 			device_printf(sc->dev, "alloced %d/%d bigs\n",
3624 				      i, ss->rx_big.mask + 1);
3625 			return ENOMEM;
3626 		}
3627 	}
3628 	return 0;
3629 }
3630 
3631 static int
3632 mxge_open(mxge_softc_t *sc)
3633 {
3634 	mxge_cmd_t cmd;
3635 	int err, big_bytes, nbufs, slice, cl_size, i;
3636 	bus_addr_t bus;
3637 	volatile uint8_t *itable;
3638 	struct mxge_slice_state *ss;
3639 
3640 	/* Copy the MAC address in case it was overridden */
3641 	bcopy(IF_LLADDR(sc->ifp), sc->mac_addr, ETHER_ADDR_LEN);
3642 
3643 	err = mxge_reset(sc, 1);
3644 	if (err != 0) {
3645 		device_printf(sc->dev, "failed to reset\n");
3646 		return EIO;
3647 	}
3648 
3649 	if (sc->num_slices > 1) {
3650 		/* setup the indirection table */
3651 		cmd.data0 = sc->num_slices;
3652 		err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_TABLE_SIZE,
3653 				    &cmd);
3654 
3655 		err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_RSS_TABLE_OFFSET,
3656 				     &cmd);
3657 		if (err != 0) {
3658 			device_printf(sc->dev,
3659 				      "failed to setup rss tables\n");
3660 			return err;
3661 		}
3662 
3663 		/* just enable an identity mapping */
3664 		itable = sc->sram + cmd.data0;
3665 		for (i = 0; i < sc->num_slices; i++)
3666 			itable[i] = (uint8_t)i;
3667 
3668 		cmd.data0 = 1;
3669 		cmd.data1 = mxge_rss_hash_type;
3670 		err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_ENABLE, &cmd);
3671 		if (err != 0) {
3672 			device_printf(sc->dev, "failed to enable slices\n");
3673 			return err;
3674 		}
3675 	}
3676 
3677 
3678 	mxge_choose_params(sc->ifp->if_mtu, &big_bytes, &cl_size, &nbufs);
3679 
3680 	cmd.data0 = nbufs;
3681 	err = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
3682 			    &cmd);
3683 	/* error is only meaningful if we're trying to set
3684 	   MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS > 1 */
3685 	if (err && nbufs > 1) {
3686 		device_printf(sc->dev,
3687 			      "Failed to set alway-use-n to %d\n",
3688 			      nbufs);
3689 		return EIO;
3690 	}
3691 	/* Give the firmware the mtu and the big and small buffer
3692 	   sizes.  The firmware wants the big buf size to be a power
3693 	   of two. Luckily, FreeBSD's clusters are powers of two */
3694 	cmd.data0 = sc->ifp->if_mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
3695 	err = mxge_send_cmd(sc, MXGEFW_CMD_SET_MTU, &cmd);
3696 	cmd.data0 = MHLEN - MXGEFW_PAD;
3697 	err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_SMALL_BUFFER_SIZE,
3698 			     &cmd);
3699 	cmd.data0 = big_bytes;
3700 	err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_BIG_BUFFER_SIZE, &cmd);
3701 
3702 	if (err != 0) {
3703 		device_printf(sc->dev, "failed to setup params\n");
3704 		goto abort;
3705 	}
3706 
3707 	/* Now give him the pointer to the stats block */
3708 	for (slice = 0;
3709 #ifdef IFNET_BUF_RING
3710 	     slice < sc->num_slices;
3711 #else
3712 	     slice < 1;
3713 #endif
3714 	     slice++) {
3715 		ss = &sc->ss[slice];
3716 		cmd.data0 =
3717 			MXGE_LOWPART_TO_U32(ss->fw_stats_dma.bus_addr);
3718 		cmd.data1 =
3719 			MXGE_HIGHPART_TO_U32(ss->fw_stats_dma.bus_addr);
3720 		cmd.data2 = sizeof(struct mcp_irq_data);
3721 		cmd.data2 |= (slice << 16);
3722 		err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_STATS_DMA_V2, &cmd);
3723 	}
3724 
3725 	if (err != 0) {
3726 		bus = sc->ss->fw_stats_dma.bus_addr;
3727 		bus += offsetof(struct mcp_irq_data, send_done_count);
3728 		cmd.data0 = MXGE_LOWPART_TO_U32(bus);
3729 		cmd.data1 = MXGE_HIGHPART_TO_U32(bus);
3730 		err = mxge_send_cmd(sc,
3731 				    MXGEFW_CMD_SET_STATS_DMA_OBSOLETE,
3732 				    &cmd);
3733 		/* Firmware cannot support multicast without STATS_DMA_V2 */
3734 		sc->fw_multicast_support = 0;
3735 	} else {
3736 		sc->fw_multicast_support = 1;
3737 	}
3738 
3739 	if (err != 0) {
3740 		device_printf(sc->dev, "failed to setup params\n");
3741 		goto abort;
3742 	}
3743 
3744 	for (slice = 0; slice < sc->num_slices; slice++) {
3745 		err = mxge_slice_open(&sc->ss[slice], nbufs, cl_size);
3746 		if (err != 0) {
3747 			device_printf(sc->dev, "couldn't open slice %d\n",
3748 				      slice);
3749 			goto abort;
3750 		}
3751 	}
3752 
3753 	/* Finally, start the firmware running */
3754 	err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_UP, &cmd);
3755 	if (err) {
3756 		device_printf(sc->dev, "Couldn't bring up link\n");
3757 		goto abort;
3758 	}
3759 #ifdef IFNET_BUF_RING
3760 	for (slice = 0; slice < sc->num_slices; slice++) {
3761 		ss = &sc->ss[slice];
3762 		ss->if_drv_flags |= IFF_DRV_RUNNING;
3763 		ss->if_drv_flags &= ~IFF_DRV_OACTIVE;
3764 	}
3765 #endif
3766 	sc->ifp->if_drv_flags |= IFF_DRV_RUNNING;
3767 	sc->ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
3768 
3769 	return 0;
3770 
3771 
3772 abort:
3773 	mxge_free_mbufs(sc);
3774 
3775 	return err;
3776 }
3777 
3778 static int
3779 mxge_close(mxge_softc_t *sc, int down)
3780 {
3781 	mxge_cmd_t cmd;
3782 	int err, old_down_cnt;
3783 #ifdef IFNET_BUF_RING
3784 	struct mxge_slice_state *ss;
3785 	int slice;
3786 #endif
3787 
3788 #ifdef IFNET_BUF_RING
3789 	for (slice = 0; slice < sc->num_slices; slice++) {
3790 		ss = &sc->ss[slice];
3791 		ss->if_drv_flags &= ~IFF_DRV_RUNNING;
3792 	}
3793 #endif
3794 	sc->ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
3795 	if (!down) {
3796 		old_down_cnt = sc->down_cnt;
3797 		wmb();
3798 		err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_DOWN, &cmd);
3799 		if (err) {
3800 			device_printf(sc->dev,
3801 				      "Couldn't bring down link\n");
3802 		}
3803 		if (old_down_cnt == sc->down_cnt) {
3804 			/* wait for down irq */
3805 			DELAY(10 * sc->intr_coal_delay);
3806 		}
3807 		wmb();
3808 		if (old_down_cnt == sc->down_cnt) {
3809 			device_printf(sc->dev, "never got down irq\n");
3810 		}
3811 	}
3812 	mxge_free_mbufs(sc);
3813 
3814 	return 0;
3815 }
3816 
3817 static void
3818 mxge_setup_cfg_space(mxge_softc_t *sc)
3819 {
3820 	device_t dev = sc->dev;
3821 	int reg;
3822 	uint16_t lnk, pectl;
3823 
3824 	/* find the PCIe link width and set max read request to 4KB*/
3825 	if (pci_find_cap(dev, PCIY_EXPRESS, &reg) == 0) {
3826 		lnk = pci_read_config(dev, reg + 0x12, 2);
3827 		sc->link_width = (lnk >> 4) & 0x3f;
3828 
3829 		if (sc->pectl == 0) {
3830 			pectl = pci_read_config(dev, reg + 0x8, 2);
3831 			pectl = (pectl & ~0x7000) | (5 << 12);
3832 			pci_write_config(dev, reg + 0x8, pectl, 2);
3833 			sc->pectl = pectl;
3834 		} else {
3835 			/* restore saved pectl after watchdog reset */
3836 			pci_write_config(dev, reg + 0x8, sc->pectl, 2);
3837 		}
3838 	}
3839 
3840 	/* Enable DMA and Memory space access */
3841 	pci_enable_busmaster(dev);
3842 }
3843 
3844 static uint32_t
3845 mxge_read_reboot(mxge_softc_t *sc)
3846 {
3847 	device_t dev = sc->dev;
3848 	uint32_t vs;
3849 
3850 	/* find the vendor specific offset */
3851 	if (pci_find_cap(dev, PCIY_VENDOR, &vs) != 0) {
3852 		device_printf(sc->dev,
3853 			      "could not find vendor specific offset\n");
3854 		return (uint32_t)-1;
3855 	}
3856 	/* enable read32 mode */
3857 	pci_write_config(dev, vs + 0x10, 0x3, 1);
3858 	/* tell NIC which register to read */
3859 	pci_write_config(dev, vs + 0x18, 0xfffffff0, 4);
3860 	return (pci_read_config(dev, vs + 0x14, 4));
3861 }
3862 
3863 static void
3864 mxge_watchdog_reset(mxge_softc_t *sc)
3865 {
3866 	struct pci_devinfo *dinfo;
3867 	struct mxge_slice_state *ss;
3868 	int err, running, s, num_tx_slices = 1;
3869 	uint32_t reboot;
3870 	uint16_t cmd;
3871 
3872 	err = ENXIO;
3873 
3874 	device_printf(sc->dev, "Watchdog reset!\n");
3875 
3876 	/*
3877 	 * check to see if the NIC rebooted.  If it did, then all of
3878 	 * PCI config space has been reset, and things like the
3879 	 * busmaster bit will be zero.  If this is the case, then we
3880 	 * must restore PCI config space before the NIC can be used
3881 	 * again
3882 	 */
3883 	cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3884 	if (cmd == 0xffff) {
3885 		/*
3886 		 * maybe the watchdog caught the NIC rebooting; wait
3887 		 * up to 100ms for it to finish.  If it does not come
3888 		 * back, then give up
3889 		 */
3890 		DELAY(1000*100);
3891 		cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3892 		if (cmd == 0xffff) {
3893 			device_printf(sc->dev, "NIC disappeared!\n");
3894 		}
3895 	}
3896 	if ((cmd & PCIM_CMD_BUSMASTEREN) == 0) {
3897 		/* print the reboot status */
3898 		reboot = mxge_read_reboot(sc);
3899 		device_printf(sc->dev, "NIC rebooted, status = 0x%x\n",
3900 			      reboot);
3901 		running = sc->ifp->if_drv_flags & IFF_DRV_RUNNING;
3902 		if (running) {
3903 
3904 			/*
3905 			 * quiesce NIC so that TX routines will not try to
3906 			 * xmit after restoration of BAR
3907 			 */
3908 
3909 			/* Mark the link as down */
3910 			if (sc->link_state) {
3911 				sc->link_state = 0;
3912 				if_link_state_change(sc->ifp,
3913 						     LINK_STATE_DOWN);
3914 			}
3915 #ifdef IFNET_BUF_RING
3916 			num_tx_slices = sc->num_slices;
3917 #endif
3918 			/* grab all TX locks to ensure no tx  */
3919 			for (s = 0; s < num_tx_slices; s++) {
3920 				ss = &sc->ss[s];
3921 				mtx_lock(&ss->tx.mtx);
3922 			}
3923 			mxge_close(sc, 1);
3924 		}
3925 		/* restore PCI configuration space */
3926 		dinfo = device_get_ivars(sc->dev);
3927 		pci_cfg_restore(sc->dev, dinfo);
3928 
3929 		/* and redo any changes we made to our config space */
3930 		mxge_setup_cfg_space(sc);
3931 
3932 		/* reload f/w */
3933 		err = mxge_load_firmware(sc, 0);
3934 		if (err) {
3935 			device_printf(sc->dev,
3936 				      "Unable to re-load f/w\n");
3937 		}
3938 		if (running) {
3939 			if (!err)
3940 				err = mxge_open(sc);
3941 			/* release all TX locks */
3942 			for (s = 0; s < num_tx_slices; s++) {
3943 				ss = &sc->ss[s];
3944 #ifdef IFNET_BUF_RING
3945 				mxge_start_locked(ss);
3946 #endif
3947 				mtx_unlock(&ss->tx.mtx);
3948 			}
3949 		}
3950 		sc->watchdog_resets++;
3951 	} else {
3952 		device_printf(sc->dev,
3953 			      "NIC did not reboot, not resetting\n");
3954 		err = 0;
3955 	}
3956 	if (err) {
3957 		device_printf(sc->dev, "watchdog reset failed\n");
3958 	} else {
3959 		if (sc->dying == 2)
3960 			sc->dying = 0;
3961 		callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
3962 	}
3963 }
3964 
3965 static void
3966 mxge_watchdog_task(void *arg, int pending)
3967 {
3968 	mxge_softc_t *sc = arg;
3969 
3970 
3971 	mtx_lock(&sc->driver_mtx);
3972 	mxge_watchdog_reset(sc);
3973 	mtx_unlock(&sc->driver_mtx);
3974 }
3975 
3976 static void
3977 mxge_warn_stuck(mxge_softc_t *sc, mxge_tx_ring_t *tx, int slice)
3978 {
3979 	tx = &sc->ss[slice].tx;
3980 	device_printf(sc->dev, "slice %d struck? ring state:\n", slice);
3981 	device_printf(sc->dev,
3982 		      "tx.req=%d tx.done=%d, tx.queue_active=%d\n",
3983 		      tx->req, tx->done, tx->queue_active);
3984 	device_printf(sc->dev, "tx.activate=%d tx.deactivate=%d\n",
3985 			      tx->activate, tx->deactivate);
3986 	device_printf(sc->dev, "pkt_done=%d fw=%d\n",
3987 		      tx->pkt_done,
3988 		      be32toh(sc->ss->fw_stats->send_done_count));
3989 }
3990 
3991 static int
3992 mxge_watchdog(mxge_softc_t *sc)
3993 {
3994 	mxge_tx_ring_t *tx;
3995 	uint32_t rx_pause = be32toh(sc->ss->fw_stats->dropped_pause);
3996 	int i, err = 0;
3997 
3998 	/* see if we have outstanding transmits, which
3999 	   have been pending for more than mxge_ticks */
4000 	for (i = 0;
4001 #ifdef IFNET_BUF_RING
4002 	     (i < sc->num_slices) && (err == 0);
4003 #else
4004 	     (i < 1) && (err == 0);
4005 #endif
4006 	     i++) {
4007 		tx = &sc->ss[i].tx;
4008 		if (tx->req != tx->done &&
4009 		    tx->watchdog_req != tx->watchdog_done &&
4010 		    tx->done == tx->watchdog_done) {
4011 			/* check for pause blocking before resetting */
4012 			if (tx->watchdog_rx_pause == rx_pause) {
4013 				mxge_warn_stuck(sc, tx, i);
4014 				taskqueue_enqueue(sc->tq, &sc->watchdog_task);
4015 				return (ENXIO);
4016 			}
4017 			else
4018 				device_printf(sc->dev, "Flow control blocking "
4019 					      "xmits, check link partner\n");
4020 		}
4021 
4022 		tx->watchdog_req = tx->req;
4023 		tx->watchdog_done = tx->done;
4024 		tx->watchdog_rx_pause = rx_pause;
4025 	}
4026 
4027 	if (sc->need_media_probe)
4028 		mxge_media_probe(sc);
4029 	return (err);
4030 }
4031 
4032 static uint64_t
4033 mxge_get_counter(struct ifnet *ifp, ift_counter cnt)
4034 {
4035 	struct mxge_softc *sc;
4036 	uint64_t rv;
4037 
4038 	sc = if_getsoftc(ifp);
4039 	rv = 0;
4040 
4041 	switch (cnt) {
4042 	case IFCOUNTER_IPACKETS:
4043 		for (int s = 0; s < sc->num_slices; s++)
4044 			rv += sc->ss[s].ipackets;
4045 		return (rv);
4046 	case IFCOUNTER_OPACKETS:
4047 		for (int s = 0; s < sc->num_slices; s++)
4048 			rv += sc->ss[s].opackets;
4049 		return (rv);
4050 	case IFCOUNTER_OERRORS:
4051 		for (int s = 0; s < sc->num_slices; s++)
4052 			rv += sc->ss[s].oerrors;
4053 		return (rv);
4054 #ifdef IFNET_BUF_RING
4055 	case IFCOUNTER_OBYTES:
4056 		for (int s = 0; s < sc->num_slices; s++)
4057 			rv += sc->ss[s].obytes;
4058 		return (rv);
4059 	case IFCOUNTER_OMCASTS:
4060 		for (int s = 0; s < sc->num_slices; s++)
4061 			rv += sc->ss[s].omcasts;
4062 		return (rv);
4063 	case IFCOUNTER_OQDROPS:
4064 		for (int s = 0; s < sc->num_slices; s++)
4065 			rv += sc->ss[s].tx.br->br_drops;
4066 		return (rv);
4067 #endif
4068 	default:
4069 		return (if_get_counter_default(ifp, cnt));
4070 	}
4071 }
4072 
4073 static void
4074 mxge_tick(void *arg)
4075 {
4076 	mxge_softc_t *sc = arg;
4077 	u_long pkts = 0;
4078 	int err = 0;
4079 	int running, ticks;
4080 	uint16_t cmd;
4081 
4082 	ticks = mxge_ticks;
4083 	running = sc->ifp->if_drv_flags & IFF_DRV_RUNNING;
4084 	if (running) {
4085 		if (!sc->watchdog_countdown) {
4086 			err = mxge_watchdog(sc);
4087 			sc->watchdog_countdown = 4;
4088 		}
4089 		sc->watchdog_countdown--;
4090 	}
4091 	if (pkts == 0) {
4092 		/* ensure NIC did not suffer h/w fault while idle */
4093 		cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
4094 		if ((cmd & PCIM_CMD_BUSMASTEREN) == 0) {
4095 			sc->dying = 2;
4096 			taskqueue_enqueue(sc->tq, &sc->watchdog_task);
4097 			err = ENXIO;
4098 		}
4099 		/* look less often if NIC is idle */
4100 		ticks *= 4;
4101 	}
4102 
4103 	if (err == 0)
4104 		callout_reset(&sc->co_hdl, ticks, mxge_tick, sc);
4105 
4106 }
4107 
4108 static int
4109 mxge_media_change(struct ifnet *ifp)
4110 {
4111 	return EINVAL;
4112 }
4113 
4114 static int
4115 mxge_change_mtu(mxge_softc_t *sc, int mtu)
4116 {
4117 	struct ifnet *ifp = sc->ifp;
4118 	int real_mtu, old_mtu;
4119 	int err = 0;
4120 
4121 
4122 	real_mtu = mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
4123 	if ((real_mtu > sc->max_mtu) || real_mtu < 60)
4124 		return EINVAL;
4125 	mtx_lock(&sc->driver_mtx);
4126 	old_mtu = ifp->if_mtu;
4127 	ifp->if_mtu = mtu;
4128 	if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
4129 		mxge_close(sc, 0);
4130 		err = mxge_open(sc);
4131 		if (err != 0) {
4132 			ifp->if_mtu = old_mtu;
4133 			mxge_close(sc, 0);
4134 			(void) mxge_open(sc);
4135 		}
4136 	}
4137 	mtx_unlock(&sc->driver_mtx);
4138 	return err;
4139 }
4140 
4141 static void
4142 mxge_media_status(struct ifnet *ifp, struct ifmediareq *ifmr)
4143 {
4144 	mxge_softc_t *sc = ifp->if_softc;
4145 
4146 
4147 	if (sc == NULL)
4148 		return;
4149 	ifmr->ifm_status = IFM_AVALID;
4150 	ifmr->ifm_active = IFM_ETHER | IFM_FDX;
4151 	ifmr->ifm_status |= sc->link_state ? IFM_ACTIVE : 0;
4152 	ifmr->ifm_active |= sc->current_media;
4153 }
4154 
4155 static int
4156 mxge_ioctl(struct ifnet *ifp, u_long command, caddr_t data)
4157 {
4158 	mxge_softc_t *sc = ifp->if_softc;
4159 	struct ifreq *ifr = (struct ifreq *)data;
4160 	int err, mask;
4161 
4162 	err = 0;
4163 	switch (command) {
4164 	case SIOCSIFADDR:
4165 	case SIOCGIFADDR:
4166 		err = ether_ioctl(ifp, command, data);
4167 		break;
4168 
4169 	case SIOCSIFMTU:
4170 		err = mxge_change_mtu(sc, ifr->ifr_mtu);
4171 		break;
4172 
4173 	case SIOCSIFFLAGS:
4174 		mtx_lock(&sc->driver_mtx);
4175 		if (sc->dying) {
4176 			mtx_unlock(&sc->driver_mtx);
4177 			return EINVAL;
4178 		}
4179 		if (ifp->if_flags & IFF_UP) {
4180 			if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) {
4181 				err = mxge_open(sc);
4182 			} else {
4183 				/* take care of promis can allmulti
4184 				   flag chages */
4185 				mxge_change_promisc(sc,
4186 						    ifp->if_flags & IFF_PROMISC);
4187 				mxge_set_multicast_list(sc);
4188 			}
4189 		} else {
4190 			if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
4191 				mxge_close(sc, 0);
4192 			}
4193 		}
4194 		mtx_unlock(&sc->driver_mtx);
4195 		break;
4196 
4197 	case SIOCADDMULTI:
4198 	case SIOCDELMULTI:
4199 		mtx_lock(&sc->driver_mtx);
4200 		mxge_set_multicast_list(sc);
4201 		mtx_unlock(&sc->driver_mtx);
4202 		break;
4203 
4204 	case SIOCSIFCAP:
4205 		mtx_lock(&sc->driver_mtx);
4206 		mask = ifr->ifr_reqcap ^ ifp->if_capenable;
4207 		if (mask & IFCAP_TXCSUM) {
4208 			if (IFCAP_TXCSUM & ifp->if_capenable) {
4209 				ifp->if_capenable &= ~(IFCAP_TXCSUM|IFCAP_TSO4);
4210 				ifp->if_hwassist &= ~(CSUM_TCP | CSUM_UDP);
4211 			} else {
4212 				ifp->if_capenable |= IFCAP_TXCSUM;
4213 				ifp->if_hwassist |= (CSUM_TCP | CSUM_UDP);
4214 			}
4215 		} else if (mask & IFCAP_RXCSUM) {
4216 			if (IFCAP_RXCSUM & ifp->if_capenable) {
4217 				ifp->if_capenable &= ~IFCAP_RXCSUM;
4218 			} else {
4219 				ifp->if_capenable |= IFCAP_RXCSUM;
4220 			}
4221 		}
4222 		if (mask & IFCAP_TSO4) {
4223 			if (IFCAP_TSO4 & ifp->if_capenable) {
4224 				ifp->if_capenable &= ~IFCAP_TSO4;
4225 			} else if (IFCAP_TXCSUM & ifp->if_capenable) {
4226 				ifp->if_capenable |= IFCAP_TSO4;
4227 				ifp->if_hwassist |= CSUM_TSO;
4228 			} else {
4229 				printf("mxge requires tx checksum offload"
4230 				       " be enabled to use TSO\n");
4231 				err = EINVAL;
4232 			}
4233 		}
4234 #if IFCAP_TSO6
4235 		if (mask & IFCAP_TXCSUM_IPV6) {
4236 			if (IFCAP_TXCSUM_IPV6 & ifp->if_capenable) {
4237 				ifp->if_capenable &= ~(IFCAP_TXCSUM_IPV6
4238 						       | IFCAP_TSO6);
4239 				ifp->if_hwassist &= ~(CSUM_TCP_IPV6
4240 						      | CSUM_UDP);
4241 			} else {
4242 				ifp->if_capenable |= IFCAP_TXCSUM_IPV6;
4243 				ifp->if_hwassist |= (CSUM_TCP_IPV6
4244 						     | CSUM_UDP_IPV6);
4245 			}
4246 		} else if (mask & IFCAP_RXCSUM_IPV6) {
4247 			if (IFCAP_RXCSUM_IPV6 & ifp->if_capenable) {
4248 				ifp->if_capenable &= ~IFCAP_RXCSUM_IPV6;
4249 			} else {
4250 				ifp->if_capenable |= IFCAP_RXCSUM_IPV6;
4251 			}
4252 		}
4253 		if (mask & IFCAP_TSO6) {
4254 			if (IFCAP_TSO6 & ifp->if_capenable) {
4255 				ifp->if_capenable &= ~IFCAP_TSO6;
4256 			} else if (IFCAP_TXCSUM_IPV6 & ifp->if_capenable) {
4257 				ifp->if_capenable |= IFCAP_TSO6;
4258 				ifp->if_hwassist |= CSUM_TSO;
4259 			} else {
4260 				printf("mxge requires tx checksum offload"
4261 				       " be enabled to use TSO\n");
4262 				err = EINVAL;
4263 			}
4264 		}
4265 #endif /*IFCAP_TSO6 */
4266 
4267 		if (mask & IFCAP_LRO)
4268 			ifp->if_capenable ^= IFCAP_LRO;
4269 		if (mask & IFCAP_VLAN_HWTAGGING)
4270 			ifp->if_capenable ^= IFCAP_VLAN_HWTAGGING;
4271 		if (mask & IFCAP_VLAN_HWTSO)
4272 			ifp->if_capenable ^= IFCAP_VLAN_HWTSO;
4273 
4274 		if (!(ifp->if_capabilities & IFCAP_VLAN_HWTSO) ||
4275 		    !(ifp->if_capenable & IFCAP_VLAN_HWTAGGING))
4276 			ifp->if_capenable &= ~IFCAP_VLAN_HWTSO;
4277 
4278 		mtx_unlock(&sc->driver_mtx);
4279 		VLAN_CAPABILITIES(ifp);
4280 
4281 		break;
4282 
4283 	case SIOCGIFMEDIA:
4284 		mtx_lock(&sc->driver_mtx);
4285 		mxge_media_probe(sc);
4286 		mtx_unlock(&sc->driver_mtx);
4287 		err = ifmedia_ioctl(ifp, (struct ifreq *)data,
4288 				    &sc->media, command);
4289 		break;
4290 
4291 	default:
4292 		err = ENOTTY;
4293 	}
4294 	return err;
4295 }
4296 
4297 static void
4298 mxge_fetch_tunables(mxge_softc_t *sc)
4299 {
4300 
4301 	TUNABLE_INT_FETCH("hw.mxge.max_slices", &mxge_max_slices);
4302 	TUNABLE_INT_FETCH("hw.mxge.flow_control_enabled",
4303 			  &mxge_flow_control);
4304 	TUNABLE_INT_FETCH("hw.mxge.intr_coal_delay",
4305 			  &mxge_intr_coal_delay);
4306 	TUNABLE_INT_FETCH("hw.mxge.nvidia_ecrc_enable",
4307 			  &mxge_nvidia_ecrc_enable);
4308 	TUNABLE_INT_FETCH("hw.mxge.force_firmware",
4309 			  &mxge_force_firmware);
4310 	TUNABLE_INT_FETCH("hw.mxge.deassert_wait",
4311 			  &mxge_deassert_wait);
4312 	TUNABLE_INT_FETCH("hw.mxge.verbose",
4313 			  &mxge_verbose);
4314 	TUNABLE_INT_FETCH("hw.mxge.ticks", &mxge_ticks);
4315 	TUNABLE_INT_FETCH("hw.mxge.always_promisc", &mxge_always_promisc);
4316 	TUNABLE_INT_FETCH("hw.mxge.rss_hash_type", &mxge_rss_hash_type);
4317 	TUNABLE_INT_FETCH("hw.mxge.rss_hashtype", &mxge_rss_hash_type);
4318 	TUNABLE_INT_FETCH("hw.mxge.initial_mtu", &mxge_initial_mtu);
4319 	TUNABLE_INT_FETCH("hw.mxge.throttle", &mxge_throttle);
4320 
4321 	if (bootverbose)
4322 		mxge_verbose = 1;
4323 	if (mxge_intr_coal_delay < 0 || mxge_intr_coal_delay > 10*1000)
4324 		mxge_intr_coal_delay = 30;
4325 	if (mxge_ticks == 0)
4326 		mxge_ticks = hz / 2;
4327 	sc->pause = mxge_flow_control;
4328 	if (mxge_rss_hash_type < MXGEFW_RSS_HASH_TYPE_IPV4
4329 	    || mxge_rss_hash_type > MXGEFW_RSS_HASH_TYPE_MAX) {
4330 		mxge_rss_hash_type = MXGEFW_RSS_HASH_TYPE_SRC_DST_PORT;
4331 	}
4332 	if (mxge_initial_mtu > ETHERMTU_JUMBO ||
4333 	    mxge_initial_mtu < ETHER_MIN_LEN)
4334 		mxge_initial_mtu = ETHERMTU_JUMBO;
4335 
4336 	if (mxge_throttle && mxge_throttle > MXGE_MAX_THROTTLE)
4337 		mxge_throttle = MXGE_MAX_THROTTLE;
4338 	if (mxge_throttle && mxge_throttle < MXGE_MIN_THROTTLE)
4339 		mxge_throttle = MXGE_MIN_THROTTLE;
4340 	sc->throttle = mxge_throttle;
4341 }
4342 
4343 
4344 static void
4345 mxge_free_slices(mxge_softc_t *sc)
4346 {
4347 	struct mxge_slice_state *ss;
4348 	int i;
4349 
4350 
4351 	if (sc->ss == NULL)
4352 		return;
4353 
4354 	for (i = 0; i < sc->num_slices; i++) {
4355 		ss = &sc->ss[i];
4356 		if (ss->fw_stats != NULL) {
4357 			mxge_dma_free(&ss->fw_stats_dma);
4358 			ss->fw_stats = NULL;
4359 #ifdef IFNET_BUF_RING
4360 			if (ss->tx.br != NULL) {
4361 				drbr_free(ss->tx.br, M_DEVBUF);
4362 				ss->tx.br = NULL;
4363 			}
4364 #endif
4365 			mtx_destroy(&ss->tx.mtx);
4366 		}
4367 		if (ss->rx_done.entry != NULL) {
4368 			mxge_dma_free(&ss->rx_done.dma);
4369 			ss->rx_done.entry = NULL;
4370 		}
4371 	}
4372 	free(sc->ss, M_DEVBUF);
4373 	sc->ss = NULL;
4374 }
4375 
4376 static int
4377 mxge_alloc_slices(mxge_softc_t *sc)
4378 {
4379 	mxge_cmd_t cmd;
4380 	struct mxge_slice_state *ss;
4381 	size_t bytes;
4382 	int err, i, max_intr_slots;
4383 
4384 	err = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
4385 	if (err != 0) {
4386 		device_printf(sc->dev, "Cannot determine rx ring size\n");
4387 		return err;
4388 	}
4389 	sc->rx_ring_size = cmd.data0;
4390 	max_intr_slots = 2 * (sc->rx_ring_size / sizeof (mcp_dma_addr_t));
4391 
4392 	bytes = sizeof (*sc->ss) * sc->num_slices;
4393 	sc->ss = malloc(bytes, M_DEVBUF, M_NOWAIT | M_ZERO);
4394 	if (sc->ss == NULL)
4395 		return (ENOMEM);
4396 	for (i = 0; i < sc->num_slices; i++) {
4397 		ss = &sc->ss[i];
4398 
4399 		ss->sc = sc;
4400 
4401 		/* allocate per-slice rx interrupt queues */
4402 
4403 		bytes = max_intr_slots * sizeof (*ss->rx_done.entry);
4404 		err = mxge_dma_alloc(sc, &ss->rx_done.dma, bytes, 4096);
4405 		if (err != 0)
4406 			goto abort;
4407 		ss->rx_done.entry = ss->rx_done.dma.addr;
4408 		bzero(ss->rx_done.entry, bytes);
4409 
4410 		/*
4411 		 * allocate the per-slice firmware stats; stats
4412 		 * (including tx) are used used only on the first
4413 		 * slice for now
4414 		 */
4415 #ifndef IFNET_BUF_RING
4416 		if (i > 0)
4417 			continue;
4418 #endif
4419 
4420 		bytes = sizeof (*ss->fw_stats);
4421 		err = mxge_dma_alloc(sc, &ss->fw_stats_dma,
4422 				     sizeof (*ss->fw_stats), 64);
4423 		if (err != 0)
4424 			goto abort;
4425 		ss->fw_stats = (mcp_irq_data_t *)ss->fw_stats_dma.addr;
4426 		snprintf(ss->tx.mtx_name, sizeof(ss->tx.mtx_name),
4427 			 "%s:tx(%d)", device_get_nameunit(sc->dev), i);
4428 		mtx_init(&ss->tx.mtx, ss->tx.mtx_name, NULL, MTX_DEF);
4429 #ifdef IFNET_BUF_RING
4430 		ss->tx.br = buf_ring_alloc(2048, M_DEVBUF, M_WAITOK,
4431 					   &ss->tx.mtx);
4432 #endif
4433 	}
4434 
4435 	return (0);
4436 
4437 abort:
4438 	mxge_free_slices(sc);
4439 	return (ENOMEM);
4440 }
4441 
4442 static void
4443 mxge_slice_probe(mxge_softc_t *sc)
4444 {
4445 	mxge_cmd_t cmd;
4446 	char *old_fw;
4447 	int msix_cnt, status, max_intr_slots;
4448 
4449 	sc->num_slices = 1;
4450 	/*
4451 	 *  don't enable multiple slices if they are not enabled,
4452 	 *  or if this is not an SMP system
4453 	 */
4454 
4455 	if (mxge_max_slices == 0 || mxge_max_slices == 1 || mp_ncpus < 2)
4456 		return;
4457 
4458 	/* see how many MSI-X interrupts are available */
4459 	msix_cnt = pci_msix_count(sc->dev);
4460 	if (msix_cnt < 2)
4461 		return;
4462 
4463 	/* now load the slice aware firmware see what it supports */
4464 	old_fw = sc->fw_name;
4465 	if (old_fw == mxge_fw_aligned)
4466 		sc->fw_name = mxge_fw_rss_aligned;
4467 	else
4468 		sc->fw_name = mxge_fw_rss_unaligned;
4469 	status = mxge_load_firmware(sc, 0);
4470 	if (status != 0) {
4471 		device_printf(sc->dev, "Falling back to a single slice\n");
4472 		return;
4473 	}
4474 
4475 	/* try to send a reset command to the card to see if it
4476 	   is alive */
4477 	memset(&cmd, 0, sizeof (cmd));
4478 	status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
4479 	if (status != 0) {
4480 		device_printf(sc->dev, "failed reset\n");
4481 		goto abort_with_fw;
4482 	}
4483 
4484 	/* get rx ring size */
4485 	status = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
4486 	if (status != 0) {
4487 		device_printf(sc->dev, "Cannot determine rx ring size\n");
4488 		goto abort_with_fw;
4489 	}
4490 	max_intr_slots = 2 * (cmd.data0 / sizeof (mcp_dma_addr_t));
4491 
4492 	/* tell it the size of the interrupt queues */
4493 	cmd.data0 = max_intr_slots * sizeof (struct mcp_slot);
4494 	status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
4495 	if (status != 0) {
4496 		device_printf(sc->dev, "failed MXGEFW_CMD_SET_INTRQ_SIZE\n");
4497 		goto abort_with_fw;
4498 	}
4499 
4500 	/* ask the maximum number of slices it supports */
4501 	status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES, &cmd);
4502 	if (status != 0) {
4503 		device_printf(sc->dev,
4504 			      "failed MXGEFW_CMD_GET_MAX_RSS_QUEUES\n");
4505 		goto abort_with_fw;
4506 	}
4507 	sc->num_slices = cmd.data0;
4508 	if (sc->num_slices > msix_cnt)
4509 		sc->num_slices = msix_cnt;
4510 
4511 	if (mxge_max_slices == -1) {
4512 		/* cap to number of CPUs in system */
4513 		if (sc->num_slices > mp_ncpus)
4514 			sc->num_slices = mp_ncpus;
4515 	} else {
4516 		if (sc->num_slices > mxge_max_slices)
4517 			sc->num_slices = mxge_max_slices;
4518 	}
4519 	/* make sure it is a power of two */
4520 	while (sc->num_slices & (sc->num_slices - 1))
4521 		sc->num_slices--;
4522 
4523 	if (mxge_verbose)
4524 		device_printf(sc->dev, "using %d slices\n",
4525 			      sc->num_slices);
4526 
4527 	return;
4528 
4529 abort_with_fw:
4530 	sc->fw_name = old_fw;
4531 	(void) mxge_load_firmware(sc, 0);
4532 }
4533 
4534 static int
4535 mxge_add_msix_irqs(mxge_softc_t *sc)
4536 {
4537 	size_t bytes;
4538 	int count, err, i, rid;
4539 
4540 	rid = PCIR_BAR(2);
4541 	sc->msix_table_res = bus_alloc_resource_any(sc->dev, SYS_RES_MEMORY,
4542 						    &rid, RF_ACTIVE);
4543 
4544 	if (sc->msix_table_res == NULL) {
4545 		device_printf(sc->dev, "couldn't alloc MSIX table res\n");
4546 		return ENXIO;
4547 	}
4548 
4549 	count = sc->num_slices;
4550 	err = pci_alloc_msix(sc->dev, &count);
4551 	if (err != 0) {
4552 		device_printf(sc->dev, "pci_alloc_msix: failed, wanted %d"
4553 			      "err = %d \n", sc->num_slices, err);
4554 		goto abort_with_msix_table;
4555 	}
4556 	if (count < sc->num_slices) {
4557 		device_printf(sc->dev, "pci_alloc_msix: need %d, got %d\n",
4558 			      count, sc->num_slices);
4559 		device_printf(sc->dev,
4560 			      "Try setting hw.mxge.max_slices to %d\n",
4561 			      count);
4562 		err = ENOSPC;
4563 		goto abort_with_msix;
4564 	}
4565 	bytes = sizeof (*sc->msix_irq_res) * sc->num_slices;
4566 	sc->msix_irq_res = malloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO);
4567 	if (sc->msix_irq_res == NULL) {
4568 		err = ENOMEM;
4569 		goto abort_with_msix;
4570 	}
4571 
4572 	for (i = 0; i < sc->num_slices; i++) {
4573 		rid = i + 1;
4574 		sc->msix_irq_res[i] = bus_alloc_resource_any(sc->dev,
4575 							  SYS_RES_IRQ,
4576 							  &rid, RF_ACTIVE);
4577 		if (sc->msix_irq_res[i] == NULL) {
4578 			device_printf(sc->dev, "couldn't allocate IRQ res"
4579 				      " for message %d\n", i);
4580 			err = ENXIO;
4581 			goto abort_with_res;
4582 		}
4583 	}
4584 
4585 	bytes = sizeof (*sc->msix_ih) * sc->num_slices;
4586 	sc->msix_ih =  malloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO);
4587 
4588 	for (i = 0; i < sc->num_slices; i++) {
4589 		err = bus_setup_intr(sc->dev, sc->msix_irq_res[i],
4590 				     INTR_TYPE_NET | INTR_MPSAFE,
4591 #if __FreeBSD_version > 700030
4592 				     NULL,
4593 #endif
4594 				     mxge_intr, &sc->ss[i], &sc->msix_ih[i]);
4595 		if (err != 0) {
4596 			device_printf(sc->dev, "couldn't setup intr for "
4597 				      "message %d\n", i);
4598 			goto abort_with_intr;
4599 		}
4600 		bus_describe_intr(sc->dev, sc->msix_irq_res[i],
4601 				  sc->msix_ih[i], "s%d", i);
4602 	}
4603 
4604 	if (mxge_verbose) {
4605 		device_printf(sc->dev, "using %d msix IRQs:",
4606 			      sc->num_slices);
4607 		for (i = 0; i < sc->num_slices; i++)
4608 			printf(" %jd", rman_get_start(sc->msix_irq_res[i]));
4609 		printf("\n");
4610 	}
4611 	return (0);
4612 
4613 abort_with_intr:
4614 	for (i = 0; i < sc->num_slices; i++) {
4615 		if (sc->msix_ih[i] != NULL) {
4616 			bus_teardown_intr(sc->dev, sc->msix_irq_res[i],
4617 					  sc->msix_ih[i]);
4618 			sc->msix_ih[i] = NULL;
4619 		}
4620 	}
4621 	free(sc->msix_ih, M_DEVBUF);
4622 
4623 
4624 abort_with_res:
4625 	for (i = 0; i < sc->num_slices; i++) {
4626 		rid = i + 1;
4627 		if (sc->msix_irq_res[i] != NULL)
4628 			bus_release_resource(sc->dev, SYS_RES_IRQ, rid,
4629 					     sc->msix_irq_res[i]);
4630 		sc->msix_irq_res[i] = NULL;
4631 	}
4632 	free(sc->msix_irq_res, M_DEVBUF);
4633 
4634 
4635 abort_with_msix:
4636 	pci_release_msi(sc->dev);
4637 
4638 abort_with_msix_table:
4639 	bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2),
4640 			     sc->msix_table_res);
4641 
4642 	return err;
4643 }
4644 
4645 static int
4646 mxge_add_single_irq(mxge_softc_t *sc)
4647 {
4648 	int count, err, rid;
4649 
4650 	count = pci_msi_count(sc->dev);
4651 	if (count == 1 && pci_alloc_msi(sc->dev, &count) == 0) {
4652 		rid = 1;
4653 	} else {
4654 		rid = 0;
4655 		sc->legacy_irq = 1;
4656 	}
4657 	sc->irq_res = bus_alloc_resource_any(sc->dev, SYS_RES_IRQ, &rid,
4658 					     RF_SHAREABLE | RF_ACTIVE);
4659 	if (sc->irq_res == NULL) {
4660 		device_printf(sc->dev, "could not alloc interrupt\n");
4661 		return ENXIO;
4662 	}
4663 	if (mxge_verbose)
4664 		device_printf(sc->dev, "using %s irq %jd\n",
4665 			      sc->legacy_irq ? "INTx" : "MSI",
4666 			      rman_get_start(sc->irq_res));
4667 	err = bus_setup_intr(sc->dev, sc->irq_res,
4668 			     INTR_TYPE_NET | INTR_MPSAFE,
4669 #if __FreeBSD_version > 700030
4670 			     NULL,
4671 #endif
4672 			     mxge_intr, &sc->ss[0], &sc->ih);
4673 	if (err != 0) {
4674 		bus_release_resource(sc->dev, SYS_RES_IRQ,
4675 				     sc->legacy_irq ? 0 : 1, sc->irq_res);
4676 		if (!sc->legacy_irq)
4677 			pci_release_msi(sc->dev);
4678 	}
4679 	return err;
4680 }
4681 
4682 static void
4683 mxge_rem_msix_irqs(mxge_softc_t *sc)
4684 {
4685 	int i, rid;
4686 
4687 	for (i = 0; i < sc->num_slices; i++) {
4688 		if (sc->msix_ih[i] != NULL) {
4689 			bus_teardown_intr(sc->dev, sc->msix_irq_res[i],
4690 					  sc->msix_ih[i]);
4691 			sc->msix_ih[i] = NULL;
4692 		}
4693 	}
4694 	free(sc->msix_ih, M_DEVBUF);
4695 
4696 	for (i = 0; i < sc->num_slices; i++) {
4697 		rid = i + 1;
4698 		if (sc->msix_irq_res[i] != NULL)
4699 			bus_release_resource(sc->dev, SYS_RES_IRQ, rid,
4700 					     sc->msix_irq_res[i]);
4701 		sc->msix_irq_res[i] = NULL;
4702 	}
4703 	free(sc->msix_irq_res, M_DEVBUF);
4704 
4705 	bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2),
4706 			     sc->msix_table_res);
4707 
4708 	pci_release_msi(sc->dev);
4709 	return;
4710 }
4711 
4712 static void
4713 mxge_rem_single_irq(mxge_softc_t *sc)
4714 {
4715 	bus_teardown_intr(sc->dev, sc->irq_res, sc->ih);
4716 	bus_release_resource(sc->dev, SYS_RES_IRQ,
4717 			     sc->legacy_irq ? 0 : 1, sc->irq_res);
4718 	if (!sc->legacy_irq)
4719 		pci_release_msi(sc->dev);
4720 }
4721 
4722 static void
4723 mxge_rem_irq(mxge_softc_t *sc)
4724 {
4725 	if (sc->num_slices > 1)
4726 		mxge_rem_msix_irqs(sc);
4727 	else
4728 		mxge_rem_single_irq(sc);
4729 }
4730 
4731 static int
4732 mxge_add_irq(mxge_softc_t *sc)
4733 {
4734 	int err;
4735 
4736 	if (sc->num_slices > 1)
4737 		err = mxge_add_msix_irqs(sc);
4738 	else
4739 		err = mxge_add_single_irq(sc);
4740 
4741 	if (0 && err == 0 && sc->num_slices > 1) {
4742 		mxge_rem_msix_irqs(sc);
4743 		err = mxge_add_msix_irqs(sc);
4744 	}
4745 	return err;
4746 }
4747 
4748 
4749 static int
4750 mxge_attach(device_t dev)
4751 {
4752 	mxge_cmd_t cmd;
4753 	mxge_softc_t *sc = device_get_softc(dev);
4754 	struct ifnet *ifp;
4755 	int err, rid;
4756 
4757 	sc->dev = dev;
4758 	mxge_fetch_tunables(sc);
4759 
4760 	TASK_INIT(&sc->watchdog_task, 1, mxge_watchdog_task, sc);
4761 	sc->tq = taskqueue_create("mxge_taskq", M_WAITOK,
4762 				  taskqueue_thread_enqueue, &sc->tq);
4763 	if (sc->tq == NULL) {
4764 		err = ENOMEM;
4765 		goto abort_with_nothing;
4766 	}
4767 
4768 	err = bus_dma_tag_create(bus_get_dma_tag(dev),	/* parent */
4769 				 1,			/* alignment */
4770 				 0,			/* boundary */
4771 				 BUS_SPACE_MAXADDR,	/* low */
4772 				 BUS_SPACE_MAXADDR,	/* high */
4773 				 NULL, NULL,		/* filter */
4774 				 65536 + 256,		/* maxsize */
4775 				 MXGE_MAX_SEND_DESC, 	/* num segs */
4776 				 65536,			/* maxsegsize */
4777 				 0,			/* flags */
4778 				 NULL, NULL,		/* lock */
4779 				 &sc->parent_dmat);	/* tag */
4780 
4781 	if (err != 0) {
4782 		device_printf(sc->dev, "Err %d allocating parent dmat\n",
4783 			      err);
4784 		goto abort_with_tq;
4785 	}
4786 
4787 	ifp = sc->ifp = if_alloc(IFT_ETHER);
4788 	if (ifp == NULL) {
4789 		device_printf(dev, "can not if_alloc()\n");
4790 		err = ENOSPC;
4791 		goto abort_with_parent_dmat;
4792 	}
4793 	if_initname(ifp, device_get_name(dev), device_get_unit(dev));
4794 
4795 	snprintf(sc->cmd_mtx_name, sizeof(sc->cmd_mtx_name), "%s:cmd",
4796 		 device_get_nameunit(dev));
4797 	mtx_init(&sc->cmd_mtx, sc->cmd_mtx_name, NULL, MTX_DEF);
4798 	snprintf(sc->driver_mtx_name, sizeof(sc->driver_mtx_name),
4799 		 "%s:drv", device_get_nameunit(dev));
4800 	mtx_init(&sc->driver_mtx, sc->driver_mtx_name,
4801 		 MTX_NETWORK_LOCK, MTX_DEF);
4802 
4803 	callout_init_mtx(&sc->co_hdl, &sc->driver_mtx, 0);
4804 
4805 	mxge_setup_cfg_space(sc);
4806 
4807 	/* Map the board into the kernel */
4808 	rid = PCIR_BARS;
4809 	sc->mem_res = bus_alloc_resource_any(dev, SYS_RES_MEMORY, &rid,
4810 					     RF_ACTIVE);
4811 	if (sc->mem_res == NULL) {
4812 		device_printf(dev, "could not map memory\n");
4813 		err = ENXIO;
4814 		goto abort_with_lock;
4815 	}
4816 	sc->sram = rman_get_virtual(sc->mem_res);
4817 	sc->sram_size = 2*1024*1024 - (2*(48*1024)+(32*1024)) - 0x100;
4818 	if (sc->sram_size > rman_get_size(sc->mem_res)) {
4819 		device_printf(dev, "impossible memory region size %jd\n",
4820 			      rman_get_size(sc->mem_res));
4821 		err = ENXIO;
4822 		goto abort_with_mem_res;
4823 	}
4824 
4825 	/* make NULL terminated copy of the EEPROM strings section of
4826 	   lanai SRAM */
4827 	bzero(sc->eeprom_strings, MXGE_EEPROM_STRINGS_SIZE);
4828 	bus_space_read_region_1(rman_get_bustag(sc->mem_res),
4829 				rman_get_bushandle(sc->mem_res),
4830 				sc->sram_size - MXGE_EEPROM_STRINGS_SIZE,
4831 				sc->eeprom_strings,
4832 				MXGE_EEPROM_STRINGS_SIZE - 2);
4833 	err = mxge_parse_strings(sc);
4834 	if (err != 0)
4835 		goto abort_with_mem_res;
4836 
4837 	/* Enable write combining for efficient use of PCIe bus */
4838 	mxge_enable_wc(sc);
4839 
4840 	/* Allocate the out of band dma memory */
4841 	err = mxge_dma_alloc(sc, &sc->cmd_dma,
4842 			     sizeof (mxge_cmd_t), 64);
4843 	if (err != 0)
4844 		goto abort_with_mem_res;
4845 	sc->cmd = (mcp_cmd_response_t *) sc->cmd_dma.addr;
4846 	err = mxge_dma_alloc(sc, &sc->zeropad_dma, 64, 64);
4847 	if (err != 0)
4848 		goto abort_with_cmd_dma;
4849 
4850 	err = mxge_dma_alloc(sc, &sc->dmabench_dma, 4096, 4096);
4851 	if (err != 0)
4852 		goto abort_with_zeropad_dma;
4853 
4854 	/* select & load the firmware */
4855 	err = mxge_select_firmware(sc);
4856 	if (err != 0)
4857 		goto abort_with_dmabench;
4858 	sc->intr_coal_delay = mxge_intr_coal_delay;
4859 
4860 	mxge_slice_probe(sc);
4861 	err = mxge_alloc_slices(sc);
4862 	if (err != 0)
4863 		goto abort_with_dmabench;
4864 
4865 	err = mxge_reset(sc, 0);
4866 	if (err != 0)
4867 		goto abort_with_slices;
4868 
4869 	err = mxge_alloc_rings(sc);
4870 	if (err != 0) {
4871 		device_printf(sc->dev, "failed to allocate rings\n");
4872 		goto abort_with_slices;
4873 	}
4874 
4875 	err = mxge_add_irq(sc);
4876 	if (err != 0) {
4877 		device_printf(sc->dev, "failed to add irq\n");
4878 		goto abort_with_rings;
4879 	}
4880 
4881 	ifp->if_baudrate = IF_Gbps(10);
4882 	ifp->if_capabilities = IFCAP_RXCSUM | IFCAP_TXCSUM | IFCAP_TSO4 |
4883 		IFCAP_VLAN_MTU | IFCAP_LINKSTATE | IFCAP_TXCSUM_IPV6 |
4884 		IFCAP_RXCSUM_IPV6;
4885 #if defined(INET) || defined(INET6)
4886 	ifp->if_capabilities |= IFCAP_LRO;
4887 #endif
4888 
4889 #ifdef MXGE_NEW_VLAN_API
4890 	ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_HWCSUM;
4891 
4892 	/* Only FW 1.4.32 and newer can do TSO over vlans */
4893 	if (sc->fw_ver_major == 1 && sc->fw_ver_minor == 4 &&
4894 	    sc->fw_ver_tiny >= 32)
4895 		ifp->if_capabilities |= IFCAP_VLAN_HWTSO;
4896 #endif
4897 	sc->max_mtu = mxge_max_mtu(sc);
4898 	if (sc->max_mtu >= 9000)
4899 		ifp->if_capabilities |= IFCAP_JUMBO_MTU;
4900 	else
4901 		device_printf(dev, "MTU limited to %d.  Install "
4902 			      "latest firmware for 9000 byte jumbo support\n",
4903 			      sc->max_mtu - ETHER_HDR_LEN);
4904 	ifp->if_hwassist = CSUM_TCP | CSUM_UDP | CSUM_TSO;
4905 	ifp->if_hwassist |= CSUM_TCP_IPV6 | CSUM_UDP_IPV6;
4906 	/* check to see if f/w supports TSO for IPv6 */
4907 	if (!mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_TSO6_HDR_SIZE, &cmd)) {
4908 		if (CSUM_TCP_IPV6)
4909 			ifp->if_capabilities |= IFCAP_TSO6;
4910 		sc->max_tso6_hlen = min(cmd.data0,
4911 					sizeof (sc->ss[0].scratch));
4912 	}
4913 	ifp->if_capenable = ifp->if_capabilities;
4914 	if (sc->lro_cnt == 0)
4915 		ifp->if_capenable &= ~IFCAP_LRO;
4916 	ifp->if_init = mxge_init;
4917 	ifp->if_softc = sc;
4918 	ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
4919 	ifp->if_ioctl = mxge_ioctl;
4920 	ifp->if_start = mxge_start;
4921 	ifp->if_get_counter = mxge_get_counter;
4922 	/* Initialise the ifmedia structure */
4923 	ifmedia_init(&sc->media, 0, mxge_media_change,
4924 		     mxge_media_status);
4925 	mxge_media_init(sc);
4926 	mxge_media_probe(sc);
4927 	sc->dying = 0;
4928 	ether_ifattach(ifp, sc->mac_addr);
4929 	/* ether_ifattach sets mtu to ETHERMTU */
4930 	if (mxge_initial_mtu != ETHERMTU)
4931 		mxge_change_mtu(sc, mxge_initial_mtu);
4932 
4933 	mxge_add_sysctls(sc);
4934 #ifdef IFNET_BUF_RING
4935 	ifp->if_transmit = mxge_transmit;
4936 	ifp->if_qflush = mxge_qflush;
4937 #endif
4938 	taskqueue_start_threads(&sc->tq, 1, PI_NET, "%s taskq",
4939 				device_get_nameunit(sc->dev));
4940 	callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
4941 	return 0;
4942 
4943 abort_with_rings:
4944 	mxge_free_rings(sc);
4945 abort_with_slices:
4946 	mxge_free_slices(sc);
4947 abort_with_dmabench:
4948 	mxge_dma_free(&sc->dmabench_dma);
4949 abort_with_zeropad_dma:
4950 	mxge_dma_free(&sc->zeropad_dma);
4951 abort_with_cmd_dma:
4952 	mxge_dma_free(&sc->cmd_dma);
4953 abort_with_mem_res:
4954 	bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
4955 abort_with_lock:
4956 	pci_disable_busmaster(dev);
4957 	mtx_destroy(&sc->cmd_mtx);
4958 	mtx_destroy(&sc->driver_mtx);
4959 	if_free(ifp);
4960 abort_with_parent_dmat:
4961 	bus_dma_tag_destroy(sc->parent_dmat);
4962 abort_with_tq:
4963 	if (sc->tq != NULL) {
4964 		taskqueue_drain(sc->tq, &sc->watchdog_task);
4965 		taskqueue_free(sc->tq);
4966 		sc->tq = NULL;
4967 	}
4968 abort_with_nothing:
4969 	return err;
4970 }
4971 
4972 static int
4973 mxge_detach(device_t dev)
4974 {
4975 	mxge_softc_t *sc = device_get_softc(dev);
4976 
4977 	if (mxge_vlans_active(sc)) {
4978 		device_printf(sc->dev,
4979 			      "Detach vlans before removing module\n");
4980 		return EBUSY;
4981 	}
4982 	mtx_lock(&sc->driver_mtx);
4983 	sc->dying = 1;
4984 	if (sc->ifp->if_drv_flags & IFF_DRV_RUNNING)
4985 		mxge_close(sc, 0);
4986 	mtx_unlock(&sc->driver_mtx);
4987 	ether_ifdetach(sc->ifp);
4988 	if (sc->tq != NULL) {
4989 		taskqueue_drain(sc->tq, &sc->watchdog_task);
4990 		taskqueue_free(sc->tq);
4991 		sc->tq = NULL;
4992 	}
4993 	callout_drain(&sc->co_hdl);
4994 	ifmedia_removeall(&sc->media);
4995 	mxge_dummy_rdma(sc, 0);
4996 	mxge_rem_sysctls(sc);
4997 	mxge_rem_irq(sc);
4998 	mxge_free_rings(sc);
4999 	mxge_free_slices(sc);
5000 	mxge_dma_free(&sc->dmabench_dma);
5001 	mxge_dma_free(&sc->zeropad_dma);
5002 	mxge_dma_free(&sc->cmd_dma);
5003 	bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
5004 	pci_disable_busmaster(dev);
5005 	mtx_destroy(&sc->cmd_mtx);
5006 	mtx_destroy(&sc->driver_mtx);
5007 	if_free(sc->ifp);
5008 	bus_dma_tag_destroy(sc->parent_dmat);
5009 	return 0;
5010 }
5011 
5012 static int
5013 mxge_shutdown(device_t dev)
5014 {
5015 	return 0;
5016 }
5017 
5018 /*
5019   This file uses Myri10GE driver indentation.
5020 
5021   Local Variables:
5022   c-file-style:"linux"
5023   tab-width:8
5024   End:
5025 */
5026