xref: /freebsd/sys/dev/mxge/if_mxge.c (revision a907c6914c5879870b2597a63253cea0a5b7bdb8)
1 /******************************************************************************
2 
3 Copyright (c) 2006-2013, Myricom Inc.
4 All rights reserved.
5 
6 Redistribution and use in source and binary forms, with or without
7 modification, are permitted provided that the following conditions are met:
8 
9  1. Redistributions of source code must retain the above copyright notice,
10     this list of conditions and the following disclaimer.
11 
12  2. Neither the name of the Myricom Inc, nor the names of its
13     contributors may be used to endorse or promote products derived from
14     this software without specific prior written permission.
15 
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 POSSIBILITY OF SUCH DAMAGE.
27 
28 ***************************************************************************/
29 
30 #include <sys/cdefs.h>
31 __FBSDID("$FreeBSD$");
32 
33 #include <sys/param.h>
34 #include <sys/systm.h>
35 #include <sys/linker.h>
36 #include <sys/firmware.h>
37 #include <sys/endian.h>
38 #include <sys/sockio.h>
39 #include <sys/mbuf.h>
40 #include <sys/malloc.h>
41 #include <sys/kdb.h>
42 #include <sys/kernel.h>
43 #include <sys/lock.h>
44 #include <sys/module.h>
45 #include <sys/socket.h>
46 #include <sys/sysctl.h>
47 #include <sys/sx.h>
48 #include <sys/taskqueue.h>
49 #include <sys/zlib.h>
50 
51 #include <net/if.h>
52 #include <net/if_var.h>
53 #include <net/if_arp.h>
54 #include <net/ethernet.h>
55 #include <net/if_dl.h>
56 #include <net/if_media.h>
57 
58 #include <net/bpf.h>
59 
60 #include <net/if_types.h>
61 #include <net/if_vlan_var.h>
62 
63 #include <netinet/in_systm.h>
64 #include <netinet/in.h>
65 #include <netinet/ip.h>
66 #include <netinet/ip6.h>
67 #include <netinet/tcp.h>
68 #include <netinet/tcp_lro.h>
69 #include <netinet6/ip6_var.h>
70 
71 #include <machine/bus.h>
72 #include <machine/in_cksum.h>
73 #include <machine/resource.h>
74 #include <sys/bus.h>
75 #include <sys/rman.h>
76 #include <sys/smp.h>
77 
78 #include <dev/pci/pcireg.h>
79 #include <dev/pci/pcivar.h>
80 #include <dev/pci/pci_private.h> /* XXX for pci_cfg_restore */
81 
82 #include <vm/vm.h>		/* for pmap_mapdev() */
83 #include <vm/pmap.h>
84 
85 #if defined(__i386) || defined(__amd64)
86 #include <machine/specialreg.h>
87 #endif
88 
89 #include <dev/mxge/mxge_mcp.h>
90 #include <dev/mxge/mcp_gen_header.h>
91 /*#define MXGE_FAKE_IFP*/
92 #include <dev/mxge/if_mxge_var.h>
93 #ifdef IFNET_BUF_RING
94 #include <sys/buf_ring.h>
95 #endif
96 
97 #include "opt_inet.h"
98 #include "opt_inet6.h"
99 
100 /* tunable params */
101 static int mxge_nvidia_ecrc_enable = 1;
102 static int mxge_force_firmware = 0;
103 static int mxge_intr_coal_delay = 30;
104 static int mxge_deassert_wait = 1;
105 static int mxge_flow_control = 1;
106 static int mxge_verbose = 0;
107 static int mxge_ticks;
108 static int mxge_max_slices = 1;
109 static int mxge_rss_hash_type = MXGEFW_RSS_HASH_TYPE_SRC_DST_PORT;
110 static int mxge_always_promisc = 0;
111 static int mxge_initial_mtu = ETHERMTU_JUMBO;
112 static int mxge_throttle = 0;
113 static char *mxge_fw_unaligned = "mxge_ethp_z8e";
114 static char *mxge_fw_aligned = "mxge_eth_z8e";
115 static char *mxge_fw_rss_aligned = "mxge_rss_eth_z8e";
116 static char *mxge_fw_rss_unaligned = "mxge_rss_ethp_z8e";
117 
118 static int mxge_probe(device_t dev);
119 static int mxge_attach(device_t dev);
120 static int mxge_detach(device_t dev);
121 static int mxge_shutdown(device_t dev);
122 static void mxge_intr(void *arg);
123 
124 static device_method_t mxge_methods[] =
125 {
126   /* Device interface */
127   DEVMETHOD(device_probe, mxge_probe),
128   DEVMETHOD(device_attach, mxge_attach),
129   DEVMETHOD(device_detach, mxge_detach),
130   DEVMETHOD(device_shutdown, mxge_shutdown),
131 
132   DEVMETHOD_END
133 };
134 
135 static driver_t mxge_driver =
136 {
137   "mxge",
138   mxge_methods,
139   sizeof(mxge_softc_t),
140 };
141 
142 static devclass_t mxge_devclass;
143 
144 /* Declare ourselves to be a child of the PCI bus.*/
145 DRIVER_MODULE(mxge, pci, mxge_driver, mxge_devclass, 0, 0);
146 MODULE_DEPEND(mxge, firmware, 1, 1, 1);
147 MODULE_DEPEND(mxge, zlib, 1, 1, 1);
148 
149 static int mxge_load_firmware(mxge_softc_t *sc, int adopt);
150 static int mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data);
151 static int mxge_close(mxge_softc_t *sc, int down);
152 static int mxge_open(mxge_softc_t *sc);
153 static void mxge_tick(void *arg);
154 
155 static int
156 mxge_probe(device_t dev)
157 {
158 	int rev;
159 
160 
161 	if ((pci_get_vendor(dev) == MXGE_PCI_VENDOR_MYRICOM) &&
162 	    ((pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E) ||
163 	     (pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E_9))) {
164 		rev = pci_get_revid(dev);
165 		switch (rev) {
166 		case MXGE_PCI_REV_Z8E:
167 			device_set_desc(dev, "Myri10G-PCIE-8A");
168 			break;
169 		case MXGE_PCI_REV_Z8ES:
170 			device_set_desc(dev, "Myri10G-PCIE-8B");
171 			break;
172 		default:
173 			device_set_desc(dev, "Myri10G-PCIE-8??");
174 			device_printf(dev, "Unrecognized rev %d NIC\n",
175 				      rev);
176 			break;
177 		}
178 		return 0;
179 	}
180 	return ENXIO;
181 }
182 
183 static void
184 mxge_enable_wc(mxge_softc_t *sc)
185 {
186 #if defined(__i386) || defined(__amd64)
187 	vm_offset_t len;
188 	int err;
189 
190 	sc->wc = 1;
191 	len = rman_get_size(sc->mem_res);
192 	err = pmap_change_attr((vm_offset_t) sc->sram,
193 			       len, PAT_WRITE_COMBINING);
194 	if (err != 0) {
195 		device_printf(sc->dev, "pmap_change_attr failed, %d\n",
196 			      err);
197 		sc->wc = 0;
198 	}
199 #endif
200 }
201 
202 
203 /* callback to get our DMA address */
204 static void
205 mxge_dmamap_callback(void *arg, bus_dma_segment_t *segs, int nsegs,
206 			 int error)
207 {
208 	if (error == 0) {
209 		*(bus_addr_t *) arg = segs->ds_addr;
210 	}
211 }
212 
213 static int
214 mxge_dma_alloc(mxge_softc_t *sc, mxge_dma_t *dma, size_t bytes,
215 		   bus_size_t alignment)
216 {
217 	int err;
218 	device_t dev = sc->dev;
219 	bus_size_t boundary, maxsegsize;
220 
221 	if (bytes > 4096 && alignment == 4096) {
222 		boundary = 0;
223 		maxsegsize = bytes;
224 	} else {
225 		boundary = 4096;
226 		maxsegsize = 4096;
227 	}
228 
229 	/* allocate DMAable memory tags */
230 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
231 				 alignment,		/* alignment */
232 				 boundary,		/* boundary */
233 				 BUS_SPACE_MAXADDR,	/* low */
234 				 BUS_SPACE_MAXADDR,	/* high */
235 				 NULL, NULL,		/* filter */
236 				 bytes,			/* maxsize */
237 				 1,			/* num segs */
238 				 maxsegsize,		/* maxsegsize */
239 				 BUS_DMA_COHERENT,	/* flags */
240 				 NULL, NULL,		/* lock */
241 				 &dma->dmat);		/* tag */
242 	if (err != 0) {
243 		device_printf(dev, "couldn't alloc tag (err = %d)\n", err);
244 		return err;
245 	}
246 
247 	/* allocate DMAable memory & map */
248 	err = bus_dmamem_alloc(dma->dmat, &dma->addr,
249 			       (BUS_DMA_WAITOK | BUS_DMA_COHERENT
250 				| BUS_DMA_ZERO),  &dma->map);
251 	if (err != 0) {
252 		device_printf(dev, "couldn't alloc mem (err = %d)\n", err);
253 		goto abort_with_dmat;
254 	}
255 
256 	/* load the memory */
257 	err = bus_dmamap_load(dma->dmat, dma->map, dma->addr, bytes,
258 			      mxge_dmamap_callback,
259 			      (void *)&dma->bus_addr, 0);
260 	if (err != 0) {
261 		device_printf(dev, "couldn't load map (err = %d)\n", err);
262 		goto abort_with_mem;
263 	}
264 	return 0;
265 
266 abort_with_mem:
267 	bus_dmamem_free(dma->dmat, dma->addr, dma->map);
268 abort_with_dmat:
269 	(void)bus_dma_tag_destroy(dma->dmat);
270 	return err;
271 }
272 
273 
274 static void
275 mxge_dma_free(mxge_dma_t *dma)
276 {
277 	bus_dmamap_unload(dma->dmat, dma->map);
278 	bus_dmamem_free(dma->dmat, dma->addr, dma->map);
279 	(void)bus_dma_tag_destroy(dma->dmat);
280 }
281 
282 /*
283  * The eeprom strings on the lanaiX have the format
284  * SN=x\0
285  * MAC=x:x:x:x:x:x\0
286  * PC=text\0
287  */
288 
289 static int
290 mxge_parse_strings(mxge_softc_t *sc)
291 {
292 	char *ptr;
293 	int i, found_mac, found_sn2;
294 	char *endptr;
295 
296 	ptr = sc->eeprom_strings;
297 	found_mac = 0;
298 	found_sn2 = 0;
299 	while (*ptr != '\0') {
300 		if (strncmp(ptr, "MAC=", 4) == 0) {
301 			ptr += 4;
302 			for (i = 0;;) {
303 				sc->mac_addr[i] = strtoul(ptr, &endptr, 16);
304 				if (endptr - ptr != 2)
305 					goto abort;
306 				ptr = endptr;
307 				if (++i == 6)
308 					break;
309 				if (*ptr++ != ':')
310 					goto abort;
311 			}
312 			found_mac = 1;
313 		} else if (strncmp(ptr, "PC=", 3) == 0) {
314 			ptr += 3;
315 			strlcpy(sc->product_code_string, ptr,
316 			    sizeof(sc->product_code_string));
317 		} else if (!found_sn2 && (strncmp(ptr, "SN=", 3) == 0)) {
318 			ptr += 3;
319 			strlcpy(sc->serial_number_string, ptr,
320 			    sizeof(sc->serial_number_string));
321 		} else if (strncmp(ptr, "SN2=", 4) == 0) {
322 			/* SN2 takes precedence over SN */
323 			ptr += 4;
324 			found_sn2 = 1;
325 			strlcpy(sc->serial_number_string, ptr,
326 			    sizeof(sc->serial_number_string));
327 		}
328 		while (*ptr++ != '\0') {}
329 	}
330 
331 	if (found_mac)
332 		return 0;
333 
334  abort:
335 	device_printf(sc->dev, "failed to parse eeprom_strings\n");
336 
337 	return ENXIO;
338 }
339 
340 #if defined __i386 || defined i386 || defined __i386__ || defined __x86_64__
341 static void
342 mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
343 {
344 	uint32_t val;
345 	unsigned long base, off;
346 	char *va, *cfgptr;
347 	device_t pdev, mcp55;
348 	uint16_t vendor_id, device_id, word;
349 	uintptr_t bus, slot, func, ivend, idev;
350 	uint32_t *ptr32;
351 
352 
353 	if (!mxge_nvidia_ecrc_enable)
354 		return;
355 
356 	pdev = device_get_parent(device_get_parent(sc->dev));
357 	if (pdev == NULL) {
358 		device_printf(sc->dev, "could not find parent?\n");
359 		return;
360 	}
361 	vendor_id = pci_read_config(pdev, PCIR_VENDOR, 2);
362 	device_id = pci_read_config(pdev, PCIR_DEVICE, 2);
363 
364 	if (vendor_id != 0x10de)
365 		return;
366 
367 	base = 0;
368 
369 	if (device_id == 0x005d) {
370 		/* ck804, base address is magic */
371 		base = 0xe0000000UL;
372 	} else if (device_id >= 0x0374 && device_id <= 0x378) {
373 		/* mcp55, base address stored in chipset */
374 		mcp55 = pci_find_bsf(0, 0, 0);
375 		if (mcp55 &&
376 		    0x10de == pci_read_config(mcp55, PCIR_VENDOR, 2) &&
377 		    0x0369 == pci_read_config(mcp55, PCIR_DEVICE, 2)) {
378 			word = pci_read_config(mcp55, 0x90, 2);
379 			base = ((unsigned long)word & 0x7ffeU) << 25;
380 		}
381 	}
382 	if (!base)
383 		return;
384 
385 	/* XXXX
386 	   Test below is commented because it is believed that doing
387 	   config read/write beyond 0xff will access the config space
388 	   for the next larger function.  Uncomment this and remove
389 	   the hacky pmap_mapdev() way of accessing config space when
390 	   FreeBSD grows support for extended pcie config space access
391 	*/
392 #if 0
393 	/* See if we can, by some miracle, access the extended
394 	   config space */
395 	val = pci_read_config(pdev, 0x178, 4);
396 	if (val != 0xffffffff) {
397 		val |= 0x40;
398 		pci_write_config(pdev, 0x178, val, 4);
399 		return;
400 	}
401 #endif
402 	/* Rather than using normal pci config space writes, we must
403 	 * map the Nvidia config space ourselves.  This is because on
404 	 * opteron/nvidia class machine the 0xe000000 mapping is
405 	 * handled by the nvidia chipset, that means the internal PCI
406 	 * device (the on-chip northbridge), or the amd-8131 bridge
407 	 * and things behind them are not visible by this method.
408 	 */
409 
410 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
411 		      PCI_IVAR_BUS, &bus);
412 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
413 		      PCI_IVAR_SLOT, &slot);
414 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
415 		      PCI_IVAR_FUNCTION, &func);
416 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
417 		      PCI_IVAR_VENDOR, &ivend);
418 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
419 		      PCI_IVAR_DEVICE, &idev);
420 
421 	off =  base
422 		+ 0x00100000UL * (unsigned long)bus
423 		+ 0x00001000UL * (unsigned long)(func
424 						 + 8 * slot);
425 
426 	/* map it into the kernel */
427 	va = pmap_mapdev(trunc_page((vm_paddr_t)off), PAGE_SIZE);
428 
429 
430 	if (va == NULL) {
431 		device_printf(sc->dev, "pmap_kenter_temporary didn't\n");
432 		return;
433 	}
434 	/* get a pointer to the config space mapped into the kernel */
435 	cfgptr = va + (off & PAGE_MASK);
436 
437 	/* make sure that we can really access it */
438 	vendor_id = *(uint16_t *)(cfgptr + PCIR_VENDOR);
439 	device_id = *(uint16_t *)(cfgptr + PCIR_DEVICE);
440 	if (! (vendor_id == ivend && device_id == idev)) {
441 		device_printf(sc->dev, "mapping failed: 0x%x:0x%x\n",
442 			      vendor_id, device_id);
443 		pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
444 		return;
445 	}
446 
447 	ptr32 = (uint32_t*)(cfgptr + 0x178);
448 	val = *ptr32;
449 
450 	if (val == 0xffffffff) {
451 		device_printf(sc->dev, "extended mapping failed\n");
452 		pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
453 		return;
454 	}
455 	*ptr32 = val | 0x40;
456 	pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
457 	if (mxge_verbose)
458 		device_printf(sc->dev,
459 			      "Enabled ECRC on upstream Nvidia bridge "
460 			      "at %d:%d:%d\n",
461 			      (int)bus, (int)slot, (int)func);
462 	return;
463 }
464 #else
465 static void
466 mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
467 {
468 	device_printf(sc->dev,
469 		      "Nforce 4 chipset on non-x86/amd64!?!?!\n");
470 	return;
471 }
472 #endif
473 
474 
475 static int
476 mxge_dma_test(mxge_softc_t *sc, int test_type)
477 {
478 	mxge_cmd_t cmd;
479 	bus_addr_t dmatest_bus = sc->dmabench_dma.bus_addr;
480 	int status;
481 	uint32_t len;
482 	char *test = " ";
483 
484 
485 	/* Run a small DMA test.
486 	 * The magic multipliers to the length tell the firmware
487 	 * to do DMA read, write, or read+write tests.  The
488 	 * results are returned in cmd.data0.  The upper 16
489 	 * bits of the return is the number of transfers completed.
490 	 * The lower 16 bits is the time in 0.5us ticks that the
491 	 * transfers took to complete.
492 	 */
493 
494 	len = sc->tx_boundary;
495 
496 	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
497 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
498 	cmd.data2 = len * 0x10000;
499 	status = mxge_send_cmd(sc, test_type, &cmd);
500 	if (status != 0) {
501 		test = "read";
502 		goto abort;
503 	}
504 	sc->read_dma = ((cmd.data0>>16) * len * 2) /
505 		(cmd.data0 & 0xffff);
506 	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
507 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
508 	cmd.data2 = len * 0x1;
509 	status = mxge_send_cmd(sc, test_type, &cmd);
510 	if (status != 0) {
511 		test = "write";
512 		goto abort;
513 	}
514 	sc->write_dma = ((cmd.data0>>16) * len * 2) /
515 		(cmd.data0 & 0xffff);
516 
517 	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
518 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
519 	cmd.data2 = len * 0x10001;
520 	status = mxge_send_cmd(sc, test_type, &cmd);
521 	if (status != 0) {
522 		test = "read/write";
523 		goto abort;
524 	}
525 	sc->read_write_dma = ((cmd.data0>>16) * len * 2 * 2) /
526 		(cmd.data0 & 0xffff);
527 
528 abort:
529 	if (status != 0 && test_type != MXGEFW_CMD_UNALIGNED_TEST)
530 		device_printf(sc->dev, "DMA %s benchmark failed: %d\n",
531 			      test, status);
532 
533 	return status;
534 }
535 
536 /*
537  * The Lanai Z8E PCI-E interface achieves higher Read-DMA throughput
538  * when the PCI-E Completion packets are aligned on an 8-byte
539  * boundary.  Some PCI-E chip sets always align Completion packets; on
540  * the ones that do not, the alignment can be enforced by enabling
541  * ECRC generation (if supported).
542  *
543  * When PCI-E Completion packets are not aligned, it is actually more
544  * efficient to limit Read-DMA transactions to 2KB, rather than 4KB.
545  *
546  * If the driver can neither enable ECRC nor verify that it has
547  * already been enabled, then it must use a firmware image which works
548  * around unaligned completion packets (ethp_z8e.dat), and it should
549  * also ensure that it never gives the device a Read-DMA which is
550  * larger than 2KB by setting the tx_boundary to 2KB.  If ECRC is
551  * enabled, then the driver should use the aligned (eth_z8e.dat)
552  * firmware image, and set tx_boundary to 4KB.
553  */
554 
555 static int
556 mxge_firmware_probe(mxge_softc_t *sc)
557 {
558 	device_t dev = sc->dev;
559 	int reg, status;
560 	uint16_t pectl;
561 
562 	sc->tx_boundary = 4096;
563 	/*
564 	 * Verify the max read request size was set to 4KB
565 	 * before trying the test with 4KB.
566 	 */
567 	if (pci_find_cap(dev, PCIY_EXPRESS, &reg) == 0) {
568 		pectl = pci_read_config(dev, reg + 0x8, 2);
569 		if ((pectl & (5 << 12)) != (5 << 12)) {
570 			device_printf(dev, "Max Read Req. size != 4k (0x%x\n",
571 				      pectl);
572 			sc->tx_boundary = 2048;
573 		}
574 	}
575 
576 	/*
577 	 * load the optimized firmware (which assumes aligned PCIe
578 	 * completions) in order to see if it works on this host.
579 	 */
580 	sc->fw_name = mxge_fw_aligned;
581 	status = mxge_load_firmware(sc, 1);
582 	if (status != 0) {
583 		return status;
584 	}
585 
586 	/*
587 	 * Enable ECRC if possible
588 	 */
589 	mxge_enable_nvidia_ecrc(sc);
590 
591 	/*
592 	 * Run a DMA test which watches for unaligned completions and
593 	 * aborts on the first one seen.  Not required on Z8ES or newer.
594 	 */
595 	if (pci_get_revid(sc->dev) >= MXGE_PCI_REV_Z8ES)
596 		return 0;
597 	status = mxge_dma_test(sc, MXGEFW_CMD_UNALIGNED_TEST);
598 	if (status == 0)
599 		return 0; /* keep the aligned firmware */
600 
601 	if (status != E2BIG)
602 		device_printf(dev, "DMA test failed: %d\n", status);
603 	if (status == ENOSYS)
604 		device_printf(dev, "Falling back to ethp! "
605 			      "Please install up to date fw\n");
606 	return status;
607 }
608 
609 static int
610 mxge_select_firmware(mxge_softc_t *sc)
611 {
612 	int aligned = 0;
613 	int force_firmware = mxge_force_firmware;
614 
615 	if (sc->throttle)
616 		force_firmware = sc->throttle;
617 
618 	if (force_firmware != 0) {
619 		if (force_firmware == 1)
620 			aligned = 1;
621 		else
622 			aligned = 0;
623 		if (mxge_verbose)
624 			device_printf(sc->dev,
625 				      "Assuming %s completions (forced)\n",
626 				      aligned ? "aligned" : "unaligned");
627 		goto abort;
628 	}
629 
630 	/* if the PCIe link width is 4 or less, we can use the aligned
631 	   firmware and skip any checks */
632 	if (sc->link_width != 0 && sc->link_width <= 4) {
633 		device_printf(sc->dev,
634 			      "PCIe x%d Link, expect reduced performance\n",
635 			      sc->link_width);
636 		aligned = 1;
637 		goto abort;
638 	}
639 
640 	if (0 == mxge_firmware_probe(sc))
641 		return 0;
642 
643 abort:
644 	if (aligned) {
645 		sc->fw_name = mxge_fw_aligned;
646 		sc->tx_boundary = 4096;
647 	} else {
648 		sc->fw_name = mxge_fw_unaligned;
649 		sc->tx_boundary = 2048;
650 	}
651 	return (mxge_load_firmware(sc, 0));
652 }
653 
654 static int
655 mxge_validate_firmware(mxge_softc_t *sc, const mcp_gen_header_t *hdr)
656 {
657 
658 
659 	if (be32toh(hdr->mcp_type) != MCP_TYPE_ETH) {
660 		device_printf(sc->dev, "Bad firmware type: 0x%x\n",
661 			      be32toh(hdr->mcp_type));
662 		return EIO;
663 	}
664 
665 	/* save firmware version for sysctl */
666 	strlcpy(sc->fw_version, hdr->version, sizeof(sc->fw_version));
667 	if (mxge_verbose)
668 		device_printf(sc->dev, "firmware id: %s\n", hdr->version);
669 
670 	sscanf(sc->fw_version, "%d.%d.%d", &sc->fw_ver_major,
671 	       &sc->fw_ver_minor, &sc->fw_ver_tiny);
672 
673 	if (!(sc->fw_ver_major == MXGEFW_VERSION_MAJOR
674 	      && sc->fw_ver_minor == MXGEFW_VERSION_MINOR)) {
675 		device_printf(sc->dev, "Found firmware version %s\n",
676 			      sc->fw_version);
677 		device_printf(sc->dev, "Driver needs %d.%d\n",
678 			      MXGEFW_VERSION_MAJOR, MXGEFW_VERSION_MINOR);
679 		return EINVAL;
680 	}
681 	return 0;
682 
683 }
684 
685 static void *
686 z_alloc(void *nil, u_int items, u_int size)
687 {
688 	void *ptr;
689 
690 	ptr = malloc(items * size, M_TEMP, M_NOWAIT);
691 	return ptr;
692 }
693 
694 static void
695 z_free(void *nil, void *ptr)
696 {
697 	free(ptr, M_TEMP);
698 }
699 
700 
701 static int
702 mxge_load_firmware_helper(mxge_softc_t *sc, uint32_t *limit)
703 {
704 	z_stream zs;
705 	char *inflate_buffer;
706 	const struct firmware *fw;
707 	const mcp_gen_header_t *hdr;
708 	unsigned hdr_offset;
709 	int status;
710 	unsigned int i;
711 	char dummy;
712 	size_t fw_len;
713 
714 	fw = firmware_get(sc->fw_name);
715 	if (fw == NULL) {
716 		device_printf(sc->dev, "Could not find firmware image %s\n",
717 			      sc->fw_name);
718 		return ENOENT;
719 	}
720 
721 
722 
723 	/* setup zlib and decompress f/w */
724 	bzero(&zs, sizeof (zs));
725 	zs.zalloc = z_alloc;
726 	zs.zfree = z_free;
727 	status = inflateInit(&zs);
728 	if (status != Z_OK) {
729 		status = EIO;
730 		goto abort_with_fw;
731 	}
732 
733 	/* the uncompressed size is stored as the firmware version,
734 	   which would otherwise go unused */
735 	fw_len = (size_t) fw->version;
736 	inflate_buffer = malloc(fw_len, M_TEMP, M_NOWAIT);
737 	if (inflate_buffer == NULL)
738 		goto abort_with_zs;
739 	zs.avail_in = fw->datasize;
740 	zs.next_in = __DECONST(char *, fw->data);
741 	zs.avail_out = fw_len;
742 	zs.next_out = inflate_buffer;
743 	status = inflate(&zs, Z_FINISH);
744 	if (status != Z_STREAM_END) {
745 		device_printf(sc->dev, "zlib %d\n", status);
746 		status = EIO;
747 		goto abort_with_buffer;
748 	}
749 
750 	/* check id */
751 	hdr_offset = htobe32(*(const uint32_t *)
752 			     (inflate_buffer + MCP_HEADER_PTR_OFFSET));
753 	if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > fw_len) {
754 		device_printf(sc->dev, "Bad firmware file");
755 		status = EIO;
756 		goto abort_with_buffer;
757 	}
758 	hdr = (const void*)(inflate_buffer + hdr_offset);
759 
760 	status = mxge_validate_firmware(sc, hdr);
761 	if (status != 0)
762 		goto abort_with_buffer;
763 
764 	/* Copy the inflated firmware to NIC SRAM. */
765 	for (i = 0; i < fw_len; i += 256) {
766 		mxge_pio_copy(sc->sram + MXGE_FW_OFFSET + i,
767 			      inflate_buffer + i,
768 			      min(256U, (unsigned)(fw_len - i)));
769 		wmb();
770 		dummy = *sc->sram;
771 		wmb();
772 	}
773 
774 	*limit = fw_len;
775 	status = 0;
776 abort_with_buffer:
777 	free(inflate_buffer, M_TEMP);
778 abort_with_zs:
779 	inflateEnd(&zs);
780 abort_with_fw:
781 	firmware_put(fw, FIRMWARE_UNLOAD);
782 	return status;
783 }
784 
785 /*
786  * Enable or disable periodic RDMAs from the host to make certain
787  * chipsets resend dropped PCIe messages
788  */
789 
790 static void
791 mxge_dummy_rdma(mxge_softc_t *sc, int enable)
792 {
793 	char buf_bytes[72];
794 	volatile uint32_t *confirm;
795 	volatile char *submit;
796 	uint32_t *buf, dma_low, dma_high;
797 	int i;
798 
799 	buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
800 
801 	/* clear confirmation addr */
802 	confirm = (volatile uint32_t *)sc->cmd;
803 	*confirm = 0;
804 	wmb();
805 
806 	/* send an rdma command to the PCIe engine, and wait for the
807 	   response in the confirmation address.  The firmware should
808 	   write a -1 there to indicate it is alive and well
809 	*/
810 
811 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
812 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
813 	buf[0] = htobe32(dma_high);		/* confirm addr MSW */
814 	buf[1] = htobe32(dma_low);		/* confirm addr LSW */
815 	buf[2] = htobe32(0xffffffff);		/* confirm data */
816 	dma_low = MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr);
817 	dma_high = MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr);
818 	buf[3] = htobe32(dma_high); 		/* dummy addr MSW */
819 	buf[4] = htobe32(dma_low); 		/* dummy addr LSW */
820 	buf[5] = htobe32(enable);			/* enable? */
821 
822 
823 	submit = (volatile char *)(sc->sram + MXGEFW_BOOT_DUMMY_RDMA);
824 
825 	mxge_pio_copy(submit, buf, 64);
826 	wmb();
827 	DELAY(1000);
828 	wmb();
829 	i = 0;
830 	while (*confirm != 0xffffffff && i < 20) {
831 		DELAY(1000);
832 		i++;
833 	}
834 	if (*confirm != 0xffffffff) {
835 		device_printf(sc->dev, "dummy rdma %s failed (%p = 0x%x)",
836 			      (enable ? "enable" : "disable"), confirm,
837 			      *confirm);
838 	}
839 	return;
840 }
841 
842 static int
843 mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data)
844 {
845 	mcp_cmd_t *buf;
846 	char buf_bytes[sizeof(*buf) + 8];
847 	volatile mcp_cmd_response_t *response = sc->cmd;
848 	volatile char *cmd_addr = sc->sram + MXGEFW_ETH_CMD;
849 	uint32_t dma_low, dma_high;
850 	int err, sleep_total = 0;
851 
852 	/* ensure buf is aligned to 8 bytes */
853 	buf = (mcp_cmd_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
854 
855 	buf->data0 = htobe32(data->data0);
856 	buf->data1 = htobe32(data->data1);
857 	buf->data2 = htobe32(data->data2);
858 	buf->cmd = htobe32(cmd);
859 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
860 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
861 
862 	buf->response_addr.low = htobe32(dma_low);
863 	buf->response_addr.high = htobe32(dma_high);
864 	mtx_lock(&sc->cmd_mtx);
865 	response->result = 0xffffffff;
866 	wmb();
867 	mxge_pio_copy((volatile void *)cmd_addr, buf, sizeof (*buf));
868 
869 	/* wait up to 20ms */
870 	err = EAGAIN;
871 	for (sleep_total = 0; sleep_total <  20; sleep_total++) {
872 		bus_dmamap_sync(sc->cmd_dma.dmat,
873 				sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
874 		wmb();
875 		switch (be32toh(response->result)) {
876 		case 0:
877 			data->data0 = be32toh(response->data);
878 			err = 0;
879 			break;
880 		case 0xffffffff:
881 			DELAY(1000);
882 			break;
883 		case MXGEFW_CMD_UNKNOWN:
884 			err = ENOSYS;
885 			break;
886 		case MXGEFW_CMD_ERROR_UNALIGNED:
887 			err = E2BIG;
888 			break;
889 		case MXGEFW_CMD_ERROR_BUSY:
890 			err = EBUSY;
891 			break;
892 		case MXGEFW_CMD_ERROR_I2C_ABSENT:
893 			err = ENXIO;
894 			break;
895 		default:
896 			device_printf(sc->dev,
897 				      "mxge: command %d "
898 				      "failed, result = %d\n",
899 				      cmd, be32toh(response->result));
900 			err = ENXIO;
901 			break;
902 		}
903 		if (err != EAGAIN)
904 			break;
905 	}
906 	if (err == EAGAIN)
907 		device_printf(sc->dev, "mxge: command %d timed out"
908 			      "result = %d\n",
909 			      cmd, be32toh(response->result));
910 	mtx_unlock(&sc->cmd_mtx);
911 	return err;
912 }
913 
914 static int
915 mxge_adopt_running_firmware(mxge_softc_t *sc)
916 {
917 	struct mcp_gen_header *hdr;
918 	const size_t bytes = sizeof (struct mcp_gen_header);
919 	size_t hdr_offset;
920 	int status;
921 
922 	/* find running firmware header */
923 	hdr_offset = htobe32(*(volatile uint32_t *)
924 			     (sc->sram + MCP_HEADER_PTR_OFFSET));
925 
926 	if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > sc->sram_size) {
927 		device_printf(sc->dev,
928 			      "Running firmware has bad header offset (%d)\n",
929 			      (int)hdr_offset);
930 		return EIO;
931 	}
932 
933 	/* copy header of running firmware from SRAM to host memory to
934 	 * validate firmware */
935 	hdr = malloc(bytes, M_DEVBUF, M_NOWAIT);
936 	if (hdr == NULL) {
937 		device_printf(sc->dev, "could not malloc firmware hdr\n");
938 		return ENOMEM;
939 	}
940 	bus_space_read_region_1(rman_get_bustag(sc->mem_res),
941 				rman_get_bushandle(sc->mem_res),
942 				hdr_offset, (char *)hdr, bytes);
943 	status = mxge_validate_firmware(sc, hdr);
944 	free(hdr, M_DEVBUF);
945 
946 	/*
947 	 * check to see if adopted firmware has bug where adopting
948 	 * it will cause broadcasts to be filtered unless the NIC
949 	 * is kept in ALLMULTI mode
950 	 */
951 	if (sc->fw_ver_major == 1 && sc->fw_ver_minor == 4 &&
952 	    sc->fw_ver_tiny >= 4 && sc->fw_ver_tiny <= 11) {
953 		sc->adopted_rx_filter_bug = 1;
954 		device_printf(sc->dev, "Adopting fw %d.%d.%d: "
955 			      "working around rx filter bug\n",
956 			      sc->fw_ver_major, sc->fw_ver_minor,
957 			      sc->fw_ver_tiny);
958 	}
959 
960 	return status;
961 }
962 
963 
964 static int
965 mxge_load_firmware(mxge_softc_t *sc, int adopt)
966 {
967 	volatile uint32_t *confirm;
968 	volatile char *submit;
969 	char buf_bytes[72];
970 	uint32_t *buf, size, dma_low, dma_high;
971 	int status, i;
972 
973 	buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
974 
975 	size = sc->sram_size;
976 	status = mxge_load_firmware_helper(sc, &size);
977 	if (status) {
978 		if (!adopt)
979 			return status;
980 		/* Try to use the currently running firmware, if
981 		   it is new enough */
982 		status = mxge_adopt_running_firmware(sc);
983 		if (status) {
984 			device_printf(sc->dev,
985 				      "failed to adopt running firmware\n");
986 			return status;
987 		}
988 		device_printf(sc->dev,
989 			      "Successfully adopted running firmware\n");
990 		if (sc->tx_boundary == 4096) {
991 			device_printf(sc->dev,
992 				"Using firmware currently running on NIC"
993 				 ".  For optimal\n");
994 			device_printf(sc->dev,
995 				 "performance consider loading optimized "
996 				 "firmware\n");
997 		}
998 		sc->fw_name = mxge_fw_unaligned;
999 		sc->tx_boundary = 2048;
1000 		return 0;
1001 	}
1002 	/* clear confirmation addr */
1003 	confirm = (volatile uint32_t *)sc->cmd;
1004 	*confirm = 0;
1005 	wmb();
1006 	/* send a reload command to the bootstrap MCP, and wait for the
1007 	   response in the confirmation address.  The firmware should
1008 	   write a -1 there to indicate it is alive and well
1009 	*/
1010 
1011 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
1012 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
1013 
1014 	buf[0] = htobe32(dma_high);	/* confirm addr MSW */
1015 	buf[1] = htobe32(dma_low);	/* confirm addr LSW */
1016 	buf[2] = htobe32(0xffffffff);	/* confirm data */
1017 
1018 	/* FIX: All newest firmware should un-protect the bottom of
1019 	   the sram before handoff. However, the very first interfaces
1020 	   do not. Therefore the handoff copy must skip the first 8 bytes
1021 	*/
1022 					/* where the code starts*/
1023 	buf[3] = htobe32(MXGE_FW_OFFSET + 8);
1024 	buf[4] = htobe32(size - 8); 	/* length of code */
1025 	buf[5] = htobe32(8);		/* where to copy to */
1026 	buf[6] = htobe32(0);		/* where to jump to */
1027 
1028 	submit = (volatile char *)(sc->sram + MXGEFW_BOOT_HANDOFF);
1029 	mxge_pio_copy(submit, buf, 64);
1030 	wmb();
1031 	DELAY(1000);
1032 	wmb();
1033 	i = 0;
1034 	while (*confirm != 0xffffffff && i < 20) {
1035 		DELAY(1000*10);
1036 		i++;
1037 		bus_dmamap_sync(sc->cmd_dma.dmat,
1038 				sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
1039 	}
1040 	if (*confirm != 0xffffffff) {
1041 		device_printf(sc->dev,"handoff failed (%p = 0x%x)",
1042 			confirm, *confirm);
1043 
1044 		return ENXIO;
1045 	}
1046 	return 0;
1047 }
1048 
1049 static int
1050 mxge_update_mac_address(mxge_softc_t *sc)
1051 {
1052 	mxge_cmd_t cmd;
1053 	uint8_t *addr = sc->mac_addr;
1054 	int status;
1055 
1056 
1057 	cmd.data0 = ((addr[0] << 24) | (addr[1] << 16)
1058 		     | (addr[2] << 8) | addr[3]);
1059 
1060 	cmd.data1 = ((addr[4] << 8) | (addr[5]));
1061 
1062 	status = mxge_send_cmd(sc, MXGEFW_SET_MAC_ADDRESS, &cmd);
1063 	return status;
1064 }
1065 
1066 static int
1067 mxge_change_pause(mxge_softc_t *sc, int pause)
1068 {
1069 	mxge_cmd_t cmd;
1070 	int status;
1071 
1072 	if (pause)
1073 		status = mxge_send_cmd(sc, MXGEFW_ENABLE_FLOW_CONTROL,
1074 				       &cmd);
1075 	else
1076 		status = mxge_send_cmd(sc, MXGEFW_DISABLE_FLOW_CONTROL,
1077 				       &cmd);
1078 
1079 	if (status) {
1080 		device_printf(sc->dev, "Failed to set flow control mode\n");
1081 		return ENXIO;
1082 	}
1083 	sc->pause = pause;
1084 	return 0;
1085 }
1086 
1087 static void
1088 mxge_change_promisc(mxge_softc_t *sc, int promisc)
1089 {
1090 	mxge_cmd_t cmd;
1091 	int status;
1092 
1093 	if (mxge_always_promisc)
1094 		promisc = 1;
1095 
1096 	if (promisc)
1097 		status = mxge_send_cmd(sc, MXGEFW_ENABLE_PROMISC,
1098 				       &cmd);
1099 	else
1100 		status = mxge_send_cmd(sc, MXGEFW_DISABLE_PROMISC,
1101 				       &cmd);
1102 
1103 	if (status) {
1104 		device_printf(sc->dev, "Failed to set promisc mode\n");
1105 	}
1106 }
1107 
1108 static void
1109 mxge_set_multicast_list(mxge_softc_t *sc)
1110 {
1111 	mxge_cmd_t cmd;
1112 	struct ifmultiaddr *ifma;
1113 	struct ifnet *ifp = sc->ifp;
1114 	int err;
1115 
1116 	/* This firmware is known to not support multicast */
1117 	if (!sc->fw_multicast_support)
1118 		return;
1119 
1120 	/* Disable multicast filtering while we play with the lists*/
1121 	err = mxge_send_cmd(sc, MXGEFW_ENABLE_ALLMULTI, &cmd);
1122 	if (err != 0) {
1123 		device_printf(sc->dev, "Failed MXGEFW_ENABLE_ALLMULTI,"
1124 		       " error status: %d\n", err);
1125 		return;
1126 	}
1127 
1128 	if (sc->adopted_rx_filter_bug)
1129 		return;
1130 
1131 	if (ifp->if_flags & IFF_ALLMULTI)
1132 		/* request to disable multicast filtering, so quit here */
1133 		return;
1134 
1135 	/* Flush all the filters */
1136 
1137 	err = mxge_send_cmd(sc, MXGEFW_LEAVE_ALL_MULTICAST_GROUPS, &cmd);
1138 	if (err != 0) {
1139 		device_printf(sc->dev,
1140 			      "Failed MXGEFW_LEAVE_ALL_MULTICAST_GROUPS"
1141 			      ", error status: %d\n", err);
1142 		return;
1143 	}
1144 
1145 	/* Walk the multicast list, and add each address */
1146 
1147 	if_maddr_rlock(ifp);
1148 	TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
1149 		if (ifma->ifma_addr->sa_family != AF_LINK)
1150 			continue;
1151 		bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr),
1152 		      &cmd.data0, 4);
1153 		bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr) + 4,
1154 		      &cmd.data1, 2);
1155 		cmd.data0 = htonl(cmd.data0);
1156 		cmd.data1 = htonl(cmd.data1);
1157 		err = mxge_send_cmd(sc, MXGEFW_JOIN_MULTICAST_GROUP, &cmd);
1158 		if (err != 0) {
1159 			device_printf(sc->dev, "Failed "
1160 			       "MXGEFW_JOIN_MULTICAST_GROUP, error status:"
1161 			       "%d\t", err);
1162 			/* abort, leaving multicast filtering off */
1163 			if_maddr_runlock(ifp);
1164 			return;
1165 		}
1166 	}
1167 	if_maddr_runlock(ifp);
1168 	/* Enable multicast filtering */
1169 	err = mxge_send_cmd(sc, MXGEFW_DISABLE_ALLMULTI, &cmd);
1170 	if (err != 0) {
1171 		device_printf(sc->dev, "Failed MXGEFW_DISABLE_ALLMULTI"
1172 		       ", error status: %d\n", err);
1173 	}
1174 }
1175 
1176 static int
1177 mxge_max_mtu(mxge_softc_t *sc)
1178 {
1179 	mxge_cmd_t cmd;
1180 	int status;
1181 
1182 	if (MJUMPAGESIZE - MXGEFW_PAD >  MXGEFW_MAX_MTU)
1183 		return  MXGEFW_MAX_MTU - MXGEFW_PAD;
1184 
1185 	/* try to set nbufs to see if it we can
1186 	   use virtually contiguous jumbos */
1187 	cmd.data0 = 0;
1188 	status = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
1189 			       &cmd);
1190 	if (status == 0)
1191 		return  MXGEFW_MAX_MTU - MXGEFW_PAD;
1192 
1193 	/* otherwise, we're limited to MJUMPAGESIZE */
1194 	return MJUMPAGESIZE - MXGEFW_PAD;
1195 }
1196 
1197 static int
1198 mxge_reset(mxge_softc_t *sc, int interrupts_setup)
1199 {
1200 	struct mxge_slice_state *ss;
1201 	mxge_rx_done_t *rx_done;
1202 	volatile uint32_t *irq_claim;
1203 	mxge_cmd_t cmd;
1204 	int slice, status;
1205 
1206 	/* try to send a reset command to the card to see if it
1207 	   is alive */
1208 	memset(&cmd, 0, sizeof (cmd));
1209 	status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
1210 	if (status != 0) {
1211 		device_printf(sc->dev, "failed reset\n");
1212 		return ENXIO;
1213 	}
1214 
1215 	mxge_dummy_rdma(sc, 1);
1216 
1217 
1218 	/* set the intrq size */
1219 	cmd.data0 = sc->rx_ring_size;
1220 	status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
1221 
1222 	/*
1223 	 * Even though we already know how many slices are supported
1224 	 * via mxge_slice_probe(), MXGEFW_CMD_GET_MAX_RSS_QUEUES
1225 	 * has magic side effects, and must be called after a reset.
1226 	 * It must be called prior to calling any RSS related cmds,
1227 	 * including assigning an interrupt queue for anything but
1228 	 * slice 0.  It must also be called *after*
1229 	 * MXGEFW_CMD_SET_INTRQ_SIZE, since the intrq size is used by
1230 	 * the firmware to compute offsets.
1231 	 */
1232 
1233 	if (sc->num_slices > 1) {
1234 		/* ask the maximum number of slices it supports */
1235 		status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES,
1236 					   &cmd);
1237 		if (status != 0) {
1238 			device_printf(sc->dev,
1239 				      "failed to get number of slices\n");
1240 			return status;
1241 		}
1242 		/*
1243 		 * MXGEFW_CMD_ENABLE_RSS_QUEUES must be called prior
1244 		 * to setting up the interrupt queue DMA
1245 		 */
1246 		cmd.data0 = sc->num_slices;
1247 		cmd.data1 = MXGEFW_SLICE_INTR_MODE_ONE_PER_SLICE;
1248 #ifdef IFNET_BUF_RING
1249 		cmd.data1 |= MXGEFW_SLICE_ENABLE_MULTIPLE_TX_QUEUES;
1250 #endif
1251 		status = mxge_send_cmd(sc, MXGEFW_CMD_ENABLE_RSS_QUEUES,
1252 					   &cmd);
1253 		if (status != 0) {
1254 			device_printf(sc->dev,
1255 				      "failed to set number of slices\n");
1256 			return status;
1257 		}
1258 	}
1259 
1260 
1261 	if (interrupts_setup) {
1262 		/* Now exchange information about interrupts  */
1263 		for (slice = 0; slice < sc->num_slices; slice++) {
1264 			rx_done = &sc->ss[slice].rx_done;
1265 			memset(rx_done->entry, 0, sc->rx_ring_size);
1266 			cmd.data0 = MXGE_LOWPART_TO_U32(rx_done->dma.bus_addr);
1267 			cmd.data1 = MXGE_HIGHPART_TO_U32(rx_done->dma.bus_addr);
1268 			cmd.data2 = slice;
1269 			status |= mxge_send_cmd(sc,
1270 						MXGEFW_CMD_SET_INTRQ_DMA,
1271 						&cmd);
1272 		}
1273 	}
1274 
1275 	status |= mxge_send_cmd(sc,
1276 				MXGEFW_CMD_GET_INTR_COAL_DELAY_OFFSET, &cmd);
1277 
1278 
1279 	sc->intr_coal_delay_ptr = (volatile uint32_t *)(sc->sram + cmd.data0);
1280 
1281 	status |= mxge_send_cmd(sc, MXGEFW_CMD_GET_IRQ_ACK_OFFSET, &cmd);
1282 	irq_claim = (volatile uint32_t *)(sc->sram + cmd.data0);
1283 
1284 
1285 	status |= mxge_send_cmd(sc,  MXGEFW_CMD_GET_IRQ_DEASSERT_OFFSET,
1286 				&cmd);
1287 	sc->irq_deassert = (volatile uint32_t *)(sc->sram + cmd.data0);
1288 	if (status != 0) {
1289 		device_printf(sc->dev, "failed set interrupt parameters\n");
1290 		return status;
1291 	}
1292 
1293 
1294 	*sc->intr_coal_delay_ptr = htobe32(sc->intr_coal_delay);
1295 
1296 
1297 	/* run a DMA benchmark */
1298 	(void) mxge_dma_test(sc, MXGEFW_DMA_TEST);
1299 
1300 	for (slice = 0; slice < sc->num_slices; slice++) {
1301 		ss = &sc->ss[slice];
1302 
1303 		ss->irq_claim = irq_claim + (2 * slice);
1304 		/* reset mcp/driver shared state back to 0 */
1305 		ss->rx_done.idx = 0;
1306 		ss->rx_done.cnt = 0;
1307 		ss->tx.req = 0;
1308 		ss->tx.done = 0;
1309 		ss->tx.pkt_done = 0;
1310 		ss->tx.queue_active = 0;
1311 		ss->tx.activate = 0;
1312 		ss->tx.deactivate = 0;
1313 		ss->tx.wake = 0;
1314 		ss->tx.defrag = 0;
1315 		ss->tx.stall = 0;
1316 		ss->rx_big.cnt = 0;
1317 		ss->rx_small.cnt = 0;
1318 		ss->lc.lro_bad_csum = 0;
1319 		ss->lc.lro_queued = 0;
1320 		ss->lc.lro_flushed = 0;
1321 		if (ss->fw_stats != NULL) {
1322 			bzero(ss->fw_stats, sizeof *ss->fw_stats);
1323 		}
1324 	}
1325 	sc->rdma_tags_available = 15;
1326 	status = mxge_update_mac_address(sc);
1327 	mxge_change_promisc(sc, sc->ifp->if_flags & IFF_PROMISC);
1328 	mxge_change_pause(sc, sc->pause);
1329 	mxge_set_multicast_list(sc);
1330 	if (sc->throttle) {
1331 		cmd.data0 = sc->throttle;
1332 		if (mxge_send_cmd(sc, MXGEFW_CMD_SET_THROTTLE_FACTOR,
1333 				  &cmd)) {
1334 			device_printf(sc->dev,
1335 				      "can't enable throttle\n");
1336 		}
1337 	}
1338 	return status;
1339 }
1340 
1341 static int
1342 mxge_change_throttle(SYSCTL_HANDLER_ARGS)
1343 {
1344 	mxge_cmd_t cmd;
1345 	mxge_softc_t *sc;
1346 	int err;
1347 	unsigned int throttle;
1348 
1349 	sc = arg1;
1350 	throttle = sc->throttle;
1351 	err = sysctl_handle_int(oidp, &throttle, arg2, req);
1352 	if (err != 0) {
1353 		return err;
1354 	}
1355 
1356 	if (throttle == sc->throttle)
1357 		return 0;
1358 
1359 	if (throttle < MXGE_MIN_THROTTLE || throttle > MXGE_MAX_THROTTLE)
1360 		return EINVAL;
1361 
1362 	mtx_lock(&sc->driver_mtx);
1363 	cmd.data0 = throttle;
1364 	err = mxge_send_cmd(sc, MXGEFW_CMD_SET_THROTTLE_FACTOR, &cmd);
1365 	if (err == 0)
1366 		sc->throttle = throttle;
1367 	mtx_unlock(&sc->driver_mtx);
1368 	return err;
1369 }
1370 
1371 static int
1372 mxge_change_intr_coal(SYSCTL_HANDLER_ARGS)
1373 {
1374 	mxge_softc_t *sc;
1375 	unsigned int intr_coal_delay;
1376 	int err;
1377 
1378 	sc = arg1;
1379 	intr_coal_delay = sc->intr_coal_delay;
1380 	err = sysctl_handle_int(oidp, &intr_coal_delay, arg2, req);
1381 	if (err != 0) {
1382 		return err;
1383 	}
1384 	if (intr_coal_delay == sc->intr_coal_delay)
1385 		return 0;
1386 
1387 	if (intr_coal_delay == 0 || intr_coal_delay > 1000*1000)
1388 		return EINVAL;
1389 
1390 	mtx_lock(&sc->driver_mtx);
1391 	*sc->intr_coal_delay_ptr = htobe32(intr_coal_delay);
1392 	sc->intr_coal_delay = intr_coal_delay;
1393 
1394 	mtx_unlock(&sc->driver_mtx);
1395 	return err;
1396 }
1397 
1398 static int
1399 mxge_change_flow_control(SYSCTL_HANDLER_ARGS)
1400 {
1401 	mxge_softc_t *sc;
1402 	unsigned int enabled;
1403 	int err;
1404 
1405 	sc = arg1;
1406 	enabled = sc->pause;
1407 	err = sysctl_handle_int(oidp, &enabled, arg2, req);
1408 	if (err != 0) {
1409 		return err;
1410 	}
1411 	if (enabled == sc->pause)
1412 		return 0;
1413 
1414 	mtx_lock(&sc->driver_mtx);
1415 	err = mxge_change_pause(sc, enabled);
1416 	mtx_unlock(&sc->driver_mtx);
1417 	return err;
1418 }
1419 
1420 static int
1421 mxge_handle_be32(SYSCTL_HANDLER_ARGS)
1422 {
1423 	int err;
1424 
1425 	if (arg1 == NULL)
1426 		return EFAULT;
1427 	arg2 = be32toh(*(int *)arg1);
1428 	arg1 = NULL;
1429 	err = sysctl_handle_int(oidp, arg1, arg2, req);
1430 
1431 	return err;
1432 }
1433 
1434 static void
1435 mxge_rem_sysctls(mxge_softc_t *sc)
1436 {
1437 	struct mxge_slice_state *ss;
1438 	int slice;
1439 
1440 	if (sc->slice_sysctl_tree == NULL)
1441 		return;
1442 
1443 	for (slice = 0; slice < sc->num_slices; slice++) {
1444 		ss = &sc->ss[slice];
1445 		if (ss == NULL || ss->sysctl_tree == NULL)
1446 			continue;
1447 		sysctl_ctx_free(&ss->sysctl_ctx);
1448 		ss->sysctl_tree = NULL;
1449 	}
1450 	sysctl_ctx_free(&sc->slice_sysctl_ctx);
1451 	sc->slice_sysctl_tree = NULL;
1452 }
1453 
1454 static void
1455 mxge_add_sysctls(mxge_softc_t *sc)
1456 {
1457 	struct sysctl_ctx_list *ctx;
1458 	struct sysctl_oid_list *children;
1459 	mcp_irq_data_t *fw;
1460 	struct mxge_slice_state *ss;
1461 	int slice;
1462 	char slice_num[8];
1463 
1464 	ctx = device_get_sysctl_ctx(sc->dev);
1465 	children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev));
1466 	fw = sc->ss[0].fw_stats;
1467 
1468 	/* random information */
1469 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1470 		       "firmware_version",
1471 		       CTLFLAG_RD, sc->fw_version,
1472 		       0, "firmware version");
1473 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1474 		       "serial_number",
1475 		       CTLFLAG_RD, sc->serial_number_string,
1476 		       0, "serial number");
1477 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1478 		       "product_code",
1479 		       CTLFLAG_RD, sc->product_code_string,
1480 		       0, "product_code");
1481 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1482 		       "pcie_link_width",
1483 		       CTLFLAG_RD, &sc->link_width,
1484 		       0, "tx_boundary");
1485 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1486 		       "tx_boundary",
1487 		       CTLFLAG_RD, &sc->tx_boundary,
1488 		       0, "tx_boundary");
1489 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1490 		       "write_combine",
1491 		       CTLFLAG_RD, &sc->wc,
1492 		       0, "write combining PIO?");
1493 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1494 		       "read_dma_MBs",
1495 		       CTLFLAG_RD, &sc->read_dma,
1496 		       0, "DMA Read speed in MB/s");
1497 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1498 		       "write_dma_MBs",
1499 		       CTLFLAG_RD, &sc->write_dma,
1500 		       0, "DMA Write speed in MB/s");
1501 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1502 		       "read_write_dma_MBs",
1503 		       CTLFLAG_RD, &sc->read_write_dma,
1504 		       0, "DMA concurrent Read/Write speed in MB/s");
1505 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1506 		       "watchdog_resets",
1507 		       CTLFLAG_RD, &sc->watchdog_resets,
1508 		       0, "Number of times NIC was reset");
1509 
1510 
1511 	/* performance related tunables */
1512 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1513 			"intr_coal_delay",
1514 			CTLTYPE_INT|CTLFLAG_RW, sc,
1515 			0, mxge_change_intr_coal,
1516 			"I", "interrupt coalescing delay in usecs");
1517 
1518 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1519 			"throttle",
1520 			CTLTYPE_INT|CTLFLAG_RW, sc,
1521 			0, mxge_change_throttle,
1522 			"I", "transmit throttling");
1523 
1524 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1525 			"flow_control_enabled",
1526 			CTLTYPE_INT|CTLFLAG_RW, sc,
1527 			0, mxge_change_flow_control,
1528 			"I", "interrupt coalescing delay in usecs");
1529 
1530 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1531 		       "deassert_wait",
1532 		       CTLFLAG_RW, &mxge_deassert_wait,
1533 		       0, "Wait for IRQ line to go low in ihandler");
1534 
1535 	/* stats block from firmware is in network byte order.
1536 	   Need to swap it */
1537 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1538 			"link_up",
1539 			CTLTYPE_INT|CTLFLAG_RD, &fw->link_up,
1540 			0, mxge_handle_be32,
1541 			"I", "link up");
1542 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1543 			"rdma_tags_available",
1544 			CTLTYPE_INT|CTLFLAG_RD, &fw->rdma_tags_available,
1545 			0, mxge_handle_be32,
1546 			"I", "rdma_tags_available");
1547 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1548 			"dropped_bad_crc32",
1549 			CTLTYPE_INT|CTLFLAG_RD,
1550 			&fw->dropped_bad_crc32,
1551 			0, mxge_handle_be32,
1552 			"I", "dropped_bad_crc32");
1553 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1554 			"dropped_bad_phy",
1555 			CTLTYPE_INT|CTLFLAG_RD,
1556 			&fw->dropped_bad_phy,
1557 			0, mxge_handle_be32,
1558 			"I", "dropped_bad_phy");
1559 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1560 			"dropped_link_error_or_filtered",
1561 			CTLTYPE_INT|CTLFLAG_RD,
1562 			&fw->dropped_link_error_or_filtered,
1563 			0, mxge_handle_be32,
1564 			"I", "dropped_link_error_or_filtered");
1565 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1566 			"dropped_link_overflow",
1567 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_link_overflow,
1568 			0, mxge_handle_be32,
1569 			"I", "dropped_link_overflow");
1570 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1571 			"dropped_multicast_filtered",
1572 			CTLTYPE_INT|CTLFLAG_RD,
1573 			&fw->dropped_multicast_filtered,
1574 			0, mxge_handle_be32,
1575 			"I", "dropped_multicast_filtered");
1576 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1577 			"dropped_no_big_buffer",
1578 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_no_big_buffer,
1579 			0, mxge_handle_be32,
1580 			"I", "dropped_no_big_buffer");
1581 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1582 			"dropped_no_small_buffer",
1583 			CTLTYPE_INT|CTLFLAG_RD,
1584 			&fw->dropped_no_small_buffer,
1585 			0, mxge_handle_be32,
1586 			"I", "dropped_no_small_buffer");
1587 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1588 			"dropped_overrun",
1589 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_overrun,
1590 			0, mxge_handle_be32,
1591 			"I", "dropped_overrun");
1592 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1593 			"dropped_pause",
1594 			CTLTYPE_INT|CTLFLAG_RD,
1595 			&fw->dropped_pause,
1596 			0, mxge_handle_be32,
1597 			"I", "dropped_pause");
1598 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1599 			"dropped_runt",
1600 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_runt,
1601 			0, mxge_handle_be32,
1602 			"I", "dropped_runt");
1603 
1604 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1605 			"dropped_unicast_filtered",
1606 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_unicast_filtered,
1607 			0, mxge_handle_be32,
1608 			"I", "dropped_unicast_filtered");
1609 
1610 	/* verbose printing? */
1611 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1612 		       "verbose",
1613 		       CTLFLAG_RW, &mxge_verbose,
1614 		       0, "verbose printing");
1615 
1616 	/* add counters exported for debugging from all slices */
1617 	sysctl_ctx_init(&sc->slice_sysctl_ctx);
1618 	sc->slice_sysctl_tree =
1619 		SYSCTL_ADD_NODE(&sc->slice_sysctl_ctx, children, OID_AUTO,
1620 				"slice", CTLFLAG_RD, 0, "");
1621 
1622 	for (slice = 0; slice < sc->num_slices; slice++) {
1623 		ss = &sc->ss[slice];
1624 		sysctl_ctx_init(&ss->sysctl_ctx);
1625 		ctx = &ss->sysctl_ctx;
1626 		children = SYSCTL_CHILDREN(sc->slice_sysctl_tree);
1627 		sprintf(slice_num, "%d", slice);
1628 		ss->sysctl_tree =
1629 			SYSCTL_ADD_NODE(ctx, children, OID_AUTO, slice_num,
1630 					CTLFLAG_RD, 0, "");
1631 		children = SYSCTL_CHILDREN(ss->sysctl_tree);
1632 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1633 			       "rx_small_cnt",
1634 			       CTLFLAG_RD, &ss->rx_small.cnt,
1635 			       0, "rx_small_cnt");
1636 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1637 			       "rx_big_cnt",
1638 			       CTLFLAG_RD, &ss->rx_big.cnt,
1639 			       0, "rx_small_cnt");
1640 		SYSCTL_ADD_U64(ctx, children, OID_AUTO,
1641 			       "lro_flushed", CTLFLAG_RD, &ss->lc.lro_flushed,
1642 			       0, "number of lro merge queues flushed");
1643 
1644 		SYSCTL_ADD_U64(ctx, children, OID_AUTO,
1645 			       "lro_bad_csum", CTLFLAG_RD, &ss->lc.lro_bad_csum,
1646 			       0, "number of bad csums preventing LRO");
1647 
1648 		SYSCTL_ADD_U64(ctx, children, OID_AUTO,
1649 			       "lro_queued", CTLFLAG_RD, &ss->lc.lro_queued,
1650 			       0, "number of frames appended to lro merge"
1651 			       "queues");
1652 
1653 #ifndef IFNET_BUF_RING
1654 		/* only transmit from slice 0 for now */
1655 		if (slice > 0)
1656 			continue;
1657 #endif
1658 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1659 			       "tx_req",
1660 			       CTLFLAG_RD, &ss->tx.req,
1661 			       0, "tx_req");
1662 
1663 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1664 			       "tx_done",
1665 			       CTLFLAG_RD, &ss->tx.done,
1666 			       0, "tx_done");
1667 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1668 			       "tx_pkt_done",
1669 			       CTLFLAG_RD, &ss->tx.pkt_done,
1670 			       0, "tx_done");
1671 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1672 			       "tx_stall",
1673 			       CTLFLAG_RD, &ss->tx.stall,
1674 			       0, "tx_stall");
1675 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1676 			       "tx_wake",
1677 			       CTLFLAG_RD, &ss->tx.wake,
1678 			       0, "tx_wake");
1679 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1680 			       "tx_defrag",
1681 			       CTLFLAG_RD, &ss->tx.defrag,
1682 			       0, "tx_defrag");
1683 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1684 			       "tx_queue_active",
1685 			       CTLFLAG_RD, &ss->tx.queue_active,
1686 			       0, "tx_queue_active");
1687 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1688 			       "tx_activate",
1689 			       CTLFLAG_RD, &ss->tx.activate,
1690 			       0, "tx_activate");
1691 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1692 			       "tx_deactivate",
1693 			       CTLFLAG_RD, &ss->tx.deactivate,
1694 			       0, "tx_deactivate");
1695 	}
1696 }
1697 
1698 /* copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
1699    backwards one at a time and handle ring wraps */
1700 
1701 static inline void
1702 mxge_submit_req_backwards(mxge_tx_ring_t *tx,
1703 			    mcp_kreq_ether_send_t *src, int cnt)
1704 {
1705 	int idx, starting_slot;
1706 	starting_slot = tx->req;
1707 	while (cnt > 1) {
1708 		cnt--;
1709 		idx = (starting_slot + cnt) & tx->mask;
1710 		mxge_pio_copy(&tx->lanai[idx],
1711 			      &src[cnt], sizeof(*src));
1712 		wmb();
1713 	}
1714 }
1715 
1716 /*
1717  * copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
1718  * at most 32 bytes at a time, so as to avoid involving the software
1719  * pio handler in the nic.   We re-write the first segment's flags
1720  * to mark them valid only after writing the entire chain
1721  */
1722 
1723 static inline void
1724 mxge_submit_req(mxge_tx_ring_t *tx, mcp_kreq_ether_send_t *src,
1725 		  int cnt)
1726 {
1727 	int idx, i;
1728 	uint32_t *src_ints;
1729 	volatile uint32_t *dst_ints;
1730 	mcp_kreq_ether_send_t *srcp;
1731 	volatile mcp_kreq_ether_send_t *dstp, *dst;
1732 	uint8_t last_flags;
1733 
1734 	idx = tx->req & tx->mask;
1735 
1736 	last_flags = src->flags;
1737 	src->flags = 0;
1738 	wmb();
1739 	dst = dstp = &tx->lanai[idx];
1740 	srcp = src;
1741 
1742 	if ((idx + cnt) < tx->mask) {
1743 		for (i = 0; i < (cnt - 1); i += 2) {
1744 			mxge_pio_copy(dstp, srcp, 2 * sizeof(*src));
1745 			wmb(); /* force write every 32 bytes */
1746 			srcp += 2;
1747 			dstp += 2;
1748 		}
1749 	} else {
1750 		/* submit all but the first request, and ensure
1751 		   that it is submitted below */
1752 		mxge_submit_req_backwards(tx, src, cnt);
1753 		i = 0;
1754 	}
1755 	if (i < cnt) {
1756 		/* submit the first request */
1757 		mxge_pio_copy(dstp, srcp, sizeof(*src));
1758 		wmb(); /* barrier before setting valid flag */
1759 	}
1760 
1761 	/* re-write the last 32-bits with the valid flags */
1762 	src->flags = last_flags;
1763 	src_ints = (uint32_t *)src;
1764 	src_ints+=3;
1765 	dst_ints = (volatile uint32_t *)dst;
1766 	dst_ints+=3;
1767 	*dst_ints =  *src_ints;
1768 	tx->req += cnt;
1769 	wmb();
1770 }
1771 
1772 static int
1773 mxge_parse_tx(struct mxge_slice_state *ss, struct mbuf *m,
1774     struct mxge_pkt_info *pi)
1775 {
1776 	struct ether_vlan_header *eh;
1777 	uint16_t etype;
1778 	int tso = m->m_pkthdr.csum_flags & (CSUM_TSO);
1779 #if IFCAP_TSO6 && defined(INET6)
1780 	int nxt;
1781 #endif
1782 
1783 	eh = mtod(m, struct ether_vlan_header *);
1784 	if (eh->evl_encap_proto == htons(ETHERTYPE_VLAN)) {
1785 		etype = ntohs(eh->evl_proto);
1786 		pi->ip_off = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
1787 	} else {
1788 		etype = ntohs(eh->evl_encap_proto);
1789 		pi->ip_off = ETHER_HDR_LEN;
1790 	}
1791 
1792 	switch (etype) {
1793 	case ETHERTYPE_IP:
1794 		/*
1795 		 * ensure ip header is in first mbuf, copy it to a
1796 		 * scratch buffer if not
1797 		 */
1798 		pi->ip = (struct ip *)(m->m_data + pi->ip_off);
1799 		pi->ip6 = NULL;
1800 		if (__predict_false(m->m_len < pi->ip_off + sizeof(*pi->ip))) {
1801 			m_copydata(m, 0, pi->ip_off + sizeof(*pi->ip),
1802 			    ss->scratch);
1803 			pi->ip = (struct ip *)(ss->scratch + pi->ip_off);
1804 		}
1805 		pi->ip_hlen = pi->ip->ip_hl << 2;
1806 		if (!tso)
1807 			return 0;
1808 
1809 		if (__predict_false(m->m_len < pi->ip_off + pi->ip_hlen +
1810 		    sizeof(struct tcphdr))) {
1811 			m_copydata(m, 0, pi->ip_off + pi->ip_hlen +
1812 			    sizeof(struct tcphdr), ss->scratch);
1813 			pi->ip = (struct ip *)(ss->scratch + pi->ip_off);
1814 		}
1815 		pi->tcp = (struct tcphdr *)((char *)pi->ip + pi->ip_hlen);
1816 		break;
1817 #if IFCAP_TSO6 && defined(INET6)
1818 	case ETHERTYPE_IPV6:
1819 		pi->ip6 = (struct ip6_hdr *)(m->m_data + pi->ip_off);
1820 		if (__predict_false(m->m_len < pi->ip_off + sizeof(*pi->ip6))) {
1821 			m_copydata(m, 0, pi->ip_off + sizeof(*pi->ip6),
1822 			    ss->scratch);
1823 			pi->ip6 = (struct ip6_hdr *)(ss->scratch + pi->ip_off);
1824 		}
1825 		nxt = 0;
1826 		pi->ip_hlen = ip6_lasthdr(m, pi->ip_off, IPPROTO_IPV6, &nxt);
1827 		pi->ip_hlen -= pi->ip_off;
1828 		if (nxt != IPPROTO_TCP && nxt != IPPROTO_UDP)
1829 			return EINVAL;
1830 
1831 		if (!tso)
1832 			return 0;
1833 
1834 		if (pi->ip_off + pi->ip_hlen > ss->sc->max_tso6_hlen)
1835 			return EINVAL;
1836 
1837 		if (__predict_false(m->m_len < pi->ip_off + pi->ip_hlen +
1838 		    sizeof(struct tcphdr))) {
1839 			m_copydata(m, 0, pi->ip_off + pi->ip_hlen +
1840 			    sizeof(struct tcphdr), ss->scratch);
1841 			pi->ip6 = (struct ip6_hdr *)(ss->scratch + pi->ip_off);
1842 		}
1843 		pi->tcp = (struct tcphdr *)((char *)pi->ip6 + pi->ip_hlen);
1844 		break;
1845 #endif
1846 	default:
1847 		return EINVAL;
1848 	}
1849 	return 0;
1850 }
1851 
1852 #if IFCAP_TSO4
1853 
1854 static void
1855 mxge_encap_tso(struct mxge_slice_state *ss, struct mbuf *m,
1856 	       int busdma_seg_cnt, struct mxge_pkt_info *pi)
1857 {
1858 	mxge_tx_ring_t *tx;
1859 	mcp_kreq_ether_send_t *req;
1860 	bus_dma_segment_t *seg;
1861 	uint32_t low, high_swapped;
1862 	int len, seglen, cum_len, cum_len_next;
1863 	int next_is_first, chop, cnt, rdma_count, small;
1864 	uint16_t pseudo_hdr_offset, cksum_offset, mss, sum;
1865 	uint8_t flags, flags_next;
1866 	static int once;
1867 
1868 	mss = m->m_pkthdr.tso_segsz;
1869 
1870 	/* negative cum_len signifies to the
1871 	 * send loop that we are still in the
1872 	 * header portion of the TSO packet.
1873 	 */
1874 
1875 	cksum_offset = pi->ip_off + pi->ip_hlen;
1876 	cum_len = -(cksum_offset + (pi->tcp->th_off << 2));
1877 
1878 	/* TSO implies checksum offload on this hardware */
1879 	if (__predict_false((m->m_pkthdr.csum_flags & (CSUM_TCP|CSUM_TCP_IPV6)) == 0)) {
1880 		/*
1881 		 * If packet has full TCP csum, replace it with pseudo hdr
1882 		 * sum that the NIC expects, otherwise the NIC will emit
1883 		 * packets with bad TCP checksums.
1884 		 */
1885 		m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
1886 		if (pi->ip6) {
1887 #if (CSUM_TCP_IPV6 != 0) && defined(INET6)
1888 			m->m_pkthdr.csum_flags |= CSUM_TCP_IPV6;
1889 			sum = in6_cksum_pseudo(pi->ip6,
1890 			    m->m_pkthdr.len - cksum_offset,
1891 			    IPPROTO_TCP, 0);
1892 #endif
1893 		} else {
1894 #ifdef INET
1895 			m->m_pkthdr.csum_flags |= CSUM_TCP;
1896 			sum = in_pseudo(pi->ip->ip_src.s_addr,
1897 			    pi->ip->ip_dst.s_addr,
1898 			    htons(IPPROTO_TCP + (m->m_pkthdr.len -
1899 				    cksum_offset)));
1900 #endif
1901 		}
1902 		m_copyback(m, offsetof(struct tcphdr, th_sum) +
1903 		    cksum_offset, sizeof(sum), (caddr_t)&sum);
1904 	}
1905 	flags = MXGEFW_FLAGS_TSO_HDR | MXGEFW_FLAGS_FIRST;
1906 
1907 
1908 	/* for TSO, pseudo_hdr_offset holds mss.
1909 	 * The firmware figures out where to put
1910 	 * the checksum by parsing the header. */
1911 	pseudo_hdr_offset = htobe16(mss);
1912 
1913 	if (pi->ip6) {
1914 		/*
1915 		 * for IPv6 TSO, the "checksum offset" is re-purposed
1916 		 * to store the TCP header len
1917 		 */
1918 		cksum_offset = (pi->tcp->th_off << 2);
1919 	}
1920 
1921 	tx = &ss->tx;
1922 	req = tx->req_list;
1923 	seg = tx->seg_list;
1924 	cnt = 0;
1925 	rdma_count = 0;
1926 	/* "rdma_count" is the number of RDMAs belonging to the
1927 	 * current packet BEFORE the current send request. For
1928 	 * non-TSO packets, this is equal to "count".
1929 	 * For TSO packets, rdma_count needs to be reset
1930 	 * to 0 after a segment cut.
1931 	 *
1932 	 * The rdma_count field of the send request is
1933 	 * the number of RDMAs of the packet starting at
1934 	 * that request. For TSO send requests with one ore more cuts
1935 	 * in the middle, this is the number of RDMAs starting
1936 	 * after the last cut in the request. All previous
1937 	 * segments before the last cut implicitly have 1 RDMA.
1938 	 *
1939 	 * Since the number of RDMAs is not known beforehand,
1940 	 * it must be filled-in retroactively - after each
1941 	 * segmentation cut or at the end of the entire packet.
1942 	 */
1943 
1944 	while (busdma_seg_cnt) {
1945 		/* Break the busdma segment up into pieces*/
1946 		low = MXGE_LOWPART_TO_U32(seg->ds_addr);
1947 		high_swapped = 	htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
1948 		len = seg->ds_len;
1949 
1950 		while (len) {
1951 			flags_next = flags & ~MXGEFW_FLAGS_FIRST;
1952 			seglen = len;
1953 			cum_len_next = cum_len + seglen;
1954 			(req-rdma_count)->rdma_count = rdma_count + 1;
1955 			if (__predict_true(cum_len >= 0)) {
1956 				/* payload */
1957 				chop = (cum_len_next > mss);
1958 				cum_len_next = cum_len_next % mss;
1959 				next_is_first = (cum_len_next == 0);
1960 				flags |= chop * MXGEFW_FLAGS_TSO_CHOP;
1961 				flags_next |= next_is_first *
1962 					MXGEFW_FLAGS_FIRST;
1963 				rdma_count |= -(chop | next_is_first);
1964 				rdma_count += chop & !next_is_first;
1965 			} else if (cum_len_next >= 0) {
1966 				/* header ends */
1967 				rdma_count = -1;
1968 				cum_len_next = 0;
1969 				seglen = -cum_len;
1970 				small = (mss <= MXGEFW_SEND_SMALL_SIZE);
1971 				flags_next = MXGEFW_FLAGS_TSO_PLD |
1972 					MXGEFW_FLAGS_FIRST |
1973 					(small * MXGEFW_FLAGS_SMALL);
1974 			    }
1975 
1976 			req->addr_high = high_swapped;
1977 			req->addr_low = htobe32(low);
1978 			req->pseudo_hdr_offset = pseudo_hdr_offset;
1979 			req->pad = 0;
1980 			req->rdma_count = 1;
1981 			req->length = htobe16(seglen);
1982 			req->cksum_offset = cksum_offset;
1983 			req->flags = flags | ((cum_len & 1) *
1984 					      MXGEFW_FLAGS_ALIGN_ODD);
1985 			low += seglen;
1986 			len -= seglen;
1987 			cum_len = cum_len_next;
1988 			flags = flags_next;
1989 			req++;
1990 			cnt++;
1991 			rdma_count++;
1992 			if (cksum_offset != 0 && !pi->ip6) {
1993 				if (__predict_false(cksum_offset > seglen))
1994 					cksum_offset -= seglen;
1995 				else
1996 					cksum_offset = 0;
1997 			}
1998 			if (__predict_false(cnt > tx->max_desc))
1999 				goto drop;
2000 		}
2001 		busdma_seg_cnt--;
2002 		seg++;
2003 	}
2004 	(req-rdma_count)->rdma_count = rdma_count;
2005 
2006 	do {
2007 		req--;
2008 		req->flags |= MXGEFW_FLAGS_TSO_LAST;
2009 	} while (!(req->flags & (MXGEFW_FLAGS_TSO_CHOP | MXGEFW_FLAGS_FIRST)));
2010 
2011 	tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
2012 	mxge_submit_req(tx, tx->req_list, cnt);
2013 #ifdef IFNET_BUF_RING
2014 	if ((ss->sc->num_slices > 1) && tx->queue_active == 0) {
2015 		/* tell the NIC to start polling this slice */
2016 		*tx->send_go = 1;
2017 		tx->queue_active = 1;
2018 		tx->activate++;
2019 		wmb();
2020 	}
2021 #endif
2022 	return;
2023 
2024 drop:
2025 	bus_dmamap_unload(tx->dmat, tx->info[tx->req & tx->mask].map);
2026 	m_freem(m);
2027 	ss->oerrors++;
2028 	if (!once) {
2029 		printf("tx->max_desc exceeded via TSO!\n");
2030 		printf("mss = %d, %ld, %d!\n", mss,
2031 		       (long)seg - (long)tx->seg_list, tx->max_desc);
2032 		once = 1;
2033 	}
2034 	return;
2035 
2036 }
2037 
2038 #endif /* IFCAP_TSO4 */
2039 
2040 #ifdef MXGE_NEW_VLAN_API
2041 /*
2042  * We reproduce the software vlan tag insertion from
2043  * net/if_vlan.c:vlan_start() here so that we can advertise "hardware"
2044  * vlan tag insertion. We need to advertise this in order to have the
2045  * vlan interface respect our csum offload flags.
2046  */
2047 static struct mbuf *
2048 mxge_vlan_tag_insert(struct mbuf *m)
2049 {
2050 	struct ether_vlan_header *evl;
2051 
2052 	M_PREPEND(m, ETHER_VLAN_ENCAP_LEN, M_NOWAIT);
2053 	if (__predict_false(m == NULL))
2054 		return NULL;
2055 	if (m->m_len < sizeof(*evl)) {
2056 		m = m_pullup(m, sizeof(*evl));
2057 		if (__predict_false(m == NULL))
2058 			return NULL;
2059 	}
2060 	/*
2061 	 * Transform the Ethernet header into an Ethernet header
2062 	 * with 802.1Q encapsulation.
2063 	 */
2064 	evl = mtod(m, struct ether_vlan_header *);
2065 	bcopy((char *)evl + ETHER_VLAN_ENCAP_LEN,
2066 	      (char *)evl, ETHER_HDR_LEN - ETHER_TYPE_LEN);
2067 	evl->evl_encap_proto = htons(ETHERTYPE_VLAN);
2068 	evl->evl_tag = htons(m->m_pkthdr.ether_vtag);
2069 	m->m_flags &= ~M_VLANTAG;
2070 	return m;
2071 }
2072 #endif /* MXGE_NEW_VLAN_API */
2073 
2074 static void
2075 mxge_encap(struct mxge_slice_state *ss, struct mbuf *m)
2076 {
2077 	struct mxge_pkt_info pi = {0,0,0,0};
2078 	mxge_softc_t *sc;
2079 	mcp_kreq_ether_send_t *req;
2080 	bus_dma_segment_t *seg;
2081 	struct mbuf *m_tmp;
2082 	struct ifnet *ifp;
2083 	mxge_tx_ring_t *tx;
2084 	int cnt, cum_len, err, i, idx, odd_flag;
2085 	uint16_t pseudo_hdr_offset;
2086 	uint8_t flags, cksum_offset;
2087 
2088 
2089 	sc = ss->sc;
2090 	ifp = sc->ifp;
2091 	tx = &ss->tx;
2092 
2093 #ifdef MXGE_NEW_VLAN_API
2094 	if (m->m_flags & M_VLANTAG) {
2095 		m = mxge_vlan_tag_insert(m);
2096 		if (__predict_false(m == NULL))
2097 			goto drop_without_m;
2098 	}
2099 #endif
2100 	if (m->m_pkthdr.csum_flags &
2101 	    (CSUM_TSO | CSUM_DELAY_DATA | CSUM_DELAY_DATA_IPV6)) {
2102 		if (mxge_parse_tx(ss, m, &pi))
2103 			goto drop;
2104 	}
2105 
2106 	/* (try to) map the frame for DMA */
2107 	idx = tx->req & tx->mask;
2108 	err = bus_dmamap_load_mbuf_sg(tx->dmat, tx->info[idx].map,
2109 				      m, tx->seg_list, &cnt,
2110 				      BUS_DMA_NOWAIT);
2111 	if (__predict_false(err == EFBIG)) {
2112 		/* Too many segments in the chain.  Try
2113 		   to defrag */
2114 		m_tmp = m_defrag(m, M_NOWAIT);
2115 		if (m_tmp == NULL) {
2116 			goto drop;
2117 		}
2118 		ss->tx.defrag++;
2119 		m = m_tmp;
2120 		err = bus_dmamap_load_mbuf_sg(tx->dmat,
2121 					      tx->info[idx].map,
2122 					      m, tx->seg_list, &cnt,
2123 					      BUS_DMA_NOWAIT);
2124 	}
2125 	if (__predict_false(err != 0)) {
2126 		device_printf(sc->dev, "bus_dmamap_load_mbuf_sg returned %d"
2127 			      " packet len = %d\n", err, m->m_pkthdr.len);
2128 		goto drop;
2129 	}
2130 	bus_dmamap_sync(tx->dmat, tx->info[idx].map,
2131 			BUS_DMASYNC_PREWRITE);
2132 	tx->info[idx].m = m;
2133 
2134 #if IFCAP_TSO4
2135 	/* TSO is different enough, we handle it in another routine */
2136 	if (m->m_pkthdr.csum_flags & (CSUM_TSO)) {
2137 		mxge_encap_tso(ss, m, cnt, &pi);
2138 		return;
2139 	}
2140 #endif
2141 
2142 	req = tx->req_list;
2143 	cksum_offset = 0;
2144 	pseudo_hdr_offset = 0;
2145 	flags = MXGEFW_FLAGS_NO_TSO;
2146 
2147 	/* checksum offloading? */
2148 	if (m->m_pkthdr.csum_flags &
2149 	    (CSUM_DELAY_DATA | CSUM_DELAY_DATA_IPV6)) {
2150 		/* ensure ip header is in first mbuf, copy
2151 		   it to a scratch buffer if not */
2152 		cksum_offset = pi.ip_off + pi.ip_hlen;
2153 		pseudo_hdr_offset = cksum_offset +  m->m_pkthdr.csum_data;
2154 		pseudo_hdr_offset = htobe16(pseudo_hdr_offset);
2155 		req->cksum_offset = cksum_offset;
2156 		flags |= MXGEFW_FLAGS_CKSUM;
2157 		odd_flag = MXGEFW_FLAGS_ALIGN_ODD;
2158 	} else {
2159 		odd_flag = 0;
2160 	}
2161 	if (m->m_pkthdr.len < MXGEFW_SEND_SMALL_SIZE)
2162 		flags |= MXGEFW_FLAGS_SMALL;
2163 
2164 	/* convert segments into a request list */
2165 	cum_len = 0;
2166 	seg = tx->seg_list;
2167 	req->flags = MXGEFW_FLAGS_FIRST;
2168 	for (i = 0; i < cnt; i++) {
2169 		req->addr_low =
2170 			htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
2171 		req->addr_high =
2172 			htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
2173 		req->length = htobe16(seg->ds_len);
2174 		req->cksum_offset = cksum_offset;
2175 		if (cksum_offset > seg->ds_len)
2176 			cksum_offset -= seg->ds_len;
2177 		else
2178 			cksum_offset = 0;
2179 		req->pseudo_hdr_offset = pseudo_hdr_offset;
2180 		req->pad = 0; /* complete solid 16-byte block */
2181 		req->rdma_count = 1;
2182 		req->flags |= flags | ((cum_len & 1) * odd_flag);
2183 		cum_len += seg->ds_len;
2184 		seg++;
2185 		req++;
2186 		req->flags = 0;
2187 	}
2188 	req--;
2189 	/* pad runts to 60 bytes */
2190 	if (cum_len < 60) {
2191 		req++;
2192 		req->addr_low =
2193 			htobe32(MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr));
2194 		req->addr_high =
2195 			htobe32(MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr));
2196 		req->length = htobe16(60 - cum_len);
2197 		req->cksum_offset = 0;
2198 		req->pseudo_hdr_offset = pseudo_hdr_offset;
2199 		req->pad = 0; /* complete solid 16-byte block */
2200 		req->rdma_count = 1;
2201 		req->flags |= flags | ((cum_len & 1) * odd_flag);
2202 		cnt++;
2203 	}
2204 
2205 	tx->req_list[0].rdma_count = cnt;
2206 #if 0
2207 	/* print what the firmware will see */
2208 	for (i = 0; i < cnt; i++) {
2209 		printf("%d: addr: 0x%x 0x%x len:%d pso%d,"
2210 		    "cso:%d, flags:0x%x, rdma:%d\n",
2211 		    i, (int)ntohl(tx->req_list[i].addr_high),
2212 		    (int)ntohl(tx->req_list[i].addr_low),
2213 		    (int)ntohs(tx->req_list[i].length),
2214 		    (int)ntohs(tx->req_list[i].pseudo_hdr_offset),
2215 		    tx->req_list[i].cksum_offset, tx->req_list[i].flags,
2216 		    tx->req_list[i].rdma_count);
2217 	}
2218 	printf("--------------\n");
2219 #endif
2220 	tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
2221 	mxge_submit_req(tx, tx->req_list, cnt);
2222 #ifdef IFNET_BUF_RING
2223 	if ((ss->sc->num_slices > 1) && tx->queue_active == 0) {
2224 		/* tell the NIC to start polling this slice */
2225 		*tx->send_go = 1;
2226 		tx->queue_active = 1;
2227 		tx->activate++;
2228 		wmb();
2229 	}
2230 #endif
2231 	return;
2232 
2233 drop:
2234 	m_freem(m);
2235 drop_without_m:
2236 	ss->oerrors++;
2237 	return;
2238 }
2239 
2240 #ifdef IFNET_BUF_RING
2241 static void
2242 mxge_qflush(struct ifnet *ifp)
2243 {
2244 	mxge_softc_t *sc = ifp->if_softc;
2245 	mxge_tx_ring_t *tx;
2246 	struct mbuf *m;
2247 	int slice;
2248 
2249 	for (slice = 0; slice < sc->num_slices; slice++) {
2250 		tx = &sc->ss[slice].tx;
2251 		mtx_lock(&tx->mtx);
2252 		while ((m = buf_ring_dequeue_sc(tx->br)) != NULL)
2253 			m_freem(m);
2254 		mtx_unlock(&tx->mtx);
2255 	}
2256 	if_qflush(ifp);
2257 }
2258 
2259 static inline void
2260 mxge_start_locked(struct mxge_slice_state *ss)
2261 {
2262 	mxge_softc_t *sc;
2263 	struct mbuf *m;
2264 	struct ifnet *ifp;
2265 	mxge_tx_ring_t *tx;
2266 
2267 	sc = ss->sc;
2268 	ifp = sc->ifp;
2269 	tx = &ss->tx;
2270 
2271 	while ((tx->mask - (tx->req - tx->done)) > tx->max_desc) {
2272 		m = drbr_dequeue(ifp, tx->br);
2273 		if (m == NULL) {
2274 			return;
2275 		}
2276 		/* let BPF see it */
2277 		BPF_MTAP(ifp, m);
2278 
2279 		/* give it to the nic */
2280 		mxge_encap(ss, m);
2281 	}
2282 	/* ran out of transmit slots */
2283 	if (((ss->if_drv_flags & IFF_DRV_OACTIVE) == 0)
2284 	    && (!drbr_empty(ifp, tx->br))) {
2285 		ss->if_drv_flags |= IFF_DRV_OACTIVE;
2286 		tx->stall++;
2287 	}
2288 }
2289 
2290 static int
2291 mxge_transmit_locked(struct mxge_slice_state *ss, struct mbuf *m)
2292 {
2293 	mxge_softc_t *sc;
2294 	struct ifnet *ifp;
2295 	mxge_tx_ring_t *tx;
2296 	int err;
2297 
2298 	sc = ss->sc;
2299 	ifp = sc->ifp;
2300 	tx = &ss->tx;
2301 
2302 	if ((ss->if_drv_flags & (IFF_DRV_RUNNING|IFF_DRV_OACTIVE)) !=
2303 	    IFF_DRV_RUNNING) {
2304 		err = drbr_enqueue(ifp, tx->br, m);
2305 		return (err);
2306 	}
2307 
2308 	if (!drbr_needs_enqueue(ifp, tx->br) &&
2309 	    ((tx->mask - (tx->req - tx->done)) > tx->max_desc)) {
2310 		/* let BPF see it */
2311 		BPF_MTAP(ifp, m);
2312 		/* give it to the nic */
2313 		mxge_encap(ss, m);
2314 	} else if ((err = drbr_enqueue(ifp, tx->br, m)) != 0) {
2315 		return (err);
2316 	}
2317 	if (!drbr_empty(ifp, tx->br))
2318 		mxge_start_locked(ss);
2319 	return (0);
2320 }
2321 
2322 static int
2323 mxge_transmit(struct ifnet *ifp, struct mbuf *m)
2324 {
2325 	mxge_softc_t *sc = ifp->if_softc;
2326 	struct mxge_slice_state *ss;
2327 	mxge_tx_ring_t *tx;
2328 	int err = 0;
2329 	int slice;
2330 
2331 	slice = m->m_pkthdr.flowid;
2332 	slice &= (sc->num_slices - 1);  /* num_slices always power of 2 */
2333 
2334 	ss = &sc->ss[slice];
2335 	tx = &ss->tx;
2336 
2337 	if (mtx_trylock(&tx->mtx)) {
2338 		err = mxge_transmit_locked(ss, m);
2339 		mtx_unlock(&tx->mtx);
2340 	} else {
2341 		err = drbr_enqueue(ifp, tx->br, m);
2342 	}
2343 
2344 	return (err);
2345 }
2346 
2347 #else
2348 
2349 static inline void
2350 mxge_start_locked(struct mxge_slice_state *ss)
2351 {
2352 	mxge_softc_t *sc;
2353 	struct mbuf *m;
2354 	struct ifnet *ifp;
2355 	mxge_tx_ring_t *tx;
2356 
2357 	sc = ss->sc;
2358 	ifp = sc->ifp;
2359 	tx = &ss->tx;
2360 	while ((tx->mask - (tx->req - tx->done)) > tx->max_desc) {
2361 		IFQ_DRV_DEQUEUE(&ifp->if_snd, m);
2362 		if (m == NULL) {
2363 			return;
2364 		}
2365 		/* let BPF see it */
2366 		BPF_MTAP(ifp, m);
2367 
2368 		/* give it to the nic */
2369 		mxge_encap(ss, m);
2370 	}
2371 	/* ran out of transmit slots */
2372 	if ((sc->ifp->if_drv_flags & IFF_DRV_OACTIVE) == 0) {
2373 		sc->ifp->if_drv_flags |= IFF_DRV_OACTIVE;
2374 		tx->stall++;
2375 	}
2376 }
2377 #endif
2378 static void
2379 mxge_start(struct ifnet *ifp)
2380 {
2381 	mxge_softc_t *sc = ifp->if_softc;
2382 	struct mxge_slice_state *ss;
2383 
2384 	/* only use the first slice for now */
2385 	ss = &sc->ss[0];
2386 	mtx_lock(&ss->tx.mtx);
2387 	mxge_start_locked(ss);
2388 	mtx_unlock(&ss->tx.mtx);
2389 }
2390 
2391 /*
2392  * copy an array of mcp_kreq_ether_recv_t's to the mcp.  Copy
2393  * at most 32 bytes at a time, so as to avoid involving the software
2394  * pio handler in the nic.   We re-write the first segment's low
2395  * DMA address to mark it valid only after we write the entire chunk
2396  * in a burst
2397  */
2398 static inline void
2399 mxge_submit_8rx(volatile mcp_kreq_ether_recv_t *dst,
2400 		mcp_kreq_ether_recv_t *src)
2401 {
2402 	uint32_t low;
2403 
2404 	low = src->addr_low;
2405 	src->addr_low = 0xffffffff;
2406 	mxge_pio_copy(dst, src, 4 * sizeof (*src));
2407 	wmb();
2408 	mxge_pio_copy(dst + 4, src + 4, 4 * sizeof (*src));
2409 	wmb();
2410 	src->addr_low = low;
2411 	dst->addr_low = low;
2412 	wmb();
2413 }
2414 
2415 static int
2416 mxge_get_buf_small(struct mxge_slice_state *ss, bus_dmamap_t map, int idx)
2417 {
2418 	bus_dma_segment_t seg;
2419 	struct mbuf *m;
2420 	mxge_rx_ring_t *rx = &ss->rx_small;
2421 	int cnt, err;
2422 
2423 	m = m_gethdr(M_NOWAIT, MT_DATA);
2424 	if (m == NULL) {
2425 		rx->alloc_fail++;
2426 		err = ENOBUFS;
2427 		goto done;
2428 	}
2429 	m->m_len = MHLEN;
2430 	err = bus_dmamap_load_mbuf_sg(rx->dmat, map, m,
2431 				      &seg, &cnt, BUS_DMA_NOWAIT);
2432 	if (err != 0) {
2433 		m_free(m);
2434 		goto done;
2435 	}
2436 	rx->info[idx].m = m;
2437 	rx->shadow[idx].addr_low =
2438 		htobe32(MXGE_LOWPART_TO_U32(seg.ds_addr));
2439 	rx->shadow[idx].addr_high =
2440 		htobe32(MXGE_HIGHPART_TO_U32(seg.ds_addr));
2441 
2442 done:
2443 	if ((idx & 7) == 7)
2444 		mxge_submit_8rx(&rx->lanai[idx - 7], &rx->shadow[idx - 7]);
2445 	return err;
2446 }
2447 
2448 static int
2449 mxge_get_buf_big(struct mxge_slice_state *ss, bus_dmamap_t map, int idx)
2450 {
2451 	bus_dma_segment_t seg[3];
2452 	struct mbuf *m;
2453 	mxge_rx_ring_t *rx = &ss->rx_big;
2454 	int cnt, err, i;
2455 
2456 	m = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, rx->cl_size);
2457 	if (m == NULL) {
2458 		rx->alloc_fail++;
2459 		err = ENOBUFS;
2460 		goto done;
2461 	}
2462 	m->m_len = rx->mlen;
2463 	err = bus_dmamap_load_mbuf_sg(rx->dmat, map, m,
2464 				      seg, &cnt, BUS_DMA_NOWAIT);
2465 	if (err != 0) {
2466 		m_free(m);
2467 		goto done;
2468 	}
2469 	rx->info[idx].m = m;
2470 	rx->shadow[idx].addr_low =
2471 		htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
2472 	rx->shadow[idx].addr_high =
2473 		htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
2474 
2475 #if MXGE_VIRT_JUMBOS
2476 	for (i = 1; i < cnt; i++) {
2477 		rx->shadow[idx + i].addr_low =
2478 			htobe32(MXGE_LOWPART_TO_U32(seg[i].ds_addr));
2479 		rx->shadow[idx + i].addr_high =
2480 			htobe32(MXGE_HIGHPART_TO_U32(seg[i].ds_addr));
2481        }
2482 #endif
2483 
2484 done:
2485        for (i = 0; i < rx->nbufs; i++) {
2486 		if ((idx & 7) == 7) {
2487 			mxge_submit_8rx(&rx->lanai[idx - 7],
2488 					&rx->shadow[idx - 7]);
2489 		}
2490 		idx++;
2491 	}
2492 	return err;
2493 }
2494 
2495 #ifdef INET6
2496 
2497 static uint16_t
2498 mxge_csum_generic(uint16_t *raw, int len)
2499 {
2500 	uint32_t csum;
2501 
2502 
2503 	csum = 0;
2504 	while (len > 0) {
2505 		csum += *raw;
2506 		raw++;
2507 		len -= 2;
2508 	}
2509 	csum = (csum >> 16) + (csum & 0xffff);
2510 	csum = (csum >> 16) + (csum & 0xffff);
2511 	return (uint16_t)csum;
2512 }
2513 
2514 static inline uint16_t
2515 mxge_rx_csum6(void *p, struct mbuf *m, uint32_t csum)
2516 {
2517 	uint32_t partial;
2518 	int nxt, cksum_offset;
2519 	struct ip6_hdr *ip6 = p;
2520 	uint16_t c;
2521 
2522 	nxt = ip6->ip6_nxt;
2523 	cksum_offset = sizeof (*ip6) + ETHER_HDR_LEN;
2524 	if (nxt != IPPROTO_TCP && nxt != IPPROTO_UDP) {
2525 		cksum_offset = ip6_lasthdr(m, ETHER_HDR_LEN,
2526 					   IPPROTO_IPV6, &nxt);
2527 		if (nxt != IPPROTO_TCP && nxt != IPPROTO_UDP)
2528 			return (1);
2529 	}
2530 
2531 	/*
2532 	 * IPv6 headers do not contain a checksum, and hence
2533 	 * do not checksum to zero, so they don't "fall out"
2534 	 * of the partial checksum calculation like IPv4
2535 	 * headers do.  We need to fix the partial checksum by
2536 	 * subtracting the checksum of the IPv6 header.
2537 	 */
2538 
2539 	partial = mxge_csum_generic((uint16_t *)ip6, cksum_offset -
2540 				    ETHER_HDR_LEN);
2541 	csum += ~partial;
2542 	csum +=	 (csum < ~partial);
2543 	csum = (csum >> 16) + (csum & 0xFFFF);
2544 	csum = (csum >> 16) + (csum & 0xFFFF);
2545 	c = in6_cksum_pseudo(ip6, m->m_pkthdr.len - cksum_offset, nxt,
2546 			     csum);
2547 	c ^= 0xffff;
2548 	return (c);
2549 }
2550 #endif /* INET6 */
2551 /*
2552  *  Myri10GE hardware checksums are not valid if the sender
2553  *  padded the frame with non-zero padding.  This is because
2554  *  the firmware just does a simple 16-bit 1s complement
2555  *  checksum across the entire frame, excluding the first 14
2556  *  bytes.  It is best to simply to check the checksum and
2557  *  tell the stack about it only if the checksum is good
2558  */
2559 
2560 static inline uint16_t
2561 mxge_rx_csum(struct mbuf *m, int csum)
2562 {
2563 	struct ether_header *eh;
2564 #ifdef INET
2565 	struct ip *ip;
2566 #endif
2567 #if defined(INET) || defined(INET6)
2568 	int cap = m->m_pkthdr.rcvif->if_capenable;
2569 #endif
2570 	uint16_t c, etype;
2571 
2572 
2573 	eh = mtod(m, struct ether_header *);
2574 	etype = ntohs(eh->ether_type);
2575 	switch (etype) {
2576 #ifdef INET
2577 	case ETHERTYPE_IP:
2578 		if ((cap & IFCAP_RXCSUM) == 0)
2579 			return (1);
2580 		ip = (struct ip *)(eh + 1);
2581 		if (ip->ip_p != IPPROTO_TCP && ip->ip_p != IPPROTO_UDP)
2582 			return (1);
2583 		c = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
2584 			      htonl(ntohs(csum) + ntohs(ip->ip_len) -
2585 				    (ip->ip_hl << 2) + ip->ip_p));
2586 		c ^= 0xffff;
2587 		break;
2588 #endif
2589 #ifdef INET6
2590 	case ETHERTYPE_IPV6:
2591 		if ((cap & IFCAP_RXCSUM_IPV6) == 0)
2592 			return (1);
2593 		c = mxge_rx_csum6((eh + 1), m, csum);
2594 		break;
2595 #endif
2596 	default:
2597 		c = 1;
2598 	}
2599 	return (c);
2600 }
2601 
2602 static void
2603 mxge_vlan_tag_remove(struct mbuf *m, uint32_t *csum)
2604 {
2605 	struct ether_vlan_header *evl;
2606 	struct ether_header *eh;
2607 	uint32_t partial;
2608 
2609 	evl = mtod(m, struct ether_vlan_header *);
2610 	eh = mtod(m, struct ether_header *);
2611 
2612 	/*
2613 	 * fix checksum by subtracting ETHER_VLAN_ENCAP_LEN bytes
2614 	 * after what the firmware thought was the end of the ethernet
2615 	 * header.
2616 	 */
2617 
2618 	/* put checksum into host byte order */
2619 	*csum = ntohs(*csum);
2620 	partial = ntohl(*(uint32_t *)(mtod(m, char *) + ETHER_HDR_LEN));
2621 	(*csum) += ~partial;
2622 	(*csum) +=  ((*csum) < ~partial);
2623 	(*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2624 	(*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2625 
2626 	/* restore checksum to network byte order;
2627 	   later consumers expect this */
2628 	*csum = htons(*csum);
2629 
2630 	/* save the tag */
2631 #ifdef MXGE_NEW_VLAN_API
2632 	m->m_pkthdr.ether_vtag = ntohs(evl->evl_tag);
2633 #else
2634 	{
2635 		struct m_tag *mtag;
2636 		mtag = m_tag_alloc(MTAG_VLAN, MTAG_VLAN_TAG, sizeof(u_int),
2637 				   M_NOWAIT);
2638 		if (mtag == NULL)
2639 			return;
2640 		VLAN_TAG_VALUE(mtag) = ntohs(evl->evl_tag);
2641 		m_tag_prepend(m, mtag);
2642 	}
2643 
2644 #endif
2645 	m->m_flags |= M_VLANTAG;
2646 
2647 	/*
2648 	 * Remove the 802.1q header by copying the Ethernet
2649 	 * addresses over it and adjusting the beginning of
2650 	 * the data in the mbuf.  The encapsulated Ethernet
2651 	 * type field is already in place.
2652 	 */
2653 	bcopy((char *)evl, (char *)evl + ETHER_VLAN_ENCAP_LEN,
2654 	      ETHER_HDR_LEN - ETHER_TYPE_LEN);
2655 	m_adj(m, ETHER_VLAN_ENCAP_LEN);
2656 }
2657 
2658 
2659 static inline void
2660 mxge_rx_done_big(struct mxge_slice_state *ss, uint32_t len,
2661 		 uint32_t csum, int lro)
2662 {
2663 	mxge_softc_t *sc;
2664 	struct ifnet *ifp;
2665 	struct mbuf *m;
2666 	struct ether_header *eh;
2667 	mxge_rx_ring_t *rx;
2668 	bus_dmamap_t old_map;
2669 	int idx;
2670 
2671 	sc = ss->sc;
2672 	ifp = sc->ifp;
2673 	rx = &ss->rx_big;
2674 	idx = rx->cnt & rx->mask;
2675 	rx->cnt += rx->nbufs;
2676 	/* save a pointer to the received mbuf */
2677 	m = rx->info[idx].m;
2678 	/* try to replace the received mbuf */
2679 	if (mxge_get_buf_big(ss, rx->extra_map, idx)) {
2680 		/* drop the frame -- the old mbuf is re-cycled */
2681 		if_inc_counter(ifp, IFCOUNTER_IERRORS, 1);
2682 		return;
2683 	}
2684 
2685 	/* unmap the received buffer */
2686 	old_map = rx->info[idx].map;
2687 	bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2688 	bus_dmamap_unload(rx->dmat, old_map);
2689 
2690 	/* swap the bus_dmamap_t's */
2691 	rx->info[idx].map = rx->extra_map;
2692 	rx->extra_map = old_map;
2693 
2694 	/* mcp implicitly skips 1st 2 bytes so that packet is properly
2695 	 * aligned */
2696 	m->m_data += MXGEFW_PAD;
2697 
2698 	m->m_pkthdr.rcvif = ifp;
2699 	m->m_len = m->m_pkthdr.len = len;
2700 	ss->ipackets++;
2701 	eh = mtod(m, struct ether_header *);
2702 	if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2703 		mxge_vlan_tag_remove(m, &csum);
2704 	}
2705 	/* if the checksum is valid, mark it in the mbuf header */
2706 
2707 	if ((ifp->if_capenable & (IFCAP_RXCSUM_IPV6 | IFCAP_RXCSUM)) &&
2708 	    (0 == mxge_rx_csum(m, csum))) {
2709 		/* Tell the stack that the  checksum is good */
2710 		m->m_pkthdr.csum_data = 0xffff;
2711 		m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR |
2712 			CSUM_DATA_VALID;
2713 
2714 #if defined(INET) || defined (INET6)
2715 		if (lro && (0 == tcp_lro_rx(&ss->lc, m, 0)))
2716 			return;
2717 #endif
2718 	}
2719 	/* flowid only valid if RSS hashing is enabled */
2720 	if (sc->num_slices > 1) {
2721 		m->m_pkthdr.flowid = (ss - sc->ss);
2722 		M_HASHTYPE_SET(m, M_HASHTYPE_OPAQUE);
2723 	}
2724 	/* pass the frame up the stack */
2725 	(*ifp->if_input)(ifp, m);
2726 }
2727 
2728 static inline void
2729 mxge_rx_done_small(struct mxge_slice_state *ss, uint32_t len,
2730 		   uint32_t csum, int lro)
2731 {
2732 	mxge_softc_t *sc;
2733 	struct ifnet *ifp;
2734 	struct ether_header *eh;
2735 	struct mbuf *m;
2736 	mxge_rx_ring_t *rx;
2737 	bus_dmamap_t old_map;
2738 	int idx;
2739 
2740 	sc = ss->sc;
2741 	ifp = sc->ifp;
2742 	rx = &ss->rx_small;
2743 	idx = rx->cnt & rx->mask;
2744 	rx->cnt++;
2745 	/* save a pointer to the received mbuf */
2746 	m = rx->info[idx].m;
2747 	/* try to replace the received mbuf */
2748 	if (mxge_get_buf_small(ss, rx->extra_map, idx)) {
2749 		/* drop the frame -- the old mbuf is re-cycled */
2750 		if_inc_counter(ifp, IFCOUNTER_IERRORS, 1);
2751 		return;
2752 	}
2753 
2754 	/* unmap the received buffer */
2755 	old_map = rx->info[idx].map;
2756 	bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2757 	bus_dmamap_unload(rx->dmat, old_map);
2758 
2759 	/* swap the bus_dmamap_t's */
2760 	rx->info[idx].map = rx->extra_map;
2761 	rx->extra_map = old_map;
2762 
2763 	/* mcp implicitly skips 1st 2 bytes so that packet is properly
2764 	 * aligned */
2765 	m->m_data += MXGEFW_PAD;
2766 
2767 	m->m_pkthdr.rcvif = ifp;
2768 	m->m_len = m->m_pkthdr.len = len;
2769 	ss->ipackets++;
2770 	eh = mtod(m, struct ether_header *);
2771 	if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2772 		mxge_vlan_tag_remove(m, &csum);
2773 	}
2774 	/* if the checksum is valid, mark it in the mbuf header */
2775 	if ((ifp->if_capenable & (IFCAP_RXCSUM_IPV6 | IFCAP_RXCSUM)) &&
2776 	    (0 == mxge_rx_csum(m, csum))) {
2777 		/* Tell the stack that the  checksum is good */
2778 		m->m_pkthdr.csum_data = 0xffff;
2779 		m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR |
2780 			CSUM_DATA_VALID;
2781 
2782 #if defined(INET) || defined (INET6)
2783 		if (lro && (0 == tcp_lro_rx(&ss->lc, m, csum)))
2784 			return;
2785 #endif
2786 	}
2787 	/* flowid only valid if RSS hashing is enabled */
2788 	if (sc->num_slices > 1) {
2789 		m->m_pkthdr.flowid = (ss - sc->ss);
2790 		M_HASHTYPE_SET(m, M_HASHTYPE_OPAQUE);
2791 	}
2792 	/* pass the frame up the stack */
2793 	(*ifp->if_input)(ifp, m);
2794 }
2795 
2796 static inline void
2797 mxge_clean_rx_done(struct mxge_slice_state *ss)
2798 {
2799 	mxge_rx_done_t *rx_done = &ss->rx_done;
2800 	int limit = 0;
2801 	uint16_t length;
2802 	uint16_t checksum;
2803 	int lro;
2804 
2805 	lro = ss->sc->ifp->if_capenable & IFCAP_LRO;
2806 	while (rx_done->entry[rx_done->idx].length != 0) {
2807 		length = ntohs(rx_done->entry[rx_done->idx].length);
2808 		rx_done->entry[rx_done->idx].length = 0;
2809 		checksum = rx_done->entry[rx_done->idx].checksum;
2810 		if (length <= (MHLEN - MXGEFW_PAD))
2811 			mxge_rx_done_small(ss, length, checksum, lro);
2812 		else
2813 			mxge_rx_done_big(ss, length, checksum, lro);
2814 		rx_done->cnt++;
2815 		rx_done->idx = rx_done->cnt & rx_done->mask;
2816 
2817 		/* limit potential for livelock */
2818 		if (__predict_false(++limit > rx_done->mask / 2))
2819 			break;
2820 	}
2821 #if defined(INET)  || defined (INET6)
2822 	tcp_lro_flush_all(&ss->lc);
2823 #endif
2824 }
2825 
2826 
2827 static inline void
2828 mxge_tx_done(struct mxge_slice_state *ss, uint32_t mcp_idx)
2829 {
2830 	struct ifnet *ifp;
2831 	mxge_tx_ring_t *tx;
2832 	struct mbuf *m;
2833 	bus_dmamap_t map;
2834 	int idx;
2835 	int *flags;
2836 
2837 	tx = &ss->tx;
2838 	ifp = ss->sc->ifp;
2839 	while (tx->pkt_done != mcp_idx) {
2840 		idx = tx->done & tx->mask;
2841 		tx->done++;
2842 		m = tx->info[idx].m;
2843 		/* mbuf and DMA map only attached to the first
2844 		   segment per-mbuf */
2845 		if (m != NULL) {
2846 			ss->obytes += m->m_pkthdr.len;
2847 			if (m->m_flags & M_MCAST)
2848 				ss->omcasts++;
2849 			ss->opackets++;
2850 			tx->info[idx].m = NULL;
2851 			map = tx->info[idx].map;
2852 			bus_dmamap_unload(tx->dmat, map);
2853 			m_freem(m);
2854 		}
2855 		if (tx->info[idx].flag) {
2856 			tx->info[idx].flag = 0;
2857 			tx->pkt_done++;
2858 		}
2859 	}
2860 
2861 	/* If we have space, clear IFF_OACTIVE to tell the stack that
2862 	   its OK to send packets */
2863 #ifdef IFNET_BUF_RING
2864 	flags = &ss->if_drv_flags;
2865 #else
2866 	flags = &ifp->if_drv_flags;
2867 #endif
2868 	mtx_lock(&ss->tx.mtx);
2869 	if ((*flags) & IFF_DRV_OACTIVE &&
2870 	    tx->req - tx->done < (tx->mask + 1)/4) {
2871 		*(flags) &= ~IFF_DRV_OACTIVE;
2872 		ss->tx.wake++;
2873 		mxge_start_locked(ss);
2874 	}
2875 #ifdef IFNET_BUF_RING
2876 	if ((ss->sc->num_slices > 1) && (tx->req == tx->done)) {
2877 		/* let the NIC stop polling this queue, since there
2878 		 * are no more transmits pending */
2879 		if (tx->req == tx->done) {
2880 			*tx->send_stop = 1;
2881 			tx->queue_active = 0;
2882 			tx->deactivate++;
2883 			wmb();
2884 		}
2885 	}
2886 #endif
2887 	mtx_unlock(&ss->tx.mtx);
2888 
2889 }
2890 
2891 static struct mxge_media_type mxge_xfp_media_types[] =
2892 {
2893 	{IFM_10G_CX4,	0x7f, 		"10GBASE-CX4 (module)"},
2894 	{IFM_10G_SR, 	(1 << 7),	"10GBASE-SR"},
2895 	{IFM_10G_LR, 	(1 << 6),	"10GBASE-LR"},
2896 	{0,		(1 << 5),	"10GBASE-ER"},
2897 	{IFM_10G_LRM,	(1 << 4),	"10GBASE-LRM"},
2898 	{0,		(1 << 3),	"10GBASE-SW"},
2899 	{0,		(1 << 2),	"10GBASE-LW"},
2900 	{0,		(1 << 1),	"10GBASE-EW"},
2901 	{0,		(1 << 0),	"Reserved"}
2902 };
2903 static struct mxge_media_type mxge_sfp_media_types[] =
2904 {
2905 	{IFM_10G_TWINAX,      0,	"10GBASE-Twinax"},
2906 	{0,		(1 << 7),	"Reserved"},
2907 	{IFM_10G_LRM,	(1 << 6),	"10GBASE-LRM"},
2908 	{IFM_10G_LR, 	(1 << 5),	"10GBASE-LR"},
2909 	{IFM_10G_SR,	(1 << 4),	"10GBASE-SR"},
2910 	{IFM_10G_TWINAX,(1 << 0),	"10GBASE-Twinax"}
2911 };
2912 
2913 static void
2914 mxge_media_set(mxge_softc_t *sc, int media_type)
2915 {
2916 
2917 
2918 	ifmedia_add(&sc->media, IFM_ETHER | IFM_FDX | media_type,
2919 		    0, NULL);
2920 	ifmedia_set(&sc->media, IFM_ETHER | IFM_FDX | media_type);
2921 	sc->current_media = media_type;
2922 	sc->media.ifm_media = sc->media.ifm_cur->ifm_media;
2923 }
2924 
2925 static void
2926 mxge_media_init(mxge_softc_t *sc)
2927 {
2928 	char *ptr;
2929 	int i;
2930 
2931 	ifmedia_removeall(&sc->media);
2932 	mxge_media_set(sc, IFM_AUTO);
2933 
2934 	/*
2935 	 * parse the product code to deterimine the interface type
2936 	 * (CX4, XFP, Quad Ribbon Fiber) by looking at the character
2937 	 * after the 3rd dash in the driver's cached copy of the
2938 	 * EEPROM's product code string.
2939 	 */
2940 	ptr = sc->product_code_string;
2941 	if (ptr == NULL) {
2942 		device_printf(sc->dev, "Missing product code\n");
2943 		return;
2944 	}
2945 
2946 	for (i = 0; i < 3; i++, ptr++) {
2947 		ptr = strchr(ptr, '-');
2948 		if (ptr == NULL) {
2949 			device_printf(sc->dev,
2950 				      "only %d dashes in PC?!?\n", i);
2951 			return;
2952 		}
2953 	}
2954 	if (*ptr == 'C' || *(ptr +1) == 'C') {
2955 		/* -C is CX4 */
2956 		sc->connector = MXGE_CX4;
2957 		mxge_media_set(sc, IFM_10G_CX4);
2958 	} else if (*ptr == 'Q') {
2959 		/* -Q is Quad Ribbon Fiber */
2960 		sc->connector = MXGE_QRF;
2961 		device_printf(sc->dev, "Quad Ribbon Fiber Media\n");
2962 		/* FreeBSD has no media type for Quad ribbon fiber */
2963 	} else if (*ptr == 'R') {
2964 		/* -R is XFP */
2965 		sc->connector = MXGE_XFP;
2966 	} else if (*ptr == 'S' || *(ptr +1) == 'S') {
2967 		/* -S or -2S is SFP+ */
2968 		sc->connector = MXGE_SFP;
2969 	} else {
2970 		device_printf(sc->dev, "Unknown media type: %c\n", *ptr);
2971 	}
2972 }
2973 
2974 /*
2975  * Determine the media type for a NIC.  Some XFPs will identify
2976  * themselves only when their link is up, so this is initiated via a
2977  * link up interrupt.  However, this can potentially take up to
2978  * several milliseconds, so it is run via the watchdog routine, rather
2979  * than in the interrupt handler itself.
2980  */
2981 static void
2982 mxge_media_probe(mxge_softc_t *sc)
2983 {
2984 	mxge_cmd_t cmd;
2985 	char *cage_type;
2986 
2987 	struct mxge_media_type *mxge_media_types = NULL;
2988 	int i, err, ms, mxge_media_type_entries;
2989 	uint32_t byte;
2990 
2991 	sc->need_media_probe = 0;
2992 
2993 	if (sc->connector == MXGE_XFP) {
2994 		/* -R is XFP */
2995 		mxge_media_types = mxge_xfp_media_types;
2996 		mxge_media_type_entries =
2997 			nitems(mxge_xfp_media_types);
2998 		byte = MXGE_XFP_COMPLIANCE_BYTE;
2999 		cage_type = "XFP";
3000 	} else 	if (sc->connector == MXGE_SFP) {
3001 		/* -S or -2S is SFP+ */
3002 		mxge_media_types = mxge_sfp_media_types;
3003 		mxge_media_type_entries =
3004 			nitems(mxge_sfp_media_types);
3005 		cage_type = "SFP+";
3006 		byte = 3;
3007 	} else {
3008 		/* nothing to do; media type cannot change */
3009 		return;
3010 	}
3011 
3012 	/*
3013 	 * At this point we know the NIC has an XFP cage, so now we
3014 	 * try to determine what is in the cage by using the
3015 	 * firmware's XFP I2C commands to read the XFP 10GbE compilance
3016 	 * register.  We read just one byte, which may take over
3017 	 * a millisecond
3018 	 */
3019 
3020 	cmd.data0 = 0;	 /* just fetch 1 byte, not all 256 */
3021 	cmd.data1 = byte;
3022 	err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_READ, &cmd);
3023 	if (err == MXGEFW_CMD_ERROR_I2C_FAILURE) {
3024 		device_printf(sc->dev, "failed to read XFP\n");
3025 	}
3026 	if (err == MXGEFW_CMD_ERROR_I2C_ABSENT) {
3027 		device_printf(sc->dev, "Type R/S with no XFP!?!?\n");
3028 	}
3029 	if (err != MXGEFW_CMD_OK) {
3030 		return;
3031 	}
3032 
3033 	/* now we wait for the data to be cached */
3034 	cmd.data0 = byte;
3035 	err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
3036 	for (ms = 0; (err == EBUSY) && (ms < 50); ms++) {
3037 		DELAY(1000);
3038 		cmd.data0 = byte;
3039 		err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
3040 	}
3041 	if (err != MXGEFW_CMD_OK) {
3042 		device_printf(sc->dev, "failed to read %s (%d, %dms)\n",
3043 			      cage_type, err, ms);
3044 		return;
3045 	}
3046 
3047 	if (cmd.data0 == mxge_media_types[0].bitmask) {
3048 		if (mxge_verbose)
3049 			device_printf(sc->dev, "%s:%s\n", cage_type,
3050 				      mxge_media_types[0].name);
3051 		if (sc->current_media != mxge_media_types[0].flag) {
3052 			mxge_media_init(sc);
3053 			mxge_media_set(sc, mxge_media_types[0].flag);
3054 		}
3055 		return;
3056 	}
3057 	for (i = 1; i < mxge_media_type_entries; i++) {
3058 		if (cmd.data0 & mxge_media_types[i].bitmask) {
3059 			if (mxge_verbose)
3060 				device_printf(sc->dev, "%s:%s\n",
3061 					      cage_type,
3062 					      mxge_media_types[i].name);
3063 
3064 			if (sc->current_media != mxge_media_types[i].flag) {
3065 				mxge_media_init(sc);
3066 				mxge_media_set(sc, mxge_media_types[i].flag);
3067 			}
3068 			return;
3069 		}
3070 	}
3071 	if (mxge_verbose)
3072 		device_printf(sc->dev, "%s media 0x%x unknown\n",
3073 			      cage_type, cmd.data0);
3074 
3075 	return;
3076 }
3077 
3078 static void
3079 mxge_intr(void *arg)
3080 {
3081 	struct mxge_slice_state *ss = arg;
3082 	mxge_softc_t *sc = ss->sc;
3083 	mcp_irq_data_t *stats = ss->fw_stats;
3084 	mxge_tx_ring_t *tx = &ss->tx;
3085 	mxge_rx_done_t *rx_done = &ss->rx_done;
3086 	uint32_t send_done_count;
3087 	uint8_t valid;
3088 
3089 
3090 #ifndef IFNET_BUF_RING
3091 	/* an interrupt on a non-zero slice is implicitly valid
3092 	   since MSI-X irqs are not shared */
3093 	if (ss != sc->ss) {
3094 		mxge_clean_rx_done(ss);
3095 		*ss->irq_claim = be32toh(3);
3096 		return;
3097 	}
3098 #endif
3099 
3100 	/* make sure the DMA has finished */
3101 	if (!stats->valid) {
3102 		return;
3103 	}
3104 	valid = stats->valid;
3105 
3106 	if (sc->legacy_irq) {
3107 		/* lower legacy IRQ  */
3108 		*sc->irq_deassert = 0;
3109 		if (!mxge_deassert_wait)
3110 			/* don't wait for conf. that irq is low */
3111 			stats->valid = 0;
3112 	} else {
3113 		stats->valid = 0;
3114 	}
3115 
3116 	/* loop while waiting for legacy irq deassertion */
3117 	do {
3118 		/* check for transmit completes and receives */
3119 		send_done_count = be32toh(stats->send_done_count);
3120 		while ((send_done_count != tx->pkt_done) ||
3121 		       (rx_done->entry[rx_done->idx].length != 0)) {
3122 			if (send_done_count != tx->pkt_done)
3123 				mxge_tx_done(ss, (int)send_done_count);
3124 			mxge_clean_rx_done(ss);
3125 			send_done_count = be32toh(stats->send_done_count);
3126 		}
3127 		if (sc->legacy_irq && mxge_deassert_wait)
3128 			wmb();
3129 	} while (*((volatile uint8_t *) &stats->valid));
3130 
3131 	/* fw link & error stats meaningful only on the first slice */
3132 	if (__predict_false((ss == sc->ss) && stats->stats_updated)) {
3133 		if (sc->link_state != stats->link_up) {
3134 			sc->link_state = stats->link_up;
3135 			if (sc->link_state) {
3136 				if_link_state_change(sc->ifp, LINK_STATE_UP);
3137 				if (mxge_verbose)
3138 					device_printf(sc->dev, "link up\n");
3139 			} else {
3140 				if_link_state_change(sc->ifp, LINK_STATE_DOWN);
3141 				if (mxge_verbose)
3142 					device_printf(sc->dev, "link down\n");
3143 			}
3144 			sc->need_media_probe = 1;
3145 		}
3146 		if (sc->rdma_tags_available !=
3147 		    be32toh(stats->rdma_tags_available)) {
3148 			sc->rdma_tags_available =
3149 				be32toh(stats->rdma_tags_available);
3150 			device_printf(sc->dev, "RDMA timed out! %d tags "
3151 				      "left\n", sc->rdma_tags_available);
3152 		}
3153 
3154 		if (stats->link_down) {
3155 			sc->down_cnt += stats->link_down;
3156 			sc->link_state = 0;
3157 			if_link_state_change(sc->ifp, LINK_STATE_DOWN);
3158 		}
3159 	}
3160 
3161 	/* check to see if we have rx token to pass back */
3162 	if (valid & 0x1)
3163 	    *ss->irq_claim = be32toh(3);
3164 	*(ss->irq_claim + 1) = be32toh(3);
3165 }
3166 
3167 static void
3168 mxge_init(void *arg)
3169 {
3170 	mxge_softc_t *sc = arg;
3171 	struct ifnet *ifp = sc->ifp;
3172 
3173 
3174 	mtx_lock(&sc->driver_mtx);
3175 	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0)
3176 		(void) mxge_open(sc);
3177 	mtx_unlock(&sc->driver_mtx);
3178 }
3179 
3180 
3181 
3182 static void
3183 mxge_free_slice_mbufs(struct mxge_slice_state *ss)
3184 {
3185 	int i;
3186 
3187 #if defined(INET) || defined(INET6)
3188 	tcp_lro_free(&ss->lc);
3189 #endif
3190 	for (i = 0; i <= ss->rx_big.mask; i++) {
3191 		if (ss->rx_big.info[i].m == NULL)
3192 			continue;
3193 		bus_dmamap_unload(ss->rx_big.dmat,
3194 				  ss->rx_big.info[i].map);
3195 		m_freem(ss->rx_big.info[i].m);
3196 		ss->rx_big.info[i].m = NULL;
3197 	}
3198 
3199 	for (i = 0; i <= ss->rx_small.mask; i++) {
3200 		if (ss->rx_small.info[i].m == NULL)
3201 			continue;
3202 		bus_dmamap_unload(ss->rx_small.dmat,
3203 				  ss->rx_small.info[i].map);
3204 		m_freem(ss->rx_small.info[i].m);
3205 		ss->rx_small.info[i].m = NULL;
3206 	}
3207 
3208 	/* transmit ring used only on the first slice */
3209 	if (ss->tx.info == NULL)
3210 		return;
3211 
3212 	for (i = 0; i <= ss->tx.mask; i++) {
3213 		ss->tx.info[i].flag = 0;
3214 		if (ss->tx.info[i].m == NULL)
3215 			continue;
3216 		bus_dmamap_unload(ss->tx.dmat,
3217 				  ss->tx.info[i].map);
3218 		m_freem(ss->tx.info[i].m);
3219 		ss->tx.info[i].m = NULL;
3220 	}
3221 }
3222 
3223 static void
3224 mxge_free_mbufs(mxge_softc_t *sc)
3225 {
3226 	int slice;
3227 
3228 	for (slice = 0; slice < sc->num_slices; slice++)
3229 		mxge_free_slice_mbufs(&sc->ss[slice]);
3230 }
3231 
3232 static void
3233 mxge_free_slice_rings(struct mxge_slice_state *ss)
3234 {
3235 	int i;
3236 
3237 
3238 	if (ss->rx_done.entry != NULL)
3239 		mxge_dma_free(&ss->rx_done.dma);
3240 	ss->rx_done.entry = NULL;
3241 
3242 	if (ss->tx.req_bytes != NULL)
3243 		free(ss->tx.req_bytes, M_DEVBUF);
3244 	ss->tx.req_bytes = NULL;
3245 
3246 	if (ss->tx.seg_list != NULL)
3247 		free(ss->tx.seg_list, M_DEVBUF);
3248 	ss->tx.seg_list = NULL;
3249 
3250 	if (ss->rx_small.shadow != NULL)
3251 		free(ss->rx_small.shadow, M_DEVBUF);
3252 	ss->rx_small.shadow = NULL;
3253 
3254 	if (ss->rx_big.shadow != NULL)
3255 		free(ss->rx_big.shadow, M_DEVBUF);
3256 	ss->rx_big.shadow = NULL;
3257 
3258 	if (ss->tx.info != NULL) {
3259 		if (ss->tx.dmat != NULL) {
3260 			for (i = 0; i <= ss->tx.mask; i++) {
3261 				bus_dmamap_destroy(ss->tx.dmat,
3262 						   ss->tx.info[i].map);
3263 			}
3264 			bus_dma_tag_destroy(ss->tx.dmat);
3265 		}
3266 		free(ss->tx.info, M_DEVBUF);
3267 	}
3268 	ss->tx.info = NULL;
3269 
3270 	if (ss->rx_small.info != NULL) {
3271 		if (ss->rx_small.dmat != NULL) {
3272 			for (i = 0; i <= ss->rx_small.mask; i++) {
3273 				bus_dmamap_destroy(ss->rx_small.dmat,
3274 						   ss->rx_small.info[i].map);
3275 			}
3276 			bus_dmamap_destroy(ss->rx_small.dmat,
3277 					   ss->rx_small.extra_map);
3278 			bus_dma_tag_destroy(ss->rx_small.dmat);
3279 		}
3280 		free(ss->rx_small.info, M_DEVBUF);
3281 	}
3282 	ss->rx_small.info = NULL;
3283 
3284 	if (ss->rx_big.info != NULL) {
3285 		if (ss->rx_big.dmat != NULL) {
3286 			for (i = 0; i <= ss->rx_big.mask; i++) {
3287 				bus_dmamap_destroy(ss->rx_big.dmat,
3288 						   ss->rx_big.info[i].map);
3289 			}
3290 			bus_dmamap_destroy(ss->rx_big.dmat,
3291 					   ss->rx_big.extra_map);
3292 			bus_dma_tag_destroy(ss->rx_big.dmat);
3293 		}
3294 		free(ss->rx_big.info, M_DEVBUF);
3295 	}
3296 	ss->rx_big.info = NULL;
3297 }
3298 
3299 static void
3300 mxge_free_rings(mxge_softc_t *sc)
3301 {
3302 	int slice;
3303 
3304 	for (slice = 0; slice < sc->num_slices; slice++)
3305 		mxge_free_slice_rings(&sc->ss[slice]);
3306 }
3307 
3308 static int
3309 mxge_alloc_slice_rings(struct mxge_slice_state *ss, int rx_ring_entries,
3310 		       int tx_ring_entries)
3311 {
3312 	mxge_softc_t *sc = ss->sc;
3313 	size_t bytes;
3314 	int err, i;
3315 
3316 	/* allocate per-slice receive resources */
3317 
3318 	ss->rx_small.mask = ss->rx_big.mask = rx_ring_entries - 1;
3319 	ss->rx_done.mask = (2 * rx_ring_entries) - 1;
3320 
3321 	/* allocate the rx shadow rings */
3322 	bytes = rx_ring_entries * sizeof (*ss->rx_small.shadow);
3323 	ss->rx_small.shadow = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3324 
3325 	bytes = rx_ring_entries * sizeof (*ss->rx_big.shadow);
3326 	ss->rx_big.shadow = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3327 
3328 	/* allocate the rx host info rings */
3329 	bytes = rx_ring_entries * sizeof (*ss->rx_small.info);
3330 	ss->rx_small.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3331 
3332 	bytes = rx_ring_entries * sizeof (*ss->rx_big.info);
3333 	ss->rx_big.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3334 
3335 	/* allocate the rx busdma resources */
3336 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
3337 				 1,			/* alignment */
3338 				 4096,			/* boundary */
3339 				 BUS_SPACE_MAXADDR,	/* low */
3340 				 BUS_SPACE_MAXADDR,	/* high */
3341 				 NULL, NULL,		/* filter */
3342 				 MHLEN,			/* maxsize */
3343 				 1,			/* num segs */
3344 				 MHLEN,			/* maxsegsize */
3345 				 BUS_DMA_ALLOCNOW,	/* flags */
3346 				 NULL, NULL,		/* lock */
3347 				 &ss->rx_small.dmat);	/* tag */
3348 	if (err != 0) {
3349 		device_printf(sc->dev, "Err %d allocating rx_small dmat\n",
3350 			      err);
3351 		return err;
3352 	}
3353 
3354 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
3355 				 1,			/* alignment */
3356 #if MXGE_VIRT_JUMBOS
3357 				 4096,			/* boundary */
3358 #else
3359 				 0,			/* boundary */
3360 #endif
3361 				 BUS_SPACE_MAXADDR,	/* low */
3362 				 BUS_SPACE_MAXADDR,	/* high */
3363 				 NULL, NULL,		/* filter */
3364 				 3*4096,		/* maxsize */
3365 #if MXGE_VIRT_JUMBOS
3366 				 3,			/* num segs */
3367 				 4096,			/* maxsegsize*/
3368 #else
3369 				 1,			/* num segs */
3370 				 MJUM9BYTES,		/* maxsegsize*/
3371 #endif
3372 				 BUS_DMA_ALLOCNOW,	/* flags */
3373 				 NULL, NULL,		/* lock */
3374 				 &ss->rx_big.dmat);	/* tag */
3375 	if (err != 0) {
3376 		device_printf(sc->dev, "Err %d allocating rx_big dmat\n",
3377 			      err);
3378 		return err;
3379 	}
3380 	for (i = 0; i <= ss->rx_small.mask; i++) {
3381 		err = bus_dmamap_create(ss->rx_small.dmat, 0,
3382 					&ss->rx_small.info[i].map);
3383 		if (err != 0) {
3384 			device_printf(sc->dev, "Err %d  rx_small dmamap\n",
3385 				      err);
3386 			return err;
3387 		}
3388 	}
3389 	err = bus_dmamap_create(ss->rx_small.dmat, 0,
3390 				&ss->rx_small.extra_map);
3391 	if (err != 0) {
3392 		device_printf(sc->dev, "Err %d extra rx_small dmamap\n",
3393 			      err);
3394 		return err;
3395 	}
3396 
3397 	for (i = 0; i <= ss->rx_big.mask; i++) {
3398 		err = bus_dmamap_create(ss->rx_big.dmat, 0,
3399 					&ss->rx_big.info[i].map);
3400 		if (err != 0) {
3401 			device_printf(sc->dev, "Err %d  rx_big dmamap\n",
3402 				      err);
3403 			return err;
3404 		}
3405 	}
3406 	err = bus_dmamap_create(ss->rx_big.dmat, 0,
3407 				&ss->rx_big.extra_map);
3408 	if (err != 0) {
3409 		device_printf(sc->dev, "Err %d extra rx_big dmamap\n",
3410 			      err);
3411 		return err;
3412 	}
3413 
3414 	/* now allocate TX resources */
3415 
3416 #ifndef IFNET_BUF_RING
3417 	/* only use a single TX ring for now */
3418 	if (ss != ss->sc->ss)
3419 		return 0;
3420 #endif
3421 
3422 	ss->tx.mask = tx_ring_entries - 1;
3423 	ss->tx.max_desc = MIN(MXGE_MAX_SEND_DESC, tx_ring_entries / 4);
3424 
3425 
3426 	/* allocate the tx request copy block */
3427 	bytes = 8 +
3428 		sizeof (*ss->tx.req_list) * (ss->tx.max_desc + 4);
3429 	ss->tx.req_bytes = malloc(bytes, M_DEVBUF, M_WAITOK);
3430 	/* ensure req_list entries are aligned to 8 bytes */
3431 	ss->tx.req_list = (mcp_kreq_ether_send_t *)
3432 		((unsigned long)(ss->tx.req_bytes + 7) & ~7UL);
3433 
3434 	/* allocate the tx busdma segment list */
3435 	bytes = sizeof (*ss->tx.seg_list) * ss->tx.max_desc;
3436 	ss->tx.seg_list = (bus_dma_segment_t *)
3437 		malloc(bytes, M_DEVBUF, M_WAITOK);
3438 
3439 	/* allocate the tx host info ring */
3440 	bytes = tx_ring_entries * sizeof (*ss->tx.info);
3441 	ss->tx.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3442 
3443 	/* allocate the tx busdma resources */
3444 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
3445 				 1,			/* alignment */
3446 				 sc->tx_boundary,	/* boundary */
3447 				 BUS_SPACE_MAXADDR,	/* low */
3448 				 BUS_SPACE_MAXADDR,	/* high */
3449 				 NULL, NULL,		/* filter */
3450 				 65536 + 256,		/* maxsize */
3451 				 ss->tx.max_desc - 2,	/* num segs */
3452 				 sc->tx_boundary,	/* maxsegsz */
3453 				 BUS_DMA_ALLOCNOW,	/* flags */
3454 				 NULL, NULL,		/* lock */
3455 				 &ss->tx.dmat);		/* tag */
3456 
3457 	if (err != 0) {
3458 		device_printf(sc->dev, "Err %d allocating tx dmat\n",
3459 			      err);
3460 		return err;
3461 	}
3462 
3463 	/* now use these tags to setup dmamaps for each slot
3464 	   in the ring */
3465 	for (i = 0; i <= ss->tx.mask; i++) {
3466 		err = bus_dmamap_create(ss->tx.dmat, 0,
3467 					&ss->tx.info[i].map);
3468 		if (err != 0) {
3469 			device_printf(sc->dev, "Err %d  tx dmamap\n",
3470 				      err);
3471 			return err;
3472 		}
3473 	}
3474 	return 0;
3475 
3476 }
3477 
3478 static int
3479 mxge_alloc_rings(mxge_softc_t *sc)
3480 {
3481 	mxge_cmd_t cmd;
3482 	int tx_ring_size;
3483 	int tx_ring_entries, rx_ring_entries;
3484 	int err, slice;
3485 
3486 	/* get ring sizes */
3487 	err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_RING_SIZE, &cmd);
3488 	tx_ring_size = cmd.data0;
3489 	if (err != 0) {
3490 		device_printf(sc->dev, "Cannot determine tx ring sizes\n");
3491 		goto abort;
3492 	}
3493 
3494 	tx_ring_entries = tx_ring_size / sizeof (mcp_kreq_ether_send_t);
3495 	rx_ring_entries = sc->rx_ring_size / sizeof (mcp_dma_addr_t);
3496 	IFQ_SET_MAXLEN(&sc->ifp->if_snd, tx_ring_entries - 1);
3497 	sc->ifp->if_snd.ifq_drv_maxlen = sc->ifp->if_snd.ifq_maxlen;
3498 	IFQ_SET_READY(&sc->ifp->if_snd);
3499 
3500 	for (slice = 0; slice < sc->num_slices; slice++) {
3501 		err = mxge_alloc_slice_rings(&sc->ss[slice],
3502 					     rx_ring_entries,
3503 					     tx_ring_entries);
3504 		if (err != 0)
3505 			goto abort;
3506 	}
3507 	return 0;
3508 
3509 abort:
3510 	mxge_free_rings(sc);
3511 	return err;
3512 
3513 }
3514 
3515 
3516 static void
3517 mxge_choose_params(int mtu, int *big_buf_size, int *cl_size, int *nbufs)
3518 {
3519 	int bufsize = mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN + MXGEFW_PAD;
3520 
3521 	if (bufsize < MCLBYTES) {
3522 		/* easy, everything fits in a single buffer */
3523 		*big_buf_size = MCLBYTES;
3524 		*cl_size = MCLBYTES;
3525 		*nbufs = 1;
3526 		return;
3527 	}
3528 
3529 	if (bufsize < MJUMPAGESIZE) {
3530 		/* still easy, everything still fits in a single buffer */
3531 		*big_buf_size = MJUMPAGESIZE;
3532 		*cl_size = MJUMPAGESIZE;
3533 		*nbufs = 1;
3534 		return;
3535 	}
3536 #if MXGE_VIRT_JUMBOS
3537 	/* now we need to use virtually contiguous buffers */
3538 	*cl_size = MJUM9BYTES;
3539 	*big_buf_size = 4096;
3540 	*nbufs = mtu / 4096 + 1;
3541 	/* needs to be a power of two, so round up */
3542 	if (*nbufs == 3)
3543 		*nbufs = 4;
3544 #else
3545 	*cl_size = MJUM9BYTES;
3546 	*big_buf_size = MJUM9BYTES;
3547 	*nbufs = 1;
3548 #endif
3549 }
3550 
3551 static int
3552 mxge_slice_open(struct mxge_slice_state *ss, int nbufs, int cl_size)
3553 {
3554 	mxge_softc_t *sc;
3555 	mxge_cmd_t cmd;
3556 	bus_dmamap_t map;
3557 	int err, i, slice;
3558 
3559 
3560 	sc = ss->sc;
3561 	slice = ss - sc->ss;
3562 
3563 #if defined(INET) || defined(INET6)
3564 	(void)tcp_lro_init(&ss->lc);
3565 #endif
3566 	ss->lc.ifp = sc->ifp;
3567 
3568 	/* get the lanai pointers to the send and receive rings */
3569 
3570 	err = 0;
3571 #ifndef IFNET_BUF_RING
3572 	/* We currently only send from the first slice */
3573 	if (slice == 0) {
3574 #endif
3575 		cmd.data0 = slice;
3576 		err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_OFFSET, &cmd);
3577 		ss->tx.lanai =
3578 			(volatile mcp_kreq_ether_send_t *)(sc->sram + cmd.data0);
3579 		ss->tx.send_go = (volatile uint32_t *)
3580 			(sc->sram + MXGEFW_ETH_SEND_GO + 64 * slice);
3581 		ss->tx.send_stop = (volatile uint32_t *)
3582 		(sc->sram + MXGEFW_ETH_SEND_STOP + 64 * slice);
3583 #ifndef IFNET_BUF_RING
3584 	}
3585 #endif
3586 	cmd.data0 = slice;
3587 	err |= mxge_send_cmd(sc,
3588 			     MXGEFW_CMD_GET_SMALL_RX_OFFSET, &cmd);
3589 	ss->rx_small.lanai =
3590 		(volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
3591 	cmd.data0 = slice;
3592 	err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_BIG_RX_OFFSET, &cmd);
3593 	ss->rx_big.lanai =
3594 		(volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
3595 
3596 	if (err != 0) {
3597 		device_printf(sc->dev,
3598 			      "failed to get ring sizes or locations\n");
3599 		return EIO;
3600 	}
3601 
3602 	/* stock receive rings */
3603 	for (i = 0; i <= ss->rx_small.mask; i++) {
3604 		map = ss->rx_small.info[i].map;
3605 		err = mxge_get_buf_small(ss, map, i);
3606 		if (err) {
3607 			device_printf(sc->dev, "alloced %d/%d smalls\n",
3608 				      i, ss->rx_small.mask + 1);
3609 			return ENOMEM;
3610 		}
3611 	}
3612 	for (i = 0; i <= ss->rx_big.mask; i++) {
3613 		ss->rx_big.shadow[i].addr_low = 0xffffffff;
3614 		ss->rx_big.shadow[i].addr_high = 0xffffffff;
3615 	}
3616 	ss->rx_big.nbufs = nbufs;
3617 	ss->rx_big.cl_size = cl_size;
3618 	ss->rx_big.mlen = ss->sc->ifp->if_mtu + ETHER_HDR_LEN +
3619 		ETHER_VLAN_ENCAP_LEN + MXGEFW_PAD;
3620 	for (i = 0; i <= ss->rx_big.mask; i += ss->rx_big.nbufs) {
3621 		map = ss->rx_big.info[i].map;
3622 		err = mxge_get_buf_big(ss, map, i);
3623 		if (err) {
3624 			device_printf(sc->dev, "alloced %d/%d bigs\n",
3625 				      i, ss->rx_big.mask + 1);
3626 			return ENOMEM;
3627 		}
3628 	}
3629 	return 0;
3630 }
3631 
3632 static int
3633 mxge_open(mxge_softc_t *sc)
3634 {
3635 	mxge_cmd_t cmd;
3636 	int err, big_bytes, nbufs, slice, cl_size, i;
3637 	bus_addr_t bus;
3638 	volatile uint8_t *itable;
3639 	struct mxge_slice_state *ss;
3640 
3641 	/* Copy the MAC address in case it was overridden */
3642 	bcopy(IF_LLADDR(sc->ifp), sc->mac_addr, ETHER_ADDR_LEN);
3643 
3644 	err = mxge_reset(sc, 1);
3645 	if (err != 0) {
3646 		device_printf(sc->dev, "failed to reset\n");
3647 		return EIO;
3648 	}
3649 
3650 	if (sc->num_slices > 1) {
3651 		/* setup the indirection table */
3652 		cmd.data0 = sc->num_slices;
3653 		err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_TABLE_SIZE,
3654 				    &cmd);
3655 
3656 		err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_RSS_TABLE_OFFSET,
3657 				     &cmd);
3658 		if (err != 0) {
3659 			device_printf(sc->dev,
3660 				      "failed to setup rss tables\n");
3661 			return err;
3662 		}
3663 
3664 		/* just enable an identity mapping */
3665 		itable = sc->sram + cmd.data0;
3666 		for (i = 0; i < sc->num_slices; i++)
3667 			itable[i] = (uint8_t)i;
3668 
3669 		cmd.data0 = 1;
3670 		cmd.data1 = mxge_rss_hash_type;
3671 		err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_ENABLE, &cmd);
3672 		if (err != 0) {
3673 			device_printf(sc->dev, "failed to enable slices\n");
3674 			return err;
3675 		}
3676 	}
3677 
3678 
3679 	mxge_choose_params(sc->ifp->if_mtu, &big_bytes, &cl_size, &nbufs);
3680 
3681 	cmd.data0 = nbufs;
3682 	err = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
3683 			    &cmd);
3684 	/* error is only meaningful if we're trying to set
3685 	   MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS > 1 */
3686 	if (err && nbufs > 1) {
3687 		device_printf(sc->dev,
3688 			      "Failed to set alway-use-n to %d\n",
3689 			      nbufs);
3690 		return EIO;
3691 	}
3692 	/* Give the firmware the mtu and the big and small buffer
3693 	   sizes.  The firmware wants the big buf size to be a power
3694 	   of two. Luckily, FreeBSD's clusters are powers of two */
3695 	cmd.data0 = sc->ifp->if_mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
3696 	err = mxge_send_cmd(sc, MXGEFW_CMD_SET_MTU, &cmd);
3697 	cmd.data0 = MHLEN - MXGEFW_PAD;
3698 	err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_SMALL_BUFFER_SIZE,
3699 			     &cmd);
3700 	cmd.data0 = big_bytes;
3701 	err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_BIG_BUFFER_SIZE, &cmd);
3702 
3703 	if (err != 0) {
3704 		device_printf(sc->dev, "failed to setup params\n");
3705 		goto abort;
3706 	}
3707 
3708 	/* Now give him the pointer to the stats block */
3709 	for (slice = 0;
3710 #ifdef IFNET_BUF_RING
3711 	     slice < sc->num_slices;
3712 #else
3713 	     slice < 1;
3714 #endif
3715 	     slice++) {
3716 		ss = &sc->ss[slice];
3717 		cmd.data0 =
3718 			MXGE_LOWPART_TO_U32(ss->fw_stats_dma.bus_addr);
3719 		cmd.data1 =
3720 			MXGE_HIGHPART_TO_U32(ss->fw_stats_dma.bus_addr);
3721 		cmd.data2 = sizeof(struct mcp_irq_data);
3722 		cmd.data2 |= (slice << 16);
3723 		err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_STATS_DMA_V2, &cmd);
3724 	}
3725 
3726 	if (err != 0) {
3727 		bus = sc->ss->fw_stats_dma.bus_addr;
3728 		bus += offsetof(struct mcp_irq_data, send_done_count);
3729 		cmd.data0 = MXGE_LOWPART_TO_U32(bus);
3730 		cmd.data1 = MXGE_HIGHPART_TO_U32(bus);
3731 		err = mxge_send_cmd(sc,
3732 				    MXGEFW_CMD_SET_STATS_DMA_OBSOLETE,
3733 				    &cmd);
3734 		/* Firmware cannot support multicast without STATS_DMA_V2 */
3735 		sc->fw_multicast_support = 0;
3736 	} else {
3737 		sc->fw_multicast_support = 1;
3738 	}
3739 
3740 	if (err != 0) {
3741 		device_printf(sc->dev, "failed to setup params\n");
3742 		goto abort;
3743 	}
3744 
3745 	for (slice = 0; slice < sc->num_slices; slice++) {
3746 		err = mxge_slice_open(&sc->ss[slice], nbufs, cl_size);
3747 		if (err != 0) {
3748 			device_printf(sc->dev, "couldn't open slice %d\n",
3749 				      slice);
3750 			goto abort;
3751 		}
3752 	}
3753 
3754 	/* Finally, start the firmware running */
3755 	err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_UP, &cmd);
3756 	if (err) {
3757 		device_printf(sc->dev, "Couldn't bring up link\n");
3758 		goto abort;
3759 	}
3760 #ifdef IFNET_BUF_RING
3761 	for (slice = 0; slice < sc->num_slices; slice++) {
3762 		ss = &sc->ss[slice];
3763 		ss->if_drv_flags |= IFF_DRV_RUNNING;
3764 		ss->if_drv_flags &= ~IFF_DRV_OACTIVE;
3765 	}
3766 #endif
3767 	sc->ifp->if_drv_flags |= IFF_DRV_RUNNING;
3768 	sc->ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
3769 
3770 	return 0;
3771 
3772 
3773 abort:
3774 	mxge_free_mbufs(sc);
3775 
3776 	return err;
3777 }
3778 
3779 static int
3780 mxge_close(mxge_softc_t *sc, int down)
3781 {
3782 	mxge_cmd_t cmd;
3783 	int err, old_down_cnt;
3784 #ifdef IFNET_BUF_RING
3785 	struct mxge_slice_state *ss;
3786 	int slice;
3787 #endif
3788 
3789 #ifdef IFNET_BUF_RING
3790 	for (slice = 0; slice < sc->num_slices; slice++) {
3791 		ss = &sc->ss[slice];
3792 		ss->if_drv_flags &= ~IFF_DRV_RUNNING;
3793 	}
3794 #endif
3795 	sc->ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
3796 	if (!down) {
3797 		old_down_cnt = sc->down_cnt;
3798 		wmb();
3799 		err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_DOWN, &cmd);
3800 		if (err) {
3801 			device_printf(sc->dev,
3802 				      "Couldn't bring down link\n");
3803 		}
3804 		if (old_down_cnt == sc->down_cnt) {
3805 			/* wait for down irq */
3806 			DELAY(10 * sc->intr_coal_delay);
3807 		}
3808 		wmb();
3809 		if (old_down_cnt == sc->down_cnt) {
3810 			device_printf(sc->dev, "never got down irq\n");
3811 		}
3812 	}
3813 	mxge_free_mbufs(sc);
3814 
3815 	return 0;
3816 }
3817 
3818 static void
3819 mxge_setup_cfg_space(mxge_softc_t *sc)
3820 {
3821 	device_t dev = sc->dev;
3822 	int reg;
3823 	uint16_t lnk, pectl;
3824 
3825 	/* find the PCIe link width and set max read request to 4KB*/
3826 	if (pci_find_cap(dev, PCIY_EXPRESS, &reg) == 0) {
3827 		lnk = pci_read_config(dev, reg + 0x12, 2);
3828 		sc->link_width = (lnk >> 4) & 0x3f;
3829 
3830 		if (sc->pectl == 0) {
3831 			pectl = pci_read_config(dev, reg + 0x8, 2);
3832 			pectl = (pectl & ~0x7000) | (5 << 12);
3833 			pci_write_config(dev, reg + 0x8, pectl, 2);
3834 			sc->pectl = pectl;
3835 		} else {
3836 			/* restore saved pectl after watchdog reset */
3837 			pci_write_config(dev, reg + 0x8, sc->pectl, 2);
3838 		}
3839 	}
3840 
3841 	/* Enable DMA and Memory space access */
3842 	pci_enable_busmaster(dev);
3843 }
3844 
3845 static uint32_t
3846 mxge_read_reboot(mxge_softc_t *sc)
3847 {
3848 	device_t dev = sc->dev;
3849 	uint32_t vs;
3850 
3851 	/* find the vendor specific offset */
3852 	if (pci_find_cap(dev, PCIY_VENDOR, &vs) != 0) {
3853 		device_printf(sc->dev,
3854 			      "could not find vendor specific offset\n");
3855 		return (uint32_t)-1;
3856 	}
3857 	/* enable read32 mode */
3858 	pci_write_config(dev, vs + 0x10, 0x3, 1);
3859 	/* tell NIC which register to read */
3860 	pci_write_config(dev, vs + 0x18, 0xfffffff0, 4);
3861 	return (pci_read_config(dev, vs + 0x14, 4));
3862 }
3863 
3864 static void
3865 mxge_watchdog_reset(mxge_softc_t *sc)
3866 {
3867 	struct pci_devinfo *dinfo;
3868 	struct mxge_slice_state *ss;
3869 	int err, running, s, num_tx_slices = 1;
3870 	uint32_t reboot;
3871 	uint16_t cmd;
3872 
3873 	err = ENXIO;
3874 
3875 	device_printf(sc->dev, "Watchdog reset!\n");
3876 
3877 	/*
3878 	 * check to see if the NIC rebooted.  If it did, then all of
3879 	 * PCI config space has been reset, and things like the
3880 	 * busmaster bit will be zero.  If this is the case, then we
3881 	 * must restore PCI config space before the NIC can be used
3882 	 * again
3883 	 */
3884 	cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3885 	if (cmd == 0xffff) {
3886 		/*
3887 		 * maybe the watchdog caught the NIC rebooting; wait
3888 		 * up to 100ms for it to finish.  If it does not come
3889 		 * back, then give up
3890 		 */
3891 		DELAY(1000*100);
3892 		cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3893 		if (cmd == 0xffff) {
3894 			device_printf(sc->dev, "NIC disappeared!\n");
3895 		}
3896 	}
3897 	if ((cmd & PCIM_CMD_BUSMASTEREN) == 0) {
3898 		/* print the reboot status */
3899 		reboot = mxge_read_reboot(sc);
3900 		device_printf(sc->dev, "NIC rebooted, status = 0x%x\n",
3901 			      reboot);
3902 		running = sc->ifp->if_drv_flags & IFF_DRV_RUNNING;
3903 		if (running) {
3904 
3905 			/*
3906 			 * quiesce NIC so that TX routines will not try to
3907 			 * xmit after restoration of BAR
3908 			 */
3909 
3910 			/* Mark the link as down */
3911 			if (sc->link_state) {
3912 				sc->link_state = 0;
3913 				if_link_state_change(sc->ifp,
3914 						     LINK_STATE_DOWN);
3915 			}
3916 #ifdef IFNET_BUF_RING
3917 			num_tx_slices = sc->num_slices;
3918 #endif
3919 			/* grab all TX locks to ensure no tx  */
3920 			for (s = 0; s < num_tx_slices; s++) {
3921 				ss = &sc->ss[s];
3922 				mtx_lock(&ss->tx.mtx);
3923 			}
3924 			mxge_close(sc, 1);
3925 		}
3926 		/* restore PCI configuration space */
3927 		dinfo = device_get_ivars(sc->dev);
3928 		pci_cfg_restore(sc->dev, dinfo);
3929 
3930 		/* and redo any changes we made to our config space */
3931 		mxge_setup_cfg_space(sc);
3932 
3933 		/* reload f/w */
3934 		err = mxge_load_firmware(sc, 0);
3935 		if (err) {
3936 			device_printf(sc->dev,
3937 				      "Unable to re-load f/w\n");
3938 		}
3939 		if (running) {
3940 			if (!err)
3941 				err = mxge_open(sc);
3942 			/* release all TX locks */
3943 			for (s = 0; s < num_tx_slices; s++) {
3944 				ss = &sc->ss[s];
3945 #ifdef IFNET_BUF_RING
3946 				mxge_start_locked(ss);
3947 #endif
3948 				mtx_unlock(&ss->tx.mtx);
3949 			}
3950 		}
3951 		sc->watchdog_resets++;
3952 	} else {
3953 		device_printf(sc->dev,
3954 			      "NIC did not reboot, not resetting\n");
3955 		err = 0;
3956 	}
3957 	if (err) {
3958 		device_printf(sc->dev, "watchdog reset failed\n");
3959 	} else {
3960 		if (sc->dying == 2)
3961 			sc->dying = 0;
3962 		callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
3963 	}
3964 }
3965 
3966 static void
3967 mxge_watchdog_task(void *arg, int pending)
3968 {
3969 	mxge_softc_t *sc = arg;
3970 
3971 
3972 	mtx_lock(&sc->driver_mtx);
3973 	mxge_watchdog_reset(sc);
3974 	mtx_unlock(&sc->driver_mtx);
3975 }
3976 
3977 static void
3978 mxge_warn_stuck(mxge_softc_t *sc, mxge_tx_ring_t *tx, int slice)
3979 {
3980 	tx = &sc->ss[slice].tx;
3981 	device_printf(sc->dev, "slice %d struck? ring state:\n", slice);
3982 	device_printf(sc->dev,
3983 		      "tx.req=%d tx.done=%d, tx.queue_active=%d\n",
3984 		      tx->req, tx->done, tx->queue_active);
3985 	device_printf(sc->dev, "tx.activate=%d tx.deactivate=%d\n",
3986 			      tx->activate, tx->deactivate);
3987 	device_printf(sc->dev, "pkt_done=%d fw=%d\n",
3988 		      tx->pkt_done,
3989 		      be32toh(sc->ss->fw_stats->send_done_count));
3990 }
3991 
3992 static int
3993 mxge_watchdog(mxge_softc_t *sc)
3994 {
3995 	mxge_tx_ring_t *tx;
3996 	uint32_t rx_pause = be32toh(sc->ss->fw_stats->dropped_pause);
3997 	int i, err = 0;
3998 
3999 	/* see if we have outstanding transmits, which
4000 	   have been pending for more than mxge_ticks */
4001 	for (i = 0;
4002 #ifdef IFNET_BUF_RING
4003 	     (i < sc->num_slices) && (err == 0);
4004 #else
4005 	     (i < 1) && (err == 0);
4006 #endif
4007 	     i++) {
4008 		tx = &sc->ss[i].tx;
4009 		if (tx->req != tx->done &&
4010 		    tx->watchdog_req != tx->watchdog_done &&
4011 		    tx->done == tx->watchdog_done) {
4012 			/* check for pause blocking before resetting */
4013 			if (tx->watchdog_rx_pause == rx_pause) {
4014 				mxge_warn_stuck(sc, tx, i);
4015 				taskqueue_enqueue(sc->tq, &sc->watchdog_task);
4016 				return (ENXIO);
4017 			}
4018 			else
4019 				device_printf(sc->dev, "Flow control blocking "
4020 					      "xmits, check link partner\n");
4021 		}
4022 
4023 		tx->watchdog_req = tx->req;
4024 		tx->watchdog_done = tx->done;
4025 		tx->watchdog_rx_pause = rx_pause;
4026 	}
4027 
4028 	if (sc->need_media_probe)
4029 		mxge_media_probe(sc);
4030 	return (err);
4031 }
4032 
4033 static uint64_t
4034 mxge_get_counter(struct ifnet *ifp, ift_counter cnt)
4035 {
4036 	struct mxge_softc *sc;
4037 	uint64_t rv;
4038 
4039 	sc = if_getsoftc(ifp);
4040 	rv = 0;
4041 
4042 	switch (cnt) {
4043 	case IFCOUNTER_IPACKETS:
4044 		for (int s = 0; s < sc->num_slices; s++)
4045 			rv += sc->ss[s].ipackets;
4046 		return (rv);
4047 	case IFCOUNTER_OPACKETS:
4048 		for (int s = 0; s < sc->num_slices; s++)
4049 			rv += sc->ss[s].opackets;
4050 		return (rv);
4051 	case IFCOUNTER_OERRORS:
4052 		for (int s = 0; s < sc->num_slices; s++)
4053 			rv += sc->ss[s].oerrors;
4054 		return (rv);
4055 #ifdef IFNET_BUF_RING
4056 	case IFCOUNTER_OBYTES:
4057 		for (int s = 0; s < sc->num_slices; s++)
4058 			rv += sc->ss[s].obytes;
4059 		return (rv);
4060 	case IFCOUNTER_OMCASTS:
4061 		for (int s = 0; s < sc->num_slices; s++)
4062 			rv += sc->ss[s].omcasts;
4063 		return (rv);
4064 	case IFCOUNTER_OQDROPS:
4065 		for (int s = 0; s < sc->num_slices; s++)
4066 			rv += sc->ss[s].tx.br->br_drops;
4067 		return (rv);
4068 #endif
4069 	default:
4070 		return (if_get_counter_default(ifp, cnt));
4071 	}
4072 }
4073 
4074 static void
4075 mxge_tick(void *arg)
4076 {
4077 	mxge_softc_t *sc = arg;
4078 	u_long pkts = 0;
4079 	int err = 0;
4080 	int running, ticks;
4081 	uint16_t cmd;
4082 
4083 	ticks = mxge_ticks;
4084 	running = sc->ifp->if_drv_flags & IFF_DRV_RUNNING;
4085 	if (running) {
4086 		if (!sc->watchdog_countdown) {
4087 			err = mxge_watchdog(sc);
4088 			sc->watchdog_countdown = 4;
4089 		}
4090 		sc->watchdog_countdown--;
4091 	}
4092 	if (pkts == 0) {
4093 		/* ensure NIC did not suffer h/w fault while idle */
4094 		cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
4095 		if ((cmd & PCIM_CMD_BUSMASTEREN) == 0) {
4096 			sc->dying = 2;
4097 			taskqueue_enqueue(sc->tq, &sc->watchdog_task);
4098 			err = ENXIO;
4099 		}
4100 		/* look less often if NIC is idle */
4101 		ticks *= 4;
4102 	}
4103 
4104 	if (err == 0)
4105 		callout_reset(&sc->co_hdl, ticks, mxge_tick, sc);
4106 
4107 }
4108 
4109 static int
4110 mxge_media_change(struct ifnet *ifp)
4111 {
4112 	return EINVAL;
4113 }
4114 
4115 static int
4116 mxge_change_mtu(mxge_softc_t *sc, int mtu)
4117 {
4118 	struct ifnet *ifp = sc->ifp;
4119 	int real_mtu, old_mtu;
4120 	int err = 0;
4121 
4122 
4123 	real_mtu = mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
4124 	if ((real_mtu > sc->max_mtu) || real_mtu < 60)
4125 		return EINVAL;
4126 	mtx_lock(&sc->driver_mtx);
4127 	old_mtu = ifp->if_mtu;
4128 	ifp->if_mtu = mtu;
4129 	if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
4130 		mxge_close(sc, 0);
4131 		err = mxge_open(sc);
4132 		if (err != 0) {
4133 			ifp->if_mtu = old_mtu;
4134 			mxge_close(sc, 0);
4135 			(void) mxge_open(sc);
4136 		}
4137 	}
4138 	mtx_unlock(&sc->driver_mtx);
4139 	return err;
4140 }
4141 
4142 static void
4143 mxge_media_status(struct ifnet *ifp, struct ifmediareq *ifmr)
4144 {
4145 	mxge_softc_t *sc = ifp->if_softc;
4146 
4147 
4148 	if (sc == NULL)
4149 		return;
4150 	ifmr->ifm_status = IFM_AVALID;
4151 	ifmr->ifm_active = IFM_ETHER | IFM_FDX;
4152 	ifmr->ifm_status |= sc->link_state ? IFM_ACTIVE : 0;
4153 	ifmr->ifm_active |= sc->current_media;
4154 }
4155 
4156 static int
4157 mxge_ioctl(struct ifnet *ifp, u_long command, caddr_t data)
4158 {
4159 	mxge_softc_t *sc = ifp->if_softc;
4160 	struct ifreq *ifr = (struct ifreq *)data;
4161 	int err, mask;
4162 
4163 	err = 0;
4164 	switch (command) {
4165 	case SIOCSIFADDR:
4166 	case SIOCGIFADDR:
4167 		err = ether_ioctl(ifp, command, data);
4168 		break;
4169 
4170 	case SIOCSIFMTU:
4171 		err = mxge_change_mtu(sc, ifr->ifr_mtu);
4172 		break;
4173 
4174 	case SIOCSIFFLAGS:
4175 		mtx_lock(&sc->driver_mtx);
4176 		if (sc->dying) {
4177 			mtx_unlock(&sc->driver_mtx);
4178 			return EINVAL;
4179 		}
4180 		if (ifp->if_flags & IFF_UP) {
4181 			if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) {
4182 				err = mxge_open(sc);
4183 			} else {
4184 				/* take care of promis can allmulti
4185 				   flag chages */
4186 				mxge_change_promisc(sc,
4187 						    ifp->if_flags & IFF_PROMISC);
4188 				mxge_set_multicast_list(sc);
4189 			}
4190 		} else {
4191 			if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
4192 				mxge_close(sc, 0);
4193 			}
4194 		}
4195 		mtx_unlock(&sc->driver_mtx);
4196 		break;
4197 
4198 	case SIOCADDMULTI:
4199 	case SIOCDELMULTI:
4200 		mtx_lock(&sc->driver_mtx);
4201 		mxge_set_multicast_list(sc);
4202 		mtx_unlock(&sc->driver_mtx);
4203 		break;
4204 
4205 	case SIOCSIFCAP:
4206 		mtx_lock(&sc->driver_mtx);
4207 		mask = ifr->ifr_reqcap ^ ifp->if_capenable;
4208 		if (mask & IFCAP_TXCSUM) {
4209 			if (IFCAP_TXCSUM & ifp->if_capenable) {
4210 				ifp->if_capenable &= ~(IFCAP_TXCSUM|IFCAP_TSO4);
4211 				ifp->if_hwassist &= ~(CSUM_TCP | CSUM_UDP);
4212 			} else {
4213 				ifp->if_capenable |= IFCAP_TXCSUM;
4214 				ifp->if_hwassist |= (CSUM_TCP | CSUM_UDP);
4215 			}
4216 		} else if (mask & IFCAP_RXCSUM) {
4217 			if (IFCAP_RXCSUM & ifp->if_capenable) {
4218 				ifp->if_capenable &= ~IFCAP_RXCSUM;
4219 			} else {
4220 				ifp->if_capenable |= IFCAP_RXCSUM;
4221 			}
4222 		}
4223 		if (mask & IFCAP_TSO4) {
4224 			if (IFCAP_TSO4 & ifp->if_capenable) {
4225 				ifp->if_capenable &= ~IFCAP_TSO4;
4226 			} else if (IFCAP_TXCSUM & ifp->if_capenable) {
4227 				ifp->if_capenable |= IFCAP_TSO4;
4228 				ifp->if_hwassist |= CSUM_TSO;
4229 			} else {
4230 				printf("mxge requires tx checksum offload"
4231 				       " be enabled to use TSO\n");
4232 				err = EINVAL;
4233 			}
4234 		}
4235 #if IFCAP_TSO6
4236 		if (mask & IFCAP_TXCSUM_IPV6) {
4237 			if (IFCAP_TXCSUM_IPV6 & ifp->if_capenable) {
4238 				ifp->if_capenable &= ~(IFCAP_TXCSUM_IPV6
4239 						       | IFCAP_TSO6);
4240 				ifp->if_hwassist &= ~(CSUM_TCP_IPV6
4241 						      | CSUM_UDP);
4242 			} else {
4243 				ifp->if_capenable |= IFCAP_TXCSUM_IPV6;
4244 				ifp->if_hwassist |= (CSUM_TCP_IPV6
4245 						     | CSUM_UDP_IPV6);
4246 			}
4247 		} else if (mask & IFCAP_RXCSUM_IPV6) {
4248 			if (IFCAP_RXCSUM_IPV6 & ifp->if_capenable) {
4249 				ifp->if_capenable &= ~IFCAP_RXCSUM_IPV6;
4250 			} else {
4251 				ifp->if_capenable |= IFCAP_RXCSUM_IPV6;
4252 			}
4253 		}
4254 		if (mask & IFCAP_TSO6) {
4255 			if (IFCAP_TSO6 & ifp->if_capenable) {
4256 				ifp->if_capenable &= ~IFCAP_TSO6;
4257 			} else if (IFCAP_TXCSUM_IPV6 & ifp->if_capenable) {
4258 				ifp->if_capenable |= IFCAP_TSO6;
4259 				ifp->if_hwassist |= CSUM_TSO;
4260 			} else {
4261 				printf("mxge requires tx checksum offload"
4262 				       " be enabled to use TSO\n");
4263 				err = EINVAL;
4264 			}
4265 		}
4266 #endif /*IFCAP_TSO6 */
4267 
4268 		if (mask & IFCAP_LRO)
4269 			ifp->if_capenable ^= IFCAP_LRO;
4270 		if (mask & IFCAP_VLAN_HWTAGGING)
4271 			ifp->if_capenable ^= IFCAP_VLAN_HWTAGGING;
4272 		if (mask & IFCAP_VLAN_HWTSO)
4273 			ifp->if_capenable ^= IFCAP_VLAN_HWTSO;
4274 
4275 		if (!(ifp->if_capabilities & IFCAP_VLAN_HWTSO) ||
4276 		    !(ifp->if_capenable & IFCAP_VLAN_HWTAGGING))
4277 			ifp->if_capenable &= ~IFCAP_VLAN_HWTSO;
4278 
4279 		mtx_unlock(&sc->driver_mtx);
4280 		VLAN_CAPABILITIES(ifp);
4281 
4282 		break;
4283 
4284 	case SIOCGIFMEDIA:
4285 		mtx_lock(&sc->driver_mtx);
4286 		mxge_media_probe(sc);
4287 		mtx_unlock(&sc->driver_mtx);
4288 		err = ifmedia_ioctl(ifp, (struct ifreq *)data,
4289 				    &sc->media, command);
4290 		break;
4291 
4292 	default:
4293 		err = ENOTTY;
4294 	}
4295 	return err;
4296 }
4297 
4298 static void
4299 mxge_fetch_tunables(mxge_softc_t *sc)
4300 {
4301 
4302 	TUNABLE_INT_FETCH("hw.mxge.max_slices", &mxge_max_slices);
4303 	TUNABLE_INT_FETCH("hw.mxge.flow_control_enabled",
4304 			  &mxge_flow_control);
4305 	TUNABLE_INT_FETCH("hw.mxge.intr_coal_delay",
4306 			  &mxge_intr_coal_delay);
4307 	TUNABLE_INT_FETCH("hw.mxge.nvidia_ecrc_enable",
4308 			  &mxge_nvidia_ecrc_enable);
4309 	TUNABLE_INT_FETCH("hw.mxge.force_firmware",
4310 			  &mxge_force_firmware);
4311 	TUNABLE_INT_FETCH("hw.mxge.deassert_wait",
4312 			  &mxge_deassert_wait);
4313 	TUNABLE_INT_FETCH("hw.mxge.verbose",
4314 			  &mxge_verbose);
4315 	TUNABLE_INT_FETCH("hw.mxge.ticks", &mxge_ticks);
4316 	TUNABLE_INT_FETCH("hw.mxge.always_promisc", &mxge_always_promisc);
4317 	TUNABLE_INT_FETCH("hw.mxge.rss_hash_type", &mxge_rss_hash_type);
4318 	TUNABLE_INT_FETCH("hw.mxge.rss_hashtype", &mxge_rss_hash_type);
4319 	TUNABLE_INT_FETCH("hw.mxge.initial_mtu", &mxge_initial_mtu);
4320 	TUNABLE_INT_FETCH("hw.mxge.throttle", &mxge_throttle);
4321 
4322 	if (bootverbose)
4323 		mxge_verbose = 1;
4324 	if (mxge_intr_coal_delay < 0 || mxge_intr_coal_delay > 10*1000)
4325 		mxge_intr_coal_delay = 30;
4326 	if (mxge_ticks == 0)
4327 		mxge_ticks = hz / 2;
4328 	sc->pause = mxge_flow_control;
4329 	if (mxge_rss_hash_type < MXGEFW_RSS_HASH_TYPE_IPV4
4330 	    || mxge_rss_hash_type > MXGEFW_RSS_HASH_TYPE_MAX) {
4331 		mxge_rss_hash_type = MXGEFW_RSS_HASH_TYPE_SRC_DST_PORT;
4332 	}
4333 	if (mxge_initial_mtu > ETHERMTU_JUMBO ||
4334 	    mxge_initial_mtu < ETHER_MIN_LEN)
4335 		mxge_initial_mtu = ETHERMTU_JUMBO;
4336 
4337 	if (mxge_throttle && mxge_throttle > MXGE_MAX_THROTTLE)
4338 		mxge_throttle = MXGE_MAX_THROTTLE;
4339 	if (mxge_throttle && mxge_throttle < MXGE_MIN_THROTTLE)
4340 		mxge_throttle = MXGE_MIN_THROTTLE;
4341 	sc->throttle = mxge_throttle;
4342 }
4343 
4344 
4345 static void
4346 mxge_free_slices(mxge_softc_t *sc)
4347 {
4348 	struct mxge_slice_state *ss;
4349 	int i;
4350 
4351 
4352 	if (sc->ss == NULL)
4353 		return;
4354 
4355 	for (i = 0; i < sc->num_slices; i++) {
4356 		ss = &sc->ss[i];
4357 		if (ss->fw_stats != NULL) {
4358 			mxge_dma_free(&ss->fw_stats_dma);
4359 			ss->fw_stats = NULL;
4360 #ifdef IFNET_BUF_RING
4361 			if (ss->tx.br != NULL) {
4362 				drbr_free(ss->tx.br, M_DEVBUF);
4363 				ss->tx.br = NULL;
4364 			}
4365 #endif
4366 			mtx_destroy(&ss->tx.mtx);
4367 		}
4368 		if (ss->rx_done.entry != NULL) {
4369 			mxge_dma_free(&ss->rx_done.dma);
4370 			ss->rx_done.entry = NULL;
4371 		}
4372 	}
4373 	free(sc->ss, M_DEVBUF);
4374 	sc->ss = NULL;
4375 }
4376 
4377 static int
4378 mxge_alloc_slices(mxge_softc_t *sc)
4379 {
4380 	mxge_cmd_t cmd;
4381 	struct mxge_slice_state *ss;
4382 	size_t bytes;
4383 	int err, i, max_intr_slots;
4384 
4385 	err = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
4386 	if (err != 0) {
4387 		device_printf(sc->dev, "Cannot determine rx ring size\n");
4388 		return err;
4389 	}
4390 	sc->rx_ring_size = cmd.data0;
4391 	max_intr_slots = 2 * (sc->rx_ring_size / sizeof (mcp_dma_addr_t));
4392 
4393 	bytes = sizeof (*sc->ss) * sc->num_slices;
4394 	sc->ss = malloc(bytes, M_DEVBUF, M_NOWAIT | M_ZERO);
4395 	if (sc->ss == NULL)
4396 		return (ENOMEM);
4397 	for (i = 0; i < sc->num_slices; i++) {
4398 		ss = &sc->ss[i];
4399 
4400 		ss->sc = sc;
4401 
4402 		/* allocate per-slice rx interrupt queues */
4403 
4404 		bytes = max_intr_slots * sizeof (*ss->rx_done.entry);
4405 		err = mxge_dma_alloc(sc, &ss->rx_done.dma, bytes, 4096);
4406 		if (err != 0)
4407 			goto abort;
4408 		ss->rx_done.entry = ss->rx_done.dma.addr;
4409 		bzero(ss->rx_done.entry, bytes);
4410 
4411 		/*
4412 		 * allocate the per-slice firmware stats; stats
4413 		 * (including tx) are used used only on the first
4414 		 * slice for now
4415 		 */
4416 #ifndef IFNET_BUF_RING
4417 		if (i > 0)
4418 			continue;
4419 #endif
4420 
4421 		bytes = sizeof (*ss->fw_stats);
4422 		err = mxge_dma_alloc(sc, &ss->fw_stats_dma,
4423 				     sizeof (*ss->fw_stats), 64);
4424 		if (err != 0)
4425 			goto abort;
4426 		ss->fw_stats = (mcp_irq_data_t *)ss->fw_stats_dma.addr;
4427 		snprintf(ss->tx.mtx_name, sizeof(ss->tx.mtx_name),
4428 			 "%s:tx(%d)", device_get_nameunit(sc->dev), i);
4429 		mtx_init(&ss->tx.mtx, ss->tx.mtx_name, NULL, MTX_DEF);
4430 #ifdef IFNET_BUF_RING
4431 		ss->tx.br = buf_ring_alloc(2048, M_DEVBUF, M_WAITOK,
4432 					   &ss->tx.mtx);
4433 #endif
4434 	}
4435 
4436 	return (0);
4437 
4438 abort:
4439 	mxge_free_slices(sc);
4440 	return (ENOMEM);
4441 }
4442 
4443 static void
4444 mxge_slice_probe(mxge_softc_t *sc)
4445 {
4446 	mxge_cmd_t cmd;
4447 	char *old_fw;
4448 	int msix_cnt, status, max_intr_slots;
4449 
4450 	sc->num_slices = 1;
4451 	/*
4452 	 *  don't enable multiple slices if they are not enabled,
4453 	 *  or if this is not an SMP system
4454 	 */
4455 
4456 	if (mxge_max_slices == 0 || mxge_max_slices == 1 || mp_ncpus < 2)
4457 		return;
4458 
4459 	/* see how many MSI-X interrupts are available */
4460 	msix_cnt = pci_msix_count(sc->dev);
4461 	if (msix_cnt < 2)
4462 		return;
4463 
4464 	/* now load the slice aware firmware see what it supports */
4465 	old_fw = sc->fw_name;
4466 	if (old_fw == mxge_fw_aligned)
4467 		sc->fw_name = mxge_fw_rss_aligned;
4468 	else
4469 		sc->fw_name = mxge_fw_rss_unaligned;
4470 	status = mxge_load_firmware(sc, 0);
4471 	if (status != 0) {
4472 		device_printf(sc->dev, "Falling back to a single slice\n");
4473 		return;
4474 	}
4475 
4476 	/* try to send a reset command to the card to see if it
4477 	   is alive */
4478 	memset(&cmd, 0, sizeof (cmd));
4479 	status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
4480 	if (status != 0) {
4481 		device_printf(sc->dev, "failed reset\n");
4482 		goto abort_with_fw;
4483 	}
4484 
4485 	/* get rx ring size */
4486 	status = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
4487 	if (status != 0) {
4488 		device_printf(sc->dev, "Cannot determine rx ring size\n");
4489 		goto abort_with_fw;
4490 	}
4491 	max_intr_slots = 2 * (cmd.data0 / sizeof (mcp_dma_addr_t));
4492 
4493 	/* tell it the size of the interrupt queues */
4494 	cmd.data0 = max_intr_slots * sizeof (struct mcp_slot);
4495 	status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
4496 	if (status != 0) {
4497 		device_printf(sc->dev, "failed MXGEFW_CMD_SET_INTRQ_SIZE\n");
4498 		goto abort_with_fw;
4499 	}
4500 
4501 	/* ask the maximum number of slices it supports */
4502 	status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES, &cmd);
4503 	if (status != 0) {
4504 		device_printf(sc->dev,
4505 			      "failed MXGEFW_CMD_GET_MAX_RSS_QUEUES\n");
4506 		goto abort_with_fw;
4507 	}
4508 	sc->num_slices = cmd.data0;
4509 	if (sc->num_slices > msix_cnt)
4510 		sc->num_slices = msix_cnt;
4511 
4512 	if (mxge_max_slices == -1) {
4513 		/* cap to number of CPUs in system */
4514 		if (sc->num_slices > mp_ncpus)
4515 			sc->num_slices = mp_ncpus;
4516 	} else {
4517 		if (sc->num_slices > mxge_max_slices)
4518 			sc->num_slices = mxge_max_slices;
4519 	}
4520 	/* make sure it is a power of two */
4521 	while (sc->num_slices & (sc->num_slices - 1))
4522 		sc->num_slices--;
4523 
4524 	if (mxge_verbose)
4525 		device_printf(sc->dev, "using %d slices\n",
4526 			      sc->num_slices);
4527 
4528 	return;
4529 
4530 abort_with_fw:
4531 	sc->fw_name = old_fw;
4532 	(void) mxge_load_firmware(sc, 0);
4533 }
4534 
4535 static int
4536 mxge_add_msix_irqs(mxge_softc_t *sc)
4537 {
4538 	size_t bytes;
4539 	int count, err, i, rid;
4540 
4541 	rid = PCIR_BAR(2);
4542 	sc->msix_table_res = bus_alloc_resource_any(sc->dev, SYS_RES_MEMORY,
4543 						    &rid, RF_ACTIVE);
4544 
4545 	if (sc->msix_table_res == NULL) {
4546 		device_printf(sc->dev, "couldn't alloc MSIX table res\n");
4547 		return ENXIO;
4548 	}
4549 
4550 	count = sc->num_slices;
4551 	err = pci_alloc_msix(sc->dev, &count);
4552 	if (err != 0) {
4553 		device_printf(sc->dev, "pci_alloc_msix: failed, wanted %d"
4554 			      "err = %d \n", sc->num_slices, err);
4555 		goto abort_with_msix_table;
4556 	}
4557 	if (count < sc->num_slices) {
4558 		device_printf(sc->dev, "pci_alloc_msix: need %d, got %d\n",
4559 			      count, sc->num_slices);
4560 		device_printf(sc->dev,
4561 			      "Try setting hw.mxge.max_slices to %d\n",
4562 			      count);
4563 		err = ENOSPC;
4564 		goto abort_with_msix;
4565 	}
4566 	bytes = sizeof (*sc->msix_irq_res) * sc->num_slices;
4567 	sc->msix_irq_res = malloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO);
4568 	if (sc->msix_irq_res == NULL) {
4569 		err = ENOMEM;
4570 		goto abort_with_msix;
4571 	}
4572 
4573 	for (i = 0; i < sc->num_slices; i++) {
4574 		rid = i + 1;
4575 		sc->msix_irq_res[i] = bus_alloc_resource_any(sc->dev,
4576 							  SYS_RES_IRQ,
4577 							  &rid, RF_ACTIVE);
4578 		if (sc->msix_irq_res[i] == NULL) {
4579 			device_printf(sc->dev, "couldn't allocate IRQ res"
4580 				      " for message %d\n", i);
4581 			err = ENXIO;
4582 			goto abort_with_res;
4583 		}
4584 	}
4585 
4586 	bytes = sizeof (*sc->msix_ih) * sc->num_slices;
4587 	sc->msix_ih =  malloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO);
4588 
4589 	for (i = 0; i < sc->num_slices; i++) {
4590 		err = bus_setup_intr(sc->dev, sc->msix_irq_res[i],
4591 				     INTR_TYPE_NET | INTR_MPSAFE,
4592 #if __FreeBSD_version > 700030
4593 				     NULL,
4594 #endif
4595 				     mxge_intr, &sc->ss[i], &sc->msix_ih[i]);
4596 		if (err != 0) {
4597 			device_printf(sc->dev, "couldn't setup intr for "
4598 				      "message %d\n", i);
4599 			goto abort_with_intr;
4600 		}
4601 		bus_describe_intr(sc->dev, sc->msix_irq_res[i],
4602 				  sc->msix_ih[i], "s%d", i);
4603 	}
4604 
4605 	if (mxge_verbose) {
4606 		device_printf(sc->dev, "using %d msix IRQs:",
4607 			      sc->num_slices);
4608 		for (i = 0; i < sc->num_slices; i++)
4609 			printf(" %jd", rman_get_start(sc->msix_irq_res[i]));
4610 		printf("\n");
4611 	}
4612 	return (0);
4613 
4614 abort_with_intr:
4615 	for (i = 0; i < sc->num_slices; i++) {
4616 		if (sc->msix_ih[i] != NULL) {
4617 			bus_teardown_intr(sc->dev, sc->msix_irq_res[i],
4618 					  sc->msix_ih[i]);
4619 			sc->msix_ih[i] = NULL;
4620 		}
4621 	}
4622 	free(sc->msix_ih, M_DEVBUF);
4623 
4624 
4625 abort_with_res:
4626 	for (i = 0; i < sc->num_slices; i++) {
4627 		rid = i + 1;
4628 		if (sc->msix_irq_res[i] != NULL)
4629 			bus_release_resource(sc->dev, SYS_RES_IRQ, rid,
4630 					     sc->msix_irq_res[i]);
4631 		sc->msix_irq_res[i] = NULL;
4632 	}
4633 	free(sc->msix_irq_res, M_DEVBUF);
4634 
4635 
4636 abort_with_msix:
4637 	pci_release_msi(sc->dev);
4638 
4639 abort_with_msix_table:
4640 	bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2),
4641 			     sc->msix_table_res);
4642 
4643 	return err;
4644 }
4645 
4646 static int
4647 mxge_add_single_irq(mxge_softc_t *sc)
4648 {
4649 	int count, err, rid;
4650 
4651 	count = pci_msi_count(sc->dev);
4652 	if (count == 1 && pci_alloc_msi(sc->dev, &count) == 0) {
4653 		rid = 1;
4654 	} else {
4655 		rid = 0;
4656 		sc->legacy_irq = 1;
4657 	}
4658 	sc->irq_res = bus_alloc_resource_any(sc->dev, SYS_RES_IRQ, &rid,
4659 					     RF_SHAREABLE | RF_ACTIVE);
4660 	if (sc->irq_res == NULL) {
4661 		device_printf(sc->dev, "could not alloc interrupt\n");
4662 		return ENXIO;
4663 	}
4664 	if (mxge_verbose)
4665 		device_printf(sc->dev, "using %s irq %jd\n",
4666 			      sc->legacy_irq ? "INTx" : "MSI",
4667 			      rman_get_start(sc->irq_res));
4668 	err = bus_setup_intr(sc->dev, sc->irq_res,
4669 			     INTR_TYPE_NET | INTR_MPSAFE,
4670 #if __FreeBSD_version > 700030
4671 			     NULL,
4672 #endif
4673 			     mxge_intr, &sc->ss[0], &sc->ih);
4674 	if (err != 0) {
4675 		bus_release_resource(sc->dev, SYS_RES_IRQ,
4676 				     sc->legacy_irq ? 0 : 1, sc->irq_res);
4677 		if (!sc->legacy_irq)
4678 			pci_release_msi(sc->dev);
4679 	}
4680 	return err;
4681 }
4682 
4683 static void
4684 mxge_rem_msix_irqs(mxge_softc_t *sc)
4685 {
4686 	int i, rid;
4687 
4688 	for (i = 0; i < sc->num_slices; i++) {
4689 		if (sc->msix_ih[i] != NULL) {
4690 			bus_teardown_intr(sc->dev, sc->msix_irq_res[i],
4691 					  sc->msix_ih[i]);
4692 			sc->msix_ih[i] = NULL;
4693 		}
4694 	}
4695 	free(sc->msix_ih, M_DEVBUF);
4696 
4697 	for (i = 0; i < sc->num_slices; i++) {
4698 		rid = i + 1;
4699 		if (sc->msix_irq_res[i] != NULL)
4700 			bus_release_resource(sc->dev, SYS_RES_IRQ, rid,
4701 					     sc->msix_irq_res[i]);
4702 		sc->msix_irq_res[i] = NULL;
4703 	}
4704 	free(sc->msix_irq_res, M_DEVBUF);
4705 
4706 	bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2),
4707 			     sc->msix_table_res);
4708 
4709 	pci_release_msi(sc->dev);
4710 	return;
4711 }
4712 
4713 static void
4714 mxge_rem_single_irq(mxge_softc_t *sc)
4715 {
4716 	bus_teardown_intr(sc->dev, sc->irq_res, sc->ih);
4717 	bus_release_resource(sc->dev, SYS_RES_IRQ,
4718 			     sc->legacy_irq ? 0 : 1, sc->irq_res);
4719 	if (!sc->legacy_irq)
4720 		pci_release_msi(sc->dev);
4721 }
4722 
4723 static void
4724 mxge_rem_irq(mxge_softc_t *sc)
4725 {
4726 	if (sc->num_slices > 1)
4727 		mxge_rem_msix_irqs(sc);
4728 	else
4729 		mxge_rem_single_irq(sc);
4730 }
4731 
4732 static int
4733 mxge_add_irq(mxge_softc_t *sc)
4734 {
4735 	int err;
4736 
4737 	if (sc->num_slices > 1)
4738 		err = mxge_add_msix_irqs(sc);
4739 	else
4740 		err = mxge_add_single_irq(sc);
4741 
4742 	if (0 && err == 0 && sc->num_slices > 1) {
4743 		mxge_rem_msix_irqs(sc);
4744 		err = mxge_add_msix_irqs(sc);
4745 	}
4746 	return err;
4747 }
4748 
4749 
4750 static int
4751 mxge_attach(device_t dev)
4752 {
4753 	mxge_cmd_t cmd;
4754 	mxge_softc_t *sc = device_get_softc(dev);
4755 	struct ifnet *ifp;
4756 	int err, rid;
4757 
4758 	sc->dev = dev;
4759 	mxge_fetch_tunables(sc);
4760 
4761 	TASK_INIT(&sc->watchdog_task, 1, mxge_watchdog_task, sc);
4762 	sc->tq = taskqueue_create("mxge_taskq", M_WAITOK,
4763 				  taskqueue_thread_enqueue, &sc->tq);
4764 	if (sc->tq == NULL) {
4765 		err = ENOMEM;
4766 		goto abort_with_nothing;
4767 	}
4768 
4769 	err = bus_dma_tag_create(bus_get_dma_tag(dev),	/* parent */
4770 				 1,			/* alignment */
4771 				 0,			/* boundary */
4772 				 BUS_SPACE_MAXADDR,	/* low */
4773 				 BUS_SPACE_MAXADDR,	/* high */
4774 				 NULL, NULL,		/* filter */
4775 				 65536 + 256,		/* maxsize */
4776 				 MXGE_MAX_SEND_DESC, 	/* num segs */
4777 				 65536,			/* maxsegsize */
4778 				 0,			/* flags */
4779 				 NULL, NULL,		/* lock */
4780 				 &sc->parent_dmat);	/* tag */
4781 
4782 	if (err != 0) {
4783 		device_printf(sc->dev, "Err %d allocating parent dmat\n",
4784 			      err);
4785 		goto abort_with_tq;
4786 	}
4787 
4788 	ifp = sc->ifp = if_alloc(IFT_ETHER);
4789 	if (ifp == NULL) {
4790 		device_printf(dev, "can not if_alloc()\n");
4791 		err = ENOSPC;
4792 		goto abort_with_parent_dmat;
4793 	}
4794 	if_initname(ifp, device_get_name(dev), device_get_unit(dev));
4795 
4796 	snprintf(sc->cmd_mtx_name, sizeof(sc->cmd_mtx_name), "%s:cmd",
4797 		 device_get_nameunit(dev));
4798 	mtx_init(&sc->cmd_mtx, sc->cmd_mtx_name, NULL, MTX_DEF);
4799 	snprintf(sc->driver_mtx_name, sizeof(sc->driver_mtx_name),
4800 		 "%s:drv", device_get_nameunit(dev));
4801 	mtx_init(&sc->driver_mtx, sc->driver_mtx_name,
4802 		 MTX_NETWORK_LOCK, MTX_DEF);
4803 
4804 	callout_init_mtx(&sc->co_hdl, &sc->driver_mtx, 0);
4805 
4806 	mxge_setup_cfg_space(sc);
4807 
4808 	/* Map the board into the kernel */
4809 	rid = PCIR_BARS;
4810 	sc->mem_res = bus_alloc_resource_any(dev, SYS_RES_MEMORY, &rid,
4811 					     RF_ACTIVE);
4812 	if (sc->mem_res == NULL) {
4813 		device_printf(dev, "could not map memory\n");
4814 		err = ENXIO;
4815 		goto abort_with_lock;
4816 	}
4817 	sc->sram = rman_get_virtual(sc->mem_res);
4818 	sc->sram_size = 2*1024*1024 - (2*(48*1024)+(32*1024)) - 0x100;
4819 	if (sc->sram_size > rman_get_size(sc->mem_res)) {
4820 		device_printf(dev, "impossible memory region size %jd\n",
4821 			      rman_get_size(sc->mem_res));
4822 		err = ENXIO;
4823 		goto abort_with_mem_res;
4824 	}
4825 
4826 	/* make NULL terminated copy of the EEPROM strings section of
4827 	   lanai SRAM */
4828 	bzero(sc->eeprom_strings, MXGE_EEPROM_STRINGS_SIZE);
4829 	bus_space_read_region_1(rman_get_bustag(sc->mem_res),
4830 				rman_get_bushandle(sc->mem_res),
4831 				sc->sram_size - MXGE_EEPROM_STRINGS_SIZE,
4832 				sc->eeprom_strings,
4833 				MXGE_EEPROM_STRINGS_SIZE - 2);
4834 	err = mxge_parse_strings(sc);
4835 	if (err != 0)
4836 		goto abort_with_mem_res;
4837 
4838 	/* Enable write combining for efficient use of PCIe bus */
4839 	mxge_enable_wc(sc);
4840 
4841 	/* Allocate the out of band dma memory */
4842 	err = mxge_dma_alloc(sc, &sc->cmd_dma,
4843 			     sizeof (mxge_cmd_t), 64);
4844 	if (err != 0)
4845 		goto abort_with_mem_res;
4846 	sc->cmd = (mcp_cmd_response_t *) sc->cmd_dma.addr;
4847 	err = mxge_dma_alloc(sc, &sc->zeropad_dma, 64, 64);
4848 	if (err != 0)
4849 		goto abort_with_cmd_dma;
4850 
4851 	err = mxge_dma_alloc(sc, &sc->dmabench_dma, 4096, 4096);
4852 	if (err != 0)
4853 		goto abort_with_zeropad_dma;
4854 
4855 	/* select & load the firmware */
4856 	err = mxge_select_firmware(sc);
4857 	if (err != 0)
4858 		goto abort_with_dmabench;
4859 	sc->intr_coal_delay = mxge_intr_coal_delay;
4860 
4861 	mxge_slice_probe(sc);
4862 	err = mxge_alloc_slices(sc);
4863 	if (err != 0)
4864 		goto abort_with_dmabench;
4865 
4866 	err = mxge_reset(sc, 0);
4867 	if (err != 0)
4868 		goto abort_with_slices;
4869 
4870 	err = mxge_alloc_rings(sc);
4871 	if (err != 0) {
4872 		device_printf(sc->dev, "failed to allocate rings\n");
4873 		goto abort_with_slices;
4874 	}
4875 
4876 	err = mxge_add_irq(sc);
4877 	if (err != 0) {
4878 		device_printf(sc->dev, "failed to add irq\n");
4879 		goto abort_with_rings;
4880 	}
4881 
4882 	ifp->if_baudrate = IF_Gbps(10);
4883 	ifp->if_capabilities = IFCAP_RXCSUM | IFCAP_TXCSUM | IFCAP_TSO4 |
4884 		IFCAP_VLAN_MTU | IFCAP_LINKSTATE | IFCAP_TXCSUM_IPV6 |
4885 		IFCAP_RXCSUM_IPV6;
4886 #if defined(INET) || defined(INET6)
4887 	ifp->if_capabilities |= IFCAP_LRO;
4888 #endif
4889 
4890 #ifdef MXGE_NEW_VLAN_API
4891 	ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_HWCSUM;
4892 
4893 	/* Only FW 1.4.32 and newer can do TSO over vlans */
4894 	if (sc->fw_ver_major == 1 && sc->fw_ver_minor == 4 &&
4895 	    sc->fw_ver_tiny >= 32)
4896 		ifp->if_capabilities |= IFCAP_VLAN_HWTSO;
4897 #endif
4898 	sc->max_mtu = mxge_max_mtu(sc);
4899 	if (sc->max_mtu >= 9000)
4900 		ifp->if_capabilities |= IFCAP_JUMBO_MTU;
4901 	else
4902 		device_printf(dev, "MTU limited to %d.  Install "
4903 			      "latest firmware for 9000 byte jumbo support\n",
4904 			      sc->max_mtu - ETHER_HDR_LEN);
4905 	ifp->if_hwassist = CSUM_TCP | CSUM_UDP | CSUM_TSO;
4906 	ifp->if_hwassist |= CSUM_TCP_IPV6 | CSUM_UDP_IPV6;
4907 	/* check to see if f/w supports TSO for IPv6 */
4908 	if (!mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_TSO6_HDR_SIZE, &cmd)) {
4909 		if (CSUM_TCP_IPV6)
4910 			ifp->if_capabilities |= IFCAP_TSO6;
4911 		sc->max_tso6_hlen = min(cmd.data0,
4912 					sizeof (sc->ss[0].scratch));
4913 	}
4914 	ifp->if_capenable = ifp->if_capabilities;
4915 	if (sc->lro_cnt == 0)
4916 		ifp->if_capenable &= ~IFCAP_LRO;
4917 	ifp->if_init = mxge_init;
4918 	ifp->if_softc = sc;
4919 	ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
4920 	ifp->if_ioctl = mxge_ioctl;
4921 	ifp->if_start = mxge_start;
4922 	ifp->if_get_counter = mxge_get_counter;
4923 	/* Initialise the ifmedia structure */
4924 	ifmedia_init(&sc->media, 0, mxge_media_change,
4925 		     mxge_media_status);
4926 	mxge_media_init(sc);
4927 	mxge_media_probe(sc);
4928 	sc->dying = 0;
4929 	ether_ifattach(ifp, sc->mac_addr);
4930 	/* ether_ifattach sets mtu to ETHERMTU */
4931 	if (mxge_initial_mtu != ETHERMTU)
4932 		mxge_change_mtu(sc, mxge_initial_mtu);
4933 
4934 	mxge_add_sysctls(sc);
4935 #ifdef IFNET_BUF_RING
4936 	ifp->if_transmit = mxge_transmit;
4937 	ifp->if_qflush = mxge_qflush;
4938 #endif
4939 	taskqueue_start_threads(&sc->tq, 1, PI_NET, "%s taskq",
4940 				device_get_nameunit(sc->dev));
4941 	callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
4942 	return 0;
4943 
4944 abort_with_rings:
4945 	mxge_free_rings(sc);
4946 abort_with_slices:
4947 	mxge_free_slices(sc);
4948 abort_with_dmabench:
4949 	mxge_dma_free(&sc->dmabench_dma);
4950 abort_with_zeropad_dma:
4951 	mxge_dma_free(&sc->zeropad_dma);
4952 abort_with_cmd_dma:
4953 	mxge_dma_free(&sc->cmd_dma);
4954 abort_with_mem_res:
4955 	bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
4956 abort_with_lock:
4957 	pci_disable_busmaster(dev);
4958 	mtx_destroy(&sc->cmd_mtx);
4959 	mtx_destroy(&sc->driver_mtx);
4960 	if_free(ifp);
4961 abort_with_parent_dmat:
4962 	bus_dma_tag_destroy(sc->parent_dmat);
4963 abort_with_tq:
4964 	if (sc->tq != NULL) {
4965 		taskqueue_drain(sc->tq, &sc->watchdog_task);
4966 		taskqueue_free(sc->tq);
4967 		sc->tq = NULL;
4968 	}
4969 abort_with_nothing:
4970 	return err;
4971 }
4972 
4973 static int
4974 mxge_detach(device_t dev)
4975 {
4976 	mxge_softc_t *sc = device_get_softc(dev);
4977 
4978 	if (mxge_vlans_active(sc)) {
4979 		device_printf(sc->dev,
4980 			      "Detach vlans before removing module\n");
4981 		return EBUSY;
4982 	}
4983 	mtx_lock(&sc->driver_mtx);
4984 	sc->dying = 1;
4985 	if (sc->ifp->if_drv_flags & IFF_DRV_RUNNING)
4986 		mxge_close(sc, 0);
4987 	mtx_unlock(&sc->driver_mtx);
4988 	ether_ifdetach(sc->ifp);
4989 	if (sc->tq != NULL) {
4990 		taskqueue_drain(sc->tq, &sc->watchdog_task);
4991 		taskqueue_free(sc->tq);
4992 		sc->tq = NULL;
4993 	}
4994 	callout_drain(&sc->co_hdl);
4995 	ifmedia_removeall(&sc->media);
4996 	mxge_dummy_rdma(sc, 0);
4997 	mxge_rem_sysctls(sc);
4998 	mxge_rem_irq(sc);
4999 	mxge_free_rings(sc);
5000 	mxge_free_slices(sc);
5001 	mxge_dma_free(&sc->dmabench_dma);
5002 	mxge_dma_free(&sc->zeropad_dma);
5003 	mxge_dma_free(&sc->cmd_dma);
5004 	bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
5005 	pci_disable_busmaster(dev);
5006 	mtx_destroy(&sc->cmd_mtx);
5007 	mtx_destroy(&sc->driver_mtx);
5008 	if_free(sc->ifp);
5009 	bus_dma_tag_destroy(sc->parent_dmat);
5010 	return 0;
5011 }
5012 
5013 static int
5014 mxge_shutdown(device_t dev)
5015 {
5016 	return 0;
5017 }
5018 
5019 /*
5020   This file uses Myri10GE driver indentation.
5021 
5022   Local Variables:
5023   c-file-style:"linux"
5024   tab-width:8
5025   End:
5026 */
5027