xref: /freebsd/sys/dev/mxge/if_mxge.c (revision 3cc3c1eb7961cf93306e0a0f79f3e2309f5b7011)
1 /******************************************************************************
2 SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3 
4 Copyright (c) 2006-2013, Myricom Inc.
5 All rights reserved.
6 
7 Redistribution and use in source and binary forms, with or without
8 modification, are permitted provided that the following conditions are met:
9 
10  1. Redistributions of source code must retain the above copyright notice,
11     this list of conditions and the following disclaimer.
12 
13  2. Neither the name of the Myricom Inc, nor the names of its
14     contributors may be used to endorse or promote products derived from
15     this software without specific prior written permission.
16 
17 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
21 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27 POSSIBILITY OF SUCH DAMAGE.
28 
29 ***************************************************************************/
30 
31 #include <sys/cdefs.h>
32 __FBSDID("$FreeBSD$");
33 
34 #include <sys/param.h>
35 #include <sys/systm.h>
36 #include <sys/linker.h>
37 #include <sys/firmware.h>
38 #include <sys/endian.h>
39 #include <sys/sockio.h>
40 #include <sys/mbuf.h>
41 #include <sys/malloc.h>
42 #include <sys/kdb.h>
43 #include <sys/kernel.h>
44 #include <sys/lock.h>
45 #include <sys/module.h>
46 #include <sys/socket.h>
47 #include <sys/sysctl.h>
48 #include <sys/sx.h>
49 #include <sys/taskqueue.h>
50 #include <contrib/zlib/zlib.h>
51 #include <dev/zlib/zcalloc.h>
52 
53 #include <net/if.h>
54 #include <net/if_var.h>
55 #include <net/if_arp.h>
56 #include <net/ethernet.h>
57 #include <net/if_dl.h>
58 #include <net/if_media.h>
59 
60 #include <net/bpf.h>
61 
62 #include <net/if_types.h>
63 #include <net/if_vlan_var.h>
64 
65 #include <netinet/in_systm.h>
66 #include <netinet/in.h>
67 #include <netinet/ip.h>
68 #include <netinet/ip6.h>
69 #include <netinet/tcp.h>
70 #include <netinet/tcp_lro.h>
71 #include <netinet6/ip6_var.h>
72 
73 #include <machine/bus.h>
74 #include <machine/in_cksum.h>
75 #include <machine/resource.h>
76 #include <sys/bus.h>
77 #include <sys/rman.h>
78 #include <sys/smp.h>
79 
80 #include <dev/pci/pcireg.h>
81 #include <dev/pci/pcivar.h>
82 #include <dev/pci/pci_private.h> /* XXX for pci_cfg_restore */
83 
84 #include <vm/vm.h>		/* for pmap_mapdev() */
85 #include <vm/pmap.h>
86 
87 #if defined(__i386) || defined(__amd64)
88 #include <machine/specialreg.h>
89 #endif
90 
91 #include <dev/mxge/mxge_mcp.h>
92 #include <dev/mxge/mcp_gen_header.h>
93 /*#define MXGE_FAKE_IFP*/
94 #include <dev/mxge/if_mxge_var.h>
95 #include <sys/buf_ring.h>
96 
97 #include "opt_inet.h"
98 #include "opt_inet6.h"
99 
100 /* tunable params */
101 static int mxge_nvidia_ecrc_enable = 1;
102 static int mxge_force_firmware = 0;
103 static int mxge_intr_coal_delay = 30;
104 static int mxge_deassert_wait = 1;
105 static int mxge_flow_control = 1;
106 static int mxge_verbose = 0;
107 static int mxge_ticks;
108 static int mxge_max_slices = 1;
109 static int mxge_rss_hash_type = MXGEFW_RSS_HASH_TYPE_SRC_DST_PORT;
110 static int mxge_always_promisc = 0;
111 static int mxge_initial_mtu = ETHERMTU_JUMBO;
112 static int mxge_throttle = 0;
113 static char *mxge_fw_unaligned = "mxge_ethp_z8e";
114 static char *mxge_fw_aligned = "mxge_eth_z8e";
115 static char *mxge_fw_rss_aligned = "mxge_rss_eth_z8e";
116 static char *mxge_fw_rss_unaligned = "mxge_rss_ethp_z8e";
117 
118 static int mxge_probe(device_t dev);
119 static int mxge_attach(device_t dev);
120 static int mxge_detach(device_t dev);
121 static int mxge_shutdown(device_t dev);
122 static void mxge_intr(void *arg);
123 
124 static device_method_t mxge_methods[] =
125 {
126   /* Device interface */
127   DEVMETHOD(device_probe, mxge_probe),
128   DEVMETHOD(device_attach, mxge_attach),
129   DEVMETHOD(device_detach, mxge_detach),
130   DEVMETHOD(device_shutdown, mxge_shutdown),
131 
132   DEVMETHOD_END
133 };
134 
135 static driver_t mxge_driver =
136 {
137   "mxge",
138   mxge_methods,
139   sizeof(mxge_softc_t),
140 };
141 
142 /* Declare ourselves to be a child of the PCI bus.*/
143 DRIVER_MODULE(mxge, pci, mxge_driver, 0, 0);
144 MODULE_DEPEND(mxge, firmware, 1, 1, 1);
145 MODULE_DEPEND(mxge, zlib, 1, 1, 1);
146 
147 static int mxge_load_firmware(mxge_softc_t *sc, int adopt);
148 static int mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data);
149 static int mxge_close(mxge_softc_t *sc, int down);
150 static int mxge_open(mxge_softc_t *sc);
151 static void mxge_tick(void *arg);
152 
153 static int
154 mxge_probe(device_t dev)
155 {
156 	int rev;
157 
158 	if ((pci_get_vendor(dev) == MXGE_PCI_VENDOR_MYRICOM) &&
159 	    ((pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E) ||
160 	     (pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E_9))) {
161 		rev = pci_get_revid(dev);
162 		switch (rev) {
163 		case MXGE_PCI_REV_Z8E:
164 			device_set_desc(dev, "Myri10G-PCIE-8A");
165 			break;
166 		case MXGE_PCI_REV_Z8ES:
167 			device_set_desc(dev, "Myri10G-PCIE-8B");
168 			break;
169 		default:
170 			device_set_desc(dev, "Myri10G-PCIE-8??");
171 			device_printf(dev, "Unrecognized rev %d NIC\n",
172 				      rev);
173 			break;
174 		}
175 		return 0;
176 	}
177 	return ENXIO;
178 }
179 
180 static void
181 mxge_enable_wc(mxge_softc_t *sc)
182 {
183 #if defined(__i386) || defined(__amd64)
184 	vm_offset_t len;
185 	int err;
186 
187 	sc->wc = 1;
188 	len = rman_get_size(sc->mem_res);
189 	err = pmap_change_attr((vm_offset_t) sc->sram,
190 			       len, PAT_WRITE_COMBINING);
191 	if (err != 0) {
192 		device_printf(sc->dev, "pmap_change_attr failed, %d\n",
193 			      err);
194 		sc->wc = 0;
195 	}
196 #endif
197 }
198 
199 /* callback to get our DMA address */
200 static void
201 mxge_dmamap_callback(void *arg, bus_dma_segment_t *segs, int nsegs,
202 			 int error)
203 {
204 	if (error == 0) {
205 		*(bus_addr_t *) arg = segs->ds_addr;
206 	}
207 }
208 
209 static int
210 mxge_dma_alloc(mxge_softc_t *sc, mxge_dma_t *dma, size_t bytes,
211 		   bus_size_t alignment)
212 {
213 	int err;
214 	device_t dev = sc->dev;
215 	bus_size_t boundary, maxsegsize;
216 
217 	if (bytes > 4096 && alignment == 4096) {
218 		boundary = 0;
219 		maxsegsize = bytes;
220 	} else {
221 		boundary = 4096;
222 		maxsegsize = 4096;
223 	}
224 
225 	/* allocate DMAable memory tags */
226 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
227 				 alignment,		/* alignment */
228 				 boundary,		/* boundary */
229 				 BUS_SPACE_MAXADDR,	/* low */
230 				 BUS_SPACE_MAXADDR,	/* high */
231 				 NULL, NULL,		/* filter */
232 				 bytes,			/* maxsize */
233 				 1,			/* num segs */
234 				 maxsegsize,		/* maxsegsize */
235 				 BUS_DMA_COHERENT,	/* flags */
236 				 NULL, NULL,		/* lock */
237 				 &dma->dmat);		/* tag */
238 	if (err != 0) {
239 		device_printf(dev, "couldn't alloc tag (err = %d)\n", err);
240 		return err;
241 	}
242 
243 	/* allocate DMAable memory & map */
244 	err = bus_dmamem_alloc(dma->dmat, &dma->addr,
245 			       (BUS_DMA_WAITOK | BUS_DMA_COHERENT
246 				| BUS_DMA_ZERO),  &dma->map);
247 	if (err != 0) {
248 		device_printf(dev, "couldn't alloc mem (err = %d)\n", err);
249 		goto abort_with_dmat;
250 	}
251 
252 	/* load the memory */
253 	err = bus_dmamap_load(dma->dmat, dma->map, dma->addr, bytes,
254 			      mxge_dmamap_callback,
255 			      (void *)&dma->bus_addr, 0);
256 	if (err != 0) {
257 		device_printf(dev, "couldn't load map (err = %d)\n", err);
258 		goto abort_with_mem;
259 	}
260 	return 0;
261 
262 abort_with_mem:
263 	bus_dmamem_free(dma->dmat, dma->addr, dma->map);
264 abort_with_dmat:
265 	(void)bus_dma_tag_destroy(dma->dmat);
266 	return err;
267 }
268 
269 static void
270 mxge_dma_free(mxge_dma_t *dma)
271 {
272 	bus_dmamap_unload(dma->dmat, dma->map);
273 	bus_dmamem_free(dma->dmat, dma->addr, dma->map);
274 	(void)bus_dma_tag_destroy(dma->dmat);
275 }
276 
277 /*
278  * The eeprom strings on the lanaiX have the format
279  * SN=x\0
280  * MAC=x:x:x:x:x:x\0
281  * PC=text\0
282  */
283 
284 static int
285 mxge_parse_strings(mxge_softc_t *sc)
286 {
287 	char *ptr;
288 	int i, found_mac, found_sn2;
289 	char *endptr;
290 
291 	ptr = sc->eeprom_strings;
292 	found_mac = 0;
293 	found_sn2 = 0;
294 	while (*ptr != '\0') {
295 		if (strncmp(ptr, "MAC=", 4) == 0) {
296 			ptr += 4;
297 			for (i = 0;;) {
298 				sc->mac_addr[i] = strtoul(ptr, &endptr, 16);
299 				if (endptr - ptr != 2)
300 					goto abort;
301 				ptr = endptr;
302 				if (++i == 6)
303 					break;
304 				if (*ptr++ != ':')
305 					goto abort;
306 			}
307 			found_mac = 1;
308 		} else if (strncmp(ptr, "PC=", 3) == 0) {
309 			ptr += 3;
310 			strlcpy(sc->product_code_string, ptr,
311 			    sizeof(sc->product_code_string));
312 		} else if (!found_sn2 && (strncmp(ptr, "SN=", 3) == 0)) {
313 			ptr += 3;
314 			strlcpy(sc->serial_number_string, ptr,
315 			    sizeof(sc->serial_number_string));
316 		} else if (strncmp(ptr, "SN2=", 4) == 0) {
317 			/* SN2 takes precedence over SN */
318 			ptr += 4;
319 			found_sn2 = 1;
320 			strlcpy(sc->serial_number_string, ptr,
321 			    sizeof(sc->serial_number_string));
322 		}
323 		while (*ptr++ != '\0') {}
324 	}
325 
326 	if (found_mac)
327 		return 0;
328 
329  abort:
330 	device_printf(sc->dev, "failed to parse eeprom_strings\n");
331 
332 	return ENXIO;
333 }
334 
335 #if defined __i386 || defined i386 || defined __i386__ || defined __x86_64__
336 static void
337 mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
338 {
339 	uint32_t val;
340 	unsigned long base, off;
341 	char *va, *cfgptr;
342 	device_t pdev, mcp55;
343 	uint16_t vendor_id, device_id, word;
344 	uintptr_t bus, slot, func, ivend, idev;
345 	uint32_t *ptr32;
346 
347 	if (!mxge_nvidia_ecrc_enable)
348 		return;
349 
350 	pdev = device_get_parent(device_get_parent(sc->dev));
351 	if (pdev == NULL) {
352 		device_printf(sc->dev, "could not find parent?\n");
353 		return;
354 	}
355 	vendor_id = pci_read_config(pdev, PCIR_VENDOR, 2);
356 	device_id = pci_read_config(pdev, PCIR_DEVICE, 2);
357 
358 	if (vendor_id != 0x10de)
359 		return;
360 
361 	base = 0;
362 
363 	if (device_id == 0x005d) {
364 		/* ck804, base address is magic */
365 		base = 0xe0000000UL;
366 	} else if (device_id >= 0x0374 && device_id <= 0x378) {
367 		/* mcp55, base address stored in chipset */
368 		mcp55 = pci_find_bsf(0, 0, 0);
369 		if (mcp55 &&
370 		    0x10de == pci_read_config(mcp55, PCIR_VENDOR, 2) &&
371 		    0x0369 == pci_read_config(mcp55, PCIR_DEVICE, 2)) {
372 			word = pci_read_config(mcp55, 0x90, 2);
373 			base = ((unsigned long)word & 0x7ffeU) << 25;
374 		}
375 	}
376 	if (!base)
377 		return;
378 
379 	/* XXXX
380 	   Test below is commented because it is believed that doing
381 	   config read/write beyond 0xff will access the config space
382 	   for the next larger function.  Uncomment this and remove
383 	   the hacky pmap_mapdev() way of accessing config space when
384 	   FreeBSD grows support for extended pcie config space access
385 	*/
386 #if 0
387 	/* See if we can, by some miracle, access the extended
388 	   config space */
389 	val = pci_read_config(pdev, 0x178, 4);
390 	if (val != 0xffffffff) {
391 		val |= 0x40;
392 		pci_write_config(pdev, 0x178, val, 4);
393 		return;
394 	}
395 #endif
396 	/* Rather than using normal pci config space writes, we must
397 	 * map the Nvidia config space ourselves.  This is because on
398 	 * opteron/nvidia class machine the 0xe000000 mapping is
399 	 * handled by the nvidia chipset, that means the internal PCI
400 	 * device (the on-chip northbridge), or the amd-8131 bridge
401 	 * and things behind them are not visible by this method.
402 	 */
403 
404 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
405 		      PCI_IVAR_BUS, &bus);
406 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
407 		      PCI_IVAR_SLOT, &slot);
408 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
409 		      PCI_IVAR_FUNCTION, &func);
410 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
411 		      PCI_IVAR_VENDOR, &ivend);
412 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
413 		      PCI_IVAR_DEVICE, &idev);
414 
415 	off =  base
416 		+ 0x00100000UL * (unsigned long)bus
417 		+ 0x00001000UL * (unsigned long)(func
418 						 + 8 * slot);
419 
420 	/* map it into the kernel */
421 	va = pmap_mapdev(trunc_page((vm_paddr_t)off), PAGE_SIZE);
422 
423 	if (va == NULL) {
424 		device_printf(sc->dev, "pmap_kenter_temporary didn't\n");
425 		return;
426 	}
427 	/* get a pointer to the config space mapped into the kernel */
428 	cfgptr = va + (off & PAGE_MASK);
429 
430 	/* make sure that we can really access it */
431 	vendor_id = *(uint16_t *)(cfgptr + PCIR_VENDOR);
432 	device_id = *(uint16_t *)(cfgptr + PCIR_DEVICE);
433 	if (! (vendor_id == ivend && device_id == idev)) {
434 		device_printf(sc->dev, "mapping failed: 0x%x:0x%x\n",
435 			      vendor_id, device_id);
436 		pmap_unmapdev(va, PAGE_SIZE);
437 		return;
438 	}
439 
440 	ptr32 = (uint32_t*)(cfgptr + 0x178);
441 	val = *ptr32;
442 
443 	if (val == 0xffffffff) {
444 		device_printf(sc->dev, "extended mapping failed\n");
445 		pmap_unmapdev(va, PAGE_SIZE);
446 		return;
447 	}
448 	*ptr32 = val | 0x40;
449 	pmap_unmapdev(va, PAGE_SIZE);
450 	if (mxge_verbose)
451 		device_printf(sc->dev,
452 			      "Enabled ECRC on upstream Nvidia bridge "
453 			      "at %d:%d:%d\n",
454 			      (int)bus, (int)slot, (int)func);
455 	return;
456 }
457 #else
458 static void
459 mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
460 {
461 	device_printf(sc->dev,
462 		      "Nforce 4 chipset on non-x86/amd64!?!?!\n");
463 	return;
464 }
465 #endif
466 
467 static int
468 mxge_dma_test(mxge_softc_t *sc, int test_type)
469 {
470 	mxge_cmd_t cmd;
471 	bus_addr_t dmatest_bus = sc->dmabench_dma.bus_addr;
472 	int status;
473 	uint32_t len;
474 	char *test = " ";
475 
476 	/* Run a small DMA test.
477 	 * The magic multipliers to the length tell the firmware
478 	 * to do DMA read, write, or read+write tests.  The
479 	 * results are returned in cmd.data0.  The upper 16
480 	 * bits of the return is the number of transfers completed.
481 	 * The lower 16 bits is the time in 0.5us ticks that the
482 	 * transfers took to complete.
483 	 */
484 
485 	len = sc->tx_boundary;
486 
487 	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
488 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
489 	cmd.data2 = len * 0x10000;
490 	status = mxge_send_cmd(sc, test_type, &cmd);
491 	if (status != 0) {
492 		test = "read";
493 		goto abort;
494 	}
495 	sc->read_dma = ((cmd.data0>>16) * len * 2) /
496 		(cmd.data0 & 0xffff);
497 	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
498 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
499 	cmd.data2 = len * 0x1;
500 	status = mxge_send_cmd(sc, test_type, &cmd);
501 	if (status != 0) {
502 		test = "write";
503 		goto abort;
504 	}
505 	sc->write_dma = ((cmd.data0>>16) * len * 2) /
506 		(cmd.data0 & 0xffff);
507 
508 	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
509 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
510 	cmd.data2 = len * 0x10001;
511 	status = mxge_send_cmd(sc, test_type, &cmd);
512 	if (status != 0) {
513 		test = "read/write";
514 		goto abort;
515 	}
516 	sc->read_write_dma = ((cmd.data0>>16) * len * 2 * 2) /
517 		(cmd.data0 & 0xffff);
518 
519 abort:
520 	if (status != 0 && test_type != MXGEFW_CMD_UNALIGNED_TEST)
521 		device_printf(sc->dev, "DMA %s benchmark failed: %d\n",
522 			      test, status);
523 
524 	return status;
525 }
526 
527 /*
528  * The Lanai Z8E PCI-E interface achieves higher Read-DMA throughput
529  * when the PCI-E Completion packets are aligned on an 8-byte
530  * boundary.  Some PCI-E chip sets always align Completion packets; on
531  * the ones that do not, the alignment can be enforced by enabling
532  * ECRC generation (if supported).
533  *
534  * When PCI-E Completion packets are not aligned, it is actually more
535  * efficient to limit Read-DMA transactions to 2KB, rather than 4KB.
536  *
537  * If the driver can neither enable ECRC nor verify that it has
538  * already been enabled, then it must use a firmware image which works
539  * around unaligned completion packets (ethp_z8e.dat), and it should
540  * also ensure that it never gives the device a Read-DMA which is
541  * larger than 2KB by setting the tx_boundary to 2KB.  If ECRC is
542  * enabled, then the driver should use the aligned (eth_z8e.dat)
543  * firmware image, and set tx_boundary to 4KB.
544  */
545 
546 static int
547 mxge_firmware_probe(mxge_softc_t *sc)
548 {
549 	device_t dev = sc->dev;
550 	int reg, status;
551 	uint16_t pectl;
552 
553 	sc->tx_boundary = 4096;
554 	/*
555 	 * Verify the max read request size was set to 4KB
556 	 * before trying the test with 4KB.
557 	 */
558 	if (pci_find_cap(dev, PCIY_EXPRESS, &reg) == 0) {
559 		pectl = pci_read_config(dev, reg + 0x8, 2);
560 		if ((pectl & (5 << 12)) != (5 << 12)) {
561 			device_printf(dev, "Max Read Req. size != 4k (0x%x\n",
562 				      pectl);
563 			sc->tx_boundary = 2048;
564 		}
565 	}
566 
567 	/*
568 	 * load the optimized firmware (which assumes aligned PCIe
569 	 * completions) in order to see if it works on this host.
570 	 */
571 	sc->fw_name = mxge_fw_aligned;
572 	status = mxge_load_firmware(sc, 1);
573 	if (status != 0) {
574 		return status;
575 	}
576 
577 	/*
578 	 * Enable ECRC if possible
579 	 */
580 	mxge_enable_nvidia_ecrc(sc);
581 
582 	/*
583 	 * Run a DMA test which watches for unaligned completions and
584 	 * aborts on the first one seen.  Not required on Z8ES or newer.
585 	 */
586 	if (pci_get_revid(sc->dev) >= MXGE_PCI_REV_Z8ES)
587 		return 0;
588 	status = mxge_dma_test(sc, MXGEFW_CMD_UNALIGNED_TEST);
589 	if (status == 0)
590 		return 0; /* keep the aligned firmware */
591 
592 	if (status != E2BIG)
593 		device_printf(dev, "DMA test failed: %d\n", status);
594 	if (status == ENOSYS)
595 		device_printf(dev, "Falling back to ethp! "
596 			      "Please install up to date fw\n");
597 	return status;
598 }
599 
600 static int
601 mxge_select_firmware(mxge_softc_t *sc)
602 {
603 	int aligned = 0;
604 	int force_firmware = mxge_force_firmware;
605 
606 	if (sc->throttle)
607 		force_firmware = sc->throttle;
608 
609 	if (force_firmware != 0) {
610 		if (force_firmware == 1)
611 			aligned = 1;
612 		else
613 			aligned = 0;
614 		if (mxge_verbose)
615 			device_printf(sc->dev,
616 				      "Assuming %s completions (forced)\n",
617 				      aligned ? "aligned" : "unaligned");
618 		goto abort;
619 	}
620 
621 	/* if the PCIe link width is 4 or less, we can use the aligned
622 	   firmware and skip any checks */
623 	if (sc->link_width != 0 && sc->link_width <= 4) {
624 		device_printf(sc->dev,
625 			      "PCIe x%d Link, expect reduced performance\n",
626 			      sc->link_width);
627 		aligned = 1;
628 		goto abort;
629 	}
630 
631 	if (0 == mxge_firmware_probe(sc))
632 		return 0;
633 
634 abort:
635 	if (aligned) {
636 		sc->fw_name = mxge_fw_aligned;
637 		sc->tx_boundary = 4096;
638 	} else {
639 		sc->fw_name = mxge_fw_unaligned;
640 		sc->tx_boundary = 2048;
641 	}
642 	return (mxge_load_firmware(sc, 0));
643 }
644 
645 static int
646 mxge_validate_firmware(mxge_softc_t *sc, const mcp_gen_header_t *hdr)
647 {
648 
649 	if (be32toh(hdr->mcp_type) != MCP_TYPE_ETH) {
650 		device_printf(sc->dev, "Bad firmware type: 0x%x\n",
651 			      be32toh(hdr->mcp_type));
652 		return EIO;
653 	}
654 
655 	/* save firmware version for sysctl */
656 	strlcpy(sc->fw_version, hdr->version, sizeof(sc->fw_version));
657 	if (mxge_verbose)
658 		device_printf(sc->dev, "firmware id: %s\n", hdr->version);
659 
660 	sscanf(sc->fw_version, "%d.%d.%d", &sc->fw_ver_major,
661 	       &sc->fw_ver_minor, &sc->fw_ver_tiny);
662 
663 	if (!(sc->fw_ver_major == MXGEFW_VERSION_MAJOR
664 	      && sc->fw_ver_minor == MXGEFW_VERSION_MINOR)) {
665 		device_printf(sc->dev, "Found firmware version %s\n",
666 			      sc->fw_version);
667 		device_printf(sc->dev, "Driver needs %d.%d\n",
668 			      MXGEFW_VERSION_MAJOR, MXGEFW_VERSION_MINOR);
669 		return EINVAL;
670 	}
671 	return 0;
672 
673 }
674 
675 static int
676 mxge_load_firmware_helper(mxge_softc_t *sc, uint32_t *limit)
677 {
678 	z_stream zs;
679 	char *inflate_buffer;
680 	const struct firmware *fw;
681 	const mcp_gen_header_t *hdr;
682 	unsigned hdr_offset;
683 	int status;
684 	unsigned int i;
685 	size_t fw_len;
686 
687 	fw = firmware_get(sc->fw_name);
688 	if (fw == NULL) {
689 		device_printf(sc->dev, "Could not find firmware image %s\n",
690 			      sc->fw_name);
691 		return ENOENT;
692 	}
693 
694 	/* setup zlib and decompress f/w */
695 	bzero(&zs, sizeof (zs));
696 	zs.zalloc = zcalloc_nowait;
697 	zs.zfree = zcfree;
698 	status = inflateInit(&zs);
699 	if (status != Z_OK) {
700 		status = EIO;
701 		goto abort_with_fw;
702 	}
703 
704 	/* the uncompressed size is stored as the firmware version,
705 	   which would otherwise go unused */
706 	fw_len = (size_t) fw->version;
707 	inflate_buffer = malloc(fw_len, M_TEMP, M_NOWAIT);
708 	if (inflate_buffer == NULL)
709 		goto abort_with_zs;
710 	zs.avail_in = fw->datasize;
711 	zs.next_in = __DECONST(char *, fw->data);
712 	zs.avail_out = fw_len;
713 	zs.next_out = inflate_buffer;
714 	status = inflate(&zs, Z_FINISH);
715 	if (status != Z_STREAM_END) {
716 		device_printf(sc->dev, "zlib %d\n", status);
717 		status = EIO;
718 		goto abort_with_buffer;
719 	}
720 
721 	/* check id */
722 	hdr_offset = htobe32(*(const uint32_t *)
723 			     (inflate_buffer + MCP_HEADER_PTR_OFFSET));
724 	if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > fw_len) {
725 		device_printf(sc->dev, "Bad firmware file");
726 		status = EIO;
727 		goto abort_with_buffer;
728 	}
729 	hdr = (const void*)(inflate_buffer + hdr_offset);
730 
731 	status = mxge_validate_firmware(sc, hdr);
732 	if (status != 0)
733 		goto abort_with_buffer;
734 
735 	/* Copy the inflated firmware to NIC SRAM. */
736 	for (i = 0; i < fw_len; i += 256) {
737 		mxge_pio_copy(sc->sram + MXGE_FW_OFFSET + i,
738 			      inflate_buffer + i,
739 			      min(256U, (unsigned)(fw_len - i)));
740 		wmb();
741 		(void)*sc->sram;
742 		wmb();
743 	}
744 
745 	*limit = fw_len;
746 	status = 0;
747 abort_with_buffer:
748 	free(inflate_buffer, M_TEMP);
749 abort_with_zs:
750 	inflateEnd(&zs);
751 abort_with_fw:
752 	firmware_put(fw, FIRMWARE_UNLOAD);
753 	return status;
754 }
755 
756 /*
757  * Enable or disable periodic RDMAs from the host to make certain
758  * chipsets resend dropped PCIe messages
759  */
760 
761 static void
762 mxge_dummy_rdma(mxge_softc_t *sc, int enable)
763 {
764 	char buf_bytes[72];
765 	volatile uint32_t *confirm;
766 	volatile char *submit;
767 	uint32_t *buf, dma_low, dma_high;
768 	int i;
769 
770 	buf = (uint32_t *)((uintptr_t)(buf_bytes + 7) & ~7UL);
771 
772 	/* clear confirmation addr */
773 	confirm = (volatile uint32_t *)sc->cmd;
774 	*confirm = 0;
775 	wmb();
776 
777 	/* send an rdma command to the PCIe engine, and wait for the
778 	   response in the confirmation address.  The firmware should
779 	   write a -1 there to indicate it is alive and well
780 	*/
781 
782 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
783 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
784 	buf[0] = htobe32(dma_high);		/* confirm addr MSW */
785 	buf[1] = htobe32(dma_low);		/* confirm addr LSW */
786 	buf[2] = htobe32(0xffffffff);		/* confirm data */
787 	dma_low = MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr);
788 	dma_high = MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr);
789 	buf[3] = htobe32(dma_high); 		/* dummy addr MSW */
790 	buf[4] = htobe32(dma_low); 		/* dummy addr LSW */
791 	buf[5] = htobe32(enable);			/* enable? */
792 
793 	submit = (volatile char *)(sc->sram + MXGEFW_BOOT_DUMMY_RDMA);
794 
795 	mxge_pio_copy(submit, buf, 64);
796 	wmb();
797 	DELAY(1000);
798 	wmb();
799 	i = 0;
800 	while (*confirm != 0xffffffff && i < 20) {
801 		DELAY(1000);
802 		i++;
803 	}
804 	if (*confirm != 0xffffffff) {
805 		device_printf(sc->dev, "dummy rdma %s failed (%p = 0x%x)",
806 			      (enable ? "enable" : "disable"), confirm,
807 			      *confirm);
808 	}
809 	return;
810 }
811 
812 static int
813 mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data)
814 {
815 	mcp_cmd_t *buf;
816 	char buf_bytes[sizeof(*buf) + 8];
817 	volatile mcp_cmd_response_t *response = sc->cmd;
818 	volatile char *cmd_addr = sc->sram + MXGEFW_ETH_CMD;
819 	uint32_t dma_low, dma_high;
820 	int err, sleep_total = 0;
821 
822 	/* ensure buf is aligned to 8 bytes */
823 	buf = (mcp_cmd_t *)((uintptr_t)(buf_bytes + 7) & ~7UL);
824 
825 	buf->data0 = htobe32(data->data0);
826 	buf->data1 = htobe32(data->data1);
827 	buf->data2 = htobe32(data->data2);
828 	buf->cmd = htobe32(cmd);
829 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
830 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
831 
832 	buf->response_addr.low = htobe32(dma_low);
833 	buf->response_addr.high = htobe32(dma_high);
834 	mtx_lock(&sc->cmd_mtx);
835 	response->result = 0xffffffff;
836 	wmb();
837 	mxge_pio_copy((volatile void *)cmd_addr, buf, sizeof (*buf));
838 
839 	/* wait up to 20ms */
840 	err = EAGAIN;
841 	for (sleep_total = 0; sleep_total <  20; sleep_total++) {
842 		bus_dmamap_sync(sc->cmd_dma.dmat,
843 				sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
844 		wmb();
845 		switch (be32toh(response->result)) {
846 		case 0:
847 			data->data0 = be32toh(response->data);
848 			err = 0;
849 			break;
850 		case 0xffffffff:
851 			DELAY(1000);
852 			break;
853 		case MXGEFW_CMD_UNKNOWN:
854 			err = ENOSYS;
855 			break;
856 		case MXGEFW_CMD_ERROR_UNALIGNED:
857 			err = E2BIG;
858 			break;
859 		case MXGEFW_CMD_ERROR_BUSY:
860 			err = EBUSY;
861 			break;
862 		case MXGEFW_CMD_ERROR_I2C_ABSENT:
863 			err = ENXIO;
864 			break;
865 		default:
866 			device_printf(sc->dev,
867 				      "mxge: command %d "
868 				      "failed, result = %d\n",
869 				      cmd, be32toh(response->result));
870 			err = ENXIO;
871 			break;
872 		}
873 		if (err != EAGAIN)
874 			break;
875 	}
876 	if (err == EAGAIN)
877 		device_printf(sc->dev, "mxge: command %d timed out"
878 			      "result = %d\n",
879 			      cmd, be32toh(response->result));
880 	mtx_unlock(&sc->cmd_mtx);
881 	return err;
882 }
883 
884 static int
885 mxge_adopt_running_firmware(mxge_softc_t *sc)
886 {
887 	struct mcp_gen_header *hdr;
888 	const size_t bytes = sizeof (struct mcp_gen_header);
889 	size_t hdr_offset;
890 	int status;
891 
892 	/* find running firmware header */
893 	hdr_offset = htobe32(*(volatile uint32_t *)
894 			     (sc->sram + MCP_HEADER_PTR_OFFSET));
895 
896 	if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > sc->sram_size) {
897 		device_printf(sc->dev,
898 			      "Running firmware has bad header offset (%d)\n",
899 			      (int)hdr_offset);
900 		return EIO;
901 	}
902 
903 	/* copy header of running firmware from SRAM to host memory to
904 	 * validate firmware */
905 	hdr = malloc(bytes, M_DEVBUF, M_NOWAIT);
906 	if (hdr == NULL) {
907 		device_printf(sc->dev, "could not malloc firmware hdr\n");
908 		return ENOMEM;
909 	}
910 	bus_space_read_region_1(rman_get_bustag(sc->mem_res),
911 				rman_get_bushandle(sc->mem_res),
912 				hdr_offset, (char *)hdr, bytes);
913 	status = mxge_validate_firmware(sc, hdr);
914 	free(hdr, M_DEVBUF);
915 
916 	/*
917 	 * check to see if adopted firmware has bug where adopting
918 	 * it will cause broadcasts to be filtered unless the NIC
919 	 * is kept in ALLMULTI mode
920 	 */
921 	if (sc->fw_ver_major == 1 && sc->fw_ver_minor == 4 &&
922 	    sc->fw_ver_tiny >= 4 && sc->fw_ver_tiny <= 11) {
923 		sc->adopted_rx_filter_bug = 1;
924 		device_printf(sc->dev, "Adopting fw %d.%d.%d: "
925 			      "working around rx filter bug\n",
926 			      sc->fw_ver_major, sc->fw_ver_minor,
927 			      sc->fw_ver_tiny);
928 	}
929 
930 	return status;
931 }
932 
933 static int
934 mxge_load_firmware(mxge_softc_t *sc, int adopt)
935 {
936 	volatile uint32_t *confirm;
937 	volatile char *submit;
938 	char buf_bytes[72];
939 	uint32_t *buf, size, dma_low, dma_high;
940 	int status, i;
941 
942 	buf = (uint32_t *)((uintptr_t)(buf_bytes + 7) & ~7UL);
943 
944 	size = sc->sram_size;
945 	status = mxge_load_firmware_helper(sc, &size);
946 	if (status) {
947 		if (!adopt)
948 			return status;
949 		/* Try to use the currently running firmware, if
950 		   it is new enough */
951 		status = mxge_adopt_running_firmware(sc);
952 		if (status) {
953 			device_printf(sc->dev,
954 				      "failed to adopt running firmware\n");
955 			return status;
956 		}
957 		device_printf(sc->dev,
958 			      "Successfully adopted running firmware\n");
959 		if (sc->tx_boundary == 4096) {
960 			device_printf(sc->dev,
961 				"Using firmware currently running on NIC"
962 				 ".  For optimal\n");
963 			device_printf(sc->dev,
964 				 "performance consider loading optimized "
965 				 "firmware\n");
966 		}
967 		sc->fw_name = mxge_fw_unaligned;
968 		sc->tx_boundary = 2048;
969 		return 0;
970 	}
971 	/* clear confirmation addr */
972 	confirm = (volatile uint32_t *)sc->cmd;
973 	*confirm = 0;
974 	wmb();
975 	/* send a reload command to the bootstrap MCP, and wait for the
976 	   response in the confirmation address.  The firmware should
977 	   write a -1 there to indicate it is alive and well
978 	*/
979 
980 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
981 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
982 
983 	buf[0] = htobe32(dma_high);	/* confirm addr MSW */
984 	buf[1] = htobe32(dma_low);	/* confirm addr LSW */
985 	buf[2] = htobe32(0xffffffff);	/* confirm data */
986 
987 	/* FIX: All newest firmware should un-protect the bottom of
988 	   the sram before handoff. However, the very first interfaces
989 	   do not. Therefore the handoff copy must skip the first 8 bytes
990 	*/
991 					/* where the code starts*/
992 	buf[3] = htobe32(MXGE_FW_OFFSET + 8);
993 	buf[4] = htobe32(size - 8); 	/* length of code */
994 	buf[5] = htobe32(8);		/* where to copy to */
995 	buf[6] = htobe32(0);		/* where to jump to */
996 
997 	submit = (volatile char *)(sc->sram + MXGEFW_BOOT_HANDOFF);
998 	mxge_pio_copy(submit, buf, 64);
999 	wmb();
1000 	DELAY(1000);
1001 	wmb();
1002 	i = 0;
1003 	while (*confirm != 0xffffffff && i < 20) {
1004 		DELAY(1000*10);
1005 		i++;
1006 		bus_dmamap_sync(sc->cmd_dma.dmat,
1007 				sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
1008 	}
1009 	if (*confirm != 0xffffffff) {
1010 		device_printf(sc->dev,"handoff failed (%p = 0x%x)",
1011 			confirm, *confirm);
1012 
1013 		return ENXIO;
1014 	}
1015 	return 0;
1016 }
1017 
1018 static int
1019 mxge_update_mac_address(mxge_softc_t *sc)
1020 {
1021 	mxge_cmd_t cmd;
1022 	uint8_t *addr = sc->mac_addr;
1023 	int status;
1024 
1025 	cmd.data0 = ((addr[0] << 24) | (addr[1] << 16)
1026 		     | (addr[2] << 8) | addr[3]);
1027 
1028 	cmd.data1 = ((addr[4] << 8) | (addr[5]));
1029 
1030 	status = mxge_send_cmd(sc, MXGEFW_SET_MAC_ADDRESS, &cmd);
1031 	return status;
1032 }
1033 
1034 static int
1035 mxge_change_pause(mxge_softc_t *sc, int pause)
1036 {
1037 	mxge_cmd_t cmd;
1038 	int status;
1039 
1040 	if (pause)
1041 		status = mxge_send_cmd(sc, MXGEFW_ENABLE_FLOW_CONTROL,
1042 				       &cmd);
1043 	else
1044 		status = mxge_send_cmd(sc, MXGEFW_DISABLE_FLOW_CONTROL,
1045 				       &cmd);
1046 
1047 	if (status) {
1048 		device_printf(sc->dev, "Failed to set flow control mode\n");
1049 		return ENXIO;
1050 	}
1051 	sc->pause = pause;
1052 	return 0;
1053 }
1054 
1055 static void
1056 mxge_change_promisc(mxge_softc_t *sc, int promisc)
1057 {
1058 	mxge_cmd_t cmd;
1059 	int status;
1060 
1061 	if (mxge_always_promisc)
1062 		promisc = 1;
1063 
1064 	if (promisc)
1065 		status = mxge_send_cmd(sc, MXGEFW_ENABLE_PROMISC,
1066 				       &cmd);
1067 	else
1068 		status = mxge_send_cmd(sc, MXGEFW_DISABLE_PROMISC,
1069 				       &cmd);
1070 
1071 	if (status) {
1072 		device_printf(sc->dev, "Failed to set promisc mode\n");
1073 	}
1074 }
1075 
1076 struct mxge_add_maddr_ctx {
1077 	mxge_softc_t *sc;
1078 	int error;
1079 };
1080 
1081 static u_int
1082 mxge_add_maddr(void *arg, struct sockaddr_dl *sdl, u_int cnt)
1083 {
1084 	struct mxge_add_maddr_ctx *ctx = arg;
1085 	mxge_cmd_t cmd;
1086 
1087 	if (ctx->error != 0)
1088 		return (0);
1089 	bcopy(LLADDR(sdl), &cmd.data0, 4);
1090 	bcopy(LLADDR(sdl) + 4, &cmd.data1, 2);
1091 	cmd.data0 = htonl(cmd.data0);
1092 	cmd.data1 = htonl(cmd.data1);
1093 
1094 	ctx->error = mxge_send_cmd(ctx->sc, MXGEFW_JOIN_MULTICAST_GROUP, &cmd);
1095 
1096 	return (1);
1097 }
1098 
1099 static void
1100 mxge_set_multicast_list(mxge_softc_t *sc)
1101 {
1102 	struct mxge_add_maddr_ctx ctx;
1103 	struct ifnet *ifp = sc->ifp;
1104 	mxge_cmd_t cmd;
1105 	int err;
1106 
1107 	/* This firmware is known to not support multicast */
1108 	if (!sc->fw_multicast_support)
1109 		return;
1110 
1111 	/* Disable multicast filtering while we play with the lists*/
1112 	err = mxge_send_cmd(sc, MXGEFW_ENABLE_ALLMULTI, &cmd);
1113 	if (err != 0) {
1114 		device_printf(sc->dev, "Failed MXGEFW_ENABLE_ALLMULTI,"
1115 		       " error status: %d\n", err);
1116 		return;
1117 	}
1118 
1119 	if (sc->adopted_rx_filter_bug)
1120 		return;
1121 
1122 	if (ifp->if_flags & IFF_ALLMULTI)
1123 		/* request to disable multicast filtering, so quit here */
1124 		return;
1125 
1126 	/* Flush all the filters */
1127 
1128 	err = mxge_send_cmd(sc, MXGEFW_LEAVE_ALL_MULTICAST_GROUPS, &cmd);
1129 	if (err != 0) {
1130 		device_printf(sc->dev,
1131 			      "Failed MXGEFW_LEAVE_ALL_MULTICAST_GROUPS"
1132 			      ", error status: %d\n", err);
1133 		return;
1134 	}
1135 
1136 	/* Walk the multicast list, and add each address */
1137 	ctx.sc = sc;
1138 	ctx.error = 0;
1139 	if_foreach_llmaddr(ifp, mxge_add_maddr, &ctx);
1140 	if (ctx.error != 0) {
1141 		device_printf(sc->dev, "Failed MXGEFW_JOIN_MULTICAST_GROUP, "
1142 		    "error status:" "%d\t", ctx.error);
1143 		/* abort, leaving multicast filtering off */
1144 		return;
1145 	}
1146 
1147 	/* Enable multicast filtering */
1148 	err = mxge_send_cmd(sc, MXGEFW_DISABLE_ALLMULTI, &cmd);
1149 	if (err != 0) {
1150 		device_printf(sc->dev, "Failed MXGEFW_DISABLE_ALLMULTI"
1151 		       ", error status: %d\n", err);
1152 	}
1153 }
1154 
1155 static int
1156 mxge_max_mtu(mxge_softc_t *sc)
1157 {
1158 	mxge_cmd_t cmd;
1159 	int status;
1160 
1161 	if (MJUMPAGESIZE - MXGEFW_PAD >  MXGEFW_MAX_MTU)
1162 		return  MXGEFW_MAX_MTU - MXGEFW_PAD;
1163 
1164 	/* try to set nbufs to see if it we can
1165 	   use virtually contiguous jumbos */
1166 	cmd.data0 = 0;
1167 	status = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
1168 			       &cmd);
1169 	if (status == 0)
1170 		return  MXGEFW_MAX_MTU - MXGEFW_PAD;
1171 
1172 	/* otherwise, we're limited to MJUMPAGESIZE */
1173 	return MJUMPAGESIZE - MXGEFW_PAD;
1174 }
1175 
1176 static int
1177 mxge_reset(mxge_softc_t *sc, int interrupts_setup)
1178 {
1179 	struct mxge_slice_state *ss;
1180 	mxge_rx_done_t *rx_done;
1181 	volatile uint32_t *irq_claim;
1182 	mxge_cmd_t cmd;
1183 	int slice, status;
1184 
1185 	/* try to send a reset command to the card to see if it
1186 	   is alive */
1187 	memset(&cmd, 0, sizeof (cmd));
1188 	status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
1189 	if (status != 0) {
1190 		device_printf(sc->dev, "failed reset\n");
1191 		return ENXIO;
1192 	}
1193 
1194 	mxge_dummy_rdma(sc, 1);
1195 
1196 	/* set the intrq size */
1197 	cmd.data0 = sc->rx_ring_size;
1198 	status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
1199 
1200 	/*
1201 	 * Even though we already know how many slices are supported
1202 	 * via mxge_slice_probe(), MXGEFW_CMD_GET_MAX_RSS_QUEUES
1203 	 * has magic side effects, and must be called after a reset.
1204 	 * It must be called prior to calling any RSS related cmds,
1205 	 * including assigning an interrupt queue for anything but
1206 	 * slice 0.  It must also be called *after*
1207 	 * MXGEFW_CMD_SET_INTRQ_SIZE, since the intrq size is used by
1208 	 * the firmware to compute offsets.
1209 	 */
1210 
1211 	if (sc->num_slices > 1) {
1212 		/* ask the maximum number of slices it supports */
1213 		status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES,
1214 					   &cmd);
1215 		if (status != 0) {
1216 			device_printf(sc->dev,
1217 				      "failed to get number of slices\n");
1218 			return status;
1219 		}
1220 		/*
1221 		 * MXGEFW_CMD_ENABLE_RSS_QUEUES must be called prior
1222 		 * to setting up the interrupt queue DMA
1223 		 */
1224 		cmd.data0 = sc->num_slices;
1225 		cmd.data1 = MXGEFW_SLICE_INTR_MODE_ONE_PER_SLICE;
1226 		cmd.data1 |= MXGEFW_SLICE_ENABLE_MULTIPLE_TX_QUEUES;
1227 		status = mxge_send_cmd(sc, MXGEFW_CMD_ENABLE_RSS_QUEUES,
1228 					   &cmd);
1229 		if (status != 0) {
1230 			device_printf(sc->dev,
1231 				      "failed to set number of slices\n");
1232 			return status;
1233 		}
1234 	}
1235 
1236 	if (interrupts_setup) {
1237 		/* Now exchange information about interrupts  */
1238 		for (slice = 0; slice < sc->num_slices; slice++) {
1239 			rx_done = &sc->ss[slice].rx_done;
1240 			memset(rx_done->entry, 0, sc->rx_ring_size);
1241 			cmd.data0 = MXGE_LOWPART_TO_U32(rx_done->dma.bus_addr);
1242 			cmd.data1 = MXGE_HIGHPART_TO_U32(rx_done->dma.bus_addr);
1243 			cmd.data2 = slice;
1244 			status |= mxge_send_cmd(sc,
1245 						MXGEFW_CMD_SET_INTRQ_DMA,
1246 						&cmd);
1247 		}
1248 	}
1249 
1250 	status |= mxge_send_cmd(sc,
1251 				MXGEFW_CMD_GET_INTR_COAL_DELAY_OFFSET, &cmd);
1252 
1253 	sc->intr_coal_delay_ptr = (volatile uint32_t *)(sc->sram + cmd.data0);
1254 
1255 	status |= mxge_send_cmd(sc, MXGEFW_CMD_GET_IRQ_ACK_OFFSET, &cmd);
1256 	irq_claim = (volatile uint32_t *)(sc->sram + cmd.data0);
1257 
1258 	status |= mxge_send_cmd(sc,  MXGEFW_CMD_GET_IRQ_DEASSERT_OFFSET,
1259 				&cmd);
1260 	sc->irq_deassert = (volatile uint32_t *)(sc->sram + cmd.data0);
1261 	if (status != 0) {
1262 		device_printf(sc->dev, "failed set interrupt parameters\n");
1263 		return status;
1264 	}
1265 
1266 	*sc->intr_coal_delay_ptr = htobe32(sc->intr_coal_delay);
1267 
1268 	/* run a DMA benchmark */
1269 	(void) mxge_dma_test(sc, MXGEFW_DMA_TEST);
1270 
1271 	for (slice = 0; slice < sc->num_slices; slice++) {
1272 		ss = &sc->ss[slice];
1273 
1274 		ss->irq_claim = irq_claim + (2 * slice);
1275 		/* reset mcp/driver shared state back to 0 */
1276 		ss->rx_done.idx = 0;
1277 		ss->rx_done.cnt = 0;
1278 		ss->tx.req = 0;
1279 		ss->tx.done = 0;
1280 		ss->tx.pkt_done = 0;
1281 		ss->tx.queue_active = 0;
1282 		ss->tx.activate = 0;
1283 		ss->tx.deactivate = 0;
1284 		ss->tx.wake = 0;
1285 		ss->tx.defrag = 0;
1286 		ss->tx.stall = 0;
1287 		ss->rx_big.cnt = 0;
1288 		ss->rx_small.cnt = 0;
1289 		ss->lc.lro_bad_csum = 0;
1290 		ss->lc.lro_queued = 0;
1291 		ss->lc.lro_flushed = 0;
1292 		if (ss->fw_stats != NULL) {
1293 			bzero(ss->fw_stats, sizeof *ss->fw_stats);
1294 		}
1295 	}
1296 	sc->rdma_tags_available = 15;
1297 	status = mxge_update_mac_address(sc);
1298 	mxge_change_promisc(sc, sc->ifp->if_flags & IFF_PROMISC);
1299 	mxge_change_pause(sc, sc->pause);
1300 	mxge_set_multicast_list(sc);
1301 	if (sc->throttle) {
1302 		cmd.data0 = sc->throttle;
1303 		if (mxge_send_cmd(sc, MXGEFW_CMD_SET_THROTTLE_FACTOR,
1304 				  &cmd)) {
1305 			device_printf(sc->dev,
1306 				      "can't enable throttle\n");
1307 		}
1308 	}
1309 	return status;
1310 }
1311 
1312 static int
1313 mxge_change_throttle(SYSCTL_HANDLER_ARGS)
1314 {
1315 	mxge_cmd_t cmd;
1316 	mxge_softc_t *sc;
1317 	int err;
1318 	unsigned int throttle;
1319 
1320 	sc = arg1;
1321 	throttle = sc->throttle;
1322 	err = sysctl_handle_int(oidp, &throttle, arg2, req);
1323 	if (err != 0) {
1324 		return err;
1325 	}
1326 
1327 	if (throttle == sc->throttle)
1328 		return 0;
1329 
1330 	if (throttle < MXGE_MIN_THROTTLE || throttle > MXGE_MAX_THROTTLE)
1331 		return EINVAL;
1332 
1333 	mtx_lock(&sc->driver_mtx);
1334 	cmd.data0 = throttle;
1335 	err = mxge_send_cmd(sc, MXGEFW_CMD_SET_THROTTLE_FACTOR, &cmd);
1336 	if (err == 0)
1337 		sc->throttle = throttle;
1338 	mtx_unlock(&sc->driver_mtx);
1339 	return err;
1340 }
1341 
1342 static int
1343 mxge_change_intr_coal(SYSCTL_HANDLER_ARGS)
1344 {
1345 	mxge_softc_t *sc;
1346 	unsigned int intr_coal_delay;
1347 	int err;
1348 
1349 	sc = arg1;
1350 	intr_coal_delay = sc->intr_coal_delay;
1351 	err = sysctl_handle_int(oidp, &intr_coal_delay, arg2, req);
1352 	if (err != 0) {
1353 		return err;
1354 	}
1355 	if (intr_coal_delay == sc->intr_coal_delay)
1356 		return 0;
1357 
1358 	if (intr_coal_delay == 0 || intr_coal_delay > 1000*1000)
1359 		return EINVAL;
1360 
1361 	mtx_lock(&sc->driver_mtx);
1362 	*sc->intr_coal_delay_ptr = htobe32(intr_coal_delay);
1363 	sc->intr_coal_delay = intr_coal_delay;
1364 
1365 	mtx_unlock(&sc->driver_mtx);
1366 	return err;
1367 }
1368 
1369 static int
1370 mxge_change_flow_control(SYSCTL_HANDLER_ARGS)
1371 {
1372 	mxge_softc_t *sc;
1373 	unsigned int enabled;
1374 	int err;
1375 
1376 	sc = arg1;
1377 	enabled = sc->pause;
1378 	err = sysctl_handle_int(oidp, &enabled, arg2, req);
1379 	if (err != 0) {
1380 		return err;
1381 	}
1382 	if (enabled == sc->pause)
1383 		return 0;
1384 
1385 	mtx_lock(&sc->driver_mtx);
1386 	err = mxge_change_pause(sc, enabled);
1387 	mtx_unlock(&sc->driver_mtx);
1388 	return err;
1389 }
1390 
1391 static int
1392 mxge_handle_be32(SYSCTL_HANDLER_ARGS)
1393 {
1394 	int err;
1395 
1396 	if (arg1 == NULL)
1397 		return EFAULT;
1398 	arg2 = be32toh(*(int *)arg1);
1399 	arg1 = NULL;
1400 	err = sysctl_handle_int(oidp, arg1, arg2, req);
1401 
1402 	return err;
1403 }
1404 
1405 static void
1406 mxge_rem_sysctls(mxge_softc_t *sc)
1407 {
1408 	struct mxge_slice_state *ss;
1409 	int slice;
1410 
1411 	if (sc->slice_sysctl_tree == NULL)
1412 		return;
1413 
1414 	for (slice = 0; slice < sc->num_slices; slice++) {
1415 		ss = &sc->ss[slice];
1416 		if (ss == NULL || ss->sysctl_tree == NULL)
1417 			continue;
1418 		sysctl_ctx_free(&ss->sysctl_ctx);
1419 		ss->sysctl_tree = NULL;
1420 	}
1421 	sysctl_ctx_free(&sc->slice_sysctl_ctx);
1422 	sc->slice_sysctl_tree = NULL;
1423 }
1424 
1425 static void
1426 mxge_add_sysctls(mxge_softc_t *sc)
1427 {
1428 	struct sysctl_ctx_list *ctx;
1429 	struct sysctl_oid_list *children;
1430 	mcp_irq_data_t *fw;
1431 	struct mxge_slice_state *ss;
1432 	int slice;
1433 	char slice_num[8];
1434 
1435 	ctx = device_get_sysctl_ctx(sc->dev);
1436 	children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev));
1437 	fw = sc->ss[0].fw_stats;
1438 
1439 	/* random information */
1440 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1441 		       "firmware_version",
1442 		       CTLFLAG_RD, sc->fw_version,
1443 		       0, "firmware version");
1444 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1445 		       "serial_number",
1446 		       CTLFLAG_RD, sc->serial_number_string,
1447 		       0, "serial number");
1448 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1449 		       "product_code",
1450 		       CTLFLAG_RD, sc->product_code_string,
1451 		       0, "product_code");
1452 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1453 		       "pcie_link_width",
1454 		       CTLFLAG_RD, &sc->link_width,
1455 		       0, "tx_boundary");
1456 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1457 		       "tx_boundary",
1458 		       CTLFLAG_RD, &sc->tx_boundary,
1459 		       0, "tx_boundary");
1460 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1461 		       "write_combine",
1462 		       CTLFLAG_RD, &sc->wc,
1463 		       0, "write combining PIO?");
1464 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1465 		       "read_dma_MBs",
1466 		       CTLFLAG_RD, &sc->read_dma,
1467 		       0, "DMA Read speed in MB/s");
1468 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1469 		       "write_dma_MBs",
1470 		       CTLFLAG_RD, &sc->write_dma,
1471 		       0, "DMA Write speed in MB/s");
1472 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1473 		       "read_write_dma_MBs",
1474 		       CTLFLAG_RD, &sc->read_write_dma,
1475 		       0, "DMA concurrent Read/Write speed in MB/s");
1476 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1477 		       "watchdog_resets",
1478 		       CTLFLAG_RD, &sc->watchdog_resets,
1479 		       0, "Number of times NIC was reset");
1480 
1481 	/* performance related tunables */
1482 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1483 	    "intr_coal_delay", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
1484 	    sc, 0, mxge_change_intr_coal, "I",
1485 	    "interrupt coalescing delay in usecs");
1486 
1487 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1488 	    "throttle", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
1489 	    mxge_change_throttle, "I", "transmit throttling");
1490 
1491 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1492 	    "flow_control_enabled",
1493 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
1494 	    mxge_change_flow_control, "I",
1495 	    "interrupt coalescing delay in usecs");
1496 
1497 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1498 		       "deassert_wait",
1499 		       CTLFLAG_RW, &mxge_deassert_wait,
1500 		       0, "Wait for IRQ line to go low in ihandler");
1501 
1502 	/* stats block from firmware is in network byte order.
1503 	   Need to swap it */
1504 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1505 	    "link_up", CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
1506 	    &fw->link_up, 0, mxge_handle_be32, "I", "link up");
1507 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1508 	    "rdma_tags_available", CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
1509 	    &fw->rdma_tags_available, 0, mxge_handle_be32, "I",
1510 	    "rdma_tags_available");
1511 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1512 	    "dropped_bad_crc32", CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
1513 	    &fw->dropped_bad_crc32, 0, mxge_handle_be32, "I",
1514 	    "dropped_bad_crc32");
1515 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1516 	    "dropped_bad_phy", CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
1517 	    &fw->dropped_bad_phy, 0, mxge_handle_be32, "I", "dropped_bad_phy");
1518 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1519 	    "dropped_link_error_or_filtered",
1520 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
1521 	    &fw->dropped_link_error_or_filtered, 0, mxge_handle_be32, "I",
1522 	    "dropped_link_error_or_filtered");
1523 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1524 	    "dropped_link_overflow",
1525 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
1526 	    &fw->dropped_link_overflow, 0, mxge_handle_be32, "I",
1527 	    "dropped_link_overflow");
1528 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1529 	    "dropped_multicast_filtered",
1530 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
1531 	    &fw->dropped_multicast_filtered, 0, mxge_handle_be32, "I",
1532 	    "dropped_multicast_filtered");
1533 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1534 	    "dropped_no_big_buffer",
1535 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
1536 	    &fw->dropped_no_big_buffer, 0, mxge_handle_be32, "I",
1537 	    "dropped_no_big_buffer");
1538 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1539 	    "dropped_no_small_buffer",
1540 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
1541 	    &fw->dropped_no_small_buffer, 0, mxge_handle_be32, "I",
1542 	    "dropped_no_small_buffer");
1543 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1544 	    "dropped_overrun",
1545 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
1546 	    &fw->dropped_overrun, 0, mxge_handle_be32, "I",
1547 	    "dropped_overrun");
1548 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1549 	    "dropped_pause", CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
1550 	    &fw->dropped_pause, 0, mxge_handle_be32, "I", "dropped_pause");
1551 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1552 	    "dropped_runt", CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
1553 	    &fw->dropped_runt, 0, mxge_handle_be32, "I", "dropped_runt");
1554 
1555 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1556 	    "dropped_unicast_filtered",
1557 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
1558 	    &fw->dropped_unicast_filtered, 0, mxge_handle_be32, "I",
1559 	    "dropped_unicast_filtered");
1560 
1561 	/* verbose printing? */
1562 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1563 		       "verbose",
1564 		       CTLFLAG_RW, &mxge_verbose,
1565 		       0, "verbose printing");
1566 
1567 	/* add counters exported for debugging from all slices */
1568 	sysctl_ctx_init(&sc->slice_sysctl_ctx);
1569 	sc->slice_sysctl_tree =
1570 		SYSCTL_ADD_NODE(&sc->slice_sysctl_ctx, children, OID_AUTO,
1571 		    "slice", CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
1572 
1573 	for (slice = 0; slice < sc->num_slices; slice++) {
1574 		ss = &sc->ss[slice];
1575 		sysctl_ctx_init(&ss->sysctl_ctx);
1576 		ctx = &ss->sysctl_ctx;
1577 		children = SYSCTL_CHILDREN(sc->slice_sysctl_tree);
1578 		sprintf(slice_num, "%d", slice);
1579 		ss->sysctl_tree =
1580 			SYSCTL_ADD_NODE(ctx, children, OID_AUTO, slice_num,
1581 			    CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
1582 		children = SYSCTL_CHILDREN(ss->sysctl_tree);
1583 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1584 			       "rx_small_cnt",
1585 			       CTLFLAG_RD, &ss->rx_small.cnt,
1586 			       0, "rx_small_cnt");
1587 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1588 			       "rx_big_cnt",
1589 			       CTLFLAG_RD, &ss->rx_big.cnt,
1590 			       0, "rx_small_cnt");
1591 		SYSCTL_ADD_U64(ctx, children, OID_AUTO,
1592 			       "lro_flushed", CTLFLAG_RD, &ss->lc.lro_flushed,
1593 			       0, "number of lro merge queues flushed");
1594 
1595 		SYSCTL_ADD_U64(ctx, children, OID_AUTO,
1596 			       "lro_bad_csum", CTLFLAG_RD, &ss->lc.lro_bad_csum,
1597 			       0, "number of bad csums preventing LRO");
1598 
1599 		SYSCTL_ADD_U64(ctx, children, OID_AUTO,
1600 			       "lro_queued", CTLFLAG_RD, &ss->lc.lro_queued,
1601 			       0, "number of frames appended to lro merge"
1602 			       "queues");
1603 
1604 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1605 			       "tx_req",
1606 			       CTLFLAG_RD, &ss->tx.req,
1607 			       0, "tx_req");
1608 
1609 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1610 			       "tx_done",
1611 			       CTLFLAG_RD, &ss->tx.done,
1612 			       0, "tx_done");
1613 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1614 			       "tx_pkt_done",
1615 			       CTLFLAG_RD, &ss->tx.pkt_done,
1616 			       0, "tx_done");
1617 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1618 			       "tx_stall",
1619 			       CTLFLAG_RD, &ss->tx.stall,
1620 			       0, "tx_stall");
1621 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1622 			       "tx_wake",
1623 			       CTLFLAG_RD, &ss->tx.wake,
1624 			       0, "tx_wake");
1625 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1626 			       "tx_defrag",
1627 			       CTLFLAG_RD, &ss->tx.defrag,
1628 			       0, "tx_defrag");
1629 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1630 			       "tx_queue_active",
1631 			       CTLFLAG_RD, &ss->tx.queue_active,
1632 			       0, "tx_queue_active");
1633 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1634 			       "tx_activate",
1635 			       CTLFLAG_RD, &ss->tx.activate,
1636 			       0, "tx_activate");
1637 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1638 			       "tx_deactivate",
1639 			       CTLFLAG_RD, &ss->tx.deactivate,
1640 			       0, "tx_deactivate");
1641 	}
1642 }
1643 
1644 /* copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
1645    backwards one at a time and handle ring wraps */
1646 
1647 static inline void
1648 mxge_submit_req_backwards(mxge_tx_ring_t *tx,
1649 			    mcp_kreq_ether_send_t *src, int cnt)
1650 {
1651 	int idx, starting_slot;
1652 	starting_slot = tx->req;
1653 	while (cnt > 1) {
1654 		cnt--;
1655 		idx = (starting_slot + cnt) & tx->mask;
1656 		mxge_pio_copy(&tx->lanai[idx],
1657 			      &src[cnt], sizeof(*src));
1658 		wmb();
1659 	}
1660 }
1661 
1662 /*
1663  * copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
1664  * at most 32 bytes at a time, so as to avoid involving the software
1665  * pio handler in the nic.   We re-write the first segment's flags
1666  * to mark them valid only after writing the entire chain
1667  */
1668 
1669 static inline void
1670 mxge_submit_req(mxge_tx_ring_t *tx, mcp_kreq_ether_send_t *src,
1671 		  int cnt)
1672 {
1673 	int idx, i;
1674 	uint32_t *src_ints;
1675 	volatile uint32_t *dst_ints;
1676 	mcp_kreq_ether_send_t *srcp;
1677 	volatile mcp_kreq_ether_send_t *dstp, *dst;
1678 	uint8_t last_flags;
1679 
1680 	idx = tx->req & tx->mask;
1681 
1682 	last_flags = src->flags;
1683 	src->flags = 0;
1684 	wmb();
1685 	dst = dstp = &tx->lanai[idx];
1686 	srcp = src;
1687 
1688 	if ((idx + cnt) < tx->mask) {
1689 		for (i = 0; i < (cnt - 1); i += 2) {
1690 			mxge_pio_copy(dstp, srcp, 2 * sizeof(*src));
1691 			wmb(); /* force write every 32 bytes */
1692 			srcp += 2;
1693 			dstp += 2;
1694 		}
1695 	} else {
1696 		/* submit all but the first request, and ensure
1697 		   that it is submitted below */
1698 		mxge_submit_req_backwards(tx, src, cnt);
1699 		i = 0;
1700 	}
1701 	if (i < cnt) {
1702 		/* submit the first request */
1703 		mxge_pio_copy(dstp, srcp, sizeof(*src));
1704 		wmb(); /* barrier before setting valid flag */
1705 	}
1706 
1707 	/* re-write the last 32-bits with the valid flags */
1708 	src->flags = last_flags;
1709 	src_ints = (uint32_t *)src;
1710 	src_ints+=3;
1711 	dst_ints = (volatile uint32_t *)dst;
1712 	dst_ints+=3;
1713 	*dst_ints =  *src_ints;
1714 	tx->req += cnt;
1715 	wmb();
1716 }
1717 
1718 static int
1719 mxge_parse_tx(struct mxge_slice_state *ss, struct mbuf *m,
1720     struct mxge_pkt_info *pi)
1721 {
1722 	struct ether_vlan_header *eh;
1723 	uint16_t etype;
1724 	int tso = m->m_pkthdr.csum_flags & (CSUM_TSO);
1725 #if IFCAP_TSO6 && defined(INET6)
1726 	int nxt;
1727 #endif
1728 
1729 	eh = mtod(m, struct ether_vlan_header *);
1730 	if (eh->evl_encap_proto == htons(ETHERTYPE_VLAN)) {
1731 		etype = ntohs(eh->evl_proto);
1732 		pi->ip_off = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
1733 	} else {
1734 		etype = ntohs(eh->evl_encap_proto);
1735 		pi->ip_off = ETHER_HDR_LEN;
1736 	}
1737 
1738 	switch (etype) {
1739 	case ETHERTYPE_IP:
1740 		/*
1741 		 * ensure ip header is in first mbuf, copy it to a
1742 		 * scratch buffer if not
1743 		 */
1744 		pi->ip = (struct ip *)(m->m_data + pi->ip_off);
1745 		pi->ip6 = NULL;
1746 		if (__predict_false(m->m_len < pi->ip_off + sizeof(*pi->ip))) {
1747 			m_copydata(m, 0, pi->ip_off + sizeof(*pi->ip),
1748 			    ss->scratch);
1749 			pi->ip = (struct ip *)(ss->scratch + pi->ip_off);
1750 		}
1751 		pi->ip_hlen = pi->ip->ip_hl << 2;
1752 		if (!tso)
1753 			return 0;
1754 
1755 		if (__predict_false(m->m_len < pi->ip_off + pi->ip_hlen +
1756 		    sizeof(struct tcphdr))) {
1757 			m_copydata(m, 0, pi->ip_off + pi->ip_hlen +
1758 			    sizeof(struct tcphdr), ss->scratch);
1759 			pi->ip = (struct ip *)(ss->scratch + pi->ip_off);
1760 		}
1761 		pi->tcp = (struct tcphdr *)((char *)pi->ip + pi->ip_hlen);
1762 		break;
1763 #if IFCAP_TSO6 && defined(INET6)
1764 	case ETHERTYPE_IPV6:
1765 		pi->ip6 = (struct ip6_hdr *)(m->m_data + pi->ip_off);
1766 		if (__predict_false(m->m_len < pi->ip_off + sizeof(*pi->ip6))) {
1767 			m_copydata(m, 0, pi->ip_off + sizeof(*pi->ip6),
1768 			    ss->scratch);
1769 			pi->ip6 = (struct ip6_hdr *)(ss->scratch + pi->ip_off);
1770 		}
1771 		nxt = 0;
1772 		pi->ip_hlen = ip6_lasthdr(m, pi->ip_off, IPPROTO_IPV6, &nxt);
1773 		pi->ip_hlen -= pi->ip_off;
1774 		if (nxt != IPPROTO_TCP && nxt != IPPROTO_UDP)
1775 			return EINVAL;
1776 
1777 		if (!tso)
1778 			return 0;
1779 
1780 		if (pi->ip_off + pi->ip_hlen > ss->sc->max_tso6_hlen)
1781 			return EINVAL;
1782 
1783 		if (__predict_false(m->m_len < pi->ip_off + pi->ip_hlen +
1784 		    sizeof(struct tcphdr))) {
1785 			m_copydata(m, 0, pi->ip_off + pi->ip_hlen +
1786 			    sizeof(struct tcphdr), ss->scratch);
1787 			pi->ip6 = (struct ip6_hdr *)(ss->scratch + pi->ip_off);
1788 		}
1789 		pi->tcp = (struct tcphdr *)((char *)pi->ip6 + pi->ip_hlen);
1790 		break;
1791 #endif
1792 	default:
1793 		return EINVAL;
1794 	}
1795 	return 0;
1796 }
1797 
1798 #if IFCAP_TSO4
1799 
1800 static void
1801 mxge_encap_tso(struct mxge_slice_state *ss, struct mbuf *m,
1802 	       int busdma_seg_cnt, struct mxge_pkt_info *pi)
1803 {
1804 	mxge_tx_ring_t *tx;
1805 	mcp_kreq_ether_send_t *req;
1806 	bus_dma_segment_t *seg;
1807 	uint32_t low, high_swapped;
1808 	int len, seglen, cum_len, cum_len_next;
1809 	int next_is_first, chop, cnt, rdma_count, small;
1810 	uint16_t pseudo_hdr_offset, cksum_offset, mss, sum;
1811 	uint8_t flags, flags_next;
1812 	static int once;
1813 
1814 	mss = m->m_pkthdr.tso_segsz;
1815 
1816 	/* negative cum_len signifies to the
1817 	 * send loop that we are still in the
1818 	 * header portion of the TSO packet.
1819 	 */
1820 
1821 	cksum_offset = pi->ip_off + pi->ip_hlen;
1822 	cum_len = -(cksum_offset + (pi->tcp->th_off << 2));
1823 
1824 	/* TSO implies checksum offload on this hardware */
1825 	if (__predict_false((m->m_pkthdr.csum_flags & (CSUM_TCP|CSUM_TCP_IPV6)) == 0)) {
1826 		/*
1827 		 * If packet has full TCP csum, replace it with pseudo hdr
1828 		 * sum that the NIC expects, otherwise the NIC will emit
1829 		 * packets with bad TCP checksums.
1830 		 */
1831 		m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
1832 		if (pi->ip6) {
1833 #if (CSUM_TCP_IPV6 != 0) && defined(INET6)
1834 			m->m_pkthdr.csum_flags |= CSUM_TCP_IPV6;
1835 			sum = in6_cksum_pseudo(pi->ip6,
1836 			    m->m_pkthdr.len - cksum_offset,
1837 			    IPPROTO_TCP, 0);
1838 #endif
1839 		} else {
1840 #ifdef INET
1841 			m->m_pkthdr.csum_flags |= CSUM_TCP;
1842 			sum = in_pseudo(pi->ip->ip_src.s_addr,
1843 			    pi->ip->ip_dst.s_addr,
1844 			    htons(IPPROTO_TCP + (m->m_pkthdr.len -
1845 				    cksum_offset)));
1846 #endif
1847 		}
1848 		m_copyback(m, offsetof(struct tcphdr, th_sum) +
1849 		    cksum_offset, sizeof(sum), (caddr_t)&sum);
1850 	}
1851 	flags = MXGEFW_FLAGS_TSO_HDR | MXGEFW_FLAGS_FIRST;
1852 
1853 	/* for TSO, pseudo_hdr_offset holds mss.
1854 	 * The firmware figures out where to put
1855 	 * the checksum by parsing the header. */
1856 	pseudo_hdr_offset = htobe16(mss);
1857 
1858 	if (pi->ip6) {
1859 		/*
1860 		 * for IPv6 TSO, the "checksum offset" is re-purposed
1861 		 * to store the TCP header len
1862 		 */
1863 		cksum_offset = (pi->tcp->th_off << 2);
1864 	}
1865 
1866 	tx = &ss->tx;
1867 	req = tx->req_list;
1868 	seg = tx->seg_list;
1869 	cnt = 0;
1870 	rdma_count = 0;
1871 	/* "rdma_count" is the number of RDMAs belonging to the
1872 	 * current packet BEFORE the current send request. For
1873 	 * non-TSO packets, this is equal to "count".
1874 	 * For TSO packets, rdma_count needs to be reset
1875 	 * to 0 after a segment cut.
1876 	 *
1877 	 * The rdma_count field of the send request is
1878 	 * the number of RDMAs of the packet starting at
1879 	 * that request. For TSO send requests with one ore more cuts
1880 	 * in the middle, this is the number of RDMAs starting
1881 	 * after the last cut in the request. All previous
1882 	 * segments before the last cut implicitly have 1 RDMA.
1883 	 *
1884 	 * Since the number of RDMAs is not known beforehand,
1885 	 * it must be filled-in retroactively - after each
1886 	 * segmentation cut or at the end of the entire packet.
1887 	 */
1888 
1889 	while (busdma_seg_cnt) {
1890 		/* Break the busdma segment up into pieces*/
1891 		low = MXGE_LOWPART_TO_U32(seg->ds_addr);
1892 		high_swapped = 	htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
1893 		len = seg->ds_len;
1894 
1895 		while (len) {
1896 			flags_next = flags & ~MXGEFW_FLAGS_FIRST;
1897 			seglen = len;
1898 			cum_len_next = cum_len + seglen;
1899 			(req-rdma_count)->rdma_count = rdma_count + 1;
1900 			if (__predict_true(cum_len >= 0)) {
1901 				/* payload */
1902 				chop = (cum_len_next > mss);
1903 				cum_len_next = cum_len_next % mss;
1904 				next_is_first = (cum_len_next == 0);
1905 				flags |= chop * MXGEFW_FLAGS_TSO_CHOP;
1906 				flags_next |= next_is_first *
1907 					MXGEFW_FLAGS_FIRST;
1908 				rdma_count |= -(chop | next_is_first);
1909 				rdma_count += chop & !next_is_first;
1910 			} else if (cum_len_next >= 0) {
1911 				/* header ends */
1912 				rdma_count = -1;
1913 				cum_len_next = 0;
1914 				seglen = -cum_len;
1915 				small = (mss <= MXGEFW_SEND_SMALL_SIZE);
1916 				flags_next = MXGEFW_FLAGS_TSO_PLD |
1917 					MXGEFW_FLAGS_FIRST |
1918 					(small * MXGEFW_FLAGS_SMALL);
1919 			    }
1920 
1921 			req->addr_high = high_swapped;
1922 			req->addr_low = htobe32(low);
1923 			req->pseudo_hdr_offset = pseudo_hdr_offset;
1924 			req->pad = 0;
1925 			req->rdma_count = 1;
1926 			req->length = htobe16(seglen);
1927 			req->cksum_offset = cksum_offset;
1928 			req->flags = flags | ((cum_len & 1) *
1929 					      MXGEFW_FLAGS_ALIGN_ODD);
1930 			low += seglen;
1931 			len -= seglen;
1932 			cum_len = cum_len_next;
1933 			flags = flags_next;
1934 			req++;
1935 			cnt++;
1936 			rdma_count++;
1937 			if (cksum_offset != 0 && !pi->ip6) {
1938 				if (__predict_false(cksum_offset > seglen))
1939 					cksum_offset -= seglen;
1940 				else
1941 					cksum_offset = 0;
1942 			}
1943 			if (__predict_false(cnt > tx->max_desc))
1944 				goto drop;
1945 		}
1946 		busdma_seg_cnt--;
1947 		seg++;
1948 	}
1949 	(req-rdma_count)->rdma_count = rdma_count;
1950 
1951 	do {
1952 		req--;
1953 		req->flags |= MXGEFW_FLAGS_TSO_LAST;
1954 	} while (!(req->flags & (MXGEFW_FLAGS_TSO_CHOP | MXGEFW_FLAGS_FIRST)));
1955 
1956 	tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
1957 	mxge_submit_req(tx, tx->req_list, cnt);
1958 
1959 	if ((ss->sc->num_slices > 1) && tx->queue_active == 0) {
1960 		/* tell the NIC to start polling this slice */
1961 		*tx->send_go = 1;
1962 		tx->queue_active = 1;
1963 		tx->activate++;
1964 		wmb();
1965 	}
1966 
1967 	return;
1968 
1969 drop:
1970 	bus_dmamap_unload(tx->dmat, tx->info[tx->req & tx->mask].map);
1971 	m_freem(m);
1972 	ss->oerrors++;
1973 	if (!once) {
1974 		printf("tx->max_desc exceeded via TSO!\n");
1975 		printf("mss = %d, %ld, %d!\n", mss,
1976 		       (long)seg - (long)tx->seg_list, tx->max_desc);
1977 		once = 1;
1978 	}
1979 	return;
1980 
1981 }
1982 
1983 #endif /* IFCAP_TSO4 */
1984 
1985 #ifdef MXGE_NEW_VLAN_API
1986 /*
1987  * We reproduce the software vlan tag insertion from
1988  * net/if_vlan.c:vlan_start() here so that we can advertise "hardware"
1989  * vlan tag insertion. We need to advertise this in order to have the
1990  * vlan interface respect our csum offload flags.
1991  */
1992 static struct mbuf *
1993 mxge_vlan_tag_insert(struct mbuf *m)
1994 {
1995 	struct ether_vlan_header *evl;
1996 
1997 	M_PREPEND(m, ETHER_VLAN_ENCAP_LEN, M_NOWAIT);
1998 	if (__predict_false(m == NULL))
1999 		return NULL;
2000 	if (m->m_len < sizeof(*evl)) {
2001 		m = m_pullup(m, sizeof(*evl));
2002 		if (__predict_false(m == NULL))
2003 			return NULL;
2004 	}
2005 	/*
2006 	 * Transform the Ethernet header into an Ethernet header
2007 	 * with 802.1Q encapsulation.
2008 	 */
2009 	evl = mtod(m, struct ether_vlan_header *);
2010 	bcopy((char *)evl + ETHER_VLAN_ENCAP_LEN,
2011 	      (char *)evl, ETHER_HDR_LEN - ETHER_TYPE_LEN);
2012 	evl->evl_encap_proto = htons(ETHERTYPE_VLAN);
2013 	evl->evl_tag = htons(m->m_pkthdr.ether_vtag);
2014 	m->m_flags &= ~M_VLANTAG;
2015 	return m;
2016 }
2017 #endif /* MXGE_NEW_VLAN_API */
2018 
2019 static void
2020 mxge_encap(struct mxge_slice_state *ss, struct mbuf *m)
2021 {
2022 	struct mxge_pkt_info pi = {0,0,0,0};
2023 	mxge_softc_t *sc;
2024 	mcp_kreq_ether_send_t *req;
2025 	bus_dma_segment_t *seg;
2026 	struct mbuf *m_tmp;
2027 	mxge_tx_ring_t *tx;
2028 	int cnt, cum_len, err, i, idx, odd_flag;
2029 	uint16_t pseudo_hdr_offset;
2030 	uint8_t flags, cksum_offset;
2031 
2032 	sc = ss->sc;
2033 	tx = &ss->tx;
2034 
2035 #ifdef MXGE_NEW_VLAN_API
2036 	if (m->m_flags & M_VLANTAG) {
2037 		m = mxge_vlan_tag_insert(m);
2038 		if (__predict_false(m == NULL))
2039 			goto drop_without_m;
2040 	}
2041 #endif
2042 	if (m->m_pkthdr.csum_flags &
2043 	    (CSUM_TSO | CSUM_DELAY_DATA | CSUM_DELAY_DATA_IPV6)) {
2044 		if (mxge_parse_tx(ss, m, &pi))
2045 			goto drop;
2046 	}
2047 
2048 	/* (try to) map the frame for DMA */
2049 	idx = tx->req & tx->mask;
2050 	err = bus_dmamap_load_mbuf_sg(tx->dmat, tx->info[idx].map,
2051 				      m, tx->seg_list, &cnt,
2052 				      BUS_DMA_NOWAIT);
2053 	if (__predict_false(err == EFBIG)) {
2054 		/* Too many segments in the chain.  Try
2055 		   to defrag */
2056 		m_tmp = m_defrag(m, M_NOWAIT);
2057 		if (m_tmp == NULL) {
2058 			goto drop;
2059 		}
2060 		ss->tx.defrag++;
2061 		m = m_tmp;
2062 		err = bus_dmamap_load_mbuf_sg(tx->dmat,
2063 					      tx->info[idx].map,
2064 					      m, tx->seg_list, &cnt,
2065 					      BUS_DMA_NOWAIT);
2066 	}
2067 	if (__predict_false(err != 0)) {
2068 		device_printf(sc->dev, "bus_dmamap_load_mbuf_sg returned %d"
2069 			      " packet len = %d\n", err, m->m_pkthdr.len);
2070 		goto drop;
2071 	}
2072 	bus_dmamap_sync(tx->dmat, tx->info[idx].map,
2073 			BUS_DMASYNC_PREWRITE);
2074 	tx->info[idx].m = m;
2075 
2076 #if IFCAP_TSO4
2077 	/* TSO is different enough, we handle it in another routine */
2078 	if (m->m_pkthdr.csum_flags & (CSUM_TSO)) {
2079 		mxge_encap_tso(ss, m, cnt, &pi);
2080 		return;
2081 	}
2082 #endif
2083 
2084 	req = tx->req_list;
2085 	cksum_offset = 0;
2086 	pseudo_hdr_offset = 0;
2087 	flags = MXGEFW_FLAGS_NO_TSO;
2088 
2089 	/* checksum offloading? */
2090 	if (m->m_pkthdr.csum_flags &
2091 	    (CSUM_DELAY_DATA | CSUM_DELAY_DATA_IPV6)) {
2092 		/* ensure ip header is in first mbuf, copy
2093 		   it to a scratch buffer if not */
2094 		cksum_offset = pi.ip_off + pi.ip_hlen;
2095 		pseudo_hdr_offset = cksum_offset +  m->m_pkthdr.csum_data;
2096 		pseudo_hdr_offset = htobe16(pseudo_hdr_offset);
2097 		req->cksum_offset = cksum_offset;
2098 		flags |= MXGEFW_FLAGS_CKSUM;
2099 		odd_flag = MXGEFW_FLAGS_ALIGN_ODD;
2100 	} else {
2101 		odd_flag = 0;
2102 	}
2103 	if (m->m_pkthdr.len < MXGEFW_SEND_SMALL_SIZE)
2104 		flags |= MXGEFW_FLAGS_SMALL;
2105 
2106 	/* convert segments into a request list */
2107 	cum_len = 0;
2108 	seg = tx->seg_list;
2109 	req->flags = MXGEFW_FLAGS_FIRST;
2110 	for (i = 0; i < cnt; i++) {
2111 		req->addr_low =
2112 			htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
2113 		req->addr_high =
2114 			htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
2115 		req->length = htobe16(seg->ds_len);
2116 		req->cksum_offset = cksum_offset;
2117 		if (cksum_offset > seg->ds_len)
2118 			cksum_offset -= seg->ds_len;
2119 		else
2120 			cksum_offset = 0;
2121 		req->pseudo_hdr_offset = pseudo_hdr_offset;
2122 		req->pad = 0; /* complete solid 16-byte block */
2123 		req->rdma_count = 1;
2124 		req->flags |= flags | ((cum_len & 1) * odd_flag);
2125 		cum_len += seg->ds_len;
2126 		seg++;
2127 		req++;
2128 		req->flags = 0;
2129 	}
2130 	req--;
2131 	/* pad runts to 60 bytes */
2132 	if (cum_len < 60) {
2133 		req++;
2134 		req->addr_low =
2135 			htobe32(MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr));
2136 		req->addr_high =
2137 			htobe32(MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr));
2138 		req->length = htobe16(60 - cum_len);
2139 		req->cksum_offset = 0;
2140 		req->pseudo_hdr_offset = pseudo_hdr_offset;
2141 		req->pad = 0; /* complete solid 16-byte block */
2142 		req->rdma_count = 1;
2143 		req->flags |= flags | ((cum_len & 1) * odd_flag);
2144 		cnt++;
2145 	}
2146 
2147 	tx->req_list[0].rdma_count = cnt;
2148 #if 0
2149 	/* print what the firmware will see */
2150 	for (i = 0; i < cnt; i++) {
2151 		printf("%d: addr: 0x%x 0x%x len:%d pso%d,"
2152 		    "cso:%d, flags:0x%x, rdma:%d\n",
2153 		    i, (int)ntohl(tx->req_list[i].addr_high),
2154 		    (int)ntohl(tx->req_list[i].addr_low),
2155 		    (int)ntohs(tx->req_list[i].length),
2156 		    (int)ntohs(tx->req_list[i].pseudo_hdr_offset),
2157 		    tx->req_list[i].cksum_offset, tx->req_list[i].flags,
2158 		    tx->req_list[i].rdma_count);
2159 	}
2160 	printf("--------------\n");
2161 #endif
2162 	tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
2163 	mxge_submit_req(tx, tx->req_list, cnt);
2164 
2165 	if ((ss->sc->num_slices > 1) && tx->queue_active == 0) {
2166 		/* tell the NIC to start polling this slice */
2167 		*tx->send_go = 1;
2168 		tx->queue_active = 1;
2169 		tx->activate++;
2170 		wmb();
2171 	}
2172 
2173 	return;
2174 
2175 drop:
2176 	m_freem(m);
2177 drop_without_m:
2178 	ss->oerrors++;
2179 	return;
2180 }
2181 
2182 static void
2183 mxge_qflush(struct ifnet *ifp)
2184 {
2185 	mxge_softc_t *sc = ifp->if_softc;
2186 	mxge_tx_ring_t *tx;
2187 	struct mbuf *m;
2188 	int slice;
2189 
2190 	for (slice = 0; slice < sc->num_slices; slice++) {
2191 		tx = &sc->ss[slice].tx;
2192 		mtx_lock(&tx->mtx);
2193 		while ((m = buf_ring_dequeue_sc(tx->br)) != NULL)
2194 			m_freem(m);
2195 		mtx_unlock(&tx->mtx);
2196 	}
2197 	if_qflush(ifp);
2198 }
2199 
2200 static inline void
2201 mxge_start_locked(struct mxge_slice_state *ss)
2202 {
2203 	mxge_softc_t *sc;
2204 	struct mbuf *m;
2205 	struct ifnet *ifp;
2206 	mxge_tx_ring_t *tx;
2207 
2208 	sc = ss->sc;
2209 	ifp = sc->ifp;
2210 	tx = &ss->tx;
2211 
2212 	while ((tx->mask - (tx->req - tx->done)) > tx->max_desc) {
2213 		m = drbr_dequeue(ifp, tx->br);
2214 		if (m == NULL) {
2215 			return;
2216 		}
2217 		/* let BPF see it */
2218 		BPF_MTAP(ifp, m);
2219 
2220 		/* give it to the nic */
2221 		mxge_encap(ss, m);
2222 	}
2223 	/* ran out of transmit slots */
2224 	if (((ss->if_drv_flags & IFF_DRV_OACTIVE) == 0)
2225 	    && (!drbr_empty(ifp, tx->br))) {
2226 		ss->if_drv_flags |= IFF_DRV_OACTIVE;
2227 		tx->stall++;
2228 	}
2229 }
2230 
2231 static int
2232 mxge_transmit_locked(struct mxge_slice_state *ss, struct mbuf *m)
2233 {
2234 	mxge_softc_t *sc;
2235 	struct ifnet *ifp;
2236 	mxge_tx_ring_t *tx;
2237 	int err;
2238 
2239 	sc = ss->sc;
2240 	ifp = sc->ifp;
2241 	tx = &ss->tx;
2242 
2243 	if ((ss->if_drv_flags & (IFF_DRV_RUNNING|IFF_DRV_OACTIVE)) !=
2244 	    IFF_DRV_RUNNING) {
2245 		err = drbr_enqueue(ifp, tx->br, m);
2246 		return (err);
2247 	}
2248 
2249 	if (!drbr_needs_enqueue(ifp, tx->br) &&
2250 	    ((tx->mask - (tx->req - tx->done)) > tx->max_desc)) {
2251 		/* let BPF see it */
2252 		BPF_MTAP(ifp, m);
2253 		/* give it to the nic */
2254 		mxge_encap(ss, m);
2255 	} else if ((err = drbr_enqueue(ifp, tx->br, m)) != 0) {
2256 		return (err);
2257 	}
2258 	if (!drbr_empty(ifp, tx->br))
2259 		mxge_start_locked(ss);
2260 	return (0);
2261 }
2262 
2263 static int
2264 mxge_transmit(struct ifnet *ifp, struct mbuf *m)
2265 {
2266 	mxge_softc_t *sc = ifp->if_softc;
2267 	struct mxge_slice_state *ss;
2268 	mxge_tx_ring_t *tx;
2269 	int err = 0;
2270 	int slice;
2271 
2272 	slice = m->m_pkthdr.flowid;
2273 	slice &= (sc->num_slices - 1);  /* num_slices always power of 2 */
2274 
2275 	ss = &sc->ss[slice];
2276 	tx = &ss->tx;
2277 
2278 	if (mtx_trylock(&tx->mtx)) {
2279 		err = mxge_transmit_locked(ss, m);
2280 		mtx_unlock(&tx->mtx);
2281 	} else {
2282 		err = drbr_enqueue(ifp, tx->br, m);
2283 	}
2284 
2285 	return (err);
2286 }
2287 
2288 static void
2289 mxge_start(struct ifnet *ifp)
2290 {
2291 	mxge_softc_t *sc = ifp->if_softc;
2292 	struct mxge_slice_state *ss;
2293 
2294 	/* only use the first slice for now */
2295 	ss = &sc->ss[0];
2296 	mtx_lock(&ss->tx.mtx);
2297 	mxge_start_locked(ss);
2298 	mtx_unlock(&ss->tx.mtx);
2299 }
2300 
2301 /*
2302  * copy an array of mcp_kreq_ether_recv_t's to the mcp.  Copy
2303  * at most 32 bytes at a time, so as to avoid involving the software
2304  * pio handler in the nic.   We re-write the first segment's low
2305  * DMA address to mark it valid only after we write the entire chunk
2306  * in a burst
2307  */
2308 static inline void
2309 mxge_submit_8rx(volatile mcp_kreq_ether_recv_t *dst,
2310 		mcp_kreq_ether_recv_t *src)
2311 {
2312 	uint32_t low;
2313 
2314 	low = src->addr_low;
2315 	src->addr_low = 0xffffffff;
2316 	mxge_pio_copy(dst, src, 4 * sizeof (*src));
2317 	wmb();
2318 	mxge_pio_copy(dst + 4, src + 4, 4 * sizeof (*src));
2319 	wmb();
2320 	src->addr_low = low;
2321 	dst->addr_low = low;
2322 	wmb();
2323 }
2324 
2325 static int
2326 mxge_get_buf_small(struct mxge_slice_state *ss, bus_dmamap_t map, int idx)
2327 {
2328 	bus_dma_segment_t seg;
2329 	struct mbuf *m;
2330 	mxge_rx_ring_t *rx = &ss->rx_small;
2331 	int cnt, err;
2332 
2333 	m = m_gethdr(M_NOWAIT, MT_DATA);
2334 	if (m == NULL) {
2335 		rx->alloc_fail++;
2336 		err = ENOBUFS;
2337 		goto done;
2338 	}
2339 	m->m_len = MHLEN;
2340 	err = bus_dmamap_load_mbuf_sg(rx->dmat, map, m,
2341 				      &seg, &cnt, BUS_DMA_NOWAIT);
2342 	if (err != 0) {
2343 		m_free(m);
2344 		goto done;
2345 	}
2346 	rx->info[idx].m = m;
2347 	rx->shadow[idx].addr_low =
2348 		htobe32(MXGE_LOWPART_TO_U32(seg.ds_addr));
2349 	rx->shadow[idx].addr_high =
2350 		htobe32(MXGE_HIGHPART_TO_U32(seg.ds_addr));
2351 
2352 done:
2353 	if ((idx & 7) == 7)
2354 		mxge_submit_8rx(&rx->lanai[idx - 7], &rx->shadow[idx - 7]);
2355 	return err;
2356 }
2357 
2358 static int
2359 mxge_get_buf_big(struct mxge_slice_state *ss, bus_dmamap_t map, int idx)
2360 {
2361 	bus_dma_segment_t seg[3];
2362 	struct mbuf *m;
2363 	mxge_rx_ring_t *rx = &ss->rx_big;
2364 	int cnt, err, i;
2365 
2366 	m = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, rx->cl_size);
2367 	if (m == NULL) {
2368 		rx->alloc_fail++;
2369 		err = ENOBUFS;
2370 		goto done;
2371 	}
2372 	m->m_len = rx->mlen;
2373 	err = bus_dmamap_load_mbuf_sg(rx->dmat, map, m,
2374 				      seg, &cnt, BUS_DMA_NOWAIT);
2375 	if (err != 0) {
2376 		m_free(m);
2377 		goto done;
2378 	}
2379 	rx->info[idx].m = m;
2380 	rx->shadow[idx].addr_low =
2381 		htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
2382 	rx->shadow[idx].addr_high =
2383 		htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
2384 
2385 done:
2386        for (i = 0; i < rx->nbufs; i++) {
2387 		if ((idx & 7) == 7) {
2388 			mxge_submit_8rx(&rx->lanai[idx - 7],
2389 					&rx->shadow[idx - 7]);
2390 		}
2391 		idx++;
2392 	}
2393 	return err;
2394 }
2395 
2396 #ifdef INET6
2397 
2398 static uint16_t
2399 mxge_csum_generic(uint16_t *raw, int len)
2400 {
2401 	uint32_t csum;
2402 
2403 	csum = 0;
2404 	while (len > 0) {
2405 		csum += *raw;
2406 		raw++;
2407 		len -= 2;
2408 	}
2409 	csum = (csum >> 16) + (csum & 0xffff);
2410 	csum = (csum >> 16) + (csum & 0xffff);
2411 	return (uint16_t)csum;
2412 }
2413 
2414 static inline uint16_t
2415 mxge_rx_csum6(void *p, struct mbuf *m, uint32_t csum)
2416 {
2417 	uint32_t partial;
2418 	int nxt, cksum_offset;
2419 	struct ip6_hdr *ip6 = p;
2420 	uint16_t c;
2421 
2422 	nxt = ip6->ip6_nxt;
2423 	cksum_offset = sizeof (*ip6) + ETHER_HDR_LEN;
2424 	if (nxt != IPPROTO_TCP && nxt != IPPROTO_UDP) {
2425 		cksum_offset = ip6_lasthdr(m, ETHER_HDR_LEN,
2426 					   IPPROTO_IPV6, &nxt);
2427 		if (nxt != IPPROTO_TCP && nxt != IPPROTO_UDP)
2428 			return (1);
2429 	}
2430 
2431 	/*
2432 	 * IPv6 headers do not contain a checksum, and hence
2433 	 * do not checksum to zero, so they don't "fall out"
2434 	 * of the partial checksum calculation like IPv4
2435 	 * headers do.  We need to fix the partial checksum by
2436 	 * subtracting the checksum of the IPv6 header.
2437 	 */
2438 
2439 	partial = mxge_csum_generic((uint16_t *)ip6, cksum_offset -
2440 				    ETHER_HDR_LEN);
2441 	csum += ~partial;
2442 	csum +=	 (csum < ~partial);
2443 	csum = (csum >> 16) + (csum & 0xFFFF);
2444 	csum = (csum >> 16) + (csum & 0xFFFF);
2445 	c = in6_cksum_pseudo(ip6, m->m_pkthdr.len - cksum_offset, nxt,
2446 			     csum);
2447 	c ^= 0xffff;
2448 	return (c);
2449 }
2450 #endif /* INET6 */
2451 /*
2452  *  Myri10GE hardware checksums are not valid if the sender
2453  *  padded the frame with non-zero padding.  This is because
2454  *  the firmware just does a simple 16-bit 1s complement
2455  *  checksum across the entire frame, excluding the first 14
2456  *  bytes.  It is best to simply to check the checksum and
2457  *  tell the stack about it only if the checksum is good
2458  */
2459 
2460 static inline uint16_t
2461 mxge_rx_csum(struct mbuf *m, int csum)
2462 {
2463 	struct ether_header *eh;
2464 #ifdef INET
2465 	struct ip *ip;
2466 #endif
2467 #if defined(INET) || defined(INET6)
2468 	int cap = m->m_pkthdr.rcvif->if_capenable;
2469 #endif
2470 	uint16_t c, etype;
2471 
2472 	eh = mtod(m, struct ether_header *);
2473 	etype = ntohs(eh->ether_type);
2474 	switch (etype) {
2475 #ifdef INET
2476 	case ETHERTYPE_IP:
2477 		if ((cap & IFCAP_RXCSUM) == 0)
2478 			return (1);
2479 		ip = (struct ip *)(eh + 1);
2480 		if (ip->ip_p != IPPROTO_TCP && ip->ip_p != IPPROTO_UDP)
2481 			return (1);
2482 		c = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
2483 			      htonl(ntohs(csum) + ntohs(ip->ip_len) -
2484 				    (ip->ip_hl << 2) + ip->ip_p));
2485 		c ^= 0xffff;
2486 		break;
2487 #endif
2488 #ifdef INET6
2489 	case ETHERTYPE_IPV6:
2490 		if ((cap & IFCAP_RXCSUM_IPV6) == 0)
2491 			return (1);
2492 		c = mxge_rx_csum6((eh + 1), m, csum);
2493 		break;
2494 #endif
2495 	default:
2496 		c = 1;
2497 	}
2498 	return (c);
2499 }
2500 
2501 static void
2502 mxge_vlan_tag_remove(struct mbuf *m, uint32_t *csum)
2503 {
2504 	struct ether_vlan_header *evl;
2505 	uint32_t partial;
2506 
2507 	evl = mtod(m, struct ether_vlan_header *);
2508 
2509 	/*
2510 	 * fix checksum by subtracting ETHER_VLAN_ENCAP_LEN bytes
2511 	 * after what the firmware thought was the end of the ethernet
2512 	 * header.
2513 	 */
2514 
2515 	/* put checksum into host byte order */
2516 	*csum = ntohs(*csum);
2517 	partial = ntohl(*(uint32_t *)(mtod(m, char *) + ETHER_HDR_LEN));
2518 	(*csum) += ~partial;
2519 	(*csum) +=  ((*csum) < ~partial);
2520 	(*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2521 	(*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2522 
2523 	/* restore checksum to network byte order;
2524 	   later consumers expect this */
2525 	*csum = htons(*csum);
2526 
2527 	/* save the tag */
2528 #ifdef MXGE_NEW_VLAN_API
2529 	m->m_pkthdr.ether_vtag = ntohs(evl->evl_tag);
2530 #else
2531 	{
2532 		struct m_tag *mtag;
2533 		mtag = m_tag_alloc(MTAG_VLAN, MTAG_VLAN_TAG, sizeof(u_int),
2534 				   M_NOWAIT);
2535 		if (mtag == NULL)
2536 			return;
2537 		VLAN_TAG_VALUE(mtag) = ntohs(evl->evl_tag);
2538 		m_tag_prepend(m, mtag);
2539 	}
2540 
2541 #endif
2542 	m->m_flags |= M_VLANTAG;
2543 
2544 	/*
2545 	 * Remove the 802.1q header by copying the Ethernet
2546 	 * addresses over it and adjusting the beginning of
2547 	 * the data in the mbuf.  The encapsulated Ethernet
2548 	 * type field is already in place.
2549 	 */
2550 	bcopy((char *)evl, (char *)evl + ETHER_VLAN_ENCAP_LEN,
2551 	      ETHER_HDR_LEN - ETHER_TYPE_LEN);
2552 	m_adj(m, ETHER_VLAN_ENCAP_LEN);
2553 }
2554 
2555 static inline void
2556 mxge_rx_done_big(struct mxge_slice_state *ss, uint32_t len,
2557 		 uint32_t csum, int lro)
2558 {
2559 	mxge_softc_t *sc;
2560 	struct ifnet *ifp;
2561 	struct mbuf *m;
2562 	struct ether_header *eh;
2563 	mxge_rx_ring_t *rx;
2564 	bus_dmamap_t old_map;
2565 	int idx;
2566 
2567 	sc = ss->sc;
2568 	ifp = sc->ifp;
2569 	rx = &ss->rx_big;
2570 	idx = rx->cnt & rx->mask;
2571 	rx->cnt += rx->nbufs;
2572 	/* save a pointer to the received mbuf */
2573 	m = rx->info[idx].m;
2574 	/* try to replace the received mbuf */
2575 	if (mxge_get_buf_big(ss, rx->extra_map, idx)) {
2576 		/* drop the frame -- the old mbuf is re-cycled */
2577 		if_inc_counter(ifp, IFCOUNTER_IERRORS, 1);
2578 		return;
2579 	}
2580 
2581 	/* unmap the received buffer */
2582 	old_map = rx->info[idx].map;
2583 	bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2584 	bus_dmamap_unload(rx->dmat, old_map);
2585 
2586 	/* swap the bus_dmamap_t's */
2587 	rx->info[idx].map = rx->extra_map;
2588 	rx->extra_map = old_map;
2589 
2590 	/* mcp implicitly skips 1st 2 bytes so that packet is properly
2591 	 * aligned */
2592 	m->m_data += MXGEFW_PAD;
2593 
2594 	m->m_pkthdr.rcvif = ifp;
2595 	m->m_len = m->m_pkthdr.len = len;
2596 	ss->ipackets++;
2597 	eh = mtod(m, struct ether_header *);
2598 	if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2599 		mxge_vlan_tag_remove(m, &csum);
2600 	}
2601 	/* flowid only valid if RSS hashing is enabled */
2602 	if (sc->num_slices > 1) {
2603 		m->m_pkthdr.flowid = (ss - sc->ss);
2604 		M_HASHTYPE_SET(m, M_HASHTYPE_OPAQUE);
2605 	}
2606 	/* if the checksum is valid, mark it in the mbuf header */
2607 	if ((ifp->if_capenable & (IFCAP_RXCSUM_IPV6 | IFCAP_RXCSUM)) &&
2608 	    (0 == mxge_rx_csum(m, csum))) {
2609 		/* Tell the stack that the  checksum is good */
2610 		m->m_pkthdr.csum_data = 0xffff;
2611 		m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR |
2612 			CSUM_DATA_VALID;
2613 
2614 #if defined(INET) || defined (INET6)
2615 		if (lro && (0 == tcp_lro_rx(&ss->lc, m, 0)))
2616 			return;
2617 #endif
2618 	}
2619 	/* pass the frame up the stack */
2620 	(*ifp->if_input)(ifp, m);
2621 }
2622 
2623 static inline void
2624 mxge_rx_done_small(struct mxge_slice_state *ss, uint32_t len,
2625 		   uint32_t csum, int lro)
2626 {
2627 	mxge_softc_t *sc;
2628 	struct ifnet *ifp;
2629 	struct ether_header *eh;
2630 	struct mbuf *m;
2631 	mxge_rx_ring_t *rx;
2632 	bus_dmamap_t old_map;
2633 	int idx;
2634 
2635 	sc = ss->sc;
2636 	ifp = sc->ifp;
2637 	rx = &ss->rx_small;
2638 	idx = rx->cnt & rx->mask;
2639 	rx->cnt++;
2640 	/* save a pointer to the received mbuf */
2641 	m = rx->info[idx].m;
2642 	/* try to replace the received mbuf */
2643 	if (mxge_get_buf_small(ss, rx->extra_map, idx)) {
2644 		/* drop the frame -- the old mbuf is re-cycled */
2645 		if_inc_counter(ifp, IFCOUNTER_IERRORS, 1);
2646 		return;
2647 	}
2648 
2649 	/* unmap the received buffer */
2650 	old_map = rx->info[idx].map;
2651 	bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2652 	bus_dmamap_unload(rx->dmat, old_map);
2653 
2654 	/* swap the bus_dmamap_t's */
2655 	rx->info[idx].map = rx->extra_map;
2656 	rx->extra_map = old_map;
2657 
2658 	/* mcp implicitly skips 1st 2 bytes so that packet is properly
2659 	 * aligned */
2660 	m->m_data += MXGEFW_PAD;
2661 
2662 	m->m_pkthdr.rcvif = ifp;
2663 	m->m_len = m->m_pkthdr.len = len;
2664 	ss->ipackets++;
2665 	eh = mtod(m, struct ether_header *);
2666 	if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2667 		mxge_vlan_tag_remove(m, &csum);
2668 	}
2669 	/* flowid only valid if RSS hashing is enabled */
2670 	if (sc->num_slices > 1) {
2671 		m->m_pkthdr.flowid = (ss - sc->ss);
2672 		M_HASHTYPE_SET(m, M_HASHTYPE_OPAQUE);
2673 	}
2674 	/* if the checksum is valid, mark it in the mbuf header */
2675 	if ((ifp->if_capenable & (IFCAP_RXCSUM_IPV6 | IFCAP_RXCSUM)) &&
2676 	    (0 == mxge_rx_csum(m, csum))) {
2677 		/* Tell the stack that the  checksum is good */
2678 		m->m_pkthdr.csum_data = 0xffff;
2679 		m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR |
2680 			CSUM_DATA_VALID;
2681 
2682 #if defined(INET) || defined (INET6)
2683 		if (lro && (0 == tcp_lro_rx(&ss->lc, m, csum)))
2684 			return;
2685 #endif
2686 	}
2687 	/* pass the frame up the stack */
2688 	(*ifp->if_input)(ifp, m);
2689 }
2690 
2691 static inline void
2692 mxge_clean_rx_done(struct mxge_slice_state *ss)
2693 {
2694 	mxge_rx_done_t *rx_done = &ss->rx_done;
2695 	int limit = 0;
2696 	uint16_t length;
2697 	uint16_t checksum;
2698 	int lro;
2699 
2700 	lro = ss->sc->ifp->if_capenable & IFCAP_LRO;
2701 	while (rx_done->entry[rx_done->idx].length != 0) {
2702 		length = ntohs(rx_done->entry[rx_done->idx].length);
2703 		rx_done->entry[rx_done->idx].length = 0;
2704 		checksum = rx_done->entry[rx_done->idx].checksum;
2705 		if (length <= (MHLEN - MXGEFW_PAD))
2706 			mxge_rx_done_small(ss, length, checksum, lro);
2707 		else
2708 			mxge_rx_done_big(ss, length, checksum, lro);
2709 		rx_done->cnt++;
2710 		rx_done->idx = rx_done->cnt & rx_done->mask;
2711 
2712 		/* limit potential for livelock */
2713 		if (__predict_false(++limit > rx_done->mask / 2))
2714 			break;
2715 	}
2716 #if defined(INET)  || defined (INET6)
2717 	tcp_lro_flush_all(&ss->lc);
2718 #endif
2719 }
2720 
2721 static inline void
2722 mxge_tx_done(struct mxge_slice_state *ss, uint32_t mcp_idx)
2723 {
2724 	struct ifnet *ifp __unused;
2725 	mxge_tx_ring_t *tx;
2726 	struct mbuf *m;
2727 	bus_dmamap_t map;
2728 	int idx;
2729 	int *flags;
2730 
2731 	tx = &ss->tx;
2732 	ifp = ss->sc->ifp;
2733 	while (tx->pkt_done != mcp_idx) {
2734 		idx = tx->done & tx->mask;
2735 		tx->done++;
2736 		m = tx->info[idx].m;
2737 		/* mbuf and DMA map only attached to the first
2738 		   segment per-mbuf */
2739 		if (m != NULL) {
2740 			ss->obytes += m->m_pkthdr.len;
2741 			if (m->m_flags & M_MCAST)
2742 				ss->omcasts++;
2743 			ss->opackets++;
2744 			tx->info[idx].m = NULL;
2745 			map = tx->info[idx].map;
2746 			bus_dmamap_unload(tx->dmat, map);
2747 			m_freem(m);
2748 		}
2749 		if (tx->info[idx].flag) {
2750 			tx->info[idx].flag = 0;
2751 			tx->pkt_done++;
2752 		}
2753 	}
2754 
2755 	/* If we have space, clear IFF_OACTIVE to tell the stack that
2756 	   its OK to send packets */
2757 	flags = &ss->if_drv_flags;
2758 
2759 	mtx_lock(&ss->tx.mtx);
2760 	if ((*flags) & IFF_DRV_OACTIVE &&
2761 	    tx->req - tx->done < (tx->mask + 1)/4) {
2762 		*(flags) &= ~IFF_DRV_OACTIVE;
2763 		ss->tx.wake++;
2764 		mxge_start_locked(ss);
2765 	}
2766 	if ((ss->sc->num_slices > 1) && (tx->req == tx->done)) {
2767 		/* let the NIC stop polling this queue, since there
2768 		 * are no more transmits pending */
2769 		if (tx->req == tx->done) {
2770 			*tx->send_stop = 1;
2771 			tx->queue_active = 0;
2772 			tx->deactivate++;
2773 			wmb();
2774 		}
2775 	}
2776 	mtx_unlock(&ss->tx.mtx);
2777 }
2778 
2779 static struct mxge_media_type mxge_xfp_media_types[] =
2780 {
2781 	{IFM_10G_CX4,	0x7f, 		"10GBASE-CX4 (module)"},
2782 	{IFM_10G_SR, 	(1 << 7),	"10GBASE-SR"},
2783 	{IFM_10G_LR, 	(1 << 6),	"10GBASE-LR"},
2784 	{0,		(1 << 5),	"10GBASE-ER"},
2785 	{IFM_10G_LRM,	(1 << 4),	"10GBASE-LRM"},
2786 	{0,		(1 << 3),	"10GBASE-SW"},
2787 	{0,		(1 << 2),	"10GBASE-LW"},
2788 	{0,		(1 << 1),	"10GBASE-EW"},
2789 	{0,		(1 << 0),	"Reserved"}
2790 };
2791 static struct mxge_media_type mxge_sfp_media_types[] =
2792 {
2793 	{IFM_10G_TWINAX,      0,	"10GBASE-Twinax"},
2794 	{0,		(1 << 7),	"Reserved"},
2795 	{IFM_10G_LRM,	(1 << 6),	"10GBASE-LRM"},
2796 	{IFM_10G_LR, 	(1 << 5),	"10GBASE-LR"},
2797 	{IFM_10G_SR,	(1 << 4),	"10GBASE-SR"},
2798 	{IFM_10G_TWINAX,(1 << 0),	"10GBASE-Twinax"}
2799 };
2800 
2801 static void
2802 mxge_media_set(mxge_softc_t *sc, int media_type)
2803 {
2804 
2805 	ifmedia_add(&sc->media, IFM_ETHER | IFM_FDX | media_type,
2806 		    0, NULL);
2807 	ifmedia_set(&sc->media, IFM_ETHER | IFM_FDX | media_type);
2808 	sc->current_media = media_type;
2809 	sc->media.ifm_media = sc->media.ifm_cur->ifm_media;
2810 }
2811 
2812 static void
2813 mxge_media_init(mxge_softc_t *sc)
2814 {
2815 	char *ptr;
2816 	int i;
2817 
2818 	ifmedia_removeall(&sc->media);
2819 	mxge_media_set(sc, IFM_AUTO);
2820 
2821 	/*
2822 	 * parse the product code to deterimine the interface type
2823 	 * (CX4, XFP, Quad Ribbon Fiber) by looking at the character
2824 	 * after the 3rd dash in the driver's cached copy of the
2825 	 * EEPROM's product code string.
2826 	 */
2827 	ptr = sc->product_code_string;
2828 	if (ptr == NULL) {
2829 		device_printf(sc->dev, "Missing product code\n");
2830 		return;
2831 	}
2832 
2833 	for (i = 0; i < 3; i++, ptr++) {
2834 		ptr = strchr(ptr, '-');
2835 		if (ptr == NULL) {
2836 			device_printf(sc->dev,
2837 				      "only %d dashes in PC?!?\n", i);
2838 			return;
2839 		}
2840 	}
2841 	if (*ptr == 'C' || *(ptr +1) == 'C') {
2842 		/* -C is CX4 */
2843 		sc->connector = MXGE_CX4;
2844 		mxge_media_set(sc, IFM_10G_CX4);
2845 	} else if (*ptr == 'Q') {
2846 		/* -Q is Quad Ribbon Fiber */
2847 		sc->connector = MXGE_QRF;
2848 		device_printf(sc->dev, "Quad Ribbon Fiber Media\n");
2849 		/* FreeBSD has no media type for Quad ribbon fiber */
2850 	} else if (*ptr == 'R') {
2851 		/* -R is XFP */
2852 		sc->connector = MXGE_XFP;
2853 	} else if (*ptr == 'S' || *(ptr +1) == 'S') {
2854 		/* -S or -2S is SFP+ */
2855 		sc->connector = MXGE_SFP;
2856 	} else {
2857 		device_printf(sc->dev, "Unknown media type: %c\n", *ptr);
2858 	}
2859 }
2860 
2861 /*
2862  * Determine the media type for a NIC.  Some XFPs will identify
2863  * themselves only when their link is up, so this is initiated via a
2864  * link up interrupt.  However, this can potentially take up to
2865  * several milliseconds, so it is run via the watchdog routine, rather
2866  * than in the interrupt handler itself.
2867  */
2868 static void
2869 mxge_media_probe(mxge_softc_t *sc)
2870 {
2871 	mxge_cmd_t cmd;
2872 	char *cage_type;
2873 
2874 	struct mxge_media_type *mxge_media_types = NULL;
2875 	int i, err, ms, mxge_media_type_entries;
2876 	uint32_t byte;
2877 
2878 	sc->need_media_probe = 0;
2879 
2880 	if (sc->connector == MXGE_XFP) {
2881 		/* -R is XFP */
2882 		mxge_media_types = mxge_xfp_media_types;
2883 		mxge_media_type_entries =
2884 			nitems(mxge_xfp_media_types);
2885 		byte = MXGE_XFP_COMPLIANCE_BYTE;
2886 		cage_type = "XFP";
2887 	} else 	if (sc->connector == MXGE_SFP) {
2888 		/* -S or -2S is SFP+ */
2889 		mxge_media_types = mxge_sfp_media_types;
2890 		mxge_media_type_entries =
2891 			nitems(mxge_sfp_media_types);
2892 		cage_type = "SFP+";
2893 		byte = 3;
2894 	} else {
2895 		/* nothing to do; media type cannot change */
2896 		return;
2897 	}
2898 
2899 	/*
2900 	 * At this point we know the NIC has an XFP cage, so now we
2901 	 * try to determine what is in the cage by using the
2902 	 * firmware's XFP I2C commands to read the XFP 10GbE compilance
2903 	 * register.  We read just one byte, which may take over
2904 	 * a millisecond
2905 	 */
2906 
2907 	cmd.data0 = 0;	 /* just fetch 1 byte, not all 256 */
2908 	cmd.data1 = byte;
2909 	err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_READ, &cmd);
2910 	if (err == MXGEFW_CMD_ERROR_I2C_FAILURE) {
2911 		device_printf(sc->dev, "failed to read XFP\n");
2912 	}
2913 	if (err == MXGEFW_CMD_ERROR_I2C_ABSENT) {
2914 		device_printf(sc->dev, "Type R/S with no XFP!?!?\n");
2915 	}
2916 	if (err != MXGEFW_CMD_OK) {
2917 		return;
2918 	}
2919 
2920 	/* now we wait for the data to be cached */
2921 	cmd.data0 = byte;
2922 	err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
2923 	for (ms = 0; (err == EBUSY) && (ms < 50); ms++) {
2924 		DELAY(1000);
2925 		cmd.data0 = byte;
2926 		err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
2927 	}
2928 	if (err != MXGEFW_CMD_OK) {
2929 		device_printf(sc->dev, "failed to read %s (%d, %dms)\n",
2930 			      cage_type, err, ms);
2931 		return;
2932 	}
2933 
2934 	if (cmd.data0 == mxge_media_types[0].bitmask) {
2935 		if (mxge_verbose)
2936 			device_printf(sc->dev, "%s:%s\n", cage_type,
2937 				      mxge_media_types[0].name);
2938 		if (sc->current_media != mxge_media_types[0].flag) {
2939 			mxge_media_init(sc);
2940 			mxge_media_set(sc, mxge_media_types[0].flag);
2941 		}
2942 		return;
2943 	}
2944 	for (i = 1; i < mxge_media_type_entries; i++) {
2945 		if (cmd.data0 & mxge_media_types[i].bitmask) {
2946 			if (mxge_verbose)
2947 				device_printf(sc->dev, "%s:%s\n",
2948 					      cage_type,
2949 					      mxge_media_types[i].name);
2950 
2951 			if (sc->current_media != mxge_media_types[i].flag) {
2952 				mxge_media_init(sc);
2953 				mxge_media_set(sc, mxge_media_types[i].flag);
2954 			}
2955 			return;
2956 		}
2957 	}
2958 	if (mxge_verbose)
2959 		device_printf(sc->dev, "%s media 0x%x unknown\n",
2960 			      cage_type, cmd.data0);
2961 
2962 	return;
2963 }
2964 
2965 static void
2966 mxge_intr(void *arg)
2967 {
2968 	struct mxge_slice_state *ss = arg;
2969 	mxge_softc_t *sc = ss->sc;
2970 	mcp_irq_data_t *stats = ss->fw_stats;
2971 	mxge_tx_ring_t *tx = &ss->tx;
2972 	mxge_rx_done_t *rx_done = &ss->rx_done;
2973 	uint32_t send_done_count;
2974 	uint8_t valid;
2975 
2976 	/* make sure the DMA has finished */
2977 	if (!stats->valid) {
2978 		return;
2979 	}
2980 	valid = stats->valid;
2981 
2982 	if (sc->legacy_irq) {
2983 		/* lower legacy IRQ  */
2984 		*sc->irq_deassert = 0;
2985 		if (!mxge_deassert_wait)
2986 			/* don't wait for conf. that irq is low */
2987 			stats->valid = 0;
2988 	} else {
2989 		stats->valid = 0;
2990 	}
2991 
2992 	/* loop while waiting for legacy irq deassertion */
2993 	do {
2994 		/* check for transmit completes and receives */
2995 		send_done_count = be32toh(stats->send_done_count);
2996 		while ((send_done_count != tx->pkt_done) ||
2997 		       (rx_done->entry[rx_done->idx].length != 0)) {
2998 			if (send_done_count != tx->pkt_done)
2999 				mxge_tx_done(ss, (int)send_done_count);
3000 			mxge_clean_rx_done(ss);
3001 			send_done_count = be32toh(stats->send_done_count);
3002 		}
3003 		if (sc->legacy_irq && mxge_deassert_wait)
3004 			wmb();
3005 	} while (*((volatile uint8_t *) &stats->valid));
3006 
3007 	/* fw link & error stats meaningful only on the first slice */
3008 	if (__predict_false((ss == sc->ss) && stats->stats_updated)) {
3009 		if (sc->link_state != stats->link_up) {
3010 			sc->link_state = stats->link_up;
3011 			if (sc->link_state) {
3012 				if_link_state_change(sc->ifp, LINK_STATE_UP);
3013 				if (mxge_verbose)
3014 					device_printf(sc->dev, "link up\n");
3015 			} else {
3016 				if_link_state_change(sc->ifp, LINK_STATE_DOWN);
3017 				if (mxge_verbose)
3018 					device_printf(sc->dev, "link down\n");
3019 			}
3020 			sc->need_media_probe = 1;
3021 		}
3022 		if (sc->rdma_tags_available !=
3023 		    be32toh(stats->rdma_tags_available)) {
3024 			sc->rdma_tags_available =
3025 				be32toh(stats->rdma_tags_available);
3026 			device_printf(sc->dev, "RDMA timed out! %d tags "
3027 				      "left\n", sc->rdma_tags_available);
3028 		}
3029 
3030 		if (stats->link_down) {
3031 			sc->down_cnt += stats->link_down;
3032 			sc->link_state = 0;
3033 			if_link_state_change(sc->ifp, LINK_STATE_DOWN);
3034 		}
3035 	}
3036 
3037 	/* check to see if we have rx token to pass back */
3038 	if (valid & 0x1)
3039 	    *ss->irq_claim = be32toh(3);
3040 	*(ss->irq_claim + 1) = be32toh(3);
3041 }
3042 
3043 static void
3044 mxge_init(void *arg)
3045 {
3046 	mxge_softc_t *sc = arg;
3047 	struct ifnet *ifp = sc->ifp;
3048 
3049 	mtx_lock(&sc->driver_mtx);
3050 	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0)
3051 		(void) mxge_open(sc);
3052 	mtx_unlock(&sc->driver_mtx);
3053 }
3054 
3055 static void
3056 mxge_free_slice_mbufs(struct mxge_slice_state *ss)
3057 {
3058 	int i;
3059 
3060 #if defined(INET) || defined(INET6)
3061 	tcp_lro_free(&ss->lc);
3062 #endif
3063 	for (i = 0; i <= ss->rx_big.mask; i++) {
3064 		if (ss->rx_big.info[i].m == NULL)
3065 			continue;
3066 		bus_dmamap_unload(ss->rx_big.dmat,
3067 				  ss->rx_big.info[i].map);
3068 		m_freem(ss->rx_big.info[i].m);
3069 		ss->rx_big.info[i].m = NULL;
3070 	}
3071 
3072 	for (i = 0; i <= ss->rx_small.mask; i++) {
3073 		if (ss->rx_small.info[i].m == NULL)
3074 			continue;
3075 		bus_dmamap_unload(ss->rx_small.dmat,
3076 				  ss->rx_small.info[i].map);
3077 		m_freem(ss->rx_small.info[i].m);
3078 		ss->rx_small.info[i].m = NULL;
3079 	}
3080 
3081 	/* transmit ring used only on the first slice */
3082 	if (ss->tx.info == NULL)
3083 		return;
3084 
3085 	for (i = 0; i <= ss->tx.mask; i++) {
3086 		ss->tx.info[i].flag = 0;
3087 		if (ss->tx.info[i].m == NULL)
3088 			continue;
3089 		bus_dmamap_unload(ss->tx.dmat,
3090 				  ss->tx.info[i].map);
3091 		m_freem(ss->tx.info[i].m);
3092 		ss->tx.info[i].m = NULL;
3093 	}
3094 }
3095 
3096 static void
3097 mxge_free_mbufs(mxge_softc_t *sc)
3098 {
3099 	int slice;
3100 
3101 	for (slice = 0; slice < sc->num_slices; slice++)
3102 		mxge_free_slice_mbufs(&sc->ss[slice]);
3103 }
3104 
3105 static void
3106 mxge_free_slice_rings(struct mxge_slice_state *ss)
3107 {
3108 	int i;
3109 
3110 	if (ss->rx_done.entry != NULL)
3111 		mxge_dma_free(&ss->rx_done.dma);
3112 	ss->rx_done.entry = NULL;
3113 
3114 	if (ss->tx.req_bytes != NULL)
3115 		free(ss->tx.req_bytes, M_DEVBUF);
3116 	ss->tx.req_bytes = NULL;
3117 
3118 	if (ss->tx.seg_list != NULL)
3119 		free(ss->tx.seg_list, M_DEVBUF);
3120 	ss->tx.seg_list = NULL;
3121 
3122 	if (ss->rx_small.shadow != NULL)
3123 		free(ss->rx_small.shadow, M_DEVBUF);
3124 	ss->rx_small.shadow = NULL;
3125 
3126 	if (ss->rx_big.shadow != NULL)
3127 		free(ss->rx_big.shadow, M_DEVBUF);
3128 	ss->rx_big.shadow = NULL;
3129 
3130 	if (ss->tx.info != NULL) {
3131 		if (ss->tx.dmat != NULL) {
3132 			for (i = 0; i <= ss->tx.mask; i++) {
3133 				bus_dmamap_destroy(ss->tx.dmat,
3134 						   ss->tx.info[i].map);
3135 			}
3136 			bus_dma_tag_destroy(ss->tx.dmat);
3137 		}
3138 		free(ss->tx.info, M_DEVBUF);
3139 	}
3140 	ss->tx.info = NULL;
3141 
3142 	if (ss->rx_small.info != NULL) {
3143 		if (ss->rx_small.dmat != NULL) {
3144 			for (i = 0; i <= ss->rx_small.mask; i++) {
3145 				bus_dmamap_destroy(ss->rx_small.dmat,
3146 						   ss->rx_small.info[i].map);
3147 			}
3148 			bus_dmamap_destroy(ss->rx_small.dmat,
3149 					   ss->rx_small.extra_map);
3150 			bus_dma_tag_destroy(ss->rx_small.dmat);
3151 		}
3152 		free(ss->rx_small.info, M_DEVBUF);
3153 	}
3154 	ss->rx_small.info = NULL;
3155 
3156 	if (ss->rx_big.info != NULL) {
3157 		if (ss->rx_big.dmat != NULL) {
3158 			for (i = 0; i <= ss->rx_big.mask; i++) {
3159 				bus_dmamap_destroy(ss->rx_big.dmat,
3160 						   ss->rx_big.info[i].map);
3161 			}
3162 			bus_dmamap_destroy(ss->rx_big.dmat,
3163 					   ss->rx_big.extra_map);
3164 			bus_dma_tag_destroy(ss->rx_big.dmat);
3165 		}
3166 		free(ss->rx_big.info, M_DEVBUF);
3167 	}
3168 	ss->rx_big.info = NULL;
3169 }
3170 
3171 static void
3172 mxge_free_rings(mxge_softc_t *sc)
3173 {
3174 	int slice;
3175 
3176 	for (slice = 0; slice < sc->num_slices; slice++)
3177 		mxge_free_slice_rings(&sc->ss[slice]);
3178 }
3179 
3180 static int
3181 mxge_alloc_slice_rings(struct mxge_slice_state *ss, int rx_ring_entries,
3182 		       int tx_ring_entries)
3183 {
3184 	mxge_softc_t *sc = ss->sc;
3185 	size_t bytes;
3186 	int err, i;
3187 
3188 	/* allocate per-slice receive resources */
3189 
3190 	ss->rx_small.mask = ss->rx_big.mask = rx_ring_entries - 1;
3191 	ss->rx_done.mask = (2 * rx_ring_entries) - 1;
3192 
3193 	/* allocate the rx shadow rings */
3194 	bytes = rx_ring_entries * sizeof (*ss->rx_small.shadow);
3195 	ss->rx_small.shadow = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3196 
3197 	bytes = rx_ring_entries * sizeof (*ss->rx_big.shadow);
3198 	ss->rx_big.shadow = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3199 
3200 	/* allocate the rx host info rings */
3201 	bytes = rx_ring_entries * sizeof (*ss->rx_small.info);
3202 	ss->rx_small.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3203 
3204 	bytes = rx_ring_entries * sizeof (*ss->rx_big.info);
3205 	ss->rx_big.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3206 
3207 	/* allocate the rx busdma resources */
3208 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
3209 				 1,			/* alignment */
3210 				 4096,			/* boundary */
3211 				 BUS_SPACE_MAXADDR,	/* low */
3212 				 BUS_SPACE_MAXADDR,	/* high */
3213 				 NULL, NULL,		/* filter */
3214 				 MHLEN,			/* maxsize */
3215 				 1,			/* num segs */
3216 				 MHLEN,			/* maxsegsize */
3217 				 BUS_DMA_ALLOCNOW,	/* flags */
3218 				 NULL, NULL,		/* lock */
3219 				 &ss->rx_small.dmat);	/* tag */
3220 	if (err != 0) {
3221 		device_printf(sc->dev, "Err %d allocating rx_small dmat\n",
3222 			      err);
3223 		return err;
3224 	}
3225 
3226 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
3227 				 1,			/* alignment */
3228 				 0,			/* boundary */
3229 				 BUS_SPACE_MAXADDR,	/* low */
3230 				 BUS_SPACE_MAXADDR,	/* high */
3231 				 NULL, NULL,		/* filter */
3232 				 3*4096,		/* maxsize */
3233 				 1,			/* num segs */
3234 				 MJUM9BYTES,		/* maxsegsize*/
3235 				 BUS_DMA_ALLOCNOW,	/* flags */
3236 				 NULL, NULL,		/* lock */
3237 				 &ss->rx_big.dmat);	/* tag */
3238 	if (err != 0) {
3239 		device_printf(sc->dev, "Err %d allocating rx_big dmat\n",
3240 			      err);
3241 		return err;
3242 	}
3243 	for (i = 0; i <= ss->rx_small.mask; i++) {
3244 		err = bus_dmamap_create(ss->rx_small.dmat, 0,
3245 					&ss->rx_small.info[i].map);
3246 		if (err != 0) {
3247 			device_printf(sc->dev, "Err %d  rx_small dmamap\n",
3248 				      err);
3249 			return err;
3250 		}
3251 	}
3252 	err = bus_dmamap_create(ss->rx_small.dmat, 0,
3253 				&ss->rx_small.extra_map);
3254 	if (err != 0) {
3255 		device_printf(sc->dev, "Err %d extra rx_small dmamap\n",
3256 			      err);
3257 		return err;
3258 	}
3259 
3260 	for (i = 0; i <= ss->rx_big.mask; i++) {
3261 		err = bus_dmamap_create(ss->rx_big.dmat, 0,
3262 					&ss->rx_big.info[i].map);
3263 		if (err != 0) {
3264 			device_printf(sc->dev, "Err %d  rx_big dmamap\n",
3265 				      err);
3266 			return err;
3267 		}
3268 	}
3269 	err = bus_dmamap_create(ss->rx_big.dmat, 0,
3270 				&ss->rx_big.extra_map);
3271 	if (err != 0) {
3272 		device_printf(sc->dev, "Err %d extra rx_big dmamap\n",
3273 			      err);
3274 		return err;
3275 	}
3276 
3277 	/* now allocate TX resources */
3278 
3279 	ss->tx.mask = tx_ring_entries - 1;
3280 	ss->tx.max_desc = MIN(MXGE_MAX_SEND_DESC, tx_ring_entries / 4);
3281 
3282 	/* allocate the tx request copy block */
3283 	bytes = 8 +
3284 		sizeof (*ss->tx.req_list) * (ss->tx.max_desc + 4);
3285 	ss->tx.req_bytes = malloc(bytes, M_DEVBUF, M_WAITOK);
3286 	/* ensure req_list entries are aligned to 8 bytes */
3287 	ss->tx.req_list = (mcp_kreq_ether_send_t *)
3288 		((uintptr_t)(ss->tx.req_bytes + 7) & ~7UL);
3289 
3290 	/* allocate the tx busdma segment list */
3291 	bytes = sizeof (*ss->tx.seg_list) * ss->tx.max_desc;
3292 	ss->tx.seg_list = (bus_dma_segment_t *)
3293 		malloc(bytes, M_DEVBUF, M_WAITOK);
3294 
3295 	/* allocate the tx host info ring */
3296 	bytes = tx_ring_entries * sizeof (*ss->tx.info);
3297 	ss->tx.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3298 
3299 	/* allocate the tx busdma resources */
3300 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
3301 				 1,			/* alignment */
3302 				 sc->tx_boundary,	/* boundary */
3303 				 BUS_SPACE_MAXADDR,	/* low */
3304 				 BUS_SPACE_MAXADDR,	/* high */
3305 				 NULL, NULL,		/* filter */
3306 				 65536 + 256,		/* maxsize */
3307 				 ss->tx.max_desc - 2,	/* num segs */
3308 				 sc->tx_boundary,	/* maxsegsz */
3309 				 BUS_DMA_ALLOCNOW,	/* flags */
3310 				 NULL, NULL,		/* lock */
3311 				 &ss->tx.dmat);		/* tag */
3312 
3313 	if (err != 0) {
3314 		device_printf(sc->dev, "Err %d allocating tx dmat\n",
3315 			      err);
3316 		return err;
3317 	}
3318 
3319 	/* now use these tags to setup dmamaps for each slot
3320 	   in the ring */
3321 	for (i = 0; i <= ss->tx.mask; i++) {
3322 		err = bus_dmamap_create(ss->tx.dmat, 0,
3323 					&ss->tx.info[i].map);
3324 		if (err != 0) {
3325 			device_printf(sc->dev, "Err %d  tx dmamap\n",
3326 				      err);
3327 			return err;
3328 		}
3329 	}
3330 	return 0;
3331 
3332 }
3333 
3334 static int
3335 mxge_alloc_rings(mxge_softc_t *sc)
3336 {
3337 	mxge_cmd_t cmd;
3338 	int tx_ring_size;
3339 	int tx_ring_entries, rx_ring_entries;
3340 	int err, slice;
3341 
3342 	/* get ring sizes */
3343 	err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_RING_SIZE, &cmd);
3344 	tx_ring_size = cmd.data0;
3345 	if (err != 0) {
3346 		device_printf(sc->dev, "Cannot determine tx ring sizes\n");
3347 		goto abort;
3348 	}
3349 
3350 	tx_ring_entries = tx_ring_size / sizeof (mcp_kreq_ether_send_t);
3351 	rx_ring_entries = sc->rx_ring_size / sizeof (mcp_dma_addr_t);
3352 	IFQ_SET_MAXLEN(&sc->ifp->if_snd, tx_ring_entries - 1);
3353 	sc->ifp->if_snd.ifq_drv_maxlen = sc->ifp->if_snd.ifq_maxlen;
3354 	IFQ_SET_READY(&sc->ifp->if_snd);
3355 
3356 	for (slice = 0; slice < sc->num_slices; slice++) {
3357 		err = mxge_alloc_slice_rings(&sc->ss[slice],
3358 					     rx_ring_entries,
3359 					     tx_ring_entries);
3360 		if (err != 0)
3361 			goto abort;
3362 	}
3363 	return 0;
3364 
3365 abort:
3366 	mxge_free_rings(sc);
3367 	return err;
3368 
3369 }
3370 
3371 static void
3372 mxge_choose_params(int mtu, int *big_buf_size, int *cl_size, int *nbufs)
3373 {
3374 	int bufsize = mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN + MXGEFW_PAD;
3375 
3376 	if (bufsize < MCLBYTES) {
3377 		/* easy, everything fits in a single buffer */
3378 		*big_buf_size = MCLBYTES;
3379 		*cl_size = MCLBYTES;
3380 		*nbufs = 1;
3381 		return;
3382 	}
3383 
3384 	if (bufsize < MJUMPAGESIZE) {
3385 		/* still easy, everything still fits in a single buffer */
3386 		*big_buf_size = MJUMPAGESIZE;
3387 		*cl_size = MJUMPAGESIZE;
3388 		*nbufs = 1;
3389 		return;
3390 	}
3391 	*cl_size = MJUM9BYTES;
3392 	*big_buf_size = MJUM9BYTES;
3393 	*nbufs = 1;
3394 }
3395 
3396 static int
3397 mxge_slice_open(struct mxge_slice_state *ss, int nbufs, int cl_size)
3398 {
3399 	mxge_softc_t *sc;
3400 	mxge_cmd_t cmd;
3401 	bus_dmamap_t map;
3402 	int err, i, slice;
3403 
3404 	sc = ss->sc;
3405 	slice = ss - sc->ss;
3406 
3407 #if defined(INET) || defined(INET6)
3408 	(void)tcp_lro_init(&ss->lc);
3409 #endif
3410 	ss->lc.ifp = sc->ifp;
3411 
3412 	/* get the lanai pointers to the send and receive rings */
3413 
3414 	err = 0;
3415 
3416 	cmd.data0 = slice;
3417 	err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_OFFSET, &cmd);
3418 	ss->tx.lanai =
3419 		(volatile mcp_kreq_ether_send_t *)(sc->sram + cmd.data0);
3420 	ss->tx.send_go = (volatile uint32_t *)
3421 		(sc->sram + MXGEFW_ETH_SEND_GO + 64 * slice);
3422 	ss->tx.send_stop = (volatile uint32_t *)
3423 	(sc->sram + MXGEFW_ETH_SEND_STOP + 64 * slice);
3424 
3425 	cmd.data0 = slice;
3426 	err |= mxge_send_cmd(sc,
3427 			     MXGEFW_CMD_GET_SMALL_RX_OFFSET, &cmd);
3428 	ss->rx_small.lanai =
3429 		(volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
3430 	cmd.data0 = slice;
3431 	err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_BIG_RX_OFFSET, &cmd);
3432 	ss->rx_big.lanai =
3433 		(volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
3434 
3435 	if (err != 0) {
3436 		device_printf(sc->dev,
3437 			      "failed to get ring sizes or locations\n");
3438 		return EIO;
3439 	}
3440 
3441 	/* stock receive rings */
3442 	for (i = 0; i <= ss->rx_small.mask; i++) {
3443 		map = ss->rx_small.info[i].map;
3444 		err = mxge_get_buf_small(ss, map, i);
3445 		if (err) {
3446 			device_printf(sc->dev, "alloced %d/%d smalls\n",
3447 				      i, ss->rx_small.mask + 1);
3448 			return ENOMEM;
3449 		}
3450 	}
3451 	for (i = 0; i <= ss->rx_big.mask; i++) {
3452 		ss->rx_big.shadow[i].addr_low = 0xffffffff;
3453 		ss->rx_big.shadow[i].addr_high = 0xffffffff;
3454 	}
3455 	ss->rx_big.nbufs = nbufs;
3456 	ss->rx_big.cl_size = cl_size;
3457 	ss->rx_big.mlen = ss->sc->ifp->if_mtu + ETHER_HDR_LEN +
3458 		ETHER_VLAN_ENCAP_LEN + MXGEFW_PAD;
3459 	for (i = 0; i <= ss->rx_big.mask; i += ss->rx_big.nbufs) {
3460 		map = ss->rx_big.info[i].map;
3461 		err = mxge_get_buf_big(ss, map, i);
3462 		if (err) {
3463 			device_printf(sc->dev, "alloced %d/%d bigs\n",
3464 				      i, ss->rx_big.mask + 1);
3465 			return ENOMEM;
3466 		}
3467 	}
3468 	return 0;
3469 }
3470 
3471 static int
3472 mxge_open(mxge_softc_t *sc)
3473 {
3474 	mxge_cmd_t cmd;
3475 	int err, big_bytes, nbufs, slice, cl_size, i;
3476 	bus_addr_t bus;
3477 	volatile uint8_t *itable;
3478 	struct mxge_slice_state *ss;
3479 
3480 	/* Copy the MAC address in case it was overridden */
3481 	bcopy(IF_LLADDR(sc->ifp), sc->mac_addr, ETHER_ADDR_LEN);
3482 
3483 	err = mxge_reset(sc, 1);
3484 	if (err != 0) {
3485 		device_printf(sc->dev, "failed to reset\n");
3486 		return EIO;
3487 	}
3488 
3489 	if (sc->num_slices > 1) {
3490 		/* setup the indirection table */
3491 		cmd.data0 = sc->num_slices;
3492 		err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_TABLE_SIZE,
3493 				    &cmd);
3494 
3495 		err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_RSS_TABLE_OFFSET,
3496 				     &cmd);
3497 		if (err != 0) {
3498 			device_printf(sc->dev,
3499 				      "failed to setup rss tables\n");
3500 			return err;
3501 		}
3502 
3503 		/* just enable an identity mapping */
3504 		itable = sc->sram + cmd.data0;
3505 		for (i = 0; i < sc->num_slices; i++)
3506 			itable[i] = (uint8_t)i;
3507 
3508 		cmd.data0 = 1;
3509 		cmd.data1 = mxge_rss_hash_type;
3510 		err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_ENABLE, &cmd);
3511 		if (err != 0) {
3512 			device_printf(sc->dev, "failed to enable slices\n");
3513 			return err;
3514 		}
3515 	}
3516 
3517 	mxge_choose_params(sc->ifp->if_mtu, &big_bytes, &cl_size, &nbufs);
3518 
3519 	cmd.data0 = nbufs;
3520 	err = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
3521 			    &cmd);
3522 	/* error is only meaningful if we're trying to set
3523 	   MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS > 1 */
3524 	if (err && nbufs > 1) {
3525 		device_printf(sc->dev,
3526 			      "Failed to set alway-use-n to %d\n",
3527 			      nbufs);
3528 		return EIO;
3529 	}
3530 	/* Give the firmware the mtu and the big and small buffer
3531 	   sizes.  The firmware wants the big buf size to be a power
3532 	   of two. Luckily, FreeBSD's clusters are powers of two */
3533 	cmd.data0 = sc->ifp->if_mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
3534 	err = mxge_send_cmd(sc, MXGEFW_CMD_SET_MTU, &cmd);
3535 	cmd.data0 = MHLEN - MXGEFW_PAD;
3536 	err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_SMALL_BUFFER_SIZE,
3537 			     &cmd);
3538 	cmd.data0 = big_bytes;
3539 	err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_BIG_BUFFER_SIZE, &cmd);
3540 
3541 	if (err != 0) {
3542 		device_printf(sc->dev, "failed to setup params\n");
3543 		goto abort;
3544 	}
3545 
3546 	/* Now give him the pointer to the stats block */
3547 	for (slice = 0; slice < sc->num_slices; slice++) {
3548 		ss = &sc->ss[slice];
3549 		cmd.data0 =
3550 			MXGE_LOWPART_TO_U32(ss->fw_stats_dma.bus_addr);
3551 		cmd.data1 =
3552 			MXGE_HIGHPART_TO_U32(ss->fw_stats_dma.bus_addr);
3553 		cmd.data2 = sizeof(struct mcp_irq_data);
3554 		cmd.data2 |= (slice << 16);
3555 		err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_STATS_DMA_V2, &cmd);
3556 	}
3557 
3558 	if (err != 0) {
3559 		bus = sc->ss->fw_stats_dma.bus_addr;
3560 		bus += offsetof(struct mcp_irq_data, send_done_count);
3561 		cmd.data0 = MXGE_LOWPART_TO_U32(bus);
3562 		cmd.data1 = MXGE_HIGHPART_TO_U32(bus);
3563 		err = mxge_send_cmd(sc,
3564 				    MXGEFW_CMD_SET_STATS_DMA_OBSOLETE,
3565 				    &cmd);
3566 		/* Firmware cannot support multicast without STATS_DMA_V2 */
3567 		sc->fw_multicast_support = 0;
3568 	} else {
3569 		sc->fw_multicast_support = 1;
3570 	}
3571 
3572 	if (err != 0) {
3573 		device_printf(sc->dev, "failed to setup params\n");
3574 		goto abort;
3575 	}
3576 
3577 	for (slice = 0; slice < sc->num_slices; slice++) {
3578 		err = mxge_slice_open(&sc->ss[slice], nbufs, cl_size);
3579 		if (err != 0) {
3580 			device_printf(sc->dev, "couldn't open slice %d\n",
3581 				      slice);
3582 			goto abort;
3583 		}
3584 	}
3585 
3586 	/* Finally, start the firmware running */
3587 	err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_UP, &cmd);
3588 	if (err) {
3589 		device_printf(sc->dev, "Couldn't bring up link\n");
3590 		goto abort;
3591 	}
3592 	for (slice = 0; slice < sc->num_slices; slice++) {
3593 		ss = &sc->ss[slice];
3594 		ss->if_drv_flags |= IFF_DRV_RUNNING;
3595 		ss->if_drv_flags &= ~IFF_DRV_OACTIVE;
3596 	}
3597 	sc->ifp->if_drv_flags |= IFF_DRV_RUNNING;
3598 	sc->ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
3599 
3600 	return 0;
3601 
3602 abort:
3603 	mxge_free_mbufs(sc);
3604 
3605 	return err;
3606 }
3607 
3608 static int
3609 mxge_close(mxge_softc_t *sc, int down)
3610 {
3611 	mxge_cmd_t cmd;
3612 	int err, old_down_cnt;
3613 	struct mxge_slice_state *ss;
3614 	int slice;
3615 
3616 	for (slice = 0; slice < sc->num_slices; slice++) {
3617 		ss = &sc->ss[slice];
3618 		ss->if_drv_flags &= ~IFF_DRV_RUNNING;
3619 	}
3620 	sc->ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
3621 	if (!down) {
3622 		old_down_cnt = sc->down_cnt;
3623 		wmb();
3624 		err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_DOWN, &cmd);
3625 		if (err) {
3626 			device_printf(sc->dev,
3627 				      "Couldn't bring down link\n");
3628 		}
3629 		if (old_down_cnt == sc->down_cnt) {
3630 			/* wait for down irq */
3631 			DELAY(10 * sc->intr_coal_delay);
3632 		}
3633 		wmb();
3634 		if (old_down_cnt == sc->down_cnt) {
3635 			device_printf(sc->dev, "never got down irq\n");
3636 		}
3637 	}
3638 	mxge_free_mbufs(sc);
3639 
3640 	return 0;
3641 }
3642 
3643 static void
3644 mxge_setup_cfg_space(mxge_softc_t *sc)
3645 {
3646 	device_t dev = sc->dev;
3647 	int reg;
3648 	uint16_t lnk, pectl;
3649 
3650 	/* find the PCIe link width and set max read request to 4KB*/
3651 	if (pci_find_cap(dev, PCIY_EXPRESS, &reg) == 0) {
3652 		lnk = pci_read_config(dev, reg + 0x12, 2);
3653 		sc->link_width = (lnk >> 4) & 0x3f;
3654 
3655 		if (sc->pectl == 0) {
3656 			pectl = pci_read_config(dev, reg + 0x8, 2);
3657 			pectl = (pectl & ~0x7000) | (5 << 12);
3658 			pci_write_config(dev, reg + 0x8, pectl, 2);
3659 			sc->pectl = pectl;
3660 		} else {
3661 			/* restore saved pectl after watchdog reset */
3662 			pci_write_config(dev, reg + 0x8, sc->pectl, 2);
3663 		}
3664 	}
3665 
3666 	/* Enable DMA and Memory space access */
3667 	pci_enable_busmaster(dev);
3668 }
3669 
3670 static uint32_t
3671 mxge_read_reboot(mxge_softc_t *sc)
3672 {
3673 	device_t dev = sc->dev;
3674 	uint32_t vs;
3675 
3676 	/* find the vendor specific offset */
3677 	if (pci_find_cap(dev, PCIY_VENDOR, &vs) != 0) {
3678 		device_printf(sc->dev,
3679 			      "could not find vendor specific offset\n");
3680 		return (uint32_t)-1;
3681 	}
3682 	/* enable read32 mode */
3683 	pci_write_config(dev, vs + 0x10, 0x3, 1);
3684 	/* tell NIC which register to read */
3685 	pci_write_config(dev, vs + 0x18, 0xfffffff0, 4);
3686 	return (pci_read_config(dev, vs + 0x14, 4));
3687 }
3688 
3689 static void
3690 mxge_watchdog_reset(mxge_softc_t *sc)
3691 {
3692 	struct pci_devinfo *dinfo;
3693 	struct mxge_slice_state *ss;
3694 	int err, running, s, num_tx_slices = 1;
3695 	uint32_t reboot;
3696 	uint16_t cmd;
3697 
3698 	err = ENXIO;
3699 
3700 	device_printf(sc->dev, "Watchdog reset!\n");
3701 
3702 	/*
3703 	 * check to see if the NIC rebooted.  If it did, then all of
3704 	 * PCI config space has been reset, and things like the
3705 	 * busmaster bit will be zero.  If this is the case, then we
3706 	 * must restore PCI config space before the NIC can be used
3707 	 * again
3708 	 */
3709 	cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3710 	if (cmd == 0xffff) {
3711 		/*
3712 		 * maybe the watchdog caught the NIC rebooting; wait
3713 		 * up to 100ms for it to finish.  If it does not come
3714 		 * back, then give up
3715 		 */
3716 		DELAY(1000*100);
3717 		cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3718 		if (cmd == 0xffff) {
3719 			device_printf(sc->dev, "NIC disappeared!\n");
3720 		}
3721 	}
3722 	if ((cmd & PCIM_CMD_BUSMASTEREN) == 0) {
3723 		/* print the reboot status */
3724 		reboot = mxge_read_reboot(sc);
3725 		device_printf(sc->dev, "NIC rebooted, status = 0x%x\n",
3726 			      reboot);
3727 		running = sc->ifp->if_drv_flags & IFF_DRV_RUNNING;
3728 		if (running) {
3729 			/*
3730 			 * quiesce NIC so that TX routines will not try to
3731 			 * xmit after restoration of BAR
3732 			 */
3733 
3734 			/* Mark the link as down */
3735 			if (sc->link_state) {
3736 				sc->link_state = 0;
3737 				if_link_state_change(sc->ifp,
3738 						     LINK_STATE_DOWN);
3739 			}
3740 
3741 			num_tx_slices = sc->num_slices;
3742 
3743 			/* grab all TX locks to ensure no tx  */
3744 			for (s = 0; s < num_tx_slices; s++) {
3745 				ss = &sc->ss[s];
3746 				mtx_lock(&ss->tx.mtx);
3747 			}
3748 			mxge_close(sc, 1);
3749 		}
3750 		/* restore PCI configuration space */
3751 		dinfo = device_get_ivars(sc->dev);
3752 		pci_cfg_restore(sc->dev, dinfo);
3753 
3754 		/* and redo any changes we made to our config space */
3755 		mxge_setup_cfg_space(sc);
3756 
3757 		/* reload f/w */
3758 		err = mxge_load_firmware(sc, 0);
3759 		if (err) {
3760 			device_printf(sc->dev,
3761 				      "Unable to re-load f/w\n");
3762 		}
3763 		if (running) {
3764 			if (!err)
3765 				err = mxge_open(sc);
3766 			/* release all TX locks */
3767 			for (s = 0; s < num_tx_slices; s++) {
3768 				ss = &sc->ss[s];
3769 				mxge_start_locked(ss);
3770 				mtx_unlock(&ss->tx.mtx);
3771 			}
3772 		}
3773 		sc->watchdog_resets++;
3774 	} else {
3775 		device_printf(sc->dev,
3776 			      "NIC did not reboot, not resetting\n");
3777 		err = 0;
3778 	}
3779 	if (err) {
3780 		device_printf(sc->dev, "watchdog reset failed\n");
3781 	} else {
3782 		if (sc->dying == 2)
3783 			sc->dying = 0;
3784 		callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
3785 	}
3786 }
3787 
3788 static void
3789 mxge_watchdog_task(void *arg, int pending)
3790 {
3791 	mxge_softc_t *sc = arg;
3792 
3793 	mtx_lock(&sc->driver_mtx);
3794 	mxge_watchdog_reset(sc);
3795 	mtx_unlock(&sc->driver_mtx);
3796 }
3797 
3798 static void
3799 mxge_warn_stuck(mxge_softc_t *sc, mxge_tx_ring_t *tx, int slice)
3800 {
3801 	tx = &sc->ss[slice].tx;
3802 	device_printf(sc->dev, "slice %d struck? ring state:\n", slice);
3803 	device_printf(sc->dev,
3804 		      "tx.req=%d tx.done=%d, tx.queue_active=%d\n",
3805 		      tx->req, tx->done, tx->queue_active);
3806 	device_printf(sc->dev, "tx.activate=%d tx.deactivate=%d\n",
3807 			      tx->activate, tx->deactivate);
3808 	device_printf(sc->dev, "pkt_done=%d fw=%d\n",
3809 		      tx->pkt_done,
3810 		      be32toh(sc->ss->fw_stats->send_done_count));
3811 }
3812 
3813 static int
3814 mxge_watchdog(mxge_softc_t *sc)
3815 {
3816 	mxge_tx_ring_t *tx;
3817 	uint32_t rx_pause = be32toh(sc->ss->fw_stats->dropped_pause);
3818 	int i, err = 0;
3819 
3820 	/* see if we have outstanding transmits, which
3821 	   have been pending for more than mxge_ticks */
3822 	for (i = 0; (i < sc->num_slices) && (err == 0); i++) {
3823 		tx = &sc->ss[i].tx;
3824 		if (tx->req != tx->done &&
3825 		    tx->watchdog_req != tx->watchdog_done &&
3826 		    tx->done == tx->watchdog_done) {
3827 			/* check for pause blocking before resetting */
3828 			if (tx->watchdog_rx_pause == rx_pause) {
3829 				mxge_warn_stuck(sc, tx, i);
3830 				taskqueue_enqueue(sc->tq, &sc->watchdog_task);
3831 				return (ENXIO);
3832 			}
3833 			else
3834 				device_printf(sc->dev, "Flow control blocking "
3835 					      "xmits, check link partner\n");
3836 		}
3837 
3838 		tx->watchdog_req = tx->req;
3839 		tx->watchdog_done = tx->done;
3840 		tx->watchdog_rx_pause = rx_pause;
3841 	}
3842 
3843 	if (sc->need_media_probe)
3844 		mxge_media_probe(sc);
3845 	return (err);
3846 }
3847 
3848 static uint64_t
3849 mxge_get_counter(struct ifnet *ifp, ift_counter cnt)
3850 {
3851 	struct mxge_softc *sc;
3852 	uint64_t rv;
3853 
3854 	sc = if_getsoftc(ifp);
3855 	rv = 0;
3856 
3857 	switch (cnt) {
3858 	case IFCOUNTER_IPACKETS:
3859 		for (int s = 0; s < sc->num_slices; s++)
3860 			rv += sc->ss[s].ipackets;
3861 		return (rv);
3862 	case IFCOUNTER_OPACKETS:
3863 		for (int s = 0; s < sc->num_slices; s++)
3864 			rv += sc->ss[s].opackets;
3865 		return (rv);
3866 	case IFCOUNTER_OERRORS:
3867 		for (int s = 0; s < sc->num_slices; s++)
3868 			rv += sc->ss[s].oerrors;
3869 		return (rv);
3870 	case IFCOUNTER_OBYTES:
3871 		for (int s = 0; s < sc->num_slices; s++)
3872 			rv += sc->ss[s].obytes;
3873 		return (rv);
3874 	case IFCOUNTER_OMCASTS:
3875 		for (int s = 0; s < sc->num_slices; s++)
3876 			rv += sc->ss[s].omcasts;
3877 		return (rv);
3878 	case IFCOUNTER_OQDROPS:
3879 		for (int s = 0; s < sc->num_slices; s++)
3880 			rv += sc->ss[s].tx.br->br_drops;
3881 		return (rv);
3882 	default:
3883 		return (if_get_counter_default(ifp, cnt));
3884 	}
3885 }
3886 
3887 static void
3888 mxge_tick(void *arg)
3889 {
3890 	mxge_softc_t *sc = arg;
3891 	u_long pkts = 0;
3892 	int err = 0;
3893 	int running, ticks;
3894 	uint16_t cmd;
3895 
3896 	ticks = mxge_ticks;
3897 	running = sc->ifp->if_drv_flags & IFF_DRV_RUNNING;
3898 	if (running) {
3899 		if (!sc->watchdog_countdown) {
3900 			err = mxge_watchdog(sc);
3901 			sc->watchdog_countdown = 4;
3902 		}
3903 		sc->watchdog_countdown--;
3904 	}
3905 	if (pkts == 0) {
3906 		/* ensure NIC did not suffer h/w fault while idle */
3907 		cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3908 		if ((cmd & PCIM_CMD_BUSMASTEREN) == 0) {
3909 			sc->dying = 2;
3910 			taskqueue_enqueue(sc->tq, &sc->watchdog_task);
3911 			err = ENXIO;
3912 		}
3913 		/* look less often if NIC is idle */
3914 		ticks *= 4;
3915 	}
3916 
3917 	if (err == 0)
3918 		callout_reset(&sc->co_hdl, ticks, mxge_tick, sc);
3919 
3920 }
3921 
3922 static int
3923 mxge_media_change(struct ifnet *ifp)
3924 {
3925 	return EINVAL;
3926 }
3927 
3928 static int
3929 mxge_change_mtu(mxge_softc_t *sc, int mtu)
3930 {
3931 	struct ifnet *ifp = sc->ifp;
3932 	int real_mtu, old_mtu;
3933 	int err = 0;
3934 
3935 	real_mtu = mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
3936 	if ((real_mtu > sc->max_mtu) || real_mtu < 60)
3937 		return EINVAL;
3938 	mtx_lock(&sc->driver_mtx);
3939 	old_mtu = ifp->if_mtu;
3940 	ifp->if_mtu = mtu;
3941 	if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
3942 		mxge_close(sc, 0);
3943 		err = mxge_open(sc);
3944 		if (err != 0) {
3945 			ifp->if_mtu = old_mtu;
3946 			mxge_close(sc, 0);
3947 			(void) mxge_open(sc);
3948 		}
3949 	}
3950 	mtx_unlock(&sc->driver_mtx);
3951 	return err;
3952 }
3953 
3954 static void
3955 mxge_media_status(struct ifnet *ifp, struct ifmediareq *ifmr)
3956 {
3957 	mxge_softc_t *sc = ifp->if_softc;
3958 
3959 	if (sc == NULL)
3960 		return;
3961 	ifmr->ifm_status = IFM_AVALID;
3962 	ifmr->ifm_active = IFM_ETHER | IFM_FDX;
3963 	ifmr->ifm_status |= sc->link_state ? IFM_ACTIVE : 0;
3964 	ifmr->ifm_active |= sc->current_media;
3965 }
3966 
3967 static int
3968 mxge_fetch_i2c(mxge_softc_t *sc, struct ifi2creq *i2c)
3969 {
3970 	mxge_cmd_t cmd;
3971 	uint32_t i2c_args;
3972 	int i, ms, err;
3973 
3974 	if (i2c->dev_addr != 0xA0 &&
3975 	    i2c->dev_addr != 0xA2)
3976 		return (EINVAL);
3977 	if (i2c->len > sizeof(i2c->data))
3978 		return (EINVAL);
3979 
3980 	for (i = 0; i < i2c->len; i++) {
3981 		i2c_args = i2c->dev_addr << 0x8;
3982 		i2c_args |= i2c->offset + i;
3983 		cmd.data0 = 0;	 /* just fetch 1 byte, not all 256 */
3984 		cmd.data1 = i2c_args;
3985 		err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_READ, &cmd);
3986 
3987 		if (err != MXGEFW_CMD_OK)
3988 			return (EIO);
3989 		/* now we wait for the data to be cached */
3990 		cmd.data0 = i2c_args & 0xff;
3991 		err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
3992 		for (ms = 0; (err == EBUSY) && (ms < 50); ms++) {
3993 			cmd.data0 = i2c_args & 0xff;
3994 			err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
3995 			if (err == EBUSY)
3996 				DELAY(1000);
3997 		}
3998 		if (err != MXGEFW_CMD_OK)
3999 			return (EIO);
4000 		i2c->data[i] = cmd.data0;
4001 	}
4002 	return (0);
4003 }
4004 
4005 static int
4006 mxge_ioctl(struct ifnet *ifp, u_long command, caddr_t data)
4007 {
4008 	mxge_softc_t *sc = ifp->if_softc;
4009 	struct ifreq *ifr = (struct ifreq *)data;
4010 	struct ifi2creq i2c;
4011 	int err, mask;
4012 
4013 	err = 0;
4014 	switch (command) {
4015 	case SIOCSIFMTU:
4016 		err = mxge_change_mtu(sc, ifr->ifr_mtu);
4017 		break;
4018 
4019 	case SIOCSIFFLAGS:
4020 		mtx_lock(&sc->driver_mtx);
4021 		if (sc->dying) {
4022 			mtx_unlock(&sc->driver_mtx);
4023 			return EINVAL;
4024 		}
4025 		if (ifp->if_flags & IFF_UP) {
4026 			if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) {
4027 				err = mxge_open(sc);
4028 			} else {
4029 				/* take care of promis can allmulti
4030 				   flag chages */
4031 				mxge_change_promisc(sc,
4032 						    ifp->if_flags & IFF_PROMISC);
4033 				mxge_set_multicast_list(sc);
4034 			}
4035 		} else {
4036 			if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
4037 				mxge_close(sc, 0);
4038 			}
4039 		}
4040 		mtx_unlock(&sc->driver_mtx);
4041 		break;
4042 
4043 	case SIOCADDMULTI:
4044 	case SIOCDELMULTI:
4045 		mtx_lock(&sc->driver_mtx);
4046 		if (sc->dying) {
4047 			mtx_unlock(&sc->driver_mtx);
4048 			return (EINVAL);
4049 		}
4050 		mxge_set_multicast_list(sc);
4051 		mtx_unlock(&sc->driver_mtx);
4052 		break;
4053 
4054 	case SIOCSIFCAP:
4055 		mtx_lock(&sc->driver_mtx);
4056 		mask = ifr->ifr_reqcap ^ ifp->if_capenable;
4057 		if (mask & IFCAP_TXCSUM) {
4058 			if (IFCAP_TXCSUM & ifp->if_capenable) {
4059 				mask &= ~IFCAP_TSO4;
4060 				ifp->if_capenable &= ~(IFCAP_TXCSUM|IFCAP_TSO4);
4061 				ifp->if_hwassist &= ~(CSUM_TCP | CSUM_UDP);
4062 			} else {
4063 				ifp->if_capenable |= IFCAP_TXCSUM;
4064 				ifp->if_hwassist |= (CSUM_TCP | CSUM_UDP);
4065 			}
4066 		}
4067 		if (mask & IFCAP_RXCSUM) {
4068 			if (IFCAP_RXCSUM & ifp->if_capenable) {
4069 				ifp->if_capenable &= ~IFCAP_RXCSUM;
4070 			} else {
4071 				ifp->if_capenable |= IFCAP_RXCSUM;
4072 			}
4073 		}
4074 		if (mask & IFCAP_TSO4) {
4075 			if (IFCAP_TSO4 & ifp->if_capenable) {
4076 				ifp->if_capenable &= ~IFCAP_TSO4;
4077 			} else if (IFCAP_TXCSUM & ifp->if_capenable) {
4078 				ifp->if_capenable |= IFCAP_TSO4;
4079 				ifp->if_hwassist |= CSUM_TSO;
4080 			} else {
4081 				printf("mxge requires tx checksum offload"
4082 				       " be enabled to use TSO\n");
4083 				err = EINVAL;
4084 			}
4085 		}
4086 #if IFCAP_TSO6
4087 		if (mask & IFCAP_TXCSUM_IPV6) {
4088 			if (IFCAP_TXCSUM_IPV6 & ifp->if_capenable) {
4089 				mask &= ~IFCAP_TSO6;
4090 				ifp->if_capenable &= ~(IFCAP_TXCSUM_IPV6
4091 						       | IFCAP_TSO6);
4092 				ifp->if_hwassist &= ~(CSUM_TCP_IPV6
4093 						      | CSUM_UDP);
4094 			} else {
4095 				ifp->if_capenable |= IFCAP_TXCSUM_IPV6;
4096 				ifp->if_hwassist |= (CSUM_TCP_IPV6
4097 						     | CSUM_UDP_IPV6);
4098 			}
4099 		}
4100 		if (mask & IFCAP_RXCSUM_IPV6) {
4101 			if (IFCAP_RXCSUM_IPV6 & ifp->if_capenable) {
4102 				ifp->if_capenable &= ~IFCAP_RXCSUM_IPV6;
4103 			} else {
4104 				ifp->if_capenable |= IFCAP_RXCSUM_IPV6;
4105 			}
4106 		}
4107 		if (mask & IFCAP_TSO6) {
4108 			if (IFCAP_TSO6 & ifp->if_capenable) {
4109 				ifp->if_capenable &= ~IFCAP_TSO6;
4110 			} else if (IFCAP_TXCSUM_IPV6 & ifp->if_capenable) {
4111 				ifp->if_capenable |= IFCAP_TSO6;
4112 				ifp->if_hwassist |= CSUM_TSO;
4113 			} else {
4114 				printf("mxge requires tx checksum offload"
4115 				       " be enabled to use TSO\n");
4116 				err = EINVAL;
4117 			}
4118 		}
4119 #endif /*IFCAP_TSO6 */
4120 
4121 		if (mask & IFCAP_LRO)
4122 			ifp->if_capenable ^= IFCAP_LRO;
4123 		if (mask & IFCAP_VLAN_HWTAGGING)
4124 			ifp->if_capenable ^= IFCAP_VLAN_HWTAGGING;
4125 		if (mask & IFCAP_VLAN_HWTSO)
4126 			ifp->if_capenable ^= IFCAP_VLAN_HWTSO;
4127 
4128 		if (!(ifp->if_capabilities & IFCAP_VLAN_HWTSO) ||
4129 		    !(ifp->if_capenable & IFCAP_VLAN_HWTAGGING))
4130 			ifp->if_capenable &= ~IFCAP_VLAN_HWTSO;
4131 
4132 		mtx_unlock(&sc->driver_mtx);
4133 		VLAN_CAPABILITIES(ifp);
4134 
4135 		break;
4136 
4137 	case SIOCGIFMEDIA:
4138 		mtx_lock(&sc->driver_mtx);
4139 		if (sc->dying) {
4140 			mtx_unlock(&sc->driver_mtx);
4141 			return (EINVAL);
4142 		}
4143 		mxge_media_probe(sc);
4144 		mtx_unlock(&sc->driver_mtx);
4145 		err = ifmedia_ioctl(ifp, (struct ifreq *)data,
4146 				    &sc->media, command);
4147 		break;
4148 
4149 	case SIOCGI2C:
4150 		if (sc->connector != MXGE_XFP &&
4151 		    sc->connector != MXGE_SFP) {
4152 			err = ENXIO;
4153 			break;
4154 		}
4155 		err = copyin(ifr_data_get_ptr(ifr), &i2c, sizeof(i2c));
4156 		if (err != 0)
4157 			break;
4158 		mtx_lock(&sc->driver_mtx);
4159 		if (sc->dying) {
4160 			mtx_unlock(&sc->driver_mtx);
4161 			return (EINVAL);
4162 		}
4163 		err = mxge_fetch_i2c(sc, &i2c);
4164 		mtx_unlock(&sc->driver_mtx);
4165 		if (err == 0)
4166 			err = copyout(&i2c, ifr_data_get_ptr(ifr),
4167 			    sizeof(i2c));
4168 		break;
4169 	default:
4170 		err = ether_ioctl(ifp, command, data);
4171 		break;
4172 	}
4173 	return err;
4174 }
4175 
4176 static void
4177 mxge_fetch_tunables(mxge_softc_t *sc)
4178 {
4179 
4180 	TUNABLE_INT_FETCH("hw.mxge.max_slices", &mxge_max_slices);
4181 	TUNABLE_INT_FETCH("hw.mxge.flow_control_enabled",
4182 			  &mxge_flow_control);
4183 	TUNABLE_INT_FETCH("hw.mxge.intr_coal_delay",
4184 			  &mxge_intr_coal_delay);
4185 	TUNABLE_INT_FETCH("hw.mxge.nvidia_ecrc_enable",
4186 			  &mxge_nvidia_ecrc_enable);
4187 	TUNABLE_INT_FETCH("hw.mxge.force_firmware",
4188 			  &mxge_force_firmware);
4189 	TUNABLE_INT_FETCH("hw.mxge.deassert_wait",
4190 			  &mxge_deassert_wait);
4191 	TUNABLE_INT_FETCH("hw.mxge.verbose",
4192 			  &mxge_verbose);
4193 	TUNABLE_INT_FETCH("hw.mxge.ticks", &mxge_ticks);
4194 	TUNABLE_INT_FETCH("hw.mxge.always_promisc", &mxge_always_promisc);
4195 	TUNABLE_INT_FETCH("hw.mxge.rss_hash_type", &mxge_rss_hash_type);
4196 	TUNABLE_INT_FETCH("hw.mxge.rss_hashtype", &mxge_rss_hash_type);
4197 	TUNABLE_INT_FETCH("hw.mxge.initial_mtu", &mxge_initial_mtu);
4198 	TUNABLE_INT_FETCH("hw.mxge.throttle", &mxge_throttle);
4199 
4200 	if (bootverbose)
4201 		mxge_verbose = 1;
4202 	if (mxge_intr_coal_delay < 0 || mxge_intr_coal_delay > 10*1000)
4203 		mxge_intr_coal_delay = 30;
4204 	if (mxge_ticks == 0)
4205 		mxge_ticks = hz / 2;
4206 	sc->pause = mxge_flow_control;
4207 	if (mxge_rss_hash_type < MXGEFW_RSS_HASH_TYPE_IPV4
4208 	    || mxge_rss_hash_type > MXGEFW_RSS_HASH_TYPE_MAX) {
4209 		mxge_rss_hash_type = MXGEFW_RSS_HASH_TYPE_SRC_DST_PORT;
4210 	}
4211 	if (mxge_initial_mtu > ETHERMTU_JUMBO ||
4212 	    mxge_initial_mtu < ETHER_MIN_LEN)
4213 		mxge_initial_mtu = ETHERMTU_JUMBO;
4214 
4215 	if (mxge_throttle && mxge_throttle > MXGE_MAX_THROTTLE)
4216 		mxge_throttle = MXGE_MAX_THROTTLE;
4217 	if (mxge_throttle && mxge_throttle < MXGE_MIN_THROTTLE)
4218 		mxge_throttle = MXGE_MIN_THROTTLE;
4219 	sc->throttle = mxge_throttle;
4220 }
4221 
4222 static void
4223 mxge_free_slices(mxge_softc_t *sc)
4224 {
4225 	struct mxge_slice_state *ss;
4226 	int i;
4227 
4228 	if (sc->ss == NULL)
4229 		return;
4230 
4231 	for (i = 0; i < sc->num_slices; i++) {
4232 		ss = &sc->ss[i];
4233 		if (ss->fw_stats != NULL) {
4234 			mxge_dma_free(&ss->fw_stats_dma);
4235 			ss->fw_stats = NULL;
4236 			if (ss->tx.br != NULL) {
4237 				drbr_free(ss->tx.br, M_DEVBUF);
4238 				ss->tx.br = NULL;
4239 			}
4240 			mtx_destroy(&ss->tx.mtx);
4241 		}
4242 		if (ss->rx_done.entry != NULL) {
4243 			mxge_dma_free(&ss->rx_done.dma);
4244 			ss->rx_done.entry = NULL;
4245 		}
4246 	}
4247 	free(sc->ss, M_DEVBUF);
4248 	sc->ss = NULL;
4249 }
4250 
4251 static int
4252 mxge_alloc_slices(mxge_softc_t *sc)
4253 {
4254 	mxge_cmd_t cmd;
4255 	struct mxge_slice_state *ss;
4256 	size_t bytes;
4257 	int err, i, max_intr_slots;
4258 
4259 	err = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
4260 	if (err != 0) {
4261 		device_printf(sc->dev, "Cannot determine rx ring size\n");
4262 		return err;
4263 	}
4264 	sc->rx_ring_size = cmd.data0;
4265 	max_intr_slots = 2 * (sc->rx_ring_size / sizeof (mcp_dma_addr_t));
4266 
4267 	bytes = sizeof (*sc->ss) * sc->num_slices;
4268 	sc->ss = malloc(bytes, M_DEVBUF, M_NOWAIT | M_ZERO);
4269 	if (sc->ss == NULL)
4270 		return (ENOMEM);
4271 	for (i = 0; i < sc->num_slices; i++) {
4272 		ss = &sc->ss[i];
4273 
4274 		ss->sc = sc;
4275 
4276 		/* allocate per-slice rx interrupt queues */
4277 
4278 		bytes = max_intr_slots * sizeof (*ss->rx_done.entry);
4279 		err = mxge_dma_alloc(sc, &ss->rx_done.dma, bytes, 4096);
4280 		if (err != 0)
4281 			goto abort;
4282 		ss->rx_done.entry = ss->rx_done.dma.addr;
4283 		bzero(ss->rx_done.entry, bytes);
4284 
4285 		/*
4286 		 * allocate the per-slice firmware stats; stats
4287 		 * (including tx) are used used only on the first
4288 		 * slice for now
4289 		 */
4290 
4291 		bytes = sizeof (*ss->fw_stats);
4292 		err = mxge_dma_alloc(sc, &ss->fw_stats_dma,
4293 				     sizeof (*ss->fw_stats), 64);
4294 		if (err != 0)
4295 			goto abort;
4296 		ss->fw_stats = (mcp_irq_data_t *)ss->fw_stats_dma.addr;
4297 		snprintf(ss->tx.mtx_name, sizeof(ss->tx.mtx_name),
4298 			 "%s:tx(%d)", device_get_nameunit(sc->dev), i);
4299 		mtx_init(&ss->tx.mtx, ss->tx.mtx_name, NULL, MTX_DEF);
4300 		ss->tx.br = buf_ring_alloc(2048, M_DEVBUF, M_WAITOK,
4301 					   &ss->tx.mtx);
4302 	}
4303 
4304 	return (0);
4305 
4306 abort:
4307 	mxge_free_slices(sc);
4308 	return (ENOMEM);
4309 }
4310 
4311 static void
4312 mxge_slice_probe(mxge_softc_t *sc)
4313 {
4314 	mxge_cmd_t cmd;
4315 	char *old_fw;
4316 	int msix_cnt, status, max_intr_slots;
4317 
4318 	sc->num_slices = 1;
4319 	/*
4320 	 *  don't enable multiple slices if they are not enabled,
4321 	 *  or if this is not an SMP system
4322 	 */
4323 
4324 	if (mxge_max_slices == 0 || mxge_max_slices == 1 || mp_ncpus < 2)
4325 		return;
4326 
4327 	/* see how many MSI-X interrupts are available */
4328 	msix_cnt = pci_msix_count(sc->dev);
4329 	if (msix_cnt < 2)
4330 		return;
4331 
4332 	/* now load the slice aware firmware see what it supports */
4333 	old_fw = sc->fw_name;
4334 	if (old_fw == mxge_fw_aligned)
4335 		sc->fw_name = mxge_fw_rss_aligned;
4336 	else
4337 		sc->fw_name = mxge_fw_rss_unaligned;
4338 	status = mxge_load_firmware(sc, 0);
4339 	if (status != 0) {
4340 		device_printf(sc->dev, "Falling back to a single slice\n");
4341 		return;
4342 	}
4343 
4344 	/* try to send a reset command to the card to see if it
4345 	   is alive */
4346 	memset(&cmd, 0, sizeof (cmd));
4347 	status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
4348 	if (status != 0) {
4349 		device_printf(sc->dev, "failed reset\n");
4350 		goto abort_with_fw;
4351 	}
4352 
4353 	/* get rx ring size */
4354 	status = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
4355 	if (status != 0) {
4356 		device_printf(sc->dev, "Cannot determine rx ring size\n");
4357 		goto abort_with_fw;
4358 	}
4359 	max_intr_slots = 2 * (cmd.data0 / sizeof (mcp_dma_addr_t));
4360 
4361 	/* tell it the size of the interrupt queues */
4362 	cmd.data0 = max_intr_slots * sizeof (struct mcp_slot);
4363 	status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
4364 	if (status != 0) {
4365 		device_printf(sc->dev, "failed MXGEFW_CMD_SET_INTRQ_SIZE\n");
4366 		goto abort_with_fw;
4367 	}
4368 
4369 	/* ask the maximum number of slices it supports */
4370 	status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES, &cmd);
4371 	if (status != 0) {
4372 		device_printf(sc->dev,
4373 			      "failed MXGEFW_CMD_GET_MAX_RSS_QUEUES\n");
4374 		goto abort_with_fw;
4375 	}
4376 	sc->num_slices = cmd.data0;
4377 	if (sc->num_slices > msix_cnt)
4378 		sc->num_slices = msix_cnt;
4379 
4380 	if (mxge_max_slices == -1) {
4381 		/* cap to number of CPUs in system */
4382 		if (sc->num_slices > mp_ncpus)
4383 			sc->num_slices = mp_ncpus;
4384 	} else {
4385 		if (sc->num_slices > mxge_max_slices)
4386 			sc->num_slices = mxge_max_slices;
4387 	}
4388 	/* make sure it is a power of two */
4389 	while (sc->num_slices & (sc->num_slices - 1))
4390 		sc->num_slices--;
4391 
4392 	if (mxge_verbose)
4393 		device_printf(sc->dev, "using %d slices\n",
4394 			      sc->num_slices);
4395 
4396 	return;
4397 
4398 abort_with_fw:
4399 	sc->fw_name = old_fw;
4400 	(void) mxge_load_firmware(sc, 0);
4401 }
4402 
4403 static int
4404 mxge_add_msix_irqs(mxge_softc_t *sc)
4405 {
4406 	size_t bytes;
4407 	int count, err, i, rid;
4408 
4409 	rid = PCIR_BAR(2);
4410 	sc->msix_table_res = bus_alloc_resource_any(sc->dev, SYS_RES_MEMORY,
4411 						    &rid, RF_ACTIVE);
4412 
4413 	if (sc->msix_table_res == NULL) {
4414 		device_printf(sc->dev, "couldn't alloc MSIX table res\n");
4415 		return ENXIO;
4416 	}
4417 
4418 	count = sc->num_slices;
4419 	err = pci_alloc_msix(sc->dev, &count);
4420 	if (err != 0) {
4421 		device_printf(sc->dev, "pci_alloc_msix: failed, wanted %d"
4422 			      "err = %d \n", sc->num_slices, err);
4423 		goto abort_with_msix_table;
4424 	}
4425 	if (count < sc->num_slices) {
4426 		device_printf(sc->dev, "pci_alloc_msix: need %d, got %d\n",
4427 			      count, sc->num_slices);
4428 		device_printf(sc->dev,
4429 			      "Try setting hw.mxge.max_slices to %d\n",
4430 			      count);
4431 		err = ENOSPC;
4432 		goto abort_with_msix;
4433 	}
4434 	bytes = sizeof (*sc->msix_irq_res) * sc->num_slices;
4435 	sc->msix_irq_res = malloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO);
4436 	if (sc->msix_irq_res == NULL) {
4437 		err = ENOMEM;
4438 		goto abort_with_msix;
4439 	}
4440 
4441 	for (i = 0; i < sc->num_slices; i++) {
4442 		rid = i + 1;
4443 		sc->msix_irq_res[i] = bus_alloc_resource_any(sc->dev,
4444 							  SYS_RES_IRQ,
4445 							  &rid, RF_ACTIVE);
4446 		if (sc->msix_irq_res[i] == NULL) {
4447 			device_printf(sc->dev, "couldn't allocate IRQ res"
4448 				      " for message %d\n", i);
4449 			err = ENXIO;
4450 			goto abort_with_res;
4451 		}
4452 	}
4453 
4454 	bytes = sizeof (*sc->msix_ih) * sc->num_slices;
4455 	sc->msix_ih =  malloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO);
4456 
4457 	for (i = 0; i < sc->num_slices; i++) {
4458 		err = bus_setup_intr(sc->dev, sc->msix_irq_res[i],
4459 				     INTR_TYPE_NET | INTR_MPSAFE, NULL,
4460 				     mxge_intr, &sc->ss[i], &sc->msix_ih[i]);
4461 		if (err != 0) {
4462 			device_printf(sc->dev, "couldn't setup intr for "
4463 				      "message %d\n", i);
4464 			goto abort_with_intr;
4465 		}
4466 		bus_describe_intr(sc->dev, sc->msix_irq_res[i],
4467 				  sc->msix_ih[i], "s%d", i);
4468 	}
4469 
4470 	if (mxge_verbose) {
4471 		device_printf(sc->dev, "using %d msix IRQs:",
4472 			      sc->num_slices);
4473 		for (i = 0; i < sc->num_slices; i++)
4474 			printf(" %jd", rman_get_start(sc->msix_irq_res[i]));
4475 		printf("\n");
4476 	}
4477 	return (0);
4478 
4479 abort_with_intr:
4480 	for (i = 0; i < sc->num_slices; i++) {
4481 		if (sc->msix_ih[i] != NULL) {
4482 			bus_teardown_intr(sc->dev, sc->msix_irq_res[i],
4483 					  sc->msix_ih[i]);
4484 			sc->msix_ih[i] = NULL;
4485 		}
4486 	}
4487 	free(sc->msix_ih, M_DEVBUF);
4488 
4489 abort_with_res:
4490 	for (i = 0; i < sc->num_slices; i++) {
4491 		rid = i + 1;
4492 		if (sc->msix_irq_res[i] != NULL)
4493 			bus_release_resource(sc->dev, SYS_RES_IRQ, rid,
4494 					     sc->msix_irq_res[i]);
4495 		sc->msix_irq_res[i] = NULL;
4496 	}
4497 	free(sc->msix_irq_res, M_DEVBUF);
4498 
4499 abort_with_msix:
4500 	pci_release_msi(sc->dev);
4501 
4502 abort_with_msix_table:
4503 	bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2),
4504 			     sc->msix_table_res);
4505 
4506 	return err;
4507 }
4508 
4509 static int
4510 mxge_add_single_irq(mxge_softc_t *sc)
4511 {
4512 	int count, err, rid;
4513 
4514 	count = pci_msi_count(sc->dev);
4515 	if (count == 1 && pci_alloc_msi(sc->dev, &count) == 0) {
4516 		rid = 1;
4517 	} else {
4518 		rid = 0;
4519 		sc->legacy_irq = 1;
4520 	}
4521 	sc->irq_res = bus_alloc_resource_any(sc->dev, SYS_RES_IRQ, &rid,
4522 					     RF_SHAREABLE | RF_ACTIVE);
4523 	if (sc->irq_res == NULL) {
4524 		device_printf(sc->dev, "could not alloc interrupt\n");
4525 		return ENXIO;
4526 	}
4527 	if (mxge_verbose)
4528 		device_printf(sc->dev, "using %s irq %jd\n",
4529 			      sc->legacy_irq ? "INTx" : "MSI",
4530 			      rman_get_start(sc->irq_res));
4531 	err = bus_setup_intr(sc->dev, sc->irq_res,
4532 			     INTR_TYPE_NET | INTR_MPSAFE, NULL,
4533 			     mxge_intr, &sc->ss[0], &sc->ih);
4534 	if (err != 0) {
4535 		bus_release_resource(sc->dev, SYS_RES_IRQ,
4536 				     sc->legacy_irq ? 0 : 1, sc->irq_res);
4537 		if (!sc->legacy_irq)
4538 			pci_release_msi(sc->dev);
4539 	}
4540 	return err;
4541 }
4542 
4543 static void
4544 mxge_rem_msix_irqs(mxge_softc_t *sc)
4545 {
4546 	int i, rid;
4547 
4548 	for (i = 0; i < sc->num_slices; i++) {
4549 		if (sc->msix_ih[i] != NULL) {
4550 			bus_teardown_intr(sc->dev, sc->msix_irq_res[i],
4551 					  sc->msix_ih[i]);
4552 			sc->msix_ih[i] = NULL;
4553 		}
4554 	}
4555 	free(sc->msix_ih, M_DEVBUF);
4556 
4557 	for (i = 0; i < sc->num_slices; i++) {
4558 		rid = i + 1;
4559 		if (sc->msix_irq_res[i] != NULL)
4560 			bus_release_resource(sc->dev, SYS_RES_IRQ, rid,
4561 					     sc->msix_irq_res[i]);
4562 		sc->msix_irq_res[i] = NULL;
4563 	}
4564 	free(sc->msix_irq_res, M_DEVBUF);
4565 
4566 	bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2),
4567 			     sc->msix_table_res);
4568 
4569 	pci_release_msi(sc->dev);
4570 	return;
4571 }
4572 
4573 static void
4574 mxge_rem_single_irq(mxge_softc_t *sc)
4575 {
4576 	bus_teardown_intr(sc->dev, sc->irq_res, sc->ih);
4577 	bus_release_resource(sc->dev, SYS_RES_IRQ,
4578 			     sc->legacy_irq ? 0 : 1, sc->irq_res);
4579 	if (!sc->legacy_irq)
4580 		pci_release_msi(sc->dev);
4581 }
4582 
4583 static void
4584 mxge_rem_irq(mxge_softc_t *sc)
4585 {
4586 	if (sc->num_slices > 1)
4587 		mxge_rem_msix_irqs(sc);
4588 	else
4589 		mxge_rem_single_irq(sc);
4590 }
4591 
4592 static int
4593 mxge_add_irq(mxge_softc_t *sc)
4594 {
4595 	int err;
4596 
4597 	if (sc->num_slices > 1)
4598 		err = mxge_add_msix_irqs(sc);
4599 	else
4600 		err = mxge_add_single_irq(sc);
4601 
4602 	if (0 && err == 0 && sc->num_slices > 1) {
4603 		mxge_rem_msix_irqs(sc);
4604 		err = mxge_add_msix_irqs(sc);
4605 	}
4606 	return err;
4607 }
4608 
4609 static int
4610 mxge_attach(device_t dev)
4611 {
4612 	mxge_cmd_t cmd;
4613 	mxge_softc_t *sc = device_get_softc(dev);
4614 	struct ifnet *ifp;
4615 	int err, rid;
4616 
4617 	sc->dev = dev;
4618 	mxge_fetch_tunables(sc);
4619 
4620 	TASK_INIT(&sc->watchdog_task, 1, mxge_watchdog_task, sc);
4621 	sc->tq = taskqueue_create("mxge_taskq", M_WAITOK,
4622 				  taskqueue_thread_enqueue, &sc->tq);
4623 	if (sc->tq == NULL) {
4624 		err = ENOMEM;
4625 		goto abort_with_nothing;
4626 	}
4627 
4628 	err = bus_dma_tag_create(bus_get_dma_tag(dev),	/* parent */
4629 				 1,			/* alignment */
4630 				 0,			/* boundary */
4631 				 BUS_SPACE_MAXADDR,	/* low */
4632 				 BUS_SPACE_MAXADDR,	/* high */
4633 				 NULL, NULL,		/* filter */
4634 				 65536 + 256,		/* maxsize */
4635 				 MXGE_MAX_SEND_DESC, 	/* num segs */
4636 				 65536,			/* maxsegsize */
4637 				 0,			/* flags */
4638 				 NULL, NULL,		/* lock */
4639 				 &sc->parent_dmat);	/* tag */
4640 
4641 	if (err != 0) {
4642 		device_printf(sc->dev, "Err %d allocating parent dmat\n",
4643 			      err);
4644 		goto abort_with_tq;
4645 	}
4646 
4647 	ifp = sc->ifp = if_alloc(IFT_ETHER);
4648 	if (ifp == NULL) {
4649 		device_printf(dev, "can not if_alloc()\n");
4650 		err = ENOSPC;
4651 		goto abort_with_parent_dmat;
4652 	}
4653 	if_initname(ifp, device_get_name(dev), device_get_unit(dev));
4654 
4655 	snprintf(sc->cmd_mtx_name, sizeof(sc->cmd_mtx_name), "%s:cmd",
4656 		 device_get_nameunit(dev));
4657 	mtx_init(&sc->cmd_mtx, sc->cmd_mtx_name, NULL, MTX_DEF);
4658 	snprintf(sc->driver_mtx_name, sizeof(sc->driver_mtx_name),
4659 		 "%s:drv", device_get_nameunit(dev));
4660 	mtx_init(&sc->driver_mtx, sc->driver_mtx_name,
4661 		 MTX_NETWORK_LOCK, MTX_DEF);
4662 
4663 	callout_init_mtx(&sc->co_hdl, &sc->driver_mtx, 0);
4664 
4665 	mxge_setup_cfg_space(sc);
4666 
4667 	/* Map the board into the kernel */
4668 	rid = PCIR_BARS;
4669 	sc->mem_res = bus_alloc_resource_any(dev, SYS_RES_MEMORY, &rid,
4670 					     RF_ACTIVE);
4671 	if (sc->mem_res == NULL) {
4672 		device_printf(dev, "could not map memory\n");
4673 		err = ENXIO;
4674 		goto abort_with_lock;
4675 	}
4676 	sc->sram = rman_get_virtual(sc->mem_res);
4677 	sc->sram_size = 2*1024*1024 - (2*(48*1024)+(32*1024)) - 0x100;
4678 	if (sc->sram_size > rman_get_size(sc->mem_res)) {
4679 		device_printf(dev, "impossible memory region size %jd\n",
4680 			      rman_get_size(sc->mem_res));
4681 		err = ENXIO;
4682 		goto abort_with_mem_res;
4683 	}
4684 
4685 	/* make NULL terminated copy of the EEPROM strings section of
4686 	   lanai SRAM */
4687 	bzero(sc->eeprom_strings, MXGE_EEPROM_STRINGS_SIZE);
4688 	bus_space_read_region_1(rman_get_bustag(sc->mem_res),
4689 				rman_get_bushandle(sc->mem_res),
4690 				sc->sram_size - MXGE_EEPROM_STRINGS_SIZE,
4691 				sc->eeprom_strings,
4692 				MXGE_EEPROM_STRINGS_SIZE - 2);
4693 	err = mxge_parse_strings(sc);
4694 	if (err != 0)
4695 		goto abort_with_mem_res;
4696 
4697 	/* Enable write combining for efficient use of PCIe bus */
4698 	mxge_enable_wc(sc);
4699 
4700 	/* Allocate the out of band dma memory */
4701 	err = mxge_dma_alloc(sc, &sc->cmd_dma,
4702 			     sizeof (mxge_cmd_t), 64);
4703 	if (err != 0)
4704 		goto abort_with_mem_res;
4705 	sc->cmd = (mcp_cmd_response_t *) sc->cmd_dma.addr;
4706 	err = mxge_dma_alloc(sc, &sc->zeropad_dma, 64, 64);
4707 	if (err != 0)
4708 		goto abort_with_cmd_dma;
4709 
4710 	err = mxge_dma_alloc(sc, &sc->dmabench_dma, 4096, 4096);
4711 	if (err != 0)
4712 		goto abort_with_zeropad_dma;
4713 
4714 	/* select & load the firmware */
4715 	err = mxge_select_firmware(sc);
4716 	if (err != 0)
4717 		goto abort_with_dmabench;
4718 	sc->intr_coal_delay = mxge_intr_coal_delay;
4719 
4720 	mxge_slice_probe(sc);
4721 	err = mxge_alloc_slices(sc);
4722 	if (err != 0)
4723 		goto abort_with_dmabench;
4724 
4725 	err = mxge_reset(sc, 0);
4726 	if (err != 0)
4727 		goto abort_with_slices;
4728 
4729 	err = mxge_alloc_rings(sc);
4730 	if (err != 0) {
4731 		device_printf(sc->dev, "failed to allocate rings\n");
4732 		goto abort_with_slices;
4733 	}
4734 
4735 	err = mxge_add_irq(sc);
4736 	if (err != 0) {
4737 		device_printf(sc->dev, "failed to add irq\n");
4738 		goto abort_with_rings;
4739 	}
4740 
4741 	ifp->if_baudrate = IF_Gbps(10);
4742 	ifp->if_capabilities = IFCAP_RXCSUM | IFCAP_TXCSUM | IFCAP_TSO4 |
4743 		IFCAP_VLAN_MTU | IFCAP_LINKSTATE | IFCAP_TXCSUM_IPV6 |
4744 		IFCAP_RXCSUM_IPV6;
4745 #if defined(INET) || defined(INET6)
4746 	ifp->if_capabilities |= IFCAP_LRO;
4747 #endif
4748 
4749 #ifdef MXGE_NEW_VLAN_API
4750 	ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_HWCSUM;
4751 
4752 	/* Only FW 1.4.32 and newer can do TSO over vlans */
4753 	if (sc->fw_ver_major == 1 && sc->fw_ver_minor == 4 &&
4754 	    sc->fw_ver_tiny >= 32)
4755 		ifp->if_capabilities |= IFCAP_VLAN_HWTSO;
4756 #endif
4757 	sc->max_mtu = mxge_max_mtu(sc);
4758 	if (sc->max_mtu >= 9000)
4759 		ifp->if_capabilities |= IFCAP_JUMBO_MTU;
4760 	else
4761 		device_printf(dev, "MTU limited to %d.  Install "
4762 			      "latest firmware for 9000 byte jumbo support\n",
4763 			      sc->max_mtu - ETHER_HDR_LEN);
4764 	ifp->if_hwassist = CSUM_TCP | CSUM_UDP | CSUM_TSO;
4765 	ifp->if_hwassist |= CSUM_TCP_IPV6 | CSUM_UDP_IPV6;
4766 	/* check to see if f/w supports TSO for IPv6 */
4767 	if (!mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_TSO6_HDR_SIZE, &cmd)) {
4768 		if (CSUM_TCP_IPV6)
4769 			ifp->if_capabilities |= IFCAP_TSO6;
4770 		sc->max_tso6_hlen = min(cmd.data0,
4771 					sizeof (sc->ss[0].scratch));
4772 	}
4773 	ifp->if_capenable = ifp->if_capabilities;
4774 	if (sc->lro_cnt == 0)
4775 		ifp->if_capenable &= ~IFCAP_LRO;
4776 	ifp->if_init = mxge_init;
4777 	ifp->if_softc = sc;
4778 	ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
4779 	ifp->if_ioctl = mxge_ioctl;
4780 	ifp->if_start = mxge_start;
4781 	ifp->if_get_counter = mxge_get_counter;
4782 	ifp->if_hw_tsomax = IP_MAXPACKET - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN);
4783 	ifp->if_hw_tsomaxsegcount = sc->ss[0].tx.max_desc;
4784 	ifp->if_hw_tsomaxsegsize = IP_MAXPACKET;
4785 	/* Initialise the ifmedia structure */
4786 	ifmedia_init(&sc->media, 0, mxge_media_change,
4787 		     mxge_media_status);
4788 	mxge_media_init(sc);
4789 	mxge_media_probe(sc);
4790 	sc->dying = 0;
4791 	ether_ifattach(ifp, sc->mac_addr);
4792 	/* ether_ifattach sets mtu to ETHERMTU */
4793 	if (mxge_initial_mtu != ETHERMTU)
4794 		mxge_change_mtu(sc, mxge_initial_mtu);
4795 
4796 	mxge_add_sysctls(sc);
4797 	ifp->if_transmit = mxge_transmit;
4798 	ifp->if_qflush = mxge_qflush;
4799 	taskqueue_start_threads(&sc->tq, 1, PI_NET, "%s taskq",
4800 				device_get_nameunit(sc->dev));
4801 	callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
4802 	return 0;
4803 
4804 abort_with_rings:
4805 	mxge_free_rings(sc);
4806 abort_with_slices:
4807 	mxge_free_slices(sc);
4808 abort_with_dmabench:
4809 	mxge_dma_free(&sc->dmabench_dma);
4810 abort_with_zeropad_dma:
4811 	mxge_dma_free(&sc->zeropad_dma);
4812 abort_with_cmd_dma:
4813 	mxge_dma_free(&sc->cmd_dma);
4814 abort_with_mem_res:
4815 	bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
4816 abort_with_lock:
4817 	pci_disable_busmaster(dev);
4818 	mtx_destroy(&sc->cmd_mtx);
4819 	mtx_destroy(&sc->driver_mtx);
4820 	if_free(ifp);
4821 abort_with_parent_dmat:
4822 	bus_dma_tag_destroy(sc->parent_dmat);
4823 abort_with_tq:
4824 	if (sc->tq != NULL) {
4825 		taskqueue_drain(sc->tq, &sc->watchdog_task);
4826 		taskqueue_free(sc->tq);
4827 		sc->tq = NULL;
4828 	}
4829 abort_with_nothing:
4830 	return err;
4831 }
4832 
4833 static int
4834 mxge_detach(device_t dev)
4835 {
4836 	mxge_softc_t *sc = device_get_softc(dev);
4837 
4838 	if (mxge_vlans_active(sc)) {
4839 		device_printf(sc->dev,
4840 			      "Detach vlans before removing module\n");
4841 		return EBUSY;
4842 	}
4843 	mtx_lock(&sc->driver_mtx);
4844 	sc->dying = 1;
4845 	if (sc->ifp->if_drv_flags & IFF_DRV_RUNNING)
4846 		mxge_close(sc, 0);
4847 	mtx_unlock(&sc->driver_mtx);
4848 	ether_ifdetach(sc->ifp);
4849 	if (sc->tq != NULL) {
4850 		taskqueue_drain(sc->tq, &sc->watchdog_task);
4851 		taskqueue_free(sc->tq);
4852 		sc->tq = NULL;
4853 	}
4854 	callout_drain(&sc->co_hdl);
4855 	ifmedia_removeall(&sc->media);
4856 	mxge_dummy_rdma(sc, 0);
4857 	mxge_rem_sysctls(sc);
4858 	mxge_rem_irq(sc);
4859 	mxge_free_rings(sc);
4860 	mxge_free_slices(sc);
4861 	mxge_dma_free(&sc->dmabench_dma);
4862 	mxge_dma_free(&sc->zeropad_dma);
4863 	mxge_dma_free(&sc->cmd_dma);
4864 	bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
4865 	pci_disable_busmaster(dev);
4866 	mtx_destroy(&sc->cmd_mtx);
4867 	mtx_destroy(&sc->driver_mtx);
4868 	if_free(sc->ifp);
4869 	bus_dma_tag_destroy(sc->parent_dmat);
4870 	return 0;
4871 }
4872 
4873 static int
4874 mxge_shutdown(device_t dev)
4875 {
4876 	return 0;
4877 }
4878 
4879 /*
4880   This file uses Myri10GE driver indentation.
4881 
4882   Local Variables:
4883   c-file-style:"linux"
4884   tab-width:8
4885   End:
4886 */
4887