xref: /freebsd/sys/dev/mxge/if_mxge.c (revision 78b9f0095b4af3aca6c931b2c7b009ddb8a05125)
1 /******************************************************************************
2 SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3 
4 Copyright (c) 2006-2013, Myricom Inc.
5 All rights reserved.
6 
7 Redistribution and use in source and binary forms, with or without
8 modification, are permitted provided that the following conditions are met:
9 
10  1. Redistributions of source code must retain the above copyright notice,
11     this list of conditions and the following disclaimer.
12 
13  2. Neither the name of the Myricom Inc, nor the names of its
14     contributors may be used to endorse or promote products derived from
15     this software without specific prior written permission.
16 
17 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
21 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27 POSSIBILITY OF SUCH DAMAGE.
28 
29 ***************************************************************************/
30 
31 #include <sys/cdefs.h>
32 __FBSDID("$FreeBSD$");
33 
34 #include <sys/param.h>
35 #include <sys/systm.h>
36 #include <sys/linker.h>
37 #include <sys/firmware.h>
38 #include <sys/endian.h>
39 #include <sys/sockio.h>
40 #include <sys/mbuf.h>
41 #include <sys/malloc.h>
42 #include <sys/kdb.h>
43 #include <sys/kernel.h>
44 #include <sys/lock.h>
45 #include <sys/module.h>
46 #include <sys/socket.h>
47 #include <sys/sysctl.h>
48 #include <sys/sx.h>
49 #include <sys/taskqueue.h>
50 #include <sys/zlib.h>
51 
52 #include <net/if.h>
53 #include <net/if_var.h>
54 #include <net/if_arp.h>
55 #include <net/ethernet.h>
56 #include <net/if_dl.h>
57 #include <net/if_media.h>
58 
59 #include <net/bpf.h>
60 
61 #include <net/if_types.h>
62 #include <net/if_vlan_var.h>
63 
64 #include <netinet/in_systm.h>
65 #include <netinet/in.h>
66 #include <netinet/ip.h>
67 #include <netinet/ip6.h>
68 #include <netinet/tcp.h>
69 #include <netinet/tcp_lro.h>
70 #include <netinet6/ip6_var.h>
71 
72 #include <machine/bus.h>
73 #include <machine/in_cksum.h>
74 #include <machine/resource.h>
75 #include <sys/bus.h>
76 #include <sys/rman.h>
77 #include <sys/smp.h>
78 
79 #include <dev/pci/pcireg.h>
80 #include <dev/pci/pcivar.h>
81 #include <dev/pci/pci_private.h> /* XXX for pci_cfg_restore */
82 
83 #include <vm/vm.h>		/* for pmap_mapdev() */
84 #include <vm/pmap.h>
85 
86 #if defined(__i386) || defined(__amd64)
87 #include <machine/specialreg.h>
88 #endif
89 
90 #include <dev/mxge/mxge_mcp.h>
91 #include <dev/mxge/mcp_gen_header.h>
92 /*#define MXGE_FAKE_IFP*/
93 #include <dev/mxge/if_mxge_var.h>
94 #ifdef IFNET_BUF_RING
95 #include <sys/buf_ring.h>
96 #endif
97 
98 #include "opt_inet.h"
99 #include "opt_inet6.h"
100 
101 /* tunable params */
102 static int mxge_nvidia_ecrc_enable = 1;
103 static int mxge_force_firmware = 0;
104 static int mxge_intr_coal_delay = 30;
105 static int mxge_deassert_wait = 1;
106 static int mxge_flow_control = 1;
107 static int mxge_verbose = 0;
108 static int mxge_ticks;
109 static int mxge_max_slices = 1;
110 static int mxge_rss_hash_type = MXGEFW_RSS_HASH_TYPE_SRC_DST_PORT;
111 static int mxge_always_promisc = 0;
112 static int mxge_initial_mtu = ETHERMTU_JUMBO;
113 static int mxge_throttle = 0;
114 static char *mxge_fw_unaligned = "mxge_ethp_z8e";
115 static char *mxge_fw_aligned = "mxge_eth_z8e";
116 static char *mxge_fw_rss_aligned = "mxge_rss_eth_z8e";
117 static char *mxge_fw_rss_unaligned = "mxge_rss_ethp_z8e";
118 
119 static int mxge_probe(device_t dev);
120 static int mxge_attach(device_t dev);
121 static int mxge_detach(device_t dev);
122 static int mxge_shutdown(device_t dev);
123 static void mxge_intr(void *arg);
124 
125 static device_method_t mxge_methods[] =
126 {
127   /* Device interface */
128   DEVMETHOD(device_probe, mxge_probe),
129   DEVMETHOD(device_attach, mxge_attach),
130   DEVMETHOD(device_detach, mxge_detach),
131   DEVMETHOD(device_shutdown, mxge_shutdown),
132 
133   DEVMETHOD_END
134 };
135 
136 static driver_t mxge_driver =
137 {
138   "mxge",
139   mxge_methods,
140   sizeof(mxge_softc_t),
141 };
142 
143 static devclass_t mxge_devclass;
144 
145 /* Declare ourselves to be a child of the PCI bus.*/
146 DRIVER_MODULE(mxge, pci, mxge_driver, mxge_devclass, 0, 0);
147 MODULE_DEPEND(mxge, firmware, 1, 1, 1);
148 MODULE_DEPEND(mxge, zlib, 1, 1, 1);
149 
150 static int mxge_load_firmware(mxge_softc_t *sc, int adopt);
151 static int mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data);
152 static int mxge_close(mxge_softc_t *sc, int down);
153 static int mxge_open(mxge_softc_t *sc);
154 static void mxge_tick(void *arg);
155 
156 static int
157 mxge_probe(device_t dev)
158 {
159 	int rev;
160 
161 
162 	if ((pci_get_vendor(dev) == MXGE_PCI_VENDOR_MYRICOM) &&
163 	    ((pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E) ||
164 	     (pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E_9))) {
165 		rev = pci_get_revid(dev);
166 		switch (rev) {
167 		case MXGE_PCI_REV_Z8E:
168 			device_set_desc(dev, "Myri10G-PCIE-8A");
169 			break;
170 		case MXGE_PCI_REV_Z8ES:
171 			device_set_desc(dev, "Myri10G-PCIE-8B");
172 			break;
173 		default:
174 			device_set_desc(dev, "Myri10G-PCIE-8??");
175 			device_printf(dev, "Unrecognized rev %d NIC\n",
176 				      rev);
177 			break;
178 		}
179 		return 0;
180 	}
181 	return ENXIO;
182 }
183 
184 static void
185 mxge_enable_wc(mxge_softc_t *sc)
186 {
187 #if defined(__i386) || defined(__amd64)
188 	vm_offset_t len;
189 	int err;
190 
191 	sc->wc = 1;
192 	len = rman_get_size(sc->mem_res);
193 	err = pmap_change_attr((vm_offset_t) sc->sram,
194 			       len, PAT_WRITE_COMBINING);
195 	if (err != 0) {
196 		device_printf(sc->dev, "pmap_change_attr failed, %d\n",
197 			      err);
198 		sc->wc = 0;
199 	}
200 #endif
201 }
202 
203 
204 /* callback to get our DMA address */
205 static void
206 mxge_dmamap_callback(void *arg, bus_dma_segment_t *segs, int nsegs,
207 			 int error)
208 {
209 	if (error == 0) {
210 		*(bus_addr_t *) arg = segs->ds_addr;
211 	}
212 }
213 
214 static int
215 mxge_dma_alloc(mxge_softc_t *sc, mxge_dma_t *dma, size_t bytes,
216 		   bus_size_t alignment)
217 {
218 	int err;
219 	device_t dev = sc->dev;
220 	bus_size_t boundary, maxsegsize;
221 
222 	if (bytes > 4096 && alignment == 4096) {
223 		boundary = 0;
224 		maxsegsize = bytes;
225 	} else {
226 		boundary = 4096;
227 		maxsegsize = 4096;
228 	}
229 
230 	/* allocate DMAable memory tags */
231 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
232 				 alignment,		/* alignment */
233 				 boundary,		/* boundary */
234 				 BUS_SPACE_MAXADDR,	/* low */
235 				 BUS_SPACE_MAXADDR,	/* high */
236 				 NULL, NULL,		/* filter */
237 				 bytes,			/* maxsize */
238 				 1,			/* num segs */
239 				 maxsegsize,		/* maxsegsize */
240 				 BUS_DMA_COHERENT,	/* flags */
241 				 NULL, NULL,		/* lock */
242 				 &dma->dmat);		/* tag */
243 	if (err != 0) {
244 		device_printf(dev, "couldn't alloc tag (err = %d)\n", err);
245 		return err;
246 	}
247 
248 	/* allocate DMAable memory & map */
249 	err = bus_dmamem_alloc(dma->dmat, &dma->addr,
250 			       (BUS_DMA_WAITOK | BUS_DMA_COHERENT
251 				| BUS_DMA_ZERO),  &dma->map);
252 	if (err != 0) {
253 		device_printf(dev, "couldn't alloc mem (err = %d)\n", err);
254 		goto abort_with_dmat;
255 	}
256 
257 	/* load the memory */
258 	err = bus_dmamap_load(dma->dmat, dma->map, dma->addr, bytes,
259 			      mxge_dmamap_callback,
260 			      (void *)&dma->bus_addr, 0);
261 	if (err != 0) {
262 		device_printf(dev, "couldn't load map (err = %d)\n", err);
263 		goto abort_with_mem;
264 	}
265 	return 0;
266 
267 abort_with_mem:
268 	bus_dmamem_free(dma->dmat, dma->addr, dma->map);
269 abort_with_dmat:
270 	(void)bus_dma_tag_destroy(dma->dmat);
271 	return err;
272 }
273 
274 
275 static void
276 mxge_dma_free(mxge_dma_t *dma)
277 {
278 	bus_dmamap_unload(dma->dmat, dma->map);
279 	bus_dmamem_free(dma->dmat, dma->addr, dma->map);
280 	(void)bus_dma_tag_destroy(dma->dmat);
281 }
282 
283 /*
284  * The eeprom strings on the lanaiX have the format
285  * SN=x\0
286  * MAC=x:x:x:x:x:x\0
287  * PC=text\0
288  */
289 
290 static int
291 mxge_parse_strings(mxge_softc_t *sc)
292 {
293 	char *ptr;
294 	int i, found_mac, found_sn2;
295 	char *endptr;
296 
297 	ptr = sc->eeprom_strings;
298 	found_mac = 0;
299 	found_sn2 = 0;
300 	while (*ptr != '\0') {
301 		if (strncmp(ptr, "MAC=", 4) == 0) {
302 			ptr += 4;
303 			for (i = 0;;) {
304 				sc->mac_addr[i] = strtoul(ptr, &endptr, 16);
305 				if (endptr - ptr != 2)
306 					goto abort;
307 				ptr = endptr;
308 				if (++i == 6)
309 					break;
310 				if (*ptr++ != ':')
311 					goto abort;
312 			}
313 			found_mac = 1;
314 		} else if (strncmp(ptr, "PC=", 3) == 0) {
315 			ptr += 3;
316 			strlcpy(sc->product_code_string, ptr,
317 			    sizeof(sc->product_code_string));
318 		} else if (!found_sn2 && (strncmp(ptr, "SN=", 3) == 0)) {
319 			ptr += 3;
320 			strlcpy(sc->serial_number_string, ptr,
321 			    sizeof(sc->serial_number_string));
322 		} else if (strncmp(ptr, "SN2=", 4) == 0) {
323 			/* SN2 takes precedence over SN */
324 			ptr += 4;
325 			found_sn2 = 1;
326 			strlcpy(sc->serial_number_string, ptr,
327 			    sizeof(sc->serial_number_string));
328 		}
329 		while (*ptr++ != '\0') {}
330 	}
331 
332 	if (found_mac)
333 		return 0;
334 
335  abort:
336 	device_printf(sc->dev, "failed to parse eeprom_strings\n");
337 
338 	return ENXIO;
339 }
340 
341 #if defined __i386 || defined i386 || defined __i386__ || defined __x86_64__
342 static void
343 mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
344 {
345 	uint32_t val;
346 	unsigned long base, off;
347 	char *va, *cfgptr;
348 	device_t pdev, mcp55;
349 	uint16_t vendor_id, device_id, word;
350 	uintptr_t bus, slot, func, ivend, idev;
351 	uint32_t *ptr32;
352 
353 
354 	if (!mxge_nvidia_ecrc_enable)
355 		return;
356 
357 	pdev = device_get_parent(device_get_parent(sc->dev));
358 	if (pdev == NULL) {
359 		device_printf(sc->dev, "could not find parent?\n");
360 		return;
361 	}
362 	vendor_id = pci_read_config(pdev, PCIR_VENDOR, 2);
363 	device_id = pci_read_config(pdev, PCIR_DEVICE, 2);
364 
365 	if (vendor_id != 0x10de)
366 		return;
367 
368 	base = 0;
369 
370 	if (device_id == 0x005d) {
371 		/* ck804, base address is magic */
372 		base = 0xe0000000UL;
373 	} else if (device_id >= 0x0374 && device_id <= 0x378) {
374 		/* mcp55, base address stored in chipset */
375 		mcp55 = pci_find_bsf(0, 0, 0);
376 		if (mcp55 &&
377 		    0x10de == pci_read_config(mcp55, PCIR_VENDOR, 2) &&
378 		    0x0369 == pci_read_config(mcp55, PCIR_DEVICE, 2)) {
379 			word = pci_read_config(mcp55, 0x90, 2);
380 			base = ((unsigned long)word & 0x7ffeU) << 25;
381 		}
382 	}
383 	if (!base)
384 		return;
385 
386 	/* XXXX
387 	   Test below is commented because it is believed that doing
388 	   config read/write beyond 0xff will access the config space
389 	   for the next larger function.  Uncomment this and remove
390 	   the hacky pmap_mapdev() way of accessing config space when
391 	   FreeBSD grows support for extended pcie config space access
392 	*/
393 #if 0
394 	/* See if we can, by some miracle, access the extended
395 	   config space */
396 	val = pci_read_config(pdev, 0x178, 4);
397 	if (val != 0xffffffff) {
398 		val |= 0x40;
399 		pci_write_config(pdev, 0x178, val, 4);
400 		return;
401 	}
402 #endif
403 	/* Rather than using normal pci config space writes, we must
404 	 * map the Nvidia config space ourselves.  This is because on
405 	 * opteron/nvidia class machine the 0xe000000 mapping is
406 	 * handled by the nvidia chipset, that means the internal PCI
407 	 * device (the on-chip northbridge), or the amd-8131 bridge
408 	 * and things behind them are not visible by this method.
409 	 */
410 
411 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
412 		      PCI_IVAR_BUS, &bus);
413 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
414 		      PCI_IVAR_SLOT, &slot);
415 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
416 		      PCI_IVAR_FUNCTION, &func);
417 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
418 		      PCI_IVAR_VENDOR, &ivend);
419 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
420 		      PCI_IVAR_DEVICE, &idev);
421 
422 	off =  base
423 		+ 0x00100000UL * (unsigned long)bus
424 		+ 0x00001000UL * (unsigned long)(func
425 						 + 8 * slot);
426 
427 	/* map it into the kernel */
428 	va = pmap_mapdev(trunc_page((vm_paddr_t)off), PAGE_SIZE);
429 
430 
431 	if (va == NULL) {
432 		device_printf(sc->dev, "pmap_kenter_temporary didn't\n");
433 		return;
434 	}
435 	/* get a pointer to the config space mapped into the kernel */
436 	cfgptr = va + (off & PAGE_MASK);
437 
438 	/* make sure that we can really access it */
439 	vendor_id = *(uint16_t *)(cfgptr + PCIR_VENDOR);
440 	device_id = *(uint16_t *)(cfgptr + PCIR_DEVICE);
441 	if (! (vendor_id == ivend && device_id == idev)) {
442 		device_printf(sc->dev, "mapping failed: 0x%x:0x%x\n",
443 			      vendor_id, device_id);
444 		pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
445 		return;
446 	}
447 
448 	ptr32 = (uint32_t*)(cfgptr + 0x178);
449 	val = *ptr32;
450 
451 	if (val == 0xffffffff) {
452 		device_printf(sc->dev, "extended mapping failed\n");
453 		pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
454 		return;
455 	}
456 	*ptr32 = val | 0x40;
457 	pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
458 	if (mxge_verbose)
459 		device_printf(sc->dev,
460 			      "Enabled ECRC on upstream Nvidia bridge "
461 			      "at %d:%d:%d\n",
462 			      (int)bus, (int)slot, (int)func);
463 	return;
464 }
465 #else
466 static void
467 mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
468 {
469 	device_printf(sc->dev,
470 		      "Nforce 4 chipset on non-x86/amd64!?!?!\n");
471 	return;
472 }
473 #endif
474 
475 
476 static int
477 mxge_dma_test(mxge_softc_t *sc, int test_type)
478 {
479 	mxge_cmd_t cmd;
480 	bus_addr_t dmatest_bus = sc->dmabench_dma.bus_addr;
481 	int status;
482 	uint32_t len;
483 	char *test = " ";
484 
485 
486 	/* Run a small DMA test.
487 	 * The magic multipliers to the length tell the firmware
488 	 * to do DMA read, write, or read+write tests.  The
489 	 * results are returned in cmd.data0.  The upper 16
490 	 * bits of the return is the number of transfers completed.
491 	 * The lower 16 bits is the time in 0.5us ticks that the
492 	 * transfers took to complete.
493 	 */
494 
495 	len = sc->tx_boundary;
496 
497 	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
498 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
499 	cmd.data2 = len * 0x10000;
500 	status = mxge_send_cmd(sc, test_type, &cmd);
501 	if (status != 0) {
502 		test = "read";
503 		goto abort;
504 	}
505 	sc->read_dma = ((cmd.data0>>16) * len * 2) /
506 		(cmd.data0 & 0xffff);
507 	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
508 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
509 	cmd.data2 = len * 0x1;
510 	status = mxge_send_cmd(sc, test_type, &cmd);
511 	if (status != 0) {
512 		test = "write";
513 		goto abort;
514 	}
515 	sc->write_dma = ((cmd.data0>>16) * len * 2) /
516 		(cmd.data0 & 0xffff);
517 
518 	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
519 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
520 	cmd.data2 = len * 0x10001;
521 	status = mxge_send_cmd(sc, test_type, &cmd);
522 	if (status != 0) {
523 		test = "read/write";
524 		goto abort;
525 	}
526 	sc->read_write_dma = ((cmd.data0>>16) * len * 2 * 2) /
527 		(cmd.data0 & 0xffff);
528 
529 abort:
530 	if (status != 0 && test_type != MXGEFW_CMD_UNALIGNED_TEST)
531 		device_printf(sc->dev, "DMA %s benchmark failed: %d\n",
532 			      test, status);
533 
534 	return status;
535 }
536 
537 /*
538  * The Lanai Z8E PCI-E interface achieves higher Read-DMA throughput
539  * when the PCI-E Completion packets are aligned on an 8-byte
540  * boundary.  Some PCI-E chip sets always align Completion packets; on
541  * the ones that do not, the alignment can be enforced by enabling
542  * ECRC generation (if supported).
543  *
544  * When PCI-E Completion packets are not aligned, it is actually more
545  * efficient to limit Read-DMA transactions to 2KB, rather than 4KB.
546  *
547  * If the driver can neither enable ECRC nor verify that it has
548  * already been enabled, then it must use a firmware image which works
549  * around unaligned completion packets (ethp_z8e.dat), and it should
550  * also ensure that it never gives the device a Read-DMA which is
551  * larger than 2KB by setting the tx_boundary to 2KB.  If ECRC is
552  * enabled, then the driver should use the aligned (eth_z8e.dat)
553  * firmware image, and set tx_boundary to 4KB.
554  */
555 
556 static int
557 mxge_firmware_probe(mxge_softc_t *sc)
558 {
559 	device_t dev = sc->dev;
560 	int reg, status;
561 	uint16_t pectl;
562 
563 	sc->tx_boundary = 4096;
564 	/*
565 	 * Verify the max read request size was set to 4KB
566 	 * before trying the test with 4KB.
567 	 */
568 	if (pci_find_cap(dev, PCIY_EXPRESS, &reg) == 0) {
569 		pectl = pci_read_config(dev, reg + 0x8, 2);
570 		if ((pectl & (5 << 12)) != (5 << 12)) {
571 			device_printf(dev, "Max Read Req. size != 4k (0x%x\n",
572 				      pectl);
573 			sc->tx_boundary = 2048;
574 		}
575 	}
576 
577 	/*
578 	 * load the optimized firmware (which assumes aligned PCIe
579 	 * completions) in order to see if it works on this host.
580 	 */
581 	sc->fw_name = mxge_fw_aligned;
582 	status = mxge_load_firmware(sc, 1);
583 	if (status != 0) {
584 		return status;
585 	}
586 
587 	/*
588 	 * Enable ECRC if possible
589 	 */
590 	mxge_enable_nvidia_ecrc(sc);
591 
592 	/*
593 	 * Run a DMA test which watches for unaligned completions and
594 	 * aborts on the first one seen.  Not required on Z8ES or newer.
595 	 */
596 	if (pci_get_revid(sc->dev) >= MXGE_PCI_REV_Z8ES)
597 		return 0;
598 	status = mxge_dma_test(sc, MXGEFW_CMD_UNALIGNED_TEST);
599 	if (status == 0)
600 		return 0; /* keep the aligned firmware */
601 
602 	if (status != E2BIG)
603 		device_printf(dev, "DMA test failed: %d\n", status);
604 	if (status == ENOSYS)
605 		device_printf(dev, "Falling back to ethp! "
606 			      "Please install up to date fw\n");
607 	return status;
608 }
609 
610 static int
611 mxge_select_firmware(mxge_softc_t *sc)
612 {
613 	int aligned = 0;
614 	int force_firmware = mxge_force_firmware;
615 
616 	if (sc->throttle)
617 		force_firmware = sc->throttle;
618 
619 	if (force_firmware != 0) {
620 		if (force_firmware == 1)
621 			aligned = 1;
622 		else
623 			aligned = 0;
624 		if (mxge_verbose)
625 			device_printf(sc->dev,
626 				      "Assuming %s completions (forced)\n",
627 				      aligned ? "aligned" : "unaligned");
628 		goto abort;
629 	}
630 
631 	/* if the PCIe link width is 4 or less, we can use the aligned
632 	   firmware and skip any checks */
633 	if (sc->link_width != 0 && sc->link_width <= 4) {
634 		device_printf(sc->dev,
635 			      "PCIe x%d Link, expect reduced performance\n",
636 			      sc->link_width);
637 		aligned = 1;
638 		goto abort;
639 	}
640 
641 	if (0 == mxge_firmware_probe(sc))
642 		return 0;
643 
644 abort:
645 	if (aligned) {
646 		sc->fw_name = mxge_fw_aligned;
647 		sc->tx_boundary = 4096;
648 	} else {
649 		sc->fw_name = mxge_fw_unaligned;
650 		sc->tx_boundary = 2048;
651 	}
652 	return (mxge_load_firmware(sc, 0));
653 }
654 
655 static int
656 mxge_validate_firmware(mxge_softc_t *sc, const mcp_gen_header_t *hdr)
657 {
658 
659 
660 	if (be32toh(hdr->mcp_type) != MCP_TYPE_ETH) {
661 		device_printf(sc->dev, "Bad firmware type: 0x%x\n",
662 			      be32toh(hdr->mcp_type));
663 		return EIO;
664 	}
665 
666 	/* save firmware version for sysctl */
667 	strlcpy(sc->fw_version, hdr->version, sizeof(sc->fw_version));
668 	if (mxge_verbose)
669 		device_printf(sc->dev, "firmware id: %s\n", hdr->version);
670 
671 	sscanf(sc->fw_version, "%d.%d.%d", &sc->fw_ver_major,
672 	       &sc->fw_ver_minor, &sc->fw_ver_tiny);
673 
674 	if (!(sc->fw_ver_major == MXGEFW_VERSION_MAJOR
675 	      && sc->fw_ver_minor == MXGEFW_VERSION_MINOR)) {
676 		device_printf(sc->dev, "Found firmware version %s\n",
677 			      sc->fw_version);
678 		device_printf(sc->dev, "Driver needs %d.%d\n",
679 			      MXGEFW_VERSION_MAJOR, MXGEFW_VERSION_MINOR);
680 		return EINVAL;
681 	}
682 	return 0;
683 
684 }
685 
686 static void *
687 z_alloc(void *nil, u_int items, u_int size)
688 {
689 	void *ptr;
690 
691 	ptr = malloc(items * size, M_TEMP, M_NOWAIT);
692 	return ptr;
693 }
694 
695 static void
696 z_free(void *nil, void *ptr)
697 {
698 	free(ptr, M_TEMP);
699 }
700 
701 
702 static int
703 mxge_load_firmware_helper(mxge_softc_t *sc, uint32_t *limit)
704 {
705 	z_stream zs;
706 	char *inflate_buffer;
707 	const struct firmware *fw;
708 	const mcp_gen_header_t *hdr;
709 	unsigned hdr_offset;
710 	int status;
711 	unsigned int i;
712 	char dummy;
713 	size_t fw_len;
714 
715 	fw = firmware_get(sc->fw_name);
716 	if (fw == NULL) {
717 		device_printf(sc->dev, "Could not find firmware image %s\n",
718 			      sc->fw_name);
719 		return ENOENT;
720 	}
721 
722 
723 
724 	/* setup zlib and decompress f/w */
725 	bzero(&zs, sizeof (zs));
726 	zs.zalloc = z_alloc;
727 	zs.zfree = z_free;
728 	status = inflateInit(&zs);
729 	if (status != Z_OK) {
730 		status = EIO;
731 		goto abort_with_fw;
732 	}
733 
734 	/* the uncompressed size is stored as the firmware version,
735 	   which would otherwise go unused */
736 	fw_len = (size_t) fw->version;
737 	inflate_buffer = malloc(fw_len, M_TEMP, M_NOWAIT);
738 	if (inflate_buffer == NULL)
739 		goto abort_with_zs;
740 	zs.avail_in = fw->datasize;
741 	zs.next_in = __DECONST(char *, fw->data);
742 	zs.avail_out = fw_len;
743 	zs.next_out = inflate_buffer;
744 	status = inflate(&zs, Z_FINISH);
745 	if (status != Z_STREAM_END) {
746 		device_printf(sc->dev, "zlib %d\n", status);
747 		status = EIO;
748 		goto abort_with_buffer;
749 	}
750 
751 	/* check id */
752 	hdr_offset = htobe32(*(const uint32_t *)
753 			     (inflate_buffer + MCP_HEADER_PTR_OFFSET));
754 	if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > fw_len) {
755 		device_printf(sc->dev, "Bad firmware file");
756 		status = EIO;
757 		goto abort_with_buffer;
758 	}
759 	hdr = (const void*)(inflate_buffer + hdr_offset);
760 
761 	status = mxge_validate_firmware(sc, hdr);
762 	if (status != 0)
763 		goto abort_with_buffer;
764 
765 	/* Copy the inflated firmware to NIC SRAM. */
766 	for (i = 0; i < fw_len; i += 256) {
767 		mxge_pio_copy(sc->sram + MXGE_FW_OFFSET + i,
768 			      inflate_buffer + i,
769 			      min(256U, (unsigned)(fw_len - i)));
770 		wmb();
771 		dummy = *sc->sram;
772 		wmb();
773 	}
774 
775 	*limit = fw_len;
776 	status = 0;
777 abort_with_buffer:
778 	free(inflate_buffer, M_TEMP);
779 abort_with_zs:
780 	inflateEnd(&zs);
781 abort_with_fw:
782 	firmware_put(fw, FIRMWARE_UNLOAD);
783 	return status;
784 }
785 
786 /*
787  * Enable or disable periodic RDMAs from the host to make certain
788  * chipsets resend dropped PCIe messages
789  */
790 
791 static void
792 mxge_dummy_rdma(mxge_softc_t *sc, int enable)
793 {
794 	char buf_bytes[72];
795 	volatile uint32_t *confirm;
796 	volatile char *submit;
797 	uint32_t *buf, dma_low, dma_high;
798 	int i;
799 
800 	buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
801 
802 	/* clear confirmation addr */
803 	confirm = (volatile uint32_t *)sc->cmd;
804 	*confirm = 0;
805 	wmb();
806 
807 	/* send an rdma command to the PCIe engine, and wait for the
808 	   response in the confirmation address.  The firmware should
809 	   write a -1 there to indicate it is alive and well
810 	*/
811 
812 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
813 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
814 	buf[0] = htobe32(dma_high);		/* confirm addr MSW */
815 	buf[1] = htobe32(dma_low);		/* confirm addr LSW */
816 	buf[2] = htobe32(0xffffffff);		/* confirm data */
817 	dma_low = MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr);
818 	dma_high = MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr);
819 	buf[3] = htobe32(dma_high); 		/* dummy addr MSW */
820 	buf[4] = htobe32(dma_low); 		/* dummy addr LSW */
821 	buf[5] = htobe32(enable);			/* enable? */
822 
823 
824 	submit = (volatile char *)(sc->sram + MXGEFW_BOOT_DUMMY_RDMA);
825 
826 	mxge_pio_copy(submit, buf, 64);
827 	wmb();
828 	DELAY(1000);
829 	wmb();
830 	i = 0;
831 	while (*confirm != 0xffffffff && i < 20) {
832 		DELAY(1000);
833 		i++;
834 	}
835 	if (*confirm != 0xffffffff) {
836 		device_printf(sc->dev, "dummy rdma %s failed (%p = 0x%x)",
837 			      (enable ? "enable" : "disable"), confirm,
838 			      *confirm);
839 	}
840 	return;
841 }
842 
843 static int
844 mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data)
845 {
846 	mcp_cmd_t *buf;
847 	char buf_bytes[sizeof(*buf) + 8];
848 	volatile mcp_cmd_response_t *response = sc->cmd;
849 	volatile char *cmd_addr = sc->sram + MXGEFW_ETH_CMD;
850 	uint32_t dma_low, dma_high;
851 	int err, sleep_total = 0;
852 
853 	/* ensure buf is aligned to 8 bytes */
854 	buf = (mcp_cmd_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
855 
856 	buf->data0 = htobe32(data->data0);
857 	buf->data1 = htobe32(data->data1);
858 	buf->data2 = htobe32(data->data2);
859 	buf->cmd = htobe32(cmd);
860 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
861 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
862 
863 	buf->response_addr.low = htobe32(dma_low);
864 	buf->response_addr.high = htobe32(dma_high);
865 	mtx_lock(&sc->cmd_mtx);
866 	response->result = 0xffffffff;
867 	wmb();
868 	mxge_pio_copy((volatile void *)cmd_addr, buf, sizeof (*buf));
869 
870 	/* wait up to 20ms */
871 	err = EAGAIN;
872 	for (sleep_total = 0; sleep_total <  20; sleep_total++) {
873 		bus_dmamap_sync(sc->cmd_dma.dmat,
874 				sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
875 		wmb();
876 		switch (be32toh(response->result)) {
877 		case 0:
878 			data->data0 = be32toh(response->data);
879 			err = 0;
880 			break;
881 		case 0xffffffff:
882 			DELAY(1000);
883 			break;
884 		case MXGEFW_CMD_UNKNOWN:
885 			err = ENOSYS;
886 			break;
887 		case MXGEFW_CMD_ERROR_UNALIGNED:
888 			err = E2BIG;
889 			break;
890 		case MXGEFW_CMD_ERROR_BUSY:
891 			err = EBUSY;
892 			break;
893 		case MXGEFW_CMD_ERROR_I2C_ABSENT:
894 			err = ENXIO;
895 			break;
896 		default:
897 			device_printf(sc->dev,
898 				      "mxge: command %d "
899 				      "failed, result = %d\n",
900 				      cmd, be32toh(response->result));
901 			err = ENXIO;
902 			break;
903 		}
904 		if (err != EAGAIN)
905 			break;
906 	}
907 	if (err == EAGAIN)
908 		device_printf(sc->dev, "mxge: command %d timed out"
909 			      "result = %d\n",
910 			      cmd, be32toh(response->result));
911 	mtx_unlock(&sc->cmd_mtx);
912 	return err;
913 }
914 
915 static int
916 mxge_adopt_running_firmware(mxge_softc_t *sc)
917 {
918 	struct mcp_gen_header *hdr;
919 	const size_t bytes = sizeof (struct mcp_gen_header);
920 	size_t hdr_offset;
921 	int status;
922 
923 	/* find running firmware header */
924 	hdr_offset = htobe32(*(volatile uint32_t *)
925 			     (sc->sram + MCP_HEADER_PTR_OFFSET));
926 
927 	if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > sc->sram_size) {
928 		device_printf(sc->dev,
929 			      "Running firmware has bad header offset (%d)\n",
930 			      (int)hdr_offset);
931 		return EIO;
932 	}
933 
934 	/* copy header of running firmware from SRAM to host memory to
935 	 * validate firmware */
936 	hdr = malloc(bytes, M_DEVBUF, M_NOWAIT);
937 	if (hdr == NULL) {
938 		device_printf(sc->dev, "could not malloc firmware hdr\n");
939 		return ENOMEM;
940 	}
941 	bus_space_read_region_1(rman_get_bustag(sc->mem_res),
942 				rman_get_bushandle(sc->mem_res),
943 				hdr_offset, (char *)hdr, bytes);
944 	status = mxge_validate_firmware(sc, hdr);
945 	free(hdr, M_DEVBUF);
946 
947 	/*
948 	 * check to see if adopted firmware has bug where adopting
949 	 * it will cause broadcasts to be filtered unless the NIC
950 	 * is kept in ALLMULTI mode
951 	 */
952 	if (sc->fw_ver_major == 1 && sc->fw_ver_minor == 4 &&
953 	    sc->fw_ver_tiny >= 4 && sc->fw_ver_tiny <= 11) {
954 		sc->adopted_rx_filter_bug = 1;
955 		device_printf(sc->dev, "Adopting fw %d.%d.%d: "
956 			      "working around rx filter bug\n",
957 			      sc->fw_ver_major, sc->fw_ver_minor,
958 			      sc->fw_ver_tiny);
959 	}
960 
961 	return status;
962 }
963 
964 
965 static int
966 mxge_load_firmware(mxge_softc_t *sc, int adopt)
967 {
968 	volatile uint32_t *confirm;
969 	volatile char *submit;
970 	char buf_bytes[72];
971 	uint32_t *buf, size, dma_low, dma_high;
972 	int status, i;
973 
974 	buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
975 
976 	size = sc->sram_size;
977 	status = mxge_load_firmware_helper(sc, &size);
978 	if (status) {
979 		if (!adopt)
980 			return status;
981 		/* Try to use the currently running firmware, if
982 		   it is new enough */
983 		status = mxge_adopt_running_firmware(sc);
984 		if (status) {
985 			device_printf(sc->dev,
986 				      "failed to adopt running firmware\n");
987 			return status;
988 		}
989 		device_printf(sc->dev,
990 			      "Successfully adopted running firmware\n");
991 		if (sc->tx_boundary == 4096) {
992 			device_printf(sc->dev,
993 				"Using firmware currently running on NIC"
994 				 ".  For optimal\n");
995 			device_printf(sc->dev,
996 				 "performance consider loading optimized "
997 				 "firmware\n");
998 		}
999 		sc->fw_name = mxge_fw_unaligned;
1000 		sc->tx_boundary = 2048;
1001 		return 0;
1002 	}
1003 	/* clear confirmation addr */
1004 	confirm = (volatile uint32_t *)sc->cmd;
1005 	*confirm = 0;
1006 	wmb();
1007 	/* send a reload command to the bootstrap MCP, and wait for the
1008 	   response in the confirmation address.  The firmware should
1009 	   write a -1 there to indicate it is alive and well
1010 	*/
1011 
1012 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
1013 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
1014 
1015 	buf[0] = htobe32(dma_high);	/* confirm addr MSW */
1016 	buf[1] = htobe32(dma_low);	/* confirm addr LSW */
1017 	buf[2] = htobe32(0xffffffff);	/* confirm data */
1018 
1019 	/* FIX: All newest firmware should un-protect the bottom of
1020 	   the sram before handoff. However, the very first interfaces
1021 	   do not. Therefore the handoff copy must skip the first 8 bytes
1022 	*/
1023 					/* where the code starts*/
1024 	buf[3] = htobe32(MXGE_FW_OFFSET + 8);
1025 	buf[4] = htobe32(size - 8); 	/* length of code */
1026 	buf[5] = htobe32(8);		/* where to copy to */
1027 	buf[6] = htobe32(0);		/* where to jump to */
1028 
1029 	submit = (volatile char *)(sc->sram + MXGEFW_BOOT_HANDOFF);
1030 	mxge_pio_copy(submit, buf, 64);
1031 	wmb();
1032 	DELAY(1000);
1033 	wmb();
1034 	i = 0;
1035 	while (*confirm != 0xffffffff && i < 20) {
1036 		DELAY(1000*10);
1037 		i++;
1038 		bus_dmamap_sync(sc->cmd_dma.dmat,
1039 				sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
1040 	}
1041 	if (*confirm != 0xffffffff) {
1042 		device_printf(sc->dev,"handoff failed (%p = 0x%x)",
1043 			confirm, *confirm);
1044 
1045 		return ENXIO;
1046 	}
1047 	return 0;
1048 }
1049 
1050 static int
1051 mxge_update_mac_address(mxge_softc_t *sc)
1052 {
1053 	mxge_cmd_t cmd;
1054 	uint8_t *addr = sc->mac_addr;
1055 	int status;
1056 
1057 
1058 	cmd.data0 = ((addr[0] << 24) | (addr[1] << 16)
1059 		     | (addr[2] << 8) | addr[3]);
1060 
1061 	cmd.data1 = ((addr[4] << 8) | (addr[5]));
1062 
1063 	status = mxge_send_cmd(sc, MXGEFW_SET_MAC_ADDRESS, &cmd);
1064 	return status;
1065 }
1066 
1067 static int
1068 mxge_change_pause(mxge_softc_t *sc, int pause)
1069 {
1070 	mxge_cmd_t cmd;
1071 	int status;
1072 
1073 	if (pause)
1074 		status = mxge_send_cmd(sc, MXGEFW_ENABLE_FLOW_CONTROL,
1075 				       &cmd);
1076 	else
1077 		status = mxge_send_cmd(sc, MXGEFW_DISABLE_FLOW_CONTROL,
1078 				       &cmd);
1079 
1080 	if (status) {
1081 		device_printf(sc->dev, "Failed to set flow control mode\n");
1082 		return ENXIO;
1083 	}
1084 	sc->pause = pause;
1085 	return 0;
1086 }
1087 
1088 static void
1089 mxge_change_promisc(mxge_softc_t *sc, int promisc)
1090 {
1091 	mxge_cmd_t cmd;
1092 	int status;
1093 
1094 	if (mxge_always_promisc)
1095 		promisc = 1;
1096 
1097 	if (promisc)
1098 		status = mxge_send_cmd(sc, MXGEFW_ENABLE_PROMISC,
1099 				       &cmd);
1100 	else
1101 		status = mxge_send_cmd(sc, MXGEFW_DISABLE_PROMISC,
1102 				       &cmd);
1103 
1104 	if (status) {
1105 		device_printf(sc->dev, "Failed to set promisc mode\n");
1106 	}
1107 }
1108 
1109 static void
1110 mxge_set_multicast_list(mxge_softc_t *sc)
1111 {
1112 	mxge_cmd_t cmd;
1113 	struct ifmultiaddr *ifma;
1114 	struct ifnet *ifp = sc->ifp;
1115 	int err;
1116 
1117 	/* This firmware is known to not support multicast */
1118 	if (!sc->fw_multicast_support)
1119 		return;
1120 
1121 	/* Disable multicast filtering while we play with the lists*/
1122 	err = mxge_send_cmd(sc, MXGEFW_ENABLE_ALLMULTI, &cmd);
1123 	if (err != 0) {
1124 		device_printf(sc->dev, "Failed MXGEFW_ENABLE_ALLMULTI,"
1125 		       " error status: %d\n", err);
1126 		return;
1127 	}
1128 
1129 	if (sc->adopted_rx_filter_bug)
1130 		return;
1131 
1132 	if (ifp->if_flags & IFF_ALLMULTI)
1133 		/* request to disable multicast filtering, so quit here */
1134 		return;
1135 
1136 	/* Flush all the filters */
1137 
1138 	err = mxge_send_cmd(sc, MXGEFW_LEAVE_ALL_MULTICAST_GROUPS, &cmd);
1139 	if (err != 0) {
1140 		device_printf(sc->dev,
1141 			      "Failed MXGEFW_LEAVE_ALL_MULTICAST_GROUPS"
1142 			      ", error status: %d\n", err);
1143 		return;
1144 	}
1145 
1146 	/* Walk the multicast list, and add each address */
1147 
1148 	if_maddr_rlock(ifp);
1149 	CK_STAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
1150 		if (ifma->ifma_addr->sa_family != AF_LINK)
1151 			continue;
1152 		bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr),
1153 		      &cmd.data0, 4);
1154 		bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr) + 4,
1155 		      &cmd.data1, 2);
1156 		cmd.data0 = htonl(cmd.data0);
1157 		cmd.data1 = htonl(cmd.data1);
1158 		err = mxge_send_cmd(sc, MXGEFW_JOIN_MULTICAST_GROUP, &cmd);
1159 		if (err != 0) {
1160 			device_printf(sc->dev, "Failed "
1161 			       "MXGEFW_JOIN_MULTICAST_GROUP, error status:"
1162 			       "%d\t", err);
1163 			/* abort, leaving multicast filtering off */
1164 			if_maddr_runlock(ifp);
1165 			return;
1166 		}
1167 	}
1168 	if_maddr_runlock(ifp);
1169 	/* Enable multicast filtering */
1170 	err = mxge_send_cmd(sc, MXGEFW_DISABLE_ALLMULTI, &cmd);
1171 	if (err != 0) {
1172 		device_printf(sc->dev, "Failed MXGEFW_DISABLE_ALLMULTI"
1173 		       ", error status: %d\n", err);
1174 	}
1175 }
1176 
1177 static int
1178 mxge_max_mtu(mxge_softc_t *sc)
1179 {
1180 	mxge_cmd_t cmd;
1181 	int status;
1182 
1183 	if (MJUMPAGESIZE - MXGEFW_PAD >  MXGEFW_MAX_MTU)
1184 		return  MXGEFW_MAX_MTU - MXGEFW_PAD;
1185 
1186 	/* try to set nbufs to see if it we can
1187 	   use virtually contiguous jumbos */
1188 	cmd.data0 = 0;
1189 	status = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
1190 			       &cmd);
1191 	if (status == 0)
1192 		return  MXGEFW_MAX_MTU - MXGEFW_PAD;
1193 
1194 	/* otherwise, we're limited to MJUMPAGESIZE */
1195 	return MJUMPAGESIZE - MXGEFW_PAD;
1196 }
1197 
1198 static int
1199 mxge_reset(mxge_softc_t *sc, int interrupts_setup)
1200 {
1201 	struct mxge_slice_state *ss;
1202 	mxge_rx_done_t *rx_done;
1203 	volatile uint32_t *irq_claim;
1204 	mxge_cmd_t cmd;
1205 	int slice, status;
1206 
1207 	/* try to send a reset command to the card to see if it
1208 	   is alive */
1209 	memset(&cmd, 0, sizeof (cmd));
1210 	status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
1211 	if (status != 0) {
1212 		device_printf(sc->dev, "failed reset\n");
1213 		return ENXIO;
1214 	}
1215 
1216 	mxge_dummy_rdma(sc, 1);
1217 
1218 
1219 	/* set the intrq size */
1220 	cmd.data0 = sc->rx_ring_size;
1221 	status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
1222 
1223 	/*
1224 	 * Even though we already know how many slices are supported
1225 	 * via mxge_slice_probe(), MXGEFW_CMD_GET_MAX_RSS_QUEUES
1226 	 * has magic side effects, and must be called after a reset.
1227 	 * It must be called prior to calling any RSS related cmds,
1228 	 * including assigning an interrupt queue for anything but
1229 	 * slice 0.  It must also be called *after*
1230 	 * MXGEFW_CMD_SET_INTRQ_SIZE, since the intrq size is used by
1231 	 * the firmware to compute offsets.
1232 	 */
1233 
1234 	if (sc->num_slices > 1) {
1235 		/* ask the maximum number of slices it supports */
1236 		status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES,
1237 					   &cmd);
1238 		if (status != 0) {
1239 			device_printf(sc->dev,
1240 				      "failed to get number of slices\n");
1241 			return status;
1242 		}
1243 		/*
1244 		 * MXGEFW_CMD_ENABLE_RSS_QUEUES must be called prior
1245 		 * to setting up the interrupt queue DMA
1246 		 */
1247 		cmd.data0 = sc->num_slices;
1248 		cmd.data1 = MXGEFW_SLICE_INTR_MODE_ONE_PER_SLICE;
1249 #ifdef IFNET_BUF_RING
1250 		cmd.data1 |= MXGEFW_SLICE_ENABLE_MULTIPLE_TX_QUEUES;
1251 #endif
1252 		status = mxge_send_cmd(sc, MXGEFW_CMD_ENABLE_RSS_QUEUES,
1253 					   &cmd);
1254 		if (status != 0) {
1255 			device_printf(sc->dev,
1256 				      "failed to set number of slices\n");
1257 			return status;
1258 		}
1259 	}
1260 
1261 
1262 	if (interrupts_setup) {
1263 		/* Now exchange information about interrupts  */
1264 		for (slice = 0; slice < sc->num_slices; slice++) {
1265 			rx_done = &sc->ss[slice].rx_done;
1266 			memset(rx_done->entry, 0, sc->rx_ring_size);
1267 			cmd.data0 = MXGE_LOWPART_TO_U32(rx_done->dma.bus_addr);
1268 			cmd.data1 = MXGE_HIGHPART_TO_U32(rx_done->dma.bus_addr);
1269 			cmd.data2 = slice;
1270 			status |= mxge_send_cmd(sc,
1271 						MXGEFW_CMD_SET_INTRQ_DMA,
1272 						&cmd);
1273 		}
1274 	}
1275 
1276 	status |= mxge_send_cmd(sc,
1277 				MXGEFW_CMD_GET_INTR_COAL_DELAY_OFFSET, &cmd);
1278 
1279 
1280 	sc->intr_coal_delay_ptr = (volatile uint32_t *)(sc->sram + cmd.data0);
1281 
1282 	status |= mxge_send_cmd(sc, MXGEFW_CMD_GET_IRQ_ACK_OFFSET, &cmd);
1283 	irq_claim = (volatile uint32_t *)(sc->sram + cmd.data0);
1284 
1285 
1286 	status |= mxge_send_cmd(sc,  MXGEFW_CMD_GET_IRQ_DEASSERT_OFFSET,
1287 				&cmd);
1288 	sc->irq_deassert = (volatile uint32_t *)(sc->sram + cmd.data0);
1289 	if (status != 0) {
1290 		device_printf(sc->dev, "failed set interrupt parameters\n");
1291 		return status;
1292 	}
1293 
1294 
1295 	*sc->intr_coal_delay_ptr = htobe32(sc->intr_coal_delay);
1296 
1297 
1298 	/* run a DMA benchmark */
1299 	(void) mxge_dma_test(sc, MXGEFW_DMA_TEST);
1300 
1301 	for (slice = 0; slice < sc->num_slices; slice++) {
1302 		ss = &sc->ss[slice];
1303 
1304 		ss->irq_claim = irq_claim + (2 * slice);
1305 		/* reset mcp/driver shared state back to 0 */
1306 		ss->rx_done.idx = 0;
1307 		ss->rx_done.cnt = 0;
1308 		ss->tx.req = 0;
1309 		ss->tx.done = 0;
1310 		ss->tx.pkt_done = 0;
1311 		ss->tx.queue_active = 0;
1312 		ss->tx.activate = 0;
1313 		ss->tx.deactivate = 0;
1314 		ss->tx.wake = 0;
1315 		ss->tx.defrag = 0;
1316 		ss->tx.stall = 0;
1317 		ss->rx_big.cnt = 0;
1318 		ss->rx_small.cnt = 0;
1319 		ss->lc.lro_bad_csum = 0;
1320 		ss->lc.lro_queued = 0;
1321 		ss->lc.lro_flushed = 0;
1322 		if (ss->fw_stats != NULL) {
1323 			bzero(ss->fw_stats, sizeof *ss->fw_stats);
1324 		}
1325 	}
1326 	sc->rdma_tags_available = 15;
1327 	status = mxge_update_mac_address(sc);
1328 	mxge_change_promisc(sc, sc->ifp->if_flags & IFF_PROMISC);
1329 	mxge_change_pause(sc, sc->pause);
1330 	mxge_set_multicast_list(sc);
1331 	if (sc->throttle) {
1332 		cmd.data0 = sc->throttle;
1333 		if (mxge_send_cmd(sc, MXGEFW_CMD_SET_THROTTLE_FACTOR,
1334 				  &cmd)) {
1335 			device_printf(sc->dev,
1336 				      "can't enable throttle\n");
1337 		}
1338 	}
1339 	return status;
1340 }
1341 
1342 static int
1343 mxge_change_throttle(SYSCTL_HANDLER_ARGS)
1344 {
1345 	mxge_cmd_t cmd;
1346 	mxge_softc_t *sc;
1347 	int err;
1348 	unsigned int throttle;
1349 
1350 	sc = arg1;
1351 	throttle = sc->throttle;
1352 	err = sysctl_handle_int(oidp, &throttle, arg2, req);
1353 	if (err != 0) {
1354 		return err;
1355 	}
1356 
1357 	if (throttle == sc->throttle)
1358 		return 0;
1359 
1360 	if (throttle < MXGE_MIN_THROTTLE || throttle > MXGE_MAX_THROTTLE)
1361 		return EINVAL;
1362 
1363 	mtx_lock(&sc->driver_mtx);
1364 	cmd.data0 = throttle;
1365 	err = mxge_send_cmd(sc, MXGEFW_CMD_SET_THROTTLE_FACTOR, &cmd);
1366 	if (err == 0)
1367 		sc->throttle = throttle;
1368 	mtx_unlock(&sc->driver_mtx);
1369 	return err;
1370 }
1371 
1372 static int
1373 mxge_change_intr_coal(SYSCTL_HANDLER_ARGS)
1374 {
1375 	mxge_softc_t *sc;
1376 	unsigned int intr_coal_delay;
1377 	int err;
1378 
1379 	sc = arg1;
1380 	intr_coal_delay = sc->intr_coal_delay;
1381 	err = sysctl_handle_int(oidp, &intr_coal_delay, arg2, req);
1382 	if (err != 0) {
1383 		return err;
1384 	}
1385 	if (intr_coal_delay == sc->intr_coal_delay)
1386 		return 0;
1387 
1388 	if (intr_coal_delay == 0 || intr_coal_delay > 1000*1000)
1389 		return EINVAL;
1390 
1391 	mtx_lock(&sc->driver_mtx);
1392 	*sc->intr_coal_delay_ptr = htobe32(intr_coal_delay);
1393 	sc->intr_coal_delay = intr_coal_delay;
1394 
1395 	mtx_unlock(&sc->driver_mtx);
1396 	return err;
1397 }
1398 
1399 static int
1400 mxge_change_flow_control(SYSCTL_HANDLER_ARGS)
1401 {
1402 	mxge_softc_t *sc;
1403 	unsigned int enabled;
1404 	int err;
1405 
1406 	sc = arg1;
1407 	enabled = sc->pause;
1408 	err = sysctl_handle_int(oidp, &enabled, arg2, req);
1409 	if (err != 0) {
1410 		return err;
1411 	}
1412 	if (enabled == sc->pause)
1413 		return 0;
1414 
1415 	mtx_lock(&sc->driver_mtx);
1416 	err = mxge_change_pause(sc, enabled);
1417 	mtx_unlock(&sc->driver_mtx);
1418 	return err;
1419 }
1420 
1421 static int
1422 mxge_handle_be32(SYSCTL_HANDLER_ARGS)
1423 {
1424 	int err;
1425 
1426 	if (arg1 == NULL)
1427 		return EFAULT;
1428 	arg2 = be32toh(*(int *)arg1);
1429 	arg1 = NULL;
1430 	err = sysctl_handle_int(oidp, arg1, arg2, req);
1431 
1432 	return err;
1433 }
1434 
1435 static void
1436 mxge_rem_sysctls(mxge_softc_t *sc)
1437 {
1438 	struct mxge_slice_state *ss;
1439 	int slice;
1440 
1441 	if (sc->slice_sysctl_tree == NULL)
1442 		return;
1443 
1444 	for (slice = 0; slice < sc->num_slices; slice++) {
1445 		ss = &sc->ss[slice];
1446 		if (ss == NULL || ss->sysctl_tree == NULL)
1447 			continue;
1448 		sysctl_ctx_free(&ss->sysctl_ctx);
1449 		ss->sysctl_tree = NULL;
1450 	}
1451 	sysctl_ctx_free(&sc->slice_sysctl_ctx);
1452 	sc->slice_sysctl_tree = NULL;
1453 }
1454 
1455 static void
1456 mxge_add_sysctls(mxge_softc_t *sc)
1457 {
1458 	struct sysctl_ctx_list *ctx;
1459 	struct sysctl_oid_list *children;
1460 	mcp_irq_data_t *fw;
1461 	struct mxge_slice_state *ss;
1462 	int slice;
1463 	char slice_num[8];
1464 
1465 	ctx = device_get_sysctl_ctx(sc->dev);
1466 	children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev));
1467 	fw = sc->ss[0].fw_stats;
1468 
1469 	/* random information */
1470 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1471 		       "firmware_version",
1472 		       CTLFLAG_RD, sc->fw_version,
1473 		       0, "firmware version");
1474 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1475 		       "serial_number",
1476 		       CTLFLAG_RD, sc->serial_number_string,
1477 		       0, "serial number");
1478 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1479 		       "product_code",
1480 		       CTLFLAG_RD, sc->product_code_string,
1481 		       0, "product_code");
1482 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1483 		       "pcie_link_width",
1484 		       CTLFLAG_RD, &sc->link_width,
1485 		       0, "tx_boundary");
1486 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1487 		       "tx_boundary",
1488 		       CTLFLAG_RD, &sc->tx_boundary,
1489 		       0, "tx_boundary");
1490 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1491 		       "write_combine",
1492 		       CTLFLAG_RD, &sc->wc,
1493 		       0, "write combining PIO?");
1494 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1495 		       "read_dma_MBs",
1496 		       CTLFLAG_RD, &sc->read_dma,
1497 		       0, "DMA Read speed in MB/s");
1498 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1499 		       "write_dma_MBs",
1500 		       CTLFLAG_RD, &sc->write_dma,
1501 		       0, "DMA Write speed in MB/s");
1502 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1503 		       "read_write_dma_MBs",
1504 		       CTLFLAG_RD, &sc->read_write_dma,
1505 		       0, "DMA concurrent Read/Write speed in MB/s");
1506 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1507 		       "watchdog_resets",
1508 		       CTLFLAG_RD, &sc->watchdog_resets,
1509 		       0, "Number of times NIC was reset");
1510 
1511 
1512 	/* performance related tunables */
1513 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1514 			"intr_coal_delay",
1515 			CTLTYPE_INT|CTLFLAG_RW, sc,
1516 			0, mxge_change_intr_coal,
1517 			"I", "interrupt coalescing delay in usecs");
1518 
1519 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1520 			"throttle",
1521 			CTLTYPE_INT|CTLFLAG_RW, sc,
1522 			0, mxge_change_throttle,
1523 			"I", "transmit throttling");
1524 
1525 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1526 			"flow_control_enabled",
1527 			CTLTYPE_INT|CTLFLAG_RW, sc,
1528 			0, mxge_change_flow_control,
1529 			"I", "interrupt coalescing delay in usecs");
1530 
1531 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1532 		       "deassert_wait",
1533 		       CTLFLAG_RW, &mxge_deassert_wait,
1534 		       0, "Wait for IRQ line to go low in ihandler");
1535 
1536 	/* stats block from firmware is in network byte order.
1537 	   Need to swap it */
1538 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1539 			"link_up",
1540 			CTLTYPE_INT|CTLFLAG_RD, &fw->link_up,
1541 			0, mxge_handle_be32,
1542 			"I", "link up");
1543 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1544 			"rdma_tags_available",
1545 			CTLTYPE_INT|CTLFLAG_RD, &fw->rdma_tags_available,
1546 			0, mxge_handle_be32,
1547 			"I", "rdma_tags_available");
1548 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1549 			"dropped_bad_crc32",
1550 			CTLTYPE_INT|CTLFLAG_RD,
1551 			&fw->dropped_bad_crc32,
1552 			0, mxge_handle_be32,
1553 			"I", "dropped_bad_crc32");
1554 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1555 			"dropped_bad_phy",
1556 			CTLTYPE_INT|CTLFLAG_RD,
1557 			&fw->dropped_bad_phy,
1558 			0, mxge_handle_be32,
1559 			"I", "dropped_bad_phy");
1560 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1561 			"dropped_link_error_or_filtered",
1562 			CTLTYPE_INT|CTLFLAG_RD,
1563 			&fw->dropped_link_error_or_filtered,
1564 			0, mxge_handle_be32,
1565 			"I", "dropped_link_error_or_filtered");
1566 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1567 			"dropped_link_overflow",
1568 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_link_overflow,
1569 			0, mxge_handle_be32,
1570 			"I", "dropped_link_overflow");
1571 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1572 			"dropped_multicast_filtered",
1573 			CTLTYPE_INT|CTLFLAG_RD,
1574 			&fw->dropped_multicast_filtered,
1575 			0, mxge_handle_be32,
1576 			"I", "dropped_multicast_filtered");
1577 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1578 			"dropped_no_big_buffer",
1579 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_no_big_buffer,
1580 			0, mxge_handle_be32,
1581 			"I", "dropped_no_big_buffer");
1582 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1583 			"dropped_no_small_buffer",
1584 			CTLTYPE_INT|CTLFLAG_RD,
1585 			&fw->dropped_no_small_buffer,
1586 			0, mxge_handle_be32,
1587 			"I", "dropped_no_small_buffer");
1588 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1589 			"dropped_overrun",
1590 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_overrun,
1591 			0, mxge_handle_be32,
1592 			"I", "dropped_overrun");
1593 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1594 			"dropped_pause",
1595 			CTLTYPE_INT|CTLFLAG_RD,
1596 			&fw->dropped_pause,
1597 			0, mxge_handle_be32,
1598 			"I", "dropped_pause");
1599 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1600 			"dropped_runt",
1601 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_runt,
1602 			0, mxge_handle_be32,
1603 			"I", "dropped_runt");
1604 
1605 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1606 			"dropped_unicast_filtered",
1607 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_unicast_filtered,
1608 			0, mxge_handle_be32,
1609 			"I", "dropped_unicast_filtered");
1610 
1611 	/* verbose printing? */
1612 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1613 		       "verbose",
1614 		       CTLFLAG_RW, &mxge_verbose,
1615 		       0, "verbose printing");
1616 
1617 	/* add counters exported for debugging from all slices */
1618 	sysctl_ctx_init(&sc->slice_sysctl_ctx);
1619 	sc->slice_sysctl_tree =
1620 		SYSCTL_ADD_NODE(&sc->slice_sysctl_ctx, children, OID_AUTO,
1621 				"slice", CTLFLAG_RD, 0, "");
1622 
1623 	for (slice = 0; slice < sc->num_slices; slice++) {
1624 		ss = &sc->ss[slice];
1625 		sysctl_ctx_init(&ss->sysctl_ctx);
1626 		ctx = &ss->sysctl_ctx;
1627 		children = SYSCTL_CHILDREN(sc->slice_sysctl_tree);
1628 		sprintf(slice_num, "%d", slice);
1629 		ss->sysctl_tree =
1630 			SYSCTL_ADD_NODE(ctx, children, OID_AUTO, slice_num,
1631 					CTLFLAG_RD, 0, "");
1632 		children = SYSCTL_CHILDREN(ss->sysctl_tree);
1633 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1634 			       "rx_small_cnt",
1635 			       CTLFLAG_RD, &ss->rx_small.cnt,
1636 			       0, "rx_small_cnt");
1637 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1638 			       "rx_big_cnt",
1639 			       CTLFLAG_RD, &ss->rx_big.cnt,
1640 			       0, "rx_small_cnt");
1641 		SYSCTL_ADD_U64(ctx, children, OID_AUTO,
1642 			       "lro_flushed", CTLFLAG_RD, &ss->lc.lro_flushed,
1643 			       0, "number of lro merge queues flushed");
1644 
1645 		SYSCTL_ADD_U64(ctx, children, OID_AUTO,
1646 			       "lro_bad_csum", CTLFLAG_RD, &ss->lc.lro_bad_csum,
1647 			       0, "number of bad csums preventing LRO");
1648 
1649 		SYSCTL_ADD_U64(ctx, children, OID_AUTO,
1650 			       "lro_queued", CTLFLAG_RD, &ss->lc.lro_queued,
1651 			       0, "number of frames appended to lro merge"
1652 			       "queues");
1653 
1654 #ifndef IFNET_BUF_RING
1655 		/* only transmit from slice 0 for now */
1656 		if (slice > 0)
1657 			continue;
1658 #endif
1659 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1660 			       "tx_req",
1661 			       CTLFLAG_RD, &ss->tx.req,
1662 			       0, "tx_req");
1663 
1664 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1665 			       "tx_done",
1666 			       CTLFLAG_RD, &ss->tx.done,
1667 			       0, "tx_done");
1668 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1669 			       "tx_pkt_done",
1670 			       CTLFLAG_RD, &ss->tx.pkt_done,
1671 			       0, "tx_done");
1672 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1673 			       "tx_stall",
1674 			       CTLFLAG_RD, &ss->tx.stall,
1675 			       0, "tx_stall");
1676 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1677 			       "tx_wake",
1678 			       CTLFLAG_RD, &ss->tx.wake,
1679 			       0, "tx_wake");
1680 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1681 			       "tx_defrag",
1682 			       CTLFLAG_RD, &ss->tx.defrag,
1683 			       0, "tx_defrag");
1684 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1685 			       "tx_queue_active",
1686 			       CTLFLAG_RD, &ss->tx.queue_active,
1687 			       0, "tx_queue_active");
1688 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1689 			       "tx_activate",
1690 			       CTLFLAG_RD, &ss->tx.activate,
1691 			       0, "tx_activate");
1692 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1693 			       "tx_deactivate",
1694 			       CTLFLAG_RD, &ss->tx.deactivate,
1695 			       0, "tx_deactivate");
1696 	}
1697 }
1698 
1699 /* copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
1700    backwards one at a time and handle ring wraps */
1701 
1702 static inline void
1703 mxge_submit_req_backwards(mxge_tx_ring_t *tx,
1704 			    mcp_kreq_ether_send_t *src, int cnt)
1705 {
1706 	int idx, starting_slot;
1707 	starting_slot = tx->req;
1708 	while (cnt > 1) {
1709 		cnt--;
1710 		idx = (starting_slot + cnt) & tx->mask;
1711 		mxge_pio_copy(&tx->lanai[idx],
1712 			      &src[cnt], sizeof(*src));
1713 		wmb();
1714 	}
1715 }
1716 
1717 /*
1718  * copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
1719  * at most 32 bytes at a time, so as to avoid involving the software
1720  * pio handler in the nic.   We re-write the first segment's flags
1721  * to mark them valid only after writing the entire chain
1722  */
1723 
1724 static inline void
1725 mxge_submit_req(mxge_tx_ring_t *tx, mcp_kreq_ether_send_t *src,
1726 		  int cnt)
1727 {
1728 	int idx, i;
1729 	uint32_t *src_ints;
1730 	volatile uint32_t *dst_ints;
1731 	mcp_kreq_ether_send_t *srcp;
1732 	volatile mcp_kreq_ether_send_t *dstp, *dst;
1733 	uint8_t last_flags;
1734 
1735 	idx = tx->req & tx->mask;
1736 
1737 	last_flags = src->flags;
1738 	src->flags = 0;
1739 	wmb();
1740 	dst = dstp = &tx->lanai[idx];
1741 	srcp = src;
1742 
1743 	if ((idx + cnt) < tx->mask) {
1744 		for (i = 0; i < (cnt - 1); i += 2) {
1745 			mxge_pio_copy(dstp, srcp, 2 * sizeof(*src));
1746 			wmb(); /* force write every 32 bytes */
1747 			srcp += 2;
1748 			dstp += 2;
1749 		}
1750 	} else {
1751 		/* submit all but the first request, and ensure
1752 		   that it is submitted below */
1753 		mxge_submit_req_backwards(tx, src, cnt);
1754 		i = 0;
1755 	}
1756 	if (i < cnt) {
1757 		/* submit the first request */
1758 		mxge_pio_copy(dstp, srcp, sizeof(*src));
1759 		wmb(); /* barrier before setting valid flag */
1760 	}
1761 
1762 	/* re-write the last 32-bits with the valid flags */
1763 	src->flags = last_flags;
1764 	src_ints = (uint32_t *)src;
1765 	src_ints+=3;
1766 	dst_ints = (volatile uint32_t *)dst;
1767 	dst_ints+=3;
1768 	*dst_ints =  *src_ints;
1769 	tx->req += cnt;
1770 	wmb();
1771 }
1772 
1773 static int
1774 mxge_parse_tx(struct mxge_slice_state *ss, struct mbuf *m,
1775     struct mxge_pkt_info *pi)
1776 {
1777 	struct ether_vlan_header *eh;
1778 	uint16_t etype;
1779 	int tso = m->m_pkthdr.csum_flags & (CSUM_TSO);
1780 #if IFCAP_TSO6 && defined(INET6)
1781 	int nxt;
1782 #endif
1783 
1784 	eh = mtod(m, struct ether_vlan_header *);
1785 	if (eh->evl_encap_proto == htons(ETHERTYPE_VLAN)) {
1786 		etype = ntohs(eh->evl_proto);
1787 		pi->ip_off = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
1788 	} else {
1789 		etype = ntohs(eh->evl_encap_proto);
1790 		pi->ip_off = ETHER_HDR_LEN;
1791 	}
1792 
1793 	switch (etype) {
1794 	case ETHERTYPE_IP:
1795 		/*
1796 		 * ensure ip header is in first mbuf, copy it to a
1797 		 * scratch buffer if not
1798 		 */
1799 		pi->ip = (struct ip *)(m->m_data + pi->ip_off);
1800 		pi->ip6 = NULL;
1801 		if (__predict_false(m->m_len < pi->ip_off + sizeof(*pi->ip))) {
1802 			m_copydata(m, 0, pi->ip_off + sizeof(*pi->ip),
1803 			    ss->scratch);
1804 			pi->ip = (struct ip *)(ss->scratch + pi->ip_off);
1805 		}
1806 		pi->ip_hlen = pi->ip->ip_hl << 2;
1807 		if (!tso)
1808 			return 0;
1809 
1810 		if (__predict_false(m->m_len < pi->ip_off + pi->ip_hlen +
1811 		    sizeof(struct tcphdr))) {
1812 			m_copydata(m, 0, pi->ip_off + pi->ip_hlen +
1813 			    sizeof(struct tcphdr), ss->scratch);
1814 			pi->ip = (struct ip *)(ss->scratch + pi->ip_off);
1815 		}
1816 		pi->tcp = (struct tcphdr *)((char *)pi->ip + pi->ip_hlen);
1817 		break;
1818 #if IFCAP_TSO6 && defined(INET6)
1819 	case ETHERTYPE_IPV6:
1820 		pi->ip6 = (struct ip6_hdr *)(m->m_data + pi->ip_off);
1821 		if (__predict_false(m->m_len < pi->ip_off + sizeof(*pi->ip6))) {
1822 			m_copydata(m, 0, pi->ip_off + sizeof(*pi->ip6),
1823 			    ss->scratch);
1824 			pi->ip6 = (struct ip6_hdr *)(ss->scratch + pi->ip_off);
1825 		}
1826 		nxt = 0;
1827 		pi->ip_hlen = ip6_lasthdr(m, pi->ip_off, IPPROTO_IPV6, &nxt);
1828 		pi->ip_hlen -= pi->ip_off;
1829 		if (nxt != IPPROTO_TCP && nxt != IPPROTO_UDP)
1830 			return EINVAL;
1831 
1832 		if (!tso)
1833 			return 0;
1834 
1835 		if (pi->ip_off + pi->ip_hlen > ss->sc->max_tso6_hlen)
1836 			return EINVAL;
1837 
1838 		if (__predict_false(m->m_len < pi->ip_off + pi->ip_hlen +
1839 		    sizeof(struct tcphdr))) {
1840 			m_copydata(m, 0, pi->ip_off + pi->ip_hlen +
1841 			    sizeof(struct tcphdr), ss->scratch);
1842 			pi->ip6 = (struct ip6_hdr *)(ss->scratch + pi->ip_off);
1843 		}
1844 		pi->tcp = (struct tcphdr *)((char *)pi->ip6 + pi->ip_hlen);
1845 		break;
1846 #endif
1847 	default:
1848 		return EINVAL;
1849 	}
1850 	return 0;
1851 }
1852 
1853 #if IFCAP_TSO4
1854 
1855 static void
1856 mxge_encap_tso(struct mxge_slice_state *ss, struct mbuf *m,
1857 	       int busdma_seg_cnt, struct mxge_pkt_info *pi)
1858 {
1859 	mxge_tx_ring_t *tx;
1860 	mcp_kreq_ether_send_t *req;
1861 	bus_dma_segment_t *seg;
1862 	uint32_t low, high_swapped;
1863 	int len, seglen, cum_len, cum_len_next;
1864 	int next_is_first, chop, cnt, rdma_count, small;
1865 	uint16_t pseudo_hdr_offset, cksum_offset, mss, sum;
1866 	uint8_t flags, flags_next;
1867 	static int once;
1868 
1869 	mss = m->m_pkthdr.tso_segsz;
1870 
1871 	/* negative cum_len signifies to the
1872 	 * send loop that we are still in the
1873 	 * header portion of the TSO packet.
1874 	 */
1875 
1876 	cksum_offset = pi->ip_off + pi->ip_hlen;
1877 	cum_len = -(cksum_offset + (pi->tcp->th_off << 2));
1878 
1879 	/* TSO implies checksum offload on this hardware */
1880 	if (__predict_false((m->m_pkthdr.csum_flags & (CSUM_TCP|CSUM_TCP_IPV6)) == 0)) {
1881 		/*
1882 		 * If packet has full TCP csum, replace it with pseudo hdr
1883 		 * sum that the NIC expects, otherwise the NIC will emit
1884 		 * packets with bad TCP checksums.
1885 		 */
1886 		m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
1887 		if (pi->ip6) {
1888 #if (CSUM_TCP_IPV6 != 0) && defined(INET6)
1889 			m->m_pkthdr.csum_flags |= CSUM_TCP_IPV6;
1890 			sum = in6_cksum_pseudo(pi->ip6,
1891 			    m->m_pkthdr.len - cksum_offset,
1892 			    IPPROTO_TCP, 0);
1893 #endif
1894 		} else {
1895 #ifdef INET
1896 			m->m_pkthdr.csum_flags |= CSUM_TCP;
1897 			sum = in_pseudo(pi->ip->ip_src.s_addr,
1898 			    pi->ip->ip_dst.s_addr,
1899 			    htons(IPPROTO_TCP + (m->m_pkthdr.len -
1900 				    cksum_offset)));
1901 #endif
1902 		}
1903 		m_copyback(m, offsetof(struct tcphdr, th_sum) +
1904 		    cksum_offset, sizeof(sum), (caddr_t)&sum);
1905 	}
1906 	flags = MXGEFW_FLAGS_TSO_HDR | MXGEFW_FLAGS_FIRST;
1907 
1908 
1909 	/* for TSO, pseudo_hdr_offset holds mss.
1910 	 * The firmware figures out where to put
1911 	 * the checksum by parsing the header. */
1912 	pseudo_hdr_offset = htobe16(mss);
1913 
1914 	if (pi->ip6) {
1915 		/*
1916 		 * for IPv6 TSO, the "checksum offset" is re-purposed
1917 		 * to store the TCP header len
1918 		 */
1919 		cksum_offset = (pi->tcp->th_off << 2);
1920 	}
1921 
1922 	tx = &ss->tx;
1923 	req = tx->req_list;
1924 	seg = tx->seg_list;
1925 	cnt = 0;
1926 	rdma_count = 0;
1927 	/* "rdma_count" is the number of RDMAs belonging to the
1928 	 * current packet BEFORE the current send request. For
1929 	 * non-TSO packets, this is equal to "count".
1930 	 * For TSO packets, rdma_count needs to be reset
1931 	 * to 0 after a segment cut.
1932 	 *
1933 	 * The rdma_count field of the send request is
1934 	 * the number of RDMAs of the packet starting at
1935 	 * that request. For TSO send requests with one ore more cuts
1936 	 * in the middle, this is the number of RDMAs starting
1937 	 * after the last cut in the request. All previous
1938 	 * segments before the last cut implicitly have 1 RDMA.
1939 	 *
1940 	 * Since the number of RDMAs is not known beforehand,
1941 	 * it must be filled-in retroactively - after each
1942 	 * segmentation cut or at the end of the entire packet.
1943 	 */
1944 
1945 	while (busdma_seg_cnt) {
1946 		/* Break the busdma segment up into pieces*/
1947 		low = MXGE_LOWPART_TO_U32(seg->ds_addr);
1948 		high_swapped = 	htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
1949 		len = seg->ds_len;
1950 
1951 		while (len) {
1952 			flags_next = flags & ~MXGEFW_FLAGS_FIRST;
1953 			seglen = len;
1954 			cum_len_next = cum_len + seglen;
1955 			(req-rdma_count)->rdma_count = rdma_count + 1;
1956 			if (__predict_true(cum_len >= 0)) {
1957 				/* payload */
1958 				chop = (cum_len_next > mss);
1959 				cum_len_next = cum_len_next % mss;
1960 				next_is_first = (cum_len_next == 0);
1961 				flags |= chop * MXGEFW_FLAGS_TSO_CHOP;
1962 				flags_next |= next_is_first *
1963 					MXGEFW_FLAGS_FIRST;
1964 				rdma_count |= -(chop | next_is_first);
1965 				rdma_count += chop & !next_is_first;
1966 			} else if (cum_len_next >= 0) {
1967 				/* header ends */
1968 				rdma_count = -1;
1969 				cum_len_next = 0;
1970 				seglen = -cum_len;
1971 				small = (mss <= MXGEFW_SEND_SMALL_SIZE);
1972 				flags_next = MXGEFW_FLAGS_TSO_PLD |
1973 					MXGEFW_FLAGS_FIRST |
1974 					(small * MXGEFW_FLAGS_SMALL);
1975 			    }
1976 
1977 			req->addr_high = high_swapped;
1978 			req->addr_low = htobe32(low);
1979 			req->pseudo_hdr_offset = pseudo_hdr_offset;
1980 			req->pad = 0;
1981 			req->rdma_count = 1;
1982 			req->length = htobe16(seglen);
1983 			req->cksum_offset = cksum_offset;
1984 			req->flags = flags | ((cum_len & 1) *
1985 					      MXGEFW_FLAGS_ALIGN_ODD);
1986 			low += seglen;
1987 			len -= seglen;
1988 			cum_len = cum_len_next;
1989 			flags = flags_next;
1990 			req++;
1991 			cnt++;
1992 			rdma_count++;
1993 			if (cksum_offset != 0 && !pi->ip6) {
1994 				if (__predict_false(cksum_offset > seglen))
1995 					cksum_offset -= seglen;
1996 				else
1997 					cksum_offset = 0;
1998 			}
1999 			if (__predict_false(cnt > tx->max_desc))
2000 				goto drop;
2001 		}
2002 		busdma_seg_cnt--;
2003 		seg++;
2004 	}
2005 	(req-rdma_count)->rdma_count = rdma_count;
2006 
2007 	do {
2008 		req--;
2009 		req->flags |= MXGEFW_FLAGS_TSO_LAST;
2010 	} while (!(req->flags & (MXGEFW_FLAGS_TSO_CHOP | MXGEFW_FLAGS_FIRST)));
2011 
2012 	tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
2013 	mxge_submit_req(tx, tx->req_list, cnt);
2014 #ifdef IFNET_BUF_RING
2015 	if ((ss->sc->num_slices > 1) && tx->queue_active == 0) {
2016 		/* tell the NIC to start polling this slice */
2017 		*tx->send_go = 1;
2018 		tx->queue_active = 1;
2019 		tx->activate++;
2020 		wmb();
2021 	}
2022 #endif
2023 	return;
2024 
2025 drop:
2026 	bus_dmamap_unload(tx->dmat, tx->info[tx->req & tx->mask].map);
2027 	m_freem(m);
2028 	ss->oerrors++;
2029 	if (!once) {
2030 		printf("tx->max_desc exceeded via TSO!\n");
2031 		printf("mss = %d, %ld, %d!\n", mss,
2032 		       (long)seg - (long)tx->seg_list, tx->max_desc);
2033 		once = 1;
2034 	}
2035 	return;
2036 
2037 }
2038 
2039 #endif /* IFCAP_TSO4 */
2040 
2041 #ifdef MXGE_NEW_VLAN_API
2042 /*
2043  * We reproduce the software vlan tag insertion from
2044  * net/if_vlan.c:vlan_start() here so that we can advertise "hardware"
2045  * vlan tag insertion. We need to advertise this in order to have the
2046  * vlan interface respect our csum offload flags.
2047  */
2048 static struct mbuf *
2049 mxge_vlan_tag_insert(struct mbuf *m)
2050 {
2051 	struct ether_vlan_header *evl;
2052 
2053 	M_PREPEND(m, ETHER_VLAN_ENCAP_LEN, M_NOWAIT);
2054 	if (__predict_false(m == NULL))
2055 		return NULL;
2056 	if (m->m_len < sizeof(*evl)) {
2057 		m = m_pullup(m, sizeof(*evl));
2058 		if (__predict_false(m == NULL))
2059 			return NULL;
2060 	}
2061 	/*
2062 	 * Transform the Ethernet header into an Ethernet header
2063 	 * with 802.1Q encapsulation.
2064 	 */
2065 	evl = mtod(m, struct ether_vlan_header *);
2066 	bcopy((char *)evl + ETHER_VLAN_ENCAP_LEN,
2067 	      (char *)evl, ETHER_HDR_LEN - ETHER_TYPE_LEN);
2068 	evl->evl_encap_proto = htons(ETHERTYPE_VLAN);
2069 	evl->evl_tag = htons(m->m_pkthdr.ether_vtag);
2070 	m->m_flags &= ~M_VLANTAG;
2071 	return m;
2072 }
2073 #endif /* MXGE_NEW_VLAN_API */
2074 
2075 static void
2076 mxge_encap(struct mxge_slice_state *ss, struct mbuf *m)
2077 {
2078 	struct mxge_pkt_info pi = {0,0,0,0};
2079 	mxge_softc_t *sc;
2080 	mcp_kreq_ether_send_t *req;
2081 	bus_dma_segment_t *seg;
2082 	struct mbuf *m_tmp;
2083 	struct ifnet *ifp;
2084 	mxge_tx_ring_t *tx;
2085 	int cnt, cum_len, err, i, idx, odd_flag;
2086 	uint16_t pseudo_hdr_offset;
2087 	uint8_t flags, cksum_offset;
2088 
2089 
2090 	sc = ss->sc;
2091 	ifp = sc->ifp;
2092 	tx = &ss->tx;
2093 
2094 #ifdef MXGE_NEW_VLAN_API
2095 	if (m->m_flags & M_VLANTAG) {
2096 		m = mxge_vlan_tag_insert(m);
2097 		if (__predict_false(m == NULL))
2098 			goto drop_without_m;
2099 	}
2100 #endif
2101 	if (m->m_pkthdr.csum_flags &
2102 	    (CSUM_TSO | CSUM_DELAY_DATA | CSUM_DELAY_DATA_IPV6)) {
2103 		if (mxge_parse_tx(ss, m, &pi))
2104 			goto drop;
2105 	}
2106 
2107 	/* (try to) map the frame for DMA */
2108 	idx = tx->req & tx->mask;
2109 	err = bus_dmamap_load_mbuf_sg(tx->dmat, tx->info[idx].map,
2110 				      m, tx->seg_list, &cnt,
2111 				      BUS_DMA_NOWAIT);
2112 	if (__predict_false(err == EFBIG)) {
2113 		/* Too many segments in the chain.  Try
2114 		   to defrag */
2115 		m_tmp = m_defrag(m, M_NOWAIT);
2116 		if (m_tmp == NULL) {
2117 			goto drop;
2118 		}
2119 		ss->tx.defrag++;
2120 		m = m_tmp;
2121 		err = bus_dmamap_load_mbuf_sg(tx->dmat,
2122 					      tx->info[idx].map,
2123 					      m, tx->seg_list, &cnt,
2124 					      BUS_DMA_NOWAIT);
2125 	}
2126 	if (__predict_false(err != 0)) {
2127 		device_printf(sc->dev, "bus_dmamap_load_mbuf_sg returned %d"
2128 			      " packet len = %d\n", err, m->m_pkthdr.len);
2129 		goto drop;
2130 	}
2131 	bus_dmamap_sync(tx->dmat, tx->info[idx].map,
2132 			BUS_DMASYNC_PREWRITE);
2133 	tx->info[idx].m = m;
2134 
2135 #if IFCAP_TSO4
2136 	/* TSO is different enough, we handle it in another routine */
2137 	if (m->m_pkthdr.csum_flags & (CSUM_TSO)) {
2138 		mxge_encap_tso(ss, m, cnt, &pi);
2139 		return;
2140 	}
2141 #endif
2142 
2143 	req = tx->req_list;
2144 	cksum_offset = 0;
2145 	pseudo_hdr_offset = 0;
2146 	flags = MXGEFW_FLAGS_NO_TSO;
2147 
2148 	/* checksum offloading? */
2149 	if (m->m_pkthdr.csum_flags &
2150 	    (CSUM_DELAY_DATA | CSUM_DELAY_DATA_IPV6)) {
2151 		/* ensure ip header is in first mbuf, copy
2152 		   it to a scratch buffer if not */
2153 		cksum_offset = pi.ip_off + pi.ip_hlen;
2154 		pseudo_hdr_offset = cksum_offset +  m->m_pkthdr.csum_data;
2155 		pseudo_hdr_offset = htobe16(pseudo_hdr_offset);
2156 		req->cksum_offset = cksum_offset;
2157 		flags |= MXGEFW_FLAGS_CKSUM;
2158 		odd_flag = MXGEFW_FLAGS_ALIGN_ODD;
2159 	} else {
2160 		odd_flag = 0;
2161 	}
2162 	if (m->m_pkthdr.len < MXGEFW_SEND_SMALL_SIZE)
2163 		flags |= MXGEFW_FLAGS_SMALL;
2164 
2165 	/* convert segments into a request list */
2166 	cum_len = 0;
2167 	seg = tx->seg_list;
2168 	req->flags = MXGEFW_FLAGS_FIRST;
2169 	for (i = 0; i < cnt; i++) {
2170 		req->addr_low =
2171 			htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
2172 		req->addr_high =
2173 			htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
2174 		req->length = htobe16(seg->ds_len);
2175 		req->cksum_offset = cksum_offset;
2176 		if (cksum_offset > seg->ds_len)
2177 			cksum_offset -= seg->ds_len;
2178 		else
2179 			cksum_offset = 0;
2180 		req->pseudo_hdr_offset = pseudo_hdr_offset;
2181 		req->pad = 0; /* complete solid 16-byte block */
2182 		req->rdma_count = 1;
2183 		req->flags |= flags | ((cum_len & 1) * odd_flag);
2184 		cum_len += seg->ds_len;
2185 		seg++;
2186 		req++;
2187 		req->flags = 0;
2188 	}
2189 	req--;
2190 	/* pad runts to 60 bytes */
2191 	if (cum_len < 60) {
2192 		req++;
2193 		req->addr_low =
2194 			htobe32(MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr));
2195 		req->addr_high =
2196 			htobe32(MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr));
2197 		req->length = htobe16(60 - cum_len);
2198 		req->cksum_offset = 0;
2199 		req->pseudo_hdr_offset = pseudo_hdr_offset;
2200 		req->pad = 0; /* complete solid 16-byte block */
2201 		req->rdma_count = 1;
2202 		req->flags |= flags | ((cum_len & 1) * odd_flag);
2203 		cnt++;
2204 	}
2205 
2206 	tx->req_list[0].rdma_count = cnt;
2207 #if 0
2208 	/* print what the firmware will see */
2209 	for (i = 0; i < cnt; i++) {
2210 		printf("%d: addr: 0x%x 0x%x len:%d pso%d,"
2211 		    "cso:%d, flags:0x%x, rdma:%d\n",
2212 		    i, (int)ntohl(tx->req_list[i].addr_high),
2213 		    (int)ntohl(tx->req_list[i].addr_low),
2214 		    (int)ntohs(tx->req_list[i].length),
2215 		    (int)ntohs(tx->req_list[i].pseudo_hdr_offset),
2216 		    tx->req_list[i].cksum_offset, tx->req_list[i].flags,
2217 		    tx->req_list[i].rdma_count);
2218 	}
2219 	printf("--------------\n");
2220 #endif
2221 	tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
2222 	mxge_submit_req(tx, tx->req_list, cnt);
2223 #ifdef IFNET_BUF_RING
2224 	if ((ss->sc->num_slices > 1) && tx->queue_active == 0) {
2225 		/* tell the NIC to start polling this slice */
2226 		*tx->send_go = 1;
2227 		tx->queue_active = 1;
2228 		tx->activate++;
2229 		wmb();
2230 	}
2231 #endif
2232 	return;
2233 
2234 drop:
2235 	m_freem(m);
2236 drop_without_m:
2237 	ss->oerrors++;
2238 	return;
2239 }
2240 
2241 #ifdef IFNET_BUF_RING
2242 static void
2243 mxge_qflush(struct ifnet *ifp)
2244 {
2245 	mxge_softc_t *sc = ifp->if_softc;
2246 	mxge_tx_ring_t *tx;
2247 	struct mbuf *m;
2248 	int slice;
2249 
2250 	for (slice = 0; slice < sc->num_slices; slice++) {
2251 		tx = &sc->ss[slice].tx;
2252 		mtx_lock(&tx->mtx);
2253 		while ((m = buf_ring_dequeue_sc(tx->br)) != NULL)
2254 			m_freem(m);
2255 		mtx_unlock(&tx->mtx);
2256 	}
2257 	if_qflush(ifp);
2258 }
2259 
2260 static inline void
2261 mxge_start_locked(struct mxge_slice_state *ss)
2262 {
2263 	mxge_softc_t *sc;
2264 	struct mbuf *m;
2265 	struct ifnet *ifp;
2266 	mxge_tx_ring_t *tx;
2267 
2268 	sc = ss->sc;
2269 	ifp = sc->ifp;
2270 	tx = &ss->tx;
2271 
2272 	while ((tx->mask - (tx->req - tx->done)) > tx->max_desc) {
2273 		m = drbr_dequeue(ifp, tx->br);
2274 		if (m == NULL) {
2275 			return;
2276 		}
2277 		/* let BPF see it */
2278 		BPF_MTAP(ifp, m);
2279 
2280 		/* give it to the nic */
2281 		mxge_encap(ss, m);
2282 	}
2283 	/* ran out of transmit slots */
2284 	if (((ss->if_drv_flags & IFF_DRV_OACTIVE) == 0)
2285 	    && (!drbr_empty(ifp, tx->br))) {
2286 		ss->if_drv_flags |= IFF_DRV_OACTIVE;
2287 		tx->stall++;
2288 	}
2289 }
2290 
2291 static int
2292 mxge_transmit_locked(struct mxge_slice_state *ss, struct mbuf *m)
2293 {
2294 	mxge_softc_t *sc;
2295 	struct ifnet *ifp;
2296 	mxge_tx_ring_t *tx;
2297 	int err;
2298 
2299 	sc = ss->sc;
2300 	ifp = sc->ifp;
2301 	tx = &ss->tx;
2302 
2303 	if ((ss->if_drv_flags & (IFF_DRV_RUNNING|IFF_DRV_OACTIVE)) !=
2304 	    IFF_DRV_RUNNING) {
2305 		err = drbr_enqueue(ifp, tx->br, m);
2306 		return (err);
2307 	}
2308 
2309 	if (!drbr_needs_enqueue(ifp, tx->br) &&
2310 	    ((tx->mask - (tx->req - tx->done)) > tx->max_desc)) {
2311 		/* let BPF see it */
2312 		BPF_MTAP(ifp, m);
2313 		/* give it to the nic */
2314 		mxge_encap(ss, m);
2315 	} else if ((err = drbr_enqueue(ifp, tx->br, m)) != 0) {
2316 		return (err);
2317 	}
2318 	if (!drbr_empty(ifp, tx->br))
2319 		mxge_start_locked(ss);
2320 	return (0);
2321 }
2322 
2323 static int
2324 mxge_transmit(struct ifnet *ifp, struct mbuf *m)
2325 {
2326 	mxge_softc_t *sc = ifp->if_softc;
2327 	struct mxge_slice_state *ss;
2328 	mxge_tx_ring_t *tx;
2329 	int err = 0;
2330 	int slice;
2331 
2332 	slice = m->m_pkthdr.flowid;
2333 	slice &= (sc->num_slices - 1);  /* num_slices always power of 2 */
2334 
2335 	ss = &sc->ss[slice];
2336 	tx = &ss->tx;
2337 
2338 	if (mtx_trylock(&tx->mtx)) {
2339 		err = mxge_transmit_locked(ss, m);
2340 		mtx_unlock(&tx->mtx);
2341 	} else {
2342 		err = drbr_enqueue(ifp, tx->br, m);
2343 	}
2344 
2345 	return (err);
2346 }
2347 
2348 #else
2349 
2350 static inline void
2351 mxge_start_locked(struct mxge_slice_state *ss)
2352 {
2353 	mxge_softc_t *sc;
2354 	struct mbuf *m;
2355 	struct ifnet *ifp;
2356 	mxge_tx_ring_t *tx;
2357 
2358 	sc = ss->sc;
2359 	ifp = sc->ifp;
2360 	tx = &ss->tx;
2361 	while ((tx->mask - (tx->req - tx->done)) > tx->max_desc) {
2362 		IFQ_DRV_DEQUEUE(&ifp->if_snd, m);
2363 		if (m == NULL) {
2364 			return;
2365 		}
2366 		/* let BPF see it */
2367 		BPF_MTAP(ifp, m);
2368 
2369 		/* give it to the nic */
2370 		mxge_encap(ss, m);
2371 	}
2372 	/* ran out of transmit slots */
2373 	if ((sc->ifp->if_drv_flags & IFF_DRV_OACTIVE) == 0) {
2374 		sc->ifp->if_drv_flags |= IFF_DRV_OACTIVE;
2375 		tx->stall++;
2376 	}
2377 }
2378 #endif
2379 static void
2380 mxge_start(struct ifnet *ifp)
2381 {
2382 	mxge_softc_t *sc = ifp->if_softc;
2383 	struct mxge_slice_state *ss;
2384 
2385 	/* only use the first slice for now */
2386 	ss = &sc->ss[0];
2387 	mtx_lock(&ss->tx.mtx);
2388 	mxge_start_locked(ss);
2389 	mtx_unlock(&ss->tx.mtx);
2390 }
2391 
2392 /*
2393  * copy an array of mcp_kreq_ether_recv_t's to the mcp.  Copy
2394  * at most 32 bytes at a time, so as to avoid involving the software
2395  * pio handler in the nic.   We re-write the first segment's low
2396  * DMA address to mark it valid only after we write the entire chunk
2397  * in a burst
2398  */
2399 static inline void
2400 mxge_submit_8rx(volatile mcp_kreq_ether_recv_t *dst,
2401 		mcp_kreq_ether_recv_t *src)
2402 {
2403 	uint32_t low;
2404 
2405 	low = src->addr_low;
2406 	src->addr_low = 0xffffffff;
2407 	mxge_pio_copy(dst, src, 4 * sizeof (*src));
2408 	wmb();
2409 	mxge_pio_copy(dst + 4, src + 4, 4 * sizeof (*src));
2410 	wmb();
2411 	src->addr_low = low;
2412 	dst->addr_low = low;
2413 	wmb();
2414 }
2415 
2416 static int
2417 mxge_get_buf_small(struct mxge_slice_state *ss, bus_dmamap_t map, int idx)
2418 {
2419 	bus_dma_segment_t seg;
2420 	struct mbuf *m;
2421 	mxge_rx_ring_t *rx = &ss->rx_small;
2422 	int cnt, err;
2423 
2424 	m = m_gethdr(M_NOWAIT, MT_DATA);
2425 	if (m == NULL) {
2426 		rx->alloc_fail++;
2427 		err = ENOBUFS;
2428 		goto done;
2429 	}
2430 	m->m_len = MHLEN;
2431 	err = bus_dmamap_load_mbuf_sg(rx->dmat, map, m,
2432 				      &seg, &cnt, BUS_DMA_NOWAIT);
2433 	if (err != 0) {
2434 		m_free(m);
2435 		goto done;
2436 	}
2437 	rx->info[idx].m = m;
2438 	rx->shadow[idx].addr_low =
2439 		htobe32(MXGE_LOWPART_TO_U32(seg.ds_addr));
2440 	rx->shadow[idx].addr_high =
2441 		htobe32(MXGE_HIGHPART_TO_U32(seg.ds_addr));
2442 
2443 done:
2444 	if ((idx & 7) == 7)
2445 		mxge_submit_8rx(&rx->lanai[idx - 7], &rx->shadow[idx - 7]);
2446 	return err;
2447 }
2448 
2449 static int
2450 mxge_get_buf_big(struct mxge_slice_state *ss, bus_dmamap_t map, int idx)
2451 {
2452 	bus_dma_segment_t seg[3];
2453 	struct mbuf *m;
2454 	mxge_rx_ring_t *rx = &ss->rx_big;
2455 	int cnt, err, i;
2456 
2457 	m = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, rx->cl_size);
2458 	if (m == NULL) {
2459 		rx->alloc_fail++;
2460 		err = ENOBUFS;
2461 		goto done;
2462 	}
2463 	m->m_len = rx->mlen;
2464 	err = bus_dmamap_load_mbuf_sg(rx->dmat, map, m,
2465 				      seg, &cnt, BUS_DMA_NOWAIT);
2466 	if (err != 0) {
2467 		m_free(m);
2468 		goto done;
2469 	}
2470 	rx->info[idx].m = m;
2471 	rx->shadow[idx].addr_low =
2472 		htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
2473 	rx->shadow[idx].addr_high =
2474 		htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
2475 
2476 #if MXGE_VIRT_JUMBOS
2477 	for (i = 1; i < cnt; i++) {
2478 		rx->shadow[idx + i].addr_low =
2479 			htobe32(MXGE_LOWPART_TO_U32(seg[i].ds_addr));
2480 		rx->shadow[idx + i].addr_high =
2481 			htobe32(MXGE_HIGHPART_TO_U32(seg[i].ds_addr));
2482        }
2483 #endif
2484 
2485 done:
2486        for (i = 0; i < rx->nbufs; i++) {
2487 		if ((idx & 7) == 7) {
2488 			mxge_submit_8rx(&rx->lanai[idx - 7],
2489 					&rx->shadow[idx - 7]);
2490 		}
2491 		idx++;
2492 	}
2493 	return err;
2494 }
2495 
2496 #ifdef INET6
2497 
2498 static uint16_t
2499 mxge_csum_generic(uint16_t *raw, int len)
2500 {
2501 	uint32_t csum;
2502 
2503 
2504 	csum = 0;
2505 	while (len > 0) {
2506 		csum += *raw;
2507 		raw++;
2508 		len -= 2;
2509 	}
2510 	csum = (csum >> 16) + (csum & 0xffff);
2511 	csum = (csum >> 16) + (csum & 0xffff);
2512 	return (uint16_t)csum;
2513 }
2514 
2515 static inline uint16_t
2516 mxge_rx_csum6(void *p, struct mbuf *m, uint32_t csum)
2517 {
2518 	uint32_t partial;
2519 	int nxt, cksum_offset;
2520 	struct ip6_hdr *ip6 = p;
2521 	uint16_t c;
2522 
2523 	nxt = ip6->ip6_nxt;
2524 	cksum_offset = sizeof (*ip6) + ETHER_HDR_LEN;
2525 	if (nxt != IPPROTO_TCP && nxt != IPPROTO_UDP) {
2526 		cksum_offset = ip6_lasthdr(m, ETHER_HDR_LEN,
2527 					   IPPROTO_IPV6, &nxt);
2528 		if (nxt != IPPROTO_TCP && nxt != IPPROTO_UDP)
2529 			return (1);
2530 	}
2531 
2532 	/*
2533 	 * IPv6 headers do not contain a checksum, and hence
2534 	 * do not checksum to zero, so they don't "fall out"
2535 	 * of the partial checksum calculation like IPv4
2536 	 * headers do.  We need to fix the partial checksum by
2537 	 * subtracting the checksum of the IPv6 header.
2538 	 */
2539 
2540 	partial = mxge_csum_generic((uint16_t *)ip6, cksum_offset -
2541 				    ETHER_HDR_LEN);
2542 	csum += ~partial;
2543 	csum +=	 (csum < ~partial);
2544 	csum = (csum >> 16) + (csum & 0xFFFF);
2545 	csum = (csum >> 16) + (csum & 0xFFFF);
2546 	c = in6_cksum_pseudo(ip6, m->m_pkthdr.len - cksum_offset, nxt,
2547 			     csum);
2548 	c ^= 0xffff;
2549 	return (c);
2550 }
2551 #endif /* INET6 */
2552 /*
2553  *  Myri10GE hardware checksums are not valid if the sender
2554  *  padded the frame with non-zero padding.  This is because
2555  *  the firmware just does a simple 16-bit 1s complement
2556  *  checksum across the entire frame, excluding the first 14
2557  *  bytes.  It is best to simply to check the checksum and
2558  *  tell the stack about it only if the checksum is good
2559  */
2560 
2561 static inline uint16_t
2562 mxge_rx_csum(struct mbuf *m, int csum)
2563 {
2564 	struct ether_header *eh;
2565 #ifdef INET
2566 	struct ip *ip;
2567 #endif
2568 #if defined(INET) || defined(INET6)
2569 	int cap = m->m_pkthdr.rcvif->if_capenable;
2570 #endif
2571 	uint16_t c, etype;
2572 
2573 
2574 	eh = mtod(m, struct ether_header *);
2575 	etype = ntohs(eh->ether_type);
2576 	switch (etype) {
2577 #ifdef INET
2578 	case ETHERTYPE_IP:
2579 		if ((cap & IFCAP_RXCSUM) == 0)
2580 			return (1);
2581 		ip = (struct ip *)(eh + 1);
2582 		if (ip->ip_p != IPPROTO_TCP && ip->ip_p != IPPROTO_UDP)
2583 			return (1);
2584 		c = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
2585 			      htonl(ntohs(csum) + ntohs(ip->ip_len) -
2586 				    (ip->ip_hl << 2) + ip->ip_p));
2587 		c ^= 0xffff;
2588 		break;
2589 #endif
2590 #ifdef INET6
2591 	case ETHERTYPE_IPV6:
2592 		if ((cap & IFCAP_RXCSUM_IPV6) == 0)
2593 			return (1);
2594 		c = mxge_rx_csum6((eh + 1), m, csum);
2595 		break;
2596 #endif
2597 	default:
2598 		c = 1;
2599 	}
2600 	return (c);
2601 }
2602 
2603 static void
2604 mxge_vlan_tag_remove(struct mbuf *m, uint32_t *csum)
2605 {
2606 	struct ether_vlan_header *evl;
2607 	struct ether_header *eh;
2608 	uint32_t partial;
2609 
2610 	evl = mtod(m, struct ether_vlan_header *);
2611 	eh = mtod(m, struct ether_header *);
2612 
2613 	/*
2614 	 * fix checksum by subtracting ETHER_VLAN_ENCAP_LEN bytes
2615 	 * after what the firmware thought was the end of the ethernet
2616 	 * header.
2617 	 */
2618 
2619 	/* put checksum into host byte order */
2620 	*csum = ntohs(*csum);
2621 	partial = ntohl(*(uint32_t *)(mtod(m, char *) + ETHER_HDR_LEN));
2622 	(*csum) += ~partial;
2623 	(*csum) +=  ((*csum) < ~partial);
2624 	(*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2625 	(*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2626 
2627 	/* restore checksum to network byte order;
2628 	   later consumers expect this */
2629 	*csum = htons(*csum);
2630 
2631 	/* save the tag */
2632 #ifdef MXGE_NEW_VLAN_API
2633 	m->m_pkthdr.ether_vtag = ntohs(evl->evl_tag);
2634 #else
2635 	{
2636 		struct m_tag *mtag;
2637 		mtag = m_tag_alloc(MTAG_VLAN, MTAG_VLAN_TAG, sizeof(u_int),
2638 				   M_NOWAIT);
2639 		if (mtag == NULL)
2640 			return;
2641 		VLAN_TAG_VALUE(mtag) = ntohs(evl->evl_tag);
2642 		m_tag_prepend(m, mtag);
2643 	}
2644 
2645 #endif
2646 	m->m_flags |= M_VLANTAG;
2647 
2648 	/*
2649 	 * Remove the 802.1q header by copying the Ethernet
2650 	 * addresses over it and adjusting the beginning of
2651 	 * the data in the mbuf.  The encapsulated Ethernet
2652 	 * type field is already in place.
2653 	 */
2654 	bcopy((char *)evl, (char *)evl + ETHER_VLAN_ENCAP_LEN,
2655 	      ETHER_HDR_LEN - ETHER_TYPE_LEN);
2656 	m_adj(m, ETHER_VLAN_ENCAP_LEN);
2657 }
2658 
2659 
2660 static inline void
2661 mxge_rx_done_big(struct mxge_slice_state *ss, uint32_t len,
2662 		 uint32_t csum, int lro)
2663 {
2664 	mxge_softc_t *sc;
2665 	struct ifnet *ifp;
2666 	struct mbuf *m;
2667 	struct ether_header *eh;
2668 	mxge_rx_ring_t *rx;
2669 	bus_dmamap_t old_map;
2670 	int idx;
2671 
2672 	sc = ss->sc;
2673 	ifp = sc->ifp;
2674 	rx = &ss->rx_big;
2675 	idx = rx->cnt & rx->mask;
2676 	rx->cnt += rx->nbufs;
2677 	/* save a pointer to the received mbuf */
2678 	m = rx->info[idx].m;
2679 	/* try to replace the received mbuf */
2680 	if (mxge_get_buf_big(ss, rx->extra_map, idx)) {
2681 		/* drop the frame -- the old mbuf is re-cycled */
2682 		if_inc_counter(ifp, IFCOUNTER_IERRORS, 1);
2683 		return;
2684 	}
2685 
2686 	/* unmap the received buffer */
2687 	old_map = rx->info[idx].map;
2688 	bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2689 	bus_dmamap_unload(rx->dmat, old_map);
2690 
2691 	/* swap the bus_dmamap_t's */
2692 	rx->info[idx].map = rx->extra_map;
2693 	rx->extra_map = old_map;
2694 
2695 	/* mcp implicitly skips 1st 2 bytes so that packet is properly
2696 	 * aligned */
2697 	m->m_data += MXGEFW_PAD;
2698 
2699 	m->m_pkthdr.rcvif = ifp;
2700 	m->m_len = m->m_pkthdr.len = len;
2701 	ss->ipackets++;
2702 	eh = mtod(m, struct ether_header *);
2703 	if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2704 		mxge_vlan_tag_remove(m, &csum);
2705 	}
2706 	/* flowid only valid if RSS hashing is enabled */
2707 	if (sc->num_slices > 1) {
2708 		m->m_pkthdr.flowid = (ss - sc->ss);
2709 		M_HASHTYPE_SET(m, M_HASHTYPE_OPAQUE);
2710 	}
2711 	/* if the checksum is valid, mark it in the mbuf header */
2712 	if ((ifp->if_capenable & (IFCAP_RXCSUM_IPV6 | IFCAP_RXCSUM)) &&
2713 	    (0 == mxge_rx_csum(m, csum))) {
2714 		/* Tell the stack that the  checksum is good */
2715 		m->m_pkthdr.csum_data = 0xffff;
2716 		m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR |
2717 			CSUM_DATA_VALID;
2718 
2719 #if defined(INET) || defined (INET6)
2720 		if (lro && (0 == tcp_lro_rx(&ss->lc, m, 0)))
2721 			return;
2722 #endif
2723 	}
2724 	/* pass the frame up the stack */
2725 	(*ifp->if_input)(ifp, m);
2726 }
2727 
2728 static inline void
2729 mxge_rx_done_small(struct mxge_slice_state *ss, uint32_t len,
2730 		   uint32_t csum, int lro)
2731 {
2732 	mxge_softc_t *sc;
2733 	struct ifnet *ifp;
2734 	struct ether_header *eh;
2735 	struct mbuf *m;
2736 	mxge_rx_ring_t *rx;
2737 	bus_dmamap_t old_map;
2738 	int idx;
2739 
2740 	sc = ss->sc;
2741 	ifp = sc->ifp;
2742 	rx = &ss->rx_small;
2743 	idx = rx->cnt & rx->mask;
2744 	rx->cnt++;
2745 	/* save a pointer to the received mbuf */
2746 	m = rx->info[idx].m;
2747 	/* try to replace the received mbuf */
2748 	if (mxge_get_buf_small(ss, rx->extra_map, idx)) {
2749 		/* drop the frame -- the old mbuf is re-cycled */
2750 		if_inc_counter(ifp, IFCOUNTER_IERRORS, 1);
2751 		return;
2752 	}
2753 
2754 	/* unmap the received buffer */
2755 	old_map = rx->info[idx].map;
2756 	bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2757 	bus_dmamap_unload(rx->dmat, old_map);
2758 
2759 	/* swap the bus_dmamap_t's */
2760 	rx->info[idx].map = rx->extra_map;
2761 	rx->extra_map = old_map;
2762 
2763 	/* mcp implicitly skips 1st 2 bytes so that packet is properly
2764 	 * aligned */
2765 	m->m_data += MXGEFW_PAD;
2766 
2767 	m->m_pkthdr.rcvif = ifp;
2768 	m->m_len = m->m_pkthdr.len = len;
2769 	ss->ipackets++;
2770 	eh = mtod(m, struct ether_header *);
2771 	if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2772 		mxge_vlan_tag_remove(m, &csum);
2773 	}
2774 	/* flowid only valid if RSS hashing is enabled */
2775 	if (sc->num_slices > 1) {
2776 		m->m_pkthdr.flowid = (ss - sc->ss);
2777 		M_HASHTYPE_SET(m, M_HASHTYPE_OPAQUE);
2778 	}
2779 	/* if the checksum is valid, mark it in the mbuf header */
2780 	if ((ifp->if_capenable & (IFCAP_RXCSUM_IPV6 | IFCAP_RXCSUM)) &&
2781 	    (0 == mxge_rx_csum(m, csum))) {
2782 		/* Tell the stack that the  checksum is good */
2783 		m->m_pkthdr.csum_data = 0xffff;
2784 		m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR |
2785 			CSUM_DATA_VALID;
2786 
2787 #if defined(INET) || defined (INET6)
2788 		if (lro && (0 == tcp_lro_rx(&ss->lc, m, csum)))
2789 			return;
2790 #endif
2791 	}
2792 	/* pass the frame up the stack */
2793 	(*ifp->if_input)(ifp, m);
2794 }
2795 
2796 static inline void
2797 mxge_clean_rx_done(struct mxge_slice_state *ss)
2798 {
2799 	mxge_rx_done_t *rx_done = &ss->rx_done;
2800 	int limit = 0;
2801 	uint16_t length;
2802 	uint16_t checksum;
2803 	int lro;
2804 
2805 	lro = ss->sc->ifp->if_capenable & IFCAP_LRO;
2806 	while (rx_done->entry[rx_done->idx].length != 0) {
2807 		length = ntohs(rx_done->entry[rx_done->idx].length);
2808 		rx_done->entry[rx_done->idx].length = 0;
2809 		checksum = rx_done->entry[rx_done->idx].checksum;
2810 		if (length <= (MHLEN - MXGEFW_PAD))
2811 			mxge_rx_done_small(ss, length, checksum, lro);
2812 		else
2813 			mxge_rx_done_big(ss, length, checksum, lro);
2814 		rx_done->cnt++;
2815 		rx_done->idx = rx_done->cnt & rx_done->mask;
2816 
2817 		/* limit potential for livelock */
2818 		if (__predict_false(++limit > rx_done->mask / 2))
2819 			break;
2820 	}
2821 #if defined(INET)  || defined (INET6)
2822 	tcp_lro_flush_all(&ss->lc);
2823 #endif
2824 }
2825 
2826 
2827 static inline void
2828 mxge_tx_done(struct mxge_slice_state *ss, uint32_t mcp_idx)
2829 {
2830 	struct ifnet *ifp;
2831 	mxge_tx_ring_t *tx;
2832 	struct mbuf *m;
2833 	bus_dmamap_t map;
2834 	int idx;
2835 	int *flags;
2836 
2837 	tx = &ss->tx;
2838 	ifp = ss->sc->ifp;
2839 	while (tx->pkt_done != mcp_idx) {
2840 		idx = tx->done & tx->mask;
2841 		tx->done++;
2842 		m = tx->info[idx].m;
2843 		/* mbuf and DMA map only attached to the first
2844 		   segment per-mbuf */
2845 		if (m != NULL) {
2846 			ss->obytes += m->m_pkthdr.len;
2847 			if (m->m_flags & M_MCAST)
2848 				ss->omcasts++;
2849 			ss->opackets++;
2850 			tx->info[idx].m = NULL;
2851 			map = tx->info[idx].map;
2852 			bus_dmamap_unload(tx->dmat, map);
2853 			m_freem(m);
2854 		}
2855 		if (tx->info[idx].flag) {
2856 			tx->info[idx].flag = 0;
2857 			tx->pkt_done++;
2858 		}
2859 	}
2860 
2861 	/* If we have space, clear IFF_OACTIVE to tell the stack that
2862 	   its OK to send packets */
2863 #ifdef IFNET_BUF_RING
2864 	flags = &ss->if_drv_flags;
2865 #else
2866 	flags = &ifp->if_drv_flags;
2867 #endif
2868 	mtx_lock(&ss->tx.mtx);
2869 	if ((*flags) & IFF_DRV_OACTIVE &&
2870 	    tx->req - tx->done < (tx->mask + 1)/4) {
2871 		*(flags) &= ~IFF_DRV_OACTIVE;
2872 		ss->tx.wake++;
2873 		mxge_start_locked(ss);
2874 	}
2875 #ifdef IFNET_BUF_RING
2876 	if ((ss->sc->num_slices > 1) && (tx->req == tx->done)) {
2877 		/* let the NIC stop polling this queue, since there
2878 		 * are no more transmits pending */
2879 		if (tx->req == tx->done) {
2880 			*tx->send_stop = 1;
2881 			tx->queue_active = 0;
2882 			tx->deactivate++;
2883 			wmb();
2884 		}
2885 	}
2886 #endif
2887 	mtx_unlock(&ss->tx.mtx);
2888 
2889 }
2890 
2891 static struct mxge_media_type mxge_xfp_media_types[] =
2892 {
2893 	{IFM_10G_CX4,	0x7f, 		"10GBASE-CX4 (module)"},
2894 	{IFM_10G_SR, 	(1 << 7),	"10GBASE-SR"},
2895 	{IFM_10G_LR, 	(1 << 6),	"10GBASE-LR"},
2896 	{0,		(1 << 5),	"10GBASE-ER"},
2897 	{IFM_10G_LRM,	(1 << 4),	"10GBASE-LRM"},
2898 	{0,		(1 << 3),	"10GBASE-SW"},
2899 	{0,		(1 << 2),	"10GBASE-LW"},
2900 	{0,		(1 << 1),	"10GBASE-EW"},
2901 	{0,		(1 << 0),	"Reserved"}
2902 };
2903 static struct mxge_media_type mxge_sfp_media_types[] =
2904 {
2905 	{IFM_10G_TWINAX,      0,	"10GBASE-Twinax"},
2906 	{0,		(1 << 7),	"Reserved"},
2907 	{IFM_10G_LRM,	(1 << 6),	"10GBASE-LRM"},
2908 	{IFM_10G_LR, 	(1 << 5),	"10GBASE-LR"},
2909 	{IFM_10G_SR,	(1 << 4),	"10GBASE-SR"},
2910 	{IFM_10G_TWINAX,(1 << 0),	"10GBASE-Twinax"}
2911 };
2912 
2913 static void
2914 mxge_media_set(mxge_softc_t *sc, int media_type)
2915 {
2916 
2917 
2918 	ifmedia_add(&sc->media, IFM_ETHER | IFM_FDX | media_type,
2919 		    0, NULL);
2920 	ifmedia_set(&sc->media, IFM_ETHER | IFM_FDX | media_type);
2921 	sc->current_media = media_type;
2922 	sc->media.ifm_media = sc->media.ifm_cur->ifm_media;
2923 }
2924 
2925 static void
2926 mxge_media_init(mxge_softc_t *sc)
2927 {
2928 	char *ptr;
2929 	int i;
2930 
2931 	ifmedia_removeall(&sc->media);
2932 	mxge_media_set(sc, IFM_AUTO);
2933 
2934 	/*
2935 	 * parse the product code to deterimine the interface type
2936 	 * (CX4, XFP, Quad Ribbon Fiber) by looking at the character
2937 	 * after the 3rd dash in the driver's cached copy of the
2938 	 * EEPROM's product code string.
2939 	 */
2940 	ptr = sc->product_code_string;
2941 	if (ptr == NULL) {
2942 		device_printf(sc->dev, "Missing product code\n");
2943 		return;
2944 	}
2945 
2946 	for (i = 0; i < 3; i++, ptr++) {
2947 		ptr = strchr(ptr, '-');
2948 		if (ptr == NULL) {
2949 			device_printf(sc->dev,
2950 				      "only %d dashes in PC?!?\n", i);
2951 			return;
2952 		}
2953 	}
2954 	if (*ptr == 'C' || *(ptr +1) == 'C') {
2955 		/* -C is CX4 */
2956 		sc->connector = MXGE_CX4;
2957 		mxge_media_set(sc, IFM_10G_CX4);
2958 	} else if (*ptr == 'Q') {
2959 		/* -Q is Quad Ribbon Fiber */
2960 		sc->connector = MXGE_QRF;
2961 		device_printf(sc->dev, "Quad Ribbon Fiber Media\n");
2962 		/* FreeBSD has no media type for Quad ribbon fiber */
2963 	} else if (*ptr == 'R') {
2964 		/* -R is XFP */
2965 		sc->connector = MXGE_XFP;
2966 	} else if (*ptr == 'S' || *(ptr +1) == 'S') {
2967 		/* -S or -2S is SFP+ */
2968 		sc->connector = MXGE_SFP;
2969 	} else {
2970 		device_printf(sc->dev, "Unknown media type: %c\n", *ptr);
2971 	}
2972 }
2973 
2974 /*
2975  * Determine the media type for a NIC.  Some XFPs will identify
2976  * themselves only when their link is up, so this is initiated via a
2977  * link up interrupt.  However, this can potentially take up to
2978  * several milliseconds, so it is run via the watchdog routine, rather
2979  * than in the interrupt handler itself.
2980  */
2981 static void
2982 mxge_media_probe(mxge_softc_t *sc)
2983 {
2984 	mxge_cmd_t cmd;
2985 	char *cage_type;
2986 
2987 	struct mxge_media_type *mxge_media_types = NULL;
2988 	int i, err, ms, mxge_media_type_entries;
2989 	uint32_t byte;
2990 
2991 	sc->need_media_probe = 0;
2992 
2993 	if (sc->connector == MXGE_XFP) {
2994 		/* -R is XFP */
2995 		mxge_media_types = mxge_xfp_media_types;
2996 		mxge_media_type_entries =
2997 			nitems(mxge_xfp_media_types);
2998 		byte = MXGE_XFP_COMPLIANCE_BYTE;
2999 		cage_type = "XFP";
3000 	} else 	if (sc->connector == MXGE_SFP) {
3001 		/* -S or -2S is SFP+ */
3002 		mxge_media_types = mxge_sfp_media_types;
3003 		mxge_media_type_entries =
3004 			nitems(mxge_sfp_media_types);
3005 		cage_type = "SFP+";
3006 		byte = 3;
3007 	} else {
3008 		/* nothing to do; media type cannot change */
3009 		return;
3010 	}
3011 
3012 	/*
3013 	 * At this point we know the NIC has an XFP cage, so now we
3014 	 * try to determine what is in the cage by using the
3015 	 * firmware's XFP I2C commands to read the XFP 10GbE compilance
3016 	 * register.  We read just one byte, which may take over
3017 	 * a millisecond
3018 	 */
3019 
3020 	cmd.data0 = 0;	 /* just fetch 1 byte, not all 256 */
3021 	cmd.data1 = byte;
3022 	err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_READ, &cmd);
3023 	if (err == MXGEFW_CMD_ERROR_I2C_FAILURE) {
3024 		device_printf(sc->dev, "failed to read XFP\n");
3025 	}
3026 	if (err == MXGEFW_CMD_ERROR_I2C_ABSENT) {
3027 		device_printf(sc->dev, "Type R/S with no XFP!?!?\n");
3028 	}
3029 	if (err != MXGEFW_CMD_OK) {
3030 		return;
3031 	}
3032 
3033 	/* now we wait for the data to be cached */
3034 	cmd.data0 = byte;
3035 	err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
3036 	for (ms = 0; (err == EBUSY) && (ms < 50); ms++) {
3037 		DELAY(1000);
3038 		cmd.data0 = byte;
3039 		err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
3040 	}
3041 	if (err != MXGEFW_CMD_OK) {
3042 		device_printf(sc->dev, "failed to read %s (%d, %dms)\n",
3043 			      cage_type, err, ms);
3044 		return;
3045 	}
3046 
3047 	if (cmd.data0 == mxge_media_types[0].bitmask) {
3048 		if (mxge_verbose)
3049 			device_printf(sc->dev, "%s:%s\n", cage_type,
3050 				      mxge_media_types[0].name);
3051 		if (sc->current_media != mxge_media_types[0].flag) {
3052 			mxge_media_init(sc);
3053 			mxge_media_set(sc, mxge_media_types[0].flag);
3054 		}
3055 		return;
3056 	}
3057 	for (i = 1; i < mxge_media_type_entries; i++) {
3058 		if (cmd.data0 & mxge_media_types[i].bitmask) {
3059 			if (mxge_verbose)
3060 				device_printf(sc->dev, "%s:%s\n",
3061 					      cage_type,
3062 					      mxge_media_types[i].name);
3063 
3064 			if (sc->current_media != mxge_media_types[i].flag) {
3065 				mxge_media_init(sc);
3066 				mxge_media_set(sc, mxge_media_types[i].flag);
3067 			}
3068 			return;
3069 		}
3070 	}
3071 	if (mxge_verbose)
3072 		device_printf(sc->dev, "%s media 0x%x unknown\n",
3073 			      cage_type, cmd.data0);
3074 
3075 	return;
3076 }
3077 
3078 static void
3079 mxge_intr(void *arg)
3080 {
3081 	struct mxge_slice_state *ss = arg;
3082 	mxge_softc_t *sc = ss->sc;
3083 	mcp_irq_data_t *stats = ss->fw_stats;
3084 	mxge_tx_ring_t *tx = &ss->tx;
3085 	mxge_rx_done_t *rx_done = &ss->rx_done;
3086 	uint32_t send_done_count;
3087 	uint8_t valid;
3088 
3089 
3090 #ifndef IFNET_BUF_RING
3091 	/* an interrupt on a non-zero slice is implicitly valid
3092 	   since MSI-X irqs are not shared */
3093 	if (ss != sc->ss) {
3094 		mxge_clean_rx_done(ss);
3095 		*ss->irq_claim = be32toh(3);
3096 		return;
3097 	}
3098 #endif
3099 
3100 	/* make sure the DMA has finished */
3101 	if (!stats->valid) {
3102 		return;
3103 	}
3104 	valid = stats->valid;
3105 
3106 	if (sc->legacy_irq) {
3107 		/* lower legacy IRQ  */
3108 		*sc->irq_deassert = 0;
3109 		if (!mxge_deassert_wait)
3110 			/* don't wait for conf. that irq is low */
3111 			stats->valid = 0;
3112 	} else {
3113 		stats->valid = 0;
3114 	}
3115 
3116 	/* loop while waiting for legacy irq deassertion */
3117 	do {
3118 		/* check for transmit completes and receives */
3119 		send_done_count = be32toh(stats->send_done_count);
3120 		while ((send_done_count != tx->pkt_done) ||
3121 		       (rx_done->entry[rx_done->idx].length != 0)) {
3122 			if (send_done_count != tx->pkt_done)
3123 				mxge_tx_done(ss, (int)send_done_count);
3124 			mxge_clean_rx_done(ss);
3125 			send_done_count = be32toh(stats->send_done_count);
3126 		}
3127 		if (sc->legacy_irq && mxge_deassert_wait)
3128 			wmb();
3129 	} while (*((volatile uint8_t *) &stats->valid));
3130 
3131 	/* fw link & error stats meaningful only on the first slice */
3132 	if (__predict_false((ss == sc->ss) && stats->stats_updated)) {
3133 		if (sc->link_state != stats->link_up) {
3134 			sc->link_state = stats->link_up;
3135 			if (sc->link_state) {
3136 				if_link_state_change(sc->ifp, LINK_STATE_UP);
3137 				if (mxge_verbose)
3138 					device_printf(sc->dev, "link up\n");
3139 			} else {
3140 				if_link_state_change(sc->ifp, LINK_STATE_DOWN);
3141 				if (mxge_verbose)
3142 					device_printf(sc->dev, "link down\n");
3143 			}
3144 			sc->need_media_probe = 1;
3145 		}
3146 		if (sc->rdma_tags_available !=
3147 		    be32toh(stats->rdma_tags_available)) {
3148 			sc->rdma_tags_available =
3149 				be32toh(stats->rdma_tags_available);
3150 			device_printf(sc->dev, "RDMA timed out! %d tags "
3151 				      "left\n", sc->rdma_tags_available);
3152 		}
3153 
3154 		if (stats->link_down) {
3155 			sc->down_cnt += stats->link_down;
3156 			sc->link_state = 0;
3157 			if_link_state_change(sc->ifp, LINK_STATE_DOWN);
3158 		}
3159 	}
3160 
3161 	/* check to see if we have rx token to pass back */
3162 	if (valid & 0x1)
3163 	    *ss->irq_claim = be32toh(3);
3164 	*(ss->irq_claim + 1) = be32toh(3);
3165 }
3166 
3167 static void
3168 mxge_init(void *arg)
3169 {
3170 	mxge_softc_t *sc = arg;
3171 	struct ifnet *ifp = sc->ifp;
3172 
3173 
3174 	mtx_lock(&sc->driver_mtx);
3175 	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0)
3176 		(void) mxge_open(sc);
3177 	mtx_unlock(&sc->driver_mtx);
3178 }
3179 
3180 
3181 
3182 static void
3183 mxge_free_slice_mbufs(struct mxge_slice_state *ss)
3184 {
3185 	int i;
3186 
3187 #if defined(INET) || defined(INET6)
3188 	tcp_lro_free(&ss->lc);
3189 #endif
3190 	for (i = 0; i <= ss->rx_big.mask; i++) {
3191 		if (ss->rx_big.info[i].m == NULL)
3192 			continue;
3193 		bus_dmamap_unload(ss->rx_big.dmat,
3194 				  ss->rx_big.info[i].map);
3195 		m_freem(ss->rx_big.info[i].m);
3196 		ss->rx_big.info[i].m = NULL;
3197 	}
3198 
3199 	for (i = 0; i <= ss->rx_small.mask; i++) {
3200 		if (ss->rx_small.info[i].m == NULL)
3201 			continue;
3202 		bus_dmamap_unload(ss->rx_small.dmat,
3203 				  ss->rx_small.info[i].map);
3204 		m_freem(ss->rx_small.info[i].m);
3205 		ss->rx_small.info[i].m = NULL;
3206 	}
3207 
3208 	/* transmit ring used only on the first slice */
3209 	if (ss->tx.info == NULL)
3210 		return;
3211 
3212 	for (i = 0; i <= ss->tx.mask; i++) {
3213 		ss->tx.info[i].flag = 0;
3214 		if (ss->tx.info[i].m == NULL)
3215 			continue;
3216 		bus_dmamap_unload(ss->tx.dmat,
3217 				  ss->tx.info[i].map);
3218 		m_freem(ss->tx.info[i].m);
3219 		ss->tx.info[i].m = NULL;
3220 	}
3221 }
3222 
3223 static void
3224 mxge_free_mbufs(mxge_softc_t *sc)
3225 {
3226 	int slice;
3227 
3228 	for (slice = 0; slice < sc->num_slices; slice++)
3229 		mxge_free_slice_mbufs(&sc->ss[slice]);
3230 }
3231 
3232 static void
3233 mxge_free_slice_rings(struct mxge_slice_state *ss)
3234 {
3235 	int i;
3236 
3237 
3238 	if (ss->rx_done.entry != NULL)
3239 		mxge_dma_free(&ss->rx_done.dma);
3240 	ss->rx_done.entry = NULL;
3241 
3242 	if (ss->tx.req_bytes != NULL)
3243 		free(ss->tx.req_bytes, M_DEVBUF);
3244 	ss->tx.req_bytes = NULL;
3245 
3246 	if (ss->tx.seg_list != NULL)
3247 		free(ss->tx.seg_list, M_DEVBUF);
3248 	ss->tx.seg_list = NULL;
3249 
3250 	if (ss->rx_small.shadow != NULL)
3251 		free(ss->rx_small.shadow, M_DEVBUF);
3252 	ss->rx_small.shadow = NULL;
3253 
3254 	if (ss->rx_big.shadow != NULL)
3255 		free(ss->rx_big.shadow, M_DEVBUF);
3256 	ss->rx_big.shadow = NULL;
3257 
3258 	if (ss->tx.info != NULL) {
3259 		if (ss->tx.dmat != NULL) {
3260 			for (i = 0; i <= ss->tx.mask; i++) {
3261 				bus_dmamap_destroy(ss->tx.dmat,
3262 						   ss->tx.info[i].map);
3263 			}
3264 			bus_dma_tag_destroy(ss->tx.dmat);
3265 		}
3266 		free(ss->tx.info, M_DEVBUF);
3267 	}
3268 	ss->tx.info = NULL;
3269 
3270 	if (ss->rx_small.info != NULL) {
3271 		if (ss->rx_small.dmat != NULL) {
3272 			for (i = 0; i <= ss->rx_small.mask; i++) {
3273 				bus_dmamap_destroy(ss->rx_small.dmat,
3274 						   ss->rx_small.info[i].map);
3275 			}
3276 			bus_dmamap_destroy(ss->rx_small.dmat,
3277 					   ss->rx_small.extra_map);
3278 			bus_dma_tag_destroy(ss->rx_small.dmat);
3279 		}
3280 		free(ss->rx_small.info, M_DEVBUF);
3281 	}
3282 	ss->rx_small.info = NULL;
3283 
3284 	if (ss->rx_big.info != NULL) {
3285 		if (ss->rx_big.dmat != NULL) {
3286 			for (i = 0; i <= ss->rx_big.mask; i++) {
3287 				bus_dmamap_destroy(ss->rx_big.dmat,
3288 						   ss->rx_big.info[i].map);
3289 			}
3290 			bus_dmamap_destroy(ss->rx_big.dmat,
3291 					   ss->rx_big.extra_map);
3292 			bus_dma_tag_destroy(ss->rx_big.dmat);
3293 		}
3294 		free(ss->rx_big.info, M_DEVBUF);
3295 	}
3296 	ss->rx_big.info = NULL;
3297 }
3298 
3299 static void
3300 mxge_free_rings(mxge_softc_t *sc)
3301 {
3302 	int slice;
3303 
3304 	for (slice = 0; slice < sc->num_slices; slice++)
3305 		mxge_free_slice_rings(&sc->ss[slice]);
3306 }
3307 
3308 static int
3309 mxge_alloc_slice_rings(struct mxge_slice_state *ss, int rx_ring_entries,
3310 		       int tx_ring_entries)
3311 {
3312 	mxge_softc_t *sc = ss->sc;
3313 	size_t bytes;
3314 	int err, i;
3315 
3316 	/* allocate per-slice receive resources */
3317 
3318 	ss->rx_small.mask = ss->rx_big.mask = rx_ring_entries - 1;
3319 	ss->rx_done.mask = (2 * rx_ring_entries) - 1;
3320 
3321 	/* allocate the rx shadow rings */
3322 	bytes = rx_ring_entries * sizeof (*ss->rx_small.shadow);
3323 	ss->rx_small.shadow = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3324 
3325 	bytes = rx_ring_entries * sizeof (*ss->rx_big.shadow);
3326 	ss->rx_big.shadow = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3327 
3328 	/* allocate the rx host info rings */
3329 	bytes = rx_ring_entries * sizeof (*ss->rx_small.info);
3330 	ss->rx_small.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3331 
3332 	bytes = rx_ring_entries * sizeof (*ss->rx_big.info);
3333 	ss->rx_big.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3334 
3335 	/* allocate the rx busdma resources */
3336 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
3337 				 1,			/* alignment */
3338 				 4096,			/* boundary */
3339 				 BUS_SPACE_MAXADDR,	/* low */
3340 				 BUS_SPACE_MAXADDR,	/* high */
3341 				 NULL, NULL,		/* filter */
3342 				 MHLEN,			/* maxsize */
3343 				 1,			/* num segs */
3344 				 MHLEN,			/* maxsegsize */
3345 				 BUS_DMA_ALLOCNOW,	/* flags */
3346 				 NULL, NULL,		/* lock */
3347 				 &ss->rx_small.dmat);	/* tag */
3348 	if (err != 0) {
3349 		device_printf(sc->dev, "Err %d allocating rx_small dmat\n",
3350 			      err);
3351 		return err;
3352 	}
3353 
3354 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
3355 				 1,			/* alignment */
3356 #if MXGE_VIRT_JUMBOS
3357 				 4096,			/* boundary */
3358 #else
3359 				 0,			/* boundary */
3360 #endif
3361 				 BUS_SPACE_MAXADDR,	/* low */
3362 				 BUS_SPACE_MAXADDR,	/* high */
3363 				 NULL, NULL,		/* filter */
3364 				 3*4096,		/* maxsize */
3365 #if MXGE_VIRT_JUMBOS
3366 				 3,			/* num segs */
3367 				 4096,			/* maxsegsize*/
3368 #else
3369 				 1,			/* num segs */
3370 				 MJUM9BYTES,		/* maxsegsize*/
3371 #endif
3372 				 BUS_DMA_ALLOCNOW,	/* flags */
3373 				 NULL, NULL,		/* lock */
3374 				 &ss->rx_big.dmat);	/* tag */
3375 	if (err != 0) {
3376 		device_printf(sc->dev, "Err %d allocating rx_big dmat\n",
3377 			      err);
3378 		return err;
3379 	}
3380 	for (i = 0; i <= ss->rx_small.mask; i++) {
3381 		err = bus_dmamap_create(ss->rx_small.dmat, 0,
3382 					&ss->rx_small.info[i].map);
3383 		if (err != 0) {
3384 			device_printf(sc->dev, "Err %d  rx_small dmamap\n",
3385 				      err);
3386 			return err;
3387 		}
3388 	}
3389 	err = bus_dmamap_create(ss->rx_small.dmat, 0,
3390 				&ss->rx_small.extra_map);
3391 	if (err != 0) {
3392 		device_printf(sc->dev, "Err %d extra rx_small dmamap\n",
3393 			      err);
3394 		return err;
3395 	}
3396 
3397 	for (i = 0; i <= ss->rx_big.mask; i++) {
3398 		err = bus_dmamap_create(ss->rx_big.dmat, 0,
3399 					&ss->rx_big.info[i].map);
3400 		if (err != 0) {
3401 			device_printf(sc->dev, "Err %d  rx_big dmamap\n",
3402 				      err);
3403 			return err;
3404 		}
3405 	}
3406 	err = bus_dmamap_create(ss->rx_big.dmat, 0,
3407 				&ss->rx_big.extra_map);
3408 	if (err != 0) {
3409 		device_printf(sc->dev, "Err %d extra rx_big dmamap\n",
3410 			      err);
3411 		return err;
3412 	}
3413 
3414 	/* now allocate TX resources */
3415 
3416 #ifndef IFNET_BUF_RING
3417 	/* only use a single TX ring for now */
3418 	if (ss != ss->sc->ss)
3419 		return 0;
3420 #endif
3421 
3422 	ss->tx.mask = tx_ring_entries - 1;
3423 	ss->tx.max_desc = MIN(MXGE_MAX_SEND_DESC, tx_ring_entries / 4);
3424 
3425 
3426 	/* allocate the tx request copy block */
3427 	bytes = 8 +
3428 		sizeof (*ss->tx.req_list) * (ss->tx.max_desc + 4);
3429 	ss->tx.req_bytes = malloc(bytes, M_DEVBUF, M_WAITOK);
3430 	/* ensure req_list entries are aligned to 8 bytes */
3431 	ss->tx.req_list = (mcp_kreq_ether_send_t *)
3432 		((unsigned long)(ss->tx.req_bytes + 7) & ~7UL);
3433 
3434 	/* allocate the tx busdma segment list */
3435 	bytes = sizeof (*ss->tx.seg_list) * ss->tx.max_desc;
3436 	ss->tx.seg_list = (bus_dma_segment_t *)
3437 		malloc(bytes, M_DEVBUF, M_WAITOK);
3438 
3439 	/* allocate the tx host info ring */
3440 	bytes = tx_ring_entries * sizeof (*ss->tx.info);
3441 	ss->tx.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3442 
3443 	/* allocate the tx busdma resources */
3444 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
3445 				 1,			/* alignment */
3446 				 sc->tx_boundary,	/* boundary */
3447 				 BUS_SPACE_MAXADDR,	/* low */
3448 				 BUS_SPACE_MAXADDR,	/* high */
3449 				 NULL, NULL,		/* filter */
3450 				 65536 + 256,		/* maxsize */
3451 				 ss->tx.max_desc - 2,	/* num segs */
3452 				 sc->tx_boundary,	/* maxsegsz */
3453 				 BUS_DMA_ALLOCNOW,	/* flags */
3454 				 NULL, NULL,		/* lock */
3455 				 &ss->tx.dmat);		/* tag */
3456 
3457 	if (err != 0) {
3458 		device_printf(sc->dev, "Err %d allocating tx dmat\n",
3459 			      err);
3460 		return err;
3461 	}
3462 
3463 	/* now use these tags to setup dmamaps for each slot
3464 	   in the ring */
3465 	for (i = 0; i <= ss->tx.mask; i++) {
3466 		err = bus_dmamap_create(ss->tx.dmat, 0,
3467 					&ss->tx.info[i].map);
3468 		if (err != 0) {
3469 			device_printf(sc->dev, "Err %d  tx dmamap\n",
3470 				      err);
3471 			return err;
3472 		}
3473 	}
3474 	return 0;
3475 
3476 }
3477 
3478 static int
3479 mxge_alloc_rings(mxge_softc_t *sc)
3480 {
3481 	mxge_cmd_t cmd;
3482 	int tx_ring_size;
3483 	int tx_ring_entries, rx_ring_entries;
3484 	int err, slice;
3485 
3486 	/* get ring sizes */
3487 	err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_RING_SIZE, &cmd);
3488 	tx_ring_size = cmd.data0;
3489 	if (err != 0) {
3490 		device_printf(sc->dev, "Cannot determine tx ring sizes\n");
3491 		goto abort;
3492 	}
3493 
3494 	tx_ring_entries = tx_ring_size / sizeof (mcp_kreq_ether_send_t);
3495 	rx_ring_entries = sc->rx_ring_size / sizeof (mcp_dma_addr_t);
3496 	IFQ_SET_MAXLEN(&sc->ifp->if_snd, tx_ring_entries - 1);
3497 	sc->ifp->if_snd.ifq_drv_maxlen = sc->ifp->if_snd.ifq_maxlen;
3498 	IFQ_SET_READY(&sc->ifp->if_snd);
3499 
3500 	for (slice = 0; slice < sc->num_slices; slice++) {
3501 		err = mxge_alloc_slice_rings(&sc->ss[slice],
3502 					     rx_ring_entries,
3503 					     tx_ring_entries);
3504 		if (err != 0)
3505 			goto abort;
3506 	}
3507 	return 0;
3508 
3509 abort:
3510 	mxge_free_rings(sc);
3511 	return err;
3512 
3513 }
3514 
3515 
3516 static void
3517 mxge_choose_params(int mtu, int *big_buf_size, int *cl_size, int *nbufs)
3518 {
3519 	int bufsize = mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN + MXGEFW_PAD;
3520 
3521 	if (bufsize < MCLBYTES) {
3522 		/* easy, everything fits in a single buffer */
3523 		*big_buf_size = MCLBYTES;
3524 		*cl_size = MCLBYTES;
3525 		*nbufs = 1;
3526 		return;
3527 	}
3528 
3529 	if (bufsize < MJUMPAGESIZE) {
3530 		/* still easy, everything still fits in a single buffer */
3531 		*big_buf_size = MJUMPAGESIZE;
3532 		*cl_size = MJUMPAGESIZE;
3533 		*nbufs = 1;
3534 		return;
3535 	}
3536 #if MXGE_VIRT_JUMBOS
3537 	/* now we need to use virtually contiguous buffers */
3538 	*cl_size = MJUM9BYTES;
3539 	*big_buf_size = 4096;
3540 	*nbufs = mtu / 4096 + 1;
3541 	/* needs to be a power of two, so round up */
3542 	if (*nbufs == 3)
3543 		*nbufs = 4;
3544 #else
3545 	*cl_size = MJUM9BYTES;
3546 	*big_buf_size = MJUM9BYTES;
3547 	*nbufs = 1;
3548 #endif
3549 }
3550 
3551 static int
3552 mxge_slice_open(struct mxge_slice_state *ss, int nbufs, int cl_size)
3553 {
3554 	mxge_softc_t *sc;
3555 	mxge_cmd_t cmd;
3556 	bus_dmamap_t map;
3557 	int err, i, slice;
3558 
3559 
3560 	sc = ss->sc;
3561 	slice = ss - sc->ss;
3562 
3563 #if defined(INET) || defined(INET6)
3564 	(void)tcp_lro_init(&ss->lc);
3565 #endif
3566 	ss->lc.ifp = sc->ifp;
3567 
3568 	/* get the lanai pointers to the send and receive rings */
3569 
3570 	err = 0;
3571 #ifndef IFNET_BUF_RING
3572 	/* We currently only send from the first slice */
3573 	if (slice == 0) {
3574 #endif
3575 		cmd.data0 = slice;
3576 		err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_OFFSET, &cmd);
3577 		ss->tx.lanai =
3578 			(volatile mcp_kreq_ether_send_t *)(sc->sram + cmd.data0);
3579 		ss->tx.send_go = (volatile uint32_t *)
3580 			(sc->sram + MXGEFW_ETH_SEND_GO + 64 * slice);
3581 		ss->tx.send_stop = (volatile uint32_t *)
3582 		(sc->sram + MXGEFW_ETH_SEND_STOP + 64 * slice);
3583 #ifndef IFNET_BUF_RING
3584 	}
3585 #endif
3586 	cmd.data0 = slice;
3587 	err |= mxge_send_cmd(sc,
3588 			     MXGEFW_CMD_GET_SMALL_RX_OFFSET, &cmd);
3589 	ss->rx_small.lanai =
3590 		(volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
3591 	cmd.data0 = slice;
3592 	err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_BIG_RX_OFFSET, &cmd);
3593 	ss->rx_big.lanai =
3594 		(volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
3595 
3596 	if (err != 0) {
3597 		device_printf(sc->dev,
3598 			      "failed to get ring sizes or locations\n");
3599 		return EIO;
3600 	}
3601 
3602 	/* stock receive rings */
3603 	for (i = 0; i <= ss->rx_small.mask; i++) {
3604 		map = ss->rx_small.info[i].map;
3605 		err = mxge_get_buf_small(ss, map, i);
3606 		if (err) {
3607 			device_printf(sc->dev, "alloced %d/%d smalls\n",
3608 				      i, ss->rx_small.mask + 1);
3609 			return ENOMEM;
3610 		}
3611 	}
3612 	for (i = 0; i <= ss->rx_big.mask; i++) {
3613 		ss->rx_big.shadow[i].addr_low = 0xffffffff;
3614 		ss->rx_big.shadow[i].addr_high = 0xffffffff;
3615 	}
3616 	ss->rx_big.nbufs = nbufs;
3617 	ss->rx_big.cl_size = cl_size;
3618 	ss->rx_big.mlen = ss->sc->ifp->if_mtu + ETHER_HDR_LEN +
3619 		ETHER_VLAN_ENCAP_LEN + MXGEFW_PAD;
3620 	for (i = 0; i <= ss->rx_big.mask; i += ss->rx_big.nbufs) {
3621 		map = ss->rx_big.info[i].map;
3622 		err = mxge_get_buf_big(ss, map, i);
3623 		if (err) {
3624 			device_printf(sc->dev, "alloced %d/%d bigs\n",
3625 				      i, ss->rx_big.mask + 1);
3626 			return ENOMEM;
3627 		}
3628 	}
3629 	return 0;
3630 }
3631 
3632 static int
3633 mxge_open(mxge_softc_t *sc)
3634 {
3635 	mxge_cmd_t cmd;
3636 	int err, big_bytes, nbufs, slice, cl_size, i;
3637 	bus_addr_t bus;
3638 	volatile uint8_t *itable;
3639 	struct mxge_slice_state *ss;
3640 
3641 	/* Copy the MAC address in case it was overridden */
3642 	bcopy(IF_LLADDR(sc->ifp), sc->mac_addr, ETHER_ADDR_LEN);
3643 
3644 	err = mxge_reset(sc, 1);
3645 	if (err != 0) {
3646 		device_printf(sc->dev, "failed to reset\n");
3647 		return EIO;
3648 	}
3649 
3650 	if (sc->num_slices > 1) {
3651 		/* setup the indirection table */
3652 		cmd.data0 = sc->num_slices;
3653 		err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_TABLE_SIZE,
3654 				    &cmd);
3655 
3656 		err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_RSS_TABLE_OFFSET,
3657 				     &cmd);
3658 		if (err != 0) {
3659 			device_printf(sc->dev,
3660 				      "failed to setup rss tables\n");
3661 			return err;
3662 		}
3663 
3664 		/* just enable an identity mapping */
3665 		itable = sc->sram + cmd.data0;
3666 		for (i = 0; i < sc->num_slices; i++)
3667 			itable[i] = (uint8_t)i;
3668 
3669 		cmd.data0 = 1;
3670 		cmd.data1 = mxge_rss_hash_type;
3671 		err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_ENABLE, &cmd);
3672 		if (err != 0) {
3673 			device_printf(sc->dev, "failed to enable slices\n");
3674 			return err;
3675 		}
3676 	}
3677 
3678 
3679 	mxge_choose_params(sc->ifp->if_mtu, &big_bytes, &cl_size, &nbufs);
3680 
3681 	cmd.data0 = nbufs;
3682 	err = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
3683 			    &cmd);
3684 	/* error is only meaningful if we're trying to set
3685 	   MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS > 1 */
3686 	if (err && nbufs > 1) {
3687 		device_printf(sc->dev,
3688 			      "Failed to set alway-use-n to %d\n",
3689 			      nbufs);
3690 		return EIO;
3691 	}
3692 	/* Give the firmware the mtu and the big and small buffer
3693 	   sizes.  The firmware wants the big buf size to be a power
3694 	   of two. Luckily, FreeBSD's clusters are powers of two */
3695 	cmd.data0 = sc->ifp->if_mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
3696 	err = mxge_send_cmd(sc, MXGEFW_CMD_SET_MTU, &cmd);
3697 	cmd.data0 = MHLEN - MXGEFW_PAD;
3698 	err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_SMALL_BUFFER_SIZE,
3699 			     &cmd);
3700 	cmd.data0 = big_bytes;
3701 	err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_BIG_BUFFER_SIZE, &cmd);
3702 
3703 	if (err != 0) {
3704 		device_printf(sc->dev, "failed to setup params\n");
3705 		goto abort;
3706 	}
3707 
3708 	/* Now give him the pointer to the stats block */
3709 	for (slice = 0;
3710 #ifdef IFNET_BUF_RING
3711 	     slice < sc->num_slices;
3712 #else
3713 	     slice < 1;
3714 #endif
3715 	     slice++) {
3716 		ss = &sc->ss[slice];
3717 		cmd.data0 =
3718 			MXGE_LOWPART_TO_U32(ss->fw_stats_dma.bus_addr);
3719 		cmd.data1 =
3720 			MXGE_HIGHPART_TO_U32(ss->fw_stats_dma.bus_addr);
3721 		cmd.data2 = sizeof(struct mcp_irq_data);
3722 		cmd.data2 |= (slice << 16);
3723 		err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_STATS_DMA_V2, &cmd);
3724 	}
3725 
3726 	if (err != 0) {
3727 		bus = sc->ss->fw_stats_dma.bus_addr;
3728 		bus += offsetof(struct mcp_irq_data, send_done_count);
3729 		cmd.data0 = MXGE_LOWPART_TO_U32(bus);
3730 		cmd.data1 = MXGE_HIGHPART_TO_U32(bus);
3731 		err = mxge_send_cmd(sc,
3732 				    MXGEFW_CMD_SET_STATS_DMA_OBSOLETE,
3733 				    &cmd);
3734 		/* Firmware cannot support multicast without STATS_DMA_V2 */
3735 		sc->fw_multicast_support = 0;
3736 	} else {
3737 		sc->fw_multicast_support = 1;
3738 	}
3739 
3740 	if (err != 0) {
3741 		device_printf(sc->dev, "failed to setup params\n");
3742 		goto abort;
3743 	}
3744 
3745 	for (slice = 0; slice < sc->num_slices; slice++) {
3746 		err = mxge_slice_open(&sc->ss[slice], nbufs, cl_size);
3747 		if (err != 0) {
3748 			device_printf(sc->dev, "couldn't open slice %d\n",
3749 				      slice);
3750 			goto abort;
3751 		}
3752 	}
3753 
3754 	/* Finally, start the firmware running */
3755 	err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_UP, &cmd);
3756 	if (err) {
3757 		device_printf(sc->dev, "Couldn't bring up link\n");
3758 		goto abort;
3759 	}
3760 #ifdef IFNET_BUF_RING
3761 	for (slice = 0; slice < sc->num_slices; slice++) {
3762 		ss = &sc->ss[slice];
3763 		ss->if_drv_flags |= IFF_DRV_RUNNING;
3764 		ss->if_drv_flags &= ~IFF_DRV_OACTIVE;
3765 	}
3766 #endif
3767 	sc->ifp->if_drv_flags |= IFF_DRV_RUNNING;
3768 	sc->ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
3769 
3770 	return 0;
3771 
3772 
3773 abort:
3774 	mxge_free_mbufs(sc);
3775 
3776 	return err;
3777 }
3778 
3779 static int
3780 mxge_close(mxge_softc_t *sc, int down)
3781 {
3782 	mxge_cmd_t cmd;
3783 	int err, old_down_cnt;
3784 #ifdef IFNET_BUF_RING
3785 	struct mxge_slice_state *ss;
3786 	int slice;
3787 #endif
3788 
3789 #ifdef IFNET_BUF_RING
3790 	for (slice = 0; slice < sc->num_slices; slice++) {
3791 		ss = &sc->ss[slice];
3792 		ss->if_drv_flags &= ~IFF_DRV_RUNNING;
3793 	}
3794 #endif
3795 	sc->ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
3796 	if (!down) {
3797 		old_down_cnt = sc->down_cnt;
3798 		wmb();
3799 		err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_DOWN, &cmd);
3800 		if (err) {
3801 			device_printf(sc->dev,
3802 				      "Couldn't bring down link\n");
3803 		}
3804 		if (old_down_cnt == sc->down_cnt) {
3805 			/* wait for down irq */
3806 			DELAY(10 * sc->intr_coal_delay);
3807 		}
3808 		wmb();
3809 		if (old_down_cnt == sc->down_cnt) {
3810 			device_printf(sc->dev, "never got down irq\n");
3811 		}
3812 	}
3813 	mxge_free_mbufs(sc);
3814 
3815 	return 0;
3816 }
3817 
3818 static void
3819 mxge_setup_cfg_space(mxge_softc_t *sc)
3820 {
3821 	device_t dev = sc->dev;
3822 	int reg;
3823 	uint16_t lnk, pectl;
3824 
3825 	/* find the PCIe link width and set max read request to 4KB*/
3826 	if (pci_find_cap(dev, PCIY_EXPRESS, &reg) == 0) {
3827 		lnk = pci_read_config(dev, reg + 0x12, 2);
3828 		sc->link_width = (lnk >> 4) & 0x3f;
3829 
3830 		if (sc->pectl == 0) {
3831 			pectl = pci_read_config(dev, reg + 0x8, 2);
3832 			pectl = (pectl & ~0x7000) | (5 << 12);
3833 			pci_write_config(dev, reg + 0x8, pectl, 2);
3834 			sc->pectl = pectl;
3835 		} else {
3836 			/* restore saved pectl after watchdog reset */
3837 			pci_write_config(dev, reg + 0x8, sc->pectl, 2);
3838 		}
3839 	}
3840 
3841 	/* Enable DMA and Memory space access */
3842 	pci_enable_busmaster(dev);
3843 }
3844 
3845 static uint32_t
3846 mxge_read_reboot(mxge_softc_t *sc)
3847 {
3848 	device_t dev = sc->dev;
3849 	uint32_t vs;
3850 
3851 	/* find the vendor specific offset */
3852 	if (pci_find_cap(dev, PCIY_VENDOR, &vs) != 0) {
3853 		device_printf(sc->dev,
3854 			      "could not find vendor specific offset\n");
3855 		return (uint32_t)-1;
3856 	}
3857 	/* enable read32 mode */
3858 	pci_write_config(dev, vs + 0x10, 0x3, 1);
3859 	/* tell NIC which register to read */
3860 	pci_write_config(dev, vs + 0x18, 0xfffffff0, 4);
3861 	return (pci_read_config(dev, vs + 0x14, 4));
3862 }
3863 
3864 static void
3865 mxge_watchdog_reset(mxge_softc_t *sc)
3866 {
3867 	struct pci_devinfo *dinfo;
3868 	struct mxge_slice_state *ss;
3869 	int err, running, s, num_tx_slices = 1;
3870 	uint32_t reboot;
3871 	uint16_t cmd;
3872 
3873 	err = ENXIO;
3874 
3875 	device_printf(sc->dev, "Watchdog reset!\n");
3876 
3877 	/*
3878 	 * check to see if the NIC rebooted.  If it did, then all of
3879 	 * PCI config space has been reset, and things like the
3880 	 * busmaster bit will be zero.  If this is the case, then we
3881 	 * must restore PCI config space before the NIC can be used
3882 	 * again
3883 	 */
3884 	cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3885 	if (cmd == 0xffff) {
3886 		/*
3887 		 * maybe the watchdog caught the NIC rebooting; wait
3888 		 * up to 100ms for it to finish.  If it does not come
3889 		 * back, then give up
3890 		 */
3891 		DELAY(1000*100);
3892 		cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3893 		if (cmd == 0xffff) {
3894 			device_printf(sc->dev, "NIC disappeared!\n");
3895 		}
3896 	}
3897 	if ((cmd & PCIM_CMD_BUSMASTEREN) == 0) {
3898 		/* print the reboot status */
3899 		reboot = mxge_read_reboot(sc);
3900 		device_printf(sc->dev, "NIC rebooted, status = 0x%x\n",
3901 			      reboot);
3902 		running = sc->ifp->if_drv_flags & IFF_DRV_RUNNING;
3903 		if (running) {
3904 
3905 			/*
3906 			 * quiesce NIC so that TX routines will not try to
3907 			 * xmit after restoration of BAR
3908 			 */
3909 
3910 			/* Mark the link as down */
3911 			if (sc->link_state) {
3912 				sc->link_state = 0;
3913 				if_link_state_change(sc->ifp,
3914 						     LINK_STATE_DOWN);
3915 			}
3916 #ifdef IFNET_BUF_RING
3917 			num_tx_slices = sc->num_slices;
3918 #endif
3919 			/* grab all TX locks to ensure no tx  */
3920 			for (s = 0; s < num_tx_slices; s++) {
3921 				ss = &sc->ss[s];
3922 				mtx_lock(&ss->tx.mtx);
3923 			}
3924 			mxge_close(sc, 1);
3925 		}
3926 		/* restore PCI configuration space */
3927 		dinfo = device_get_ivars(sc->dev);
3928 		pci_cfg_restore(sc->dev, dinfo);
3929 
3930 		/* and redo any changes we made to our config space */
3931 		mxge_setup_cfg_space(sc);
3932 
3933 		/* reload f/w */
3934 		err = mxge_load_firmware(sc, 0);
3935 		if (err) {
3936 			device_printf(sc->dev,
3937 				      "Unable to re-load f/w\n");
3938 		}
3939 		if (running) {
3940 			if (!err)
3941 				err = mxge_open(sc);
3942 			/* release all TX locks */
3943 			for (s = 0; s < num_tx_slices; s++) {
3944 				ss = &sc->ss[s];
3945 #ifdef IFNET_BUF_RING
3946 				mxge_start_locked(ss);
3947 #endif
3948 				mtx_unlock(&ss->tx.mtx);
3949 			}
3950 		}
3951 		sc->watchdog_resets++;
3952 	} else {
3953 		device_printf(sc->dev,
3954 			      "NIC did not reboot, not resetting\n");
3955 		err = 0;
3956 	}
3957 	if (err) {
3958 		device_printf(sc->dev, "watchdog reset failed\n");
3959 	} else {
3960 		if (sc->dying == 2)
3961 			sc->dying = 0;
3962 		callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
3963 	}
3964 }
3965 
3966 static void
3967 mxge_watchdog_task(void *arg, int pending)
3968 {
3969 	mxge_softc_t *sc = arg;
3970 
3971 
3972 	mtx_lock(&sc->driver_mtx);
3973 	mxge_watchdog_reset(sc);
3974 	mtx_unlock(&sc->driver_mtx);
3975 }
3976 
3977 static void
3978 mxge_warn_stuck(mxge_softc_t *sc, mxge_tx_ring_t *tx, int slice)
3979 {
3980 	tx = &sc->ss[slice].tx;
3981 	device_printf(sc->dev, "slice %d struck? ring state:\n", slice);
3982 	device_printf(sc->dev,
3983 		      "tx.req=%d tx.done=%d, tx.queue_active=%d\n",
3984 		      tx->req, tx->done, tx->queue_active);
3985 	device_printf(sc->dev, "tx.activate=%d tx.deactivate=%d\n",
3986 			      tx->activate, tx->deactivate);
3987 	device_printf(sc->dev, "pkt_done=%d fw=%d\n",
3988 		      tx->pkt_done,
3989 		      be32toh(sc->ss->fw_stats->send_done_count));
3990 }
3991 
3992 static int
3993 mxge_watchdog(mxge_softc_t *sc)
3994 {
3995 	mxge_tx_ring_t *tx;
3996 	uint32_t rx_pause = be32toh(sc->ss->fw_stats->dropped_pause);
3997 	int i, err = 0;
3998 
3999 	/* see if we have outstanding transmits, which
4000 	   have been pending for more than mxge_ticks */
4001 	for (i = 0;
4002 #ifdef IFNET_BUF_RING
4003 	     (i < sc->num_slices) && (err == 0);
4004 #else
4005 	     (i < 1) && (err == 0);
4006 #endif
4007 	     i++) {
4008 		tx = &sc->ss[i].tx;
4009 		if (tx->req != tx->done &&
4010 		    tx->watchdog_req != tx->watchdog_done &&
4011 		    tx->done == tx->watchdog_done) {
4012 			/* check for pause blocking before resetting */
4013 			if (tx->watchdog_rx_pause == rx_pause) {
4014 				mxge_warn_stuck(sc, tx, i);
4015 				taskqueue_enqueue(sc->tq, &sc->watchdog_task);
4016 				return (ENXIO);
4017 			}
4018 			else
4019 				device_printf(sc->dev, "Flow control blocking "
4020 					      "xmits, check link partner\n");
4021 		}
4022 
4023 		tx->watchdog_req = tx->req;
4024 		tx->watchdog_done = tx->done;
4025 		tx->watchdog_rx_pause = rx_pause;
4026 	}
4027 
4028 	if (sc->need_media_probe)
4029 		mxge_media_probe(sc);
4030 	return (err);
4031 }
4032 
4033 static uint64_t
4034 mxge_get_counter(struct ifnet *ifp, ift_counter cnt)
4035 {
4036 	struct mxge_softc *sc;
4037 	uint64_t rv;
4038 
4039 	sc = if_getsoftc(ifp);
4040 	rv = 0;
4041 
4042 	switch (cnt) {
4043 	case IFCOUNTER_IPACKETS:
4044 		for (int s = 0; s < sc->num_slices; s++)
4045 			rv += sc->ss[s].ipackets;
4046 		return (rv);
4047 	case IFCOUNTER_OPACKETS:
4048 		for (int s = 0; s < sc->num_slices; s++)
4049 			rv += sc->ss[s].opackets;
4050 		return (rv);
4051 	case IFCOUNTER_OERRORS:
4052 		for (int s = 0; s < sc->num_slices; s++)
4053 			rv += sc->ss[s].oerrors;
4054 		return (rv);
4055 #ifdef IFNET_BUF_RING
4056 	case IFCOUNTER_OBYTES:
4057 		for (int s = 0; s < sc->num_slices; s++)
4058 			rv += sc->ss[s].obytes;
4059 		return (rv);
4060 	case IFCOUNTER_OMCASTS:
4061 		for (int s = 0; s < sc->num_slices; s++)
4062 			rv += sc->ss[s].omcasts;
4063 		return (rv);
4064 	case IFCOUNTER_OQDROPS:
4065 		for (int s = 0; s < sc->num_slices; s++)
4066 			rv += sc->ss[s].tx.br->br_drops;
4067 		return (rv);
4068 #endif
4069 	default:
4070 		return (if_get_counter_default(ifp, cnt));
4071 	}
4072 }
4073 
4074 static void
4075 mxge_tick(void *arg)
4076 {
4077 	mxge_softc_t *sc = arg;
4078 	u_long pkts = 0;
4079 	int err = 0;
4080 	int running, ticks;
4081 	uint16_t cmd;
4082 
4083 	ticks = mxge_ticks;
4084 	running = sc->ifp->if_drv_flags & IFF_DRV_RUNNING;
4085 	if (running) {
4086 		if (!sc->watchdog_countdown) {
4087 			err = mxge_watchdog(sc);
4088 			sc->watchdog_countdown = 4;
4089 		}
4090 		sc->watchdog_countdown--;
4091 	}
4092 	if (pkts == 0) {
4093 		/* ensure NIC did not suffer h/w fault while idle */
4094 		cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
4095 		if ((cmd & PCIM_CMD_BUSMASTEREN) == 0) {
4096 			sc->dying = 2;
4097 			taskqueue_enqueue(sc->tq, &sc->watchdog_task);
4098 			err = ENXIO;
4099 		}
4100 		/* look less often if NIC is idle */
4101 		ticks *= 4;
4102 	}
4103 
4104 	if (err == 0)
4105 		callout_reset(&sc->co_hdl, ticks, mxge_tick, sc);
4106 
4107 }
4108 
4109 static int
4110 mxge_media_change(struct ifnet *ifp)
4111 {
4112 	return EINVAL;
4113 }
4114 
4115 static int
4116 mxge_change_mtu(mxge_softc_t *sc, int mtu)
4117 {
4118 	struct ifnet *ifp = sc->ifp;
4119 	int real_mtu, old_mtu;
4120 	int err = 0;
4121 
4122 
4123 	real_mtu = mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
4124 	if ((real_mtu > sc->max_mtu) || real_mtu < 60)
4125 		return EINVAL;
4126 	mtx_lock(&sc->driver_mtx);
4127 	old_mtu = ifp->if_mtu;
4128 	ifp->if_mtu = mtu;
4129 	if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
4130 		mxge_close(sc, 0);
4131 		err = mxge_open(sc);
4132 		if (err != 0) {
4133 			ifp->if_mtu = old_mtu;
4134 			mxge_close(sc, 0);
4135 			(void) mxge_open(sc);
4136 		}
4137 	}
4138 	mtx_unlock(&sc->driver_mtx);
4139 	return err;
4140 }
4141 
4142 static void
4143 mxge_media_status(struct ifnet *ifp, struct ifmediareq *ifmr)
4144 {
4145 	mxge_softc_t *sc = ifp->if_softc;
4146 
4147 
4148 	if (sc == NULL)
4149 		return;
4150 	ifmr->ifm_status = IFM_AVALID;
4151 	ifmr->ifm_active = IFM_ETHER | IFM_FDX;
4152 	ifmr->ifm_status |= sc->link_state ? IFM_ACTIVE : 0;
4153 	ifmr->ifm_active |= sc->current_media;
4154 }
4155 
4156 static int
4157 mxge_fetch_i2c(mxge_softc_t *sc, struct ifi2creq *i2c)
4158 {
4159 	mxge_cmd_t cmd;
4160 	uint32_t i2c_args;
4161 	int i, ms, err;
4162 
4163 
4164 	if (i2c->dev_addr != 0xA0 &&
4165 	    i2c->dev_addr != 0xA2)
4166 		return (EINVAL);
4167 	if (i2c->len > sizeof(i2c->data))
4168 		return (EINVAL);
4169 
4170 	for (i = 0; i < i2c->len; i++) {
4171 		i2c_args = i2c->dev_addr << 0x8;
4172 		i2c_args |= i2c->offset + i;
4173 		cmd.data0 = 0;	 /* just fetch 1 byte, not all 256 */
4174 		cmd.data1 = i2c_args;
4175 		err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_READ, &cmd);
4176 
4177 		if (err != MXGEFW_CMD_OK)
4178 			return (EIO);
4179 		/* now we wait for the data to be cached */
4180 		cmd.data0 = i2c_args & 0xff;
4181 		err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
4182 		for (ms = 0; (err == EBUSY) && (ms < 50); ms++) {
4183 			cmd.data0 = i2c_args & 0xff;
4184 			err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
4185 			if (err == EBUSY)
4186 				DELAY(1000);
4187 		}
4188 		if (err != MXGEFW_CMD_OK)
4189 			return (EIO);
4190 		i2c->data[i] = cmd.data0;
4191 	}
4192 	return (0);
4193 }
4194 
4195 static int
4196 mxge_ioctl(struct ifnet *ifp, u_long command, caddr_t data)
4197 {
4198 	mxge_softc_t *sc = ifp->if_softc;
4199 	struct ifreq *ifr = (struct ifreq *)data;
4200 	struct ifi2creq i2c;
4201 	int err, mask;
4202 
4203 	err = 0;
4204 	switch (command) {
4205 	case SIOCSIFMTU:
4206 		err = mxge_change_mtu(sc, ifr->ifr_mtu);
4207 		break;
4208 
4209 	case SIOCSIFFLAGS:
4210 		mtx_lock(&sc->driver_mtx);
4211 		if (sc->dying) {
4212 			mtx_unlock(&sc->driver_mtx);
4213 			return EINVAL;
4214 		}
4215 		if (ifp->if_flags & IFF_UP) {
4216 			if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) {
4217 				err = mxge_open(sc);
4218 			} else {
4219 				/* take care of promis can allmulti
4220 				   flag chages */
4221 				mxge_change_promisc(sc,
4222 						    ifp->if_flags & IFF_PROMISC);
4223 				mxge_set_multicast_list(sc);
4224 			}
4225 		} else {
4226 			if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
4227 				mxge_close(sc, 0);
4228 			}
4229 		}
4230 		mtx_unlock(&sc->driver_mtx);
4231 		break;
4232 
4233 	case SIOCADDMULTI:
4234 	case SIOCDELMULTI:
4235 		mtx_lock(&sc->driver_mtx);
4236 		if (sc->dying) {
4237 			mtx_unlock(&sc->driver_mtx);
4238 			return (EINVAL);
4239 		}
4240 		mxge_set_multicast_list(sc);
4241 		mtx_unlock(&sc->driver_mtx);
4242 		break;
4243 
4244 	case SIOCSIFCAP:
4245 		mtx_lock(&sc->driver_mtx);
4246 		mask = ifr->ifr_reqcap ^ ifp->if_capenable;
4247 		if (mask & IFCAP_TXCSUM) {
4248 			if (IFCAP_TXCSUM & ifp->if_capenable) {
4249 				ifp->if_capenable &= ~(IFCAP_TXCSUM|IFCAP_TSO4);
4250 				ifp->if_hwassist &= ~(CSUM_TCP | CSUM_UDP);
4251 			} else {
4252 				ifp->if_capenable |= IFCAP_TXCSUM;
4253 				ifp->if_hwassist |= (CSUM_TCP | CSUM_UDP);
4254 			}
4255 		} else if (mask & IFCAP_RXCSUM) {
4256 			if (IFCAP_RXCSUM & ifp->if_capenable) {
4257 				ifp->if_capenable &= ~IFCAP_RXCSUM;
4258 			} else {
4259 				ifp->if_capenable |= IFCAP_RXCSUM;
4260 			}
4261 		}
4262 		if (mask & IFCAP_TSO4) {
4263 			if (IFCAP_TSO4 & ifp->if_capenable) {
4264 				ifp->if_capenable &= ~IFCAP_TSO4;
4265 			} else if (IFCAP_TXCSUM & ifp->if_capenable) {
4266 				ifp->if_capenable |= IFCAP_TSO4;
4267 				ifp->if_hwassist |= CSUM_TSO;
4268 			} else {
4269 				printf("mxge requires tx checksum offload"
4270 				       " be enabled to use TSO\n");
4271 				err = EINVAL;
4272 			}
4273 		}
4274 #if IFCAP_TSO6
4275 		if (mask & IFCAP_TXCSUM_IPV6) {
4276 			if (IFCAP_TXCSUM_IPV6 & ifp->if_capenable) {
4277 				ifp->if_capenable &= ~(IFCAP_TXCSUM_IPV6
4278 						       | IFCAP_TSO6);
4279 				ifp->if_hwassist &= ~(CSUM_TCP_IPV6
4280 						      | CSUM_UDP);
4281 			} else {
4282 				ifp->if_capenable |= IFCAP_TXCSUM_IPV6;
4283 				ifp->if_hwassist |= (CSUM_TCP_IPV6
4284 						     | CSUM_UDP_IPV6);
4285 			}
4286 		} else if (mask & IFCAP_RXCSUM_IPV6) {
4287 			if (IFCAP_RXCSUM_IPV6 & ifp->if_capenable) {
4288 				ifp->if_capenable &= ~IFCAP_RXCSUM_IPV6;
4289 			} else {
4290 				ifp->if_capenable |= IFCAP_RXCSUM_IPV6;
4291 			}
4292 		}
4293 		if (mask & IFCAP_TSO6) {
4294 			if (IFCAP_TSO6 & ifp->if_capenable) {
4295 				ifp->if_capenable &= ~IFCAP_TSO6;
4296 			} else if (IFCAP_TXCSUM_IPV6 & ifp->if_capenable) {
4297 				ifp->if_capenable |= IFCAP_TSO6;
4298 				ifp->if_hwassist |= CSUM_TSO;
4299 			} else {
4300 				printf("mxge requires tx checksum offload"
4301 				       " be enabled to use TSO\n");
4302 				err = EINVAL;
4303 			}
4304 		}
4305 #endif /*IFCAP_TSO6 */
4306 
4307 		if (mask & IFCAP_LRO)
4308 			ifp->if_capenable ^= IFCAP_LRO;
4309 		if (mask & IFCAP_VLAN_HWTAGGING)
4310 			ifp->if_capenable ^= IFCAP_VLAN_HWTAGGING;
4311 		if (mask & IFCAP_VLAN_HWTSO)
4312 			ifp->if_capenable ^= IFCAP_VLAN_HWTSO;
4313 
4314 		if (!(ifp->if_capabilities & IFCAP_VLAN_HWTSO) ||
4315 		    !(ifp->if_capenable & IFCAP_VLAN_HWTAGGING))
4316 			ifp->if_capenable &= ~IFCAP_VLAN_HWTSO;
4317 
4318 		mtx_unlock(&sc->driver_mtx);
4319 		VLAN_CAPABILITIES(ifp);
4320 
4321 		break;
4322 
4323 	case SIOCGIFMEDIA:
4324 		mtx_lock(&sc->driver_mtx);
4325 		if (sc->dying) {
4326 			mtx_unlock(&sc->driver_mtx);
4327 			return (EINVAL);
4328 		}
4329 		mxge_media_probe(sc);
4330 		mtx_unlock(&sc->driver_mtx);
4331 		err = ifmedia_ioctl(ifp, (struct ifreq *)data,
4332 				    &sc->media, command);
4333 		break;
4334 
4335 	case SIOCGI2C:
4336 		if (sc->connector != MXGE_XFP &&
4337 		    sc->connector != MXGE_SFP) {
4338 			err = ENXIO;
4339 			break;
4340 		}
4341 		err = copyin(ifr_data_get_ptr(ifr), &i2c, sizeof(i2c));
4342 		if (err != 0)
4343 			break;
4344 		mtx_lock(&sc->driver_mtx);
4345 		if (sc->dying) {
4346 			mtx_unlock(&sc->driver_mtx);
4347 			return (EINVAL);
4348 		}
4349 		err = mxge_fetch_i2c(sc, &i2c);
4350 		mtx_unlock(&sc->driver_mtx);
4351 		if (err == 0)
4352 			err = copyout(&i2c, ifr->ifr_ifru.ifru_data,
4353 			    sizeof(i2c));
4354 		break;
4355 	default:
4356 		err = ether_ioctl(ifp, command, data);
4357 		break;
4358 	}
4359 	return err;
4360 }
4361 
4362 static void
4363 mxge_fetch_tunables(mxge_softc_t *sc)
4364 {
4365 
4366 	TUNABLE_INT_FETCH("hw.mxge.max_slices", &mxge_max_slices);
4367 	TUNABLE_INT_FETCH("hw.mxge.flow_control_enabled",
4368 			  &mxge_flow_control);
4369 	TUNABLE_INT_FETCH("hw.mxge.intr_coal_delay",
4370 			  &mxge_intr_coal_delay);
4371 	TUNABLE_INT_FETCH("hw.mxge.nvidia_ecrc_enable",
4372 			  &mxge_nvidia_ecrc_enable);
4373 	TUNABLE_INT_FETCH("hw.mxge.force_firmware",
4374 			  &mxge_force_firmware);
4375 	TUNABLE_INT_FETCH("hw.mxge.deassert_wait",
4376 			  &mxge_deassert_wait);
4377 	TUNABLE_INT_FETCH("hw.mxge.verbose",
4378 			  &mxge_verbose);
4379 	TUNABLE_INT_FETCH("hw.mxge.ticks", &mxge_ticks);
4380 	TUNABLE_INT_FETCH("hw.mxge.always_promisc", &mxge_always_promisc);
4381 	TUNABLE_INT_FETCH("hw.mxge.rss_hash_type", &mxge_rss_hash_type);
4382 	TUNABLE_INT_FETCH("hw.mxge.rss_hashtype", &mxge_rss_hash_type);
4383 	TUNABLE_INT_FETCH("hw.mxge.initial_mtu", &mxge_initial_mtu);
4384 	TUNABLE_INT_FETCH("hw.mxge.throttle", &mxge_throttle);
4385 
4386 	if (bootverbose)
4387 		mxge_verbose = 1;
4388 	if (mxge_intr_coal_delay < 0 || mxge_intr_coal_delay > 10*1000)
4389 		mxge_intr_coal_delay = 30;
4390 	if (mxge_ticks == 0)
4391 		mxge_ticks = hz / 2;
4392 	sc->pause = mxge_flow_control;
4393 	if (mxge_rss_hash_type < MXGEFW_RSS_HASH_TYPE_IPV4
4394 	    || mxge_rss_hash_type > MXGEFW_RSS_HASH_TYPE_MAX) {
4395 		mxge_rss_hash_type = MXGEFW_RSS_HASH_TYPE_SRC_DST_PORT;
4396 	}
4397 	if (mxge_initial_mtu > ETHERMTU_JUMBO ||
4398 	    mxge_initial_mtu < ETHER_MIN_LEN)
4399 		mxge_initial_mtu = ETHERMTU_JUMBO;
4400 
4401 	if (mxge_throttle && mxge_throttle > MXGE_MAX_THROTTLE)
4402 		mxge_throttle = MXGE_MAX_THROTTLE;
4403 	if (mxge_throttle && mxge_throttle < MXGE_MIN_THROTTLE)
4404 		mxge_throttle = MXGE_MIN_THROTTLE;
4405 	sc->throttle = mxge_throttle;
4406 }
4407 
4408 
4409 static void
4410 mxge_free_slices(mxge_softc_t *sc)
4411 {
4412 	struct mxge_slice_state *ss;
4413 	int i;
4414 
4415 
4416 	if (sc->ss == NULL)
4417 		return;
4418 
4419 	for (i = 0; i < sc->num_slices; i++) {
4420 		ss = &sc->ss[i];
4421 		if (ss->fw_stats != NULL) {
4422 			mxge_dma_free(&ss->fw_stats_dma);
4423 			ss->fw_stats = NULL;
4424 #ifdef IFNET_BUF_RING
4425 			if (ss->tx.br != NULL) {
4426 				drbr_free(ss->tx.br, M_DEVBUF);
4427 				ss->tx.br = NULL;
4428 			}
4429 #endif
4430 			mtx_destroy(&ss->tx.mtx);
4431 		}
4432 		if (ss->rx_done.entry != NULL) {
4433 			mxge_dma_free(&ss->rx_done.dma);
4434 			ss->rx_done.entry = NULL;
4435 		}
4436 	}
4437 	free(sc->ss, M_DEVBUF);
4438 	sc->ss = NULL;
4439 }
4440 
4441 static int
4442 mxge_alloc_slices(mxge_softc_t *sc)
4443 {
4444 	mxge_cmd_t cmd;
4445 	struct mxge_slice_state *ss;
4446 	size_t bytes;
4447 	int err, i, max_intr_slots;
4448 
4449 	err = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
4450 	if (err != 0) {
4451 		device_printf(sc->dev, "Cannot determine rx ring size\n");
4452 		return err;
4453 	}
4454 	sc->rx_ring_size = cmd.data0;
4455 	max_intr_slots = 2 * (sc->rx_ring_size / sizeof (mcp_dma_addr_t));
4456 
4457 	bytes = sizeof (*sc->ss) * sc->num_slices;
4458 	sc->ss = malloc(bytes, M_DEVBUF, M_NOWAIT | M_ZERO);
4459 	if (sc->ss == NULL)
4460 		return (ENOMEM);
4461 	for (i = 0; i < sc->num_slices; i++) {
4462 		ss = &sc->ss[i];
4463 
4464 		ss->sc = sc;
4465 
4466 		/* allocate per-slice rx interrupt queues */
4467 
4468 		bytes = max_intr_slots * sizeof (*ss->rx_done.entry);
4469 		err = mxge_dma_alloc(sc, &ss->rx_done.dma, bytes, 4096);
4470 		if (err != 0)
4471 			goto abort;
4472 		ss->rx_done.entry = ss->rx_done.dma.addr;
4473 		bzero(ss->rx_done.entry, bytes);
4474 
4475 		/*
4476 		 * allocate the per-slice firmware stats; stats
4477 		 * (including tx) are used used only on the first
4478 		 * slice for now
4479 		 */
4480 #ifndef IFNET_BUF_RING
4481 		if (i > 0)
4482 			continue;
4483 #endif
4484 
4485 		bytes = sizeof (*ss->fw_stats);
4486 		err = mxge_dma_alloc(sc, &ss->fw_stats_dma,
4487 				     sizeof (*ss->fw_stats), 64);
4488 		if (err != 0)
4489 			goto abort;
4490 		ss->fw_stats = (mcp_irq_data_t *)ss->fw_stats_dma.addr;
4491 		snprintf(ss->tx.mtx_name, sizeof(ss->tx.mtx_name),
4492 			 "%s:tx(%d)", device_get_nameunit(sc->dev), i);
4493 		mtx_init(&ss->tx.mtx, ss->tx.mtx_name, NULL, MTX_DEF);
4494 #ifdef IFNET_BUF_RING
4495 		ss->tx.br = buf_ring_alloc(2048, M_DEVBUF, M_WAITOK,
4496 					   &ss->tx.mtx);
4497 #endif
4498 	}
4499 
4500 	return (0);
4501 
4502 abort:
4503 	mxge_free_slices(sc);
4504 	return (ENOMEM);
4505 }
4506 
4507 static void
4508 mxge_slice_probe(mxge_softc_t *sc)
4509 {
4510 	mxge_cmd_t cmd;
4511 	char *old_fw;
4512 	int msix_cnt, status, max_intr_slots;
4513 
4514 	sc->num_slices = 1;
4515 	/*
4516 	 *  don't enable multiple slices if they are not enabled,
4517 	 *  or if this is not an SMP system
4518 	 */
4519 
4520 	if (mxge_max_slices == 0 || mxge_max_slices == 1 || mp_ncpus < 2)
4521 		return;
4522 
4523 	/* see how many MSI-X interrupts are available */
4524 	msix_cnt = pci_msix_count(sc->dev);
4525 	if (msix_cnt < 2)
4526 		return;
4527 
4528 	/* now load the slice aware firmware see what it supports */
4529 	old_fw = sc->fw_name;
4530 	if (old_fw == mxge_fw_aligned)
4531 		sc->fw_name = mxge_fw_rss_aligned;
4532 	else
4533 		sc->fw_name = mxge_fw_rss_unaligned;
4534 	status = mxge_load_firmware(sc, 0);
4535 	if (status != 0) {
4536 		device_printf(sc->dev, "Falling back to a single slice\n");
4537 		return;
4538 	}
4539 
4540 	/* try to send a reset command to the card to see if it
4541 	   is alive */
4542 	memset(&cmd, 0, sizeof (cmd));
4543 	status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
4544 	if (status != 0) {
4545 		device_printf(sc->dev, "failed reset\n");
4546 		goto abort_with_fw;
4547 	}
4548 
4549 	/* get rx ring size */
4550 	status = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
4551 	if (status != 0) {
4552 		device_printf(sc->dev, "Cannot determine rx ring size\n");
4553 		goto abort_with_fw;
4554 	}
4555 	max_intr_slots = 2 * (cmd.data0 / sizeof (mcp_dma_addr_t));
4556 
4557 	/* tell it the size of the interrupt queues */
4558 	cmd.data0 = max_intr_slots * sizeof (struct mcp_slot);
4559 	status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
4560 	if (status != 0) {
4561 		device_printf(sc->dev, "failed MXGEFW_CMD_SET_INTRQ_SIZE\n");
4562 		goto abort_with_fw;
4563 	}
4564 
4565 	/* ask the maximum number of slices it supports */
4566 	status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES, &cmd);
4567 	if (status != 0) {
4568 		device_printf(sc->dev,
4569 			      "failed MXGEFW_CMD_GET_MAX_RSS_QUEUES\n");
4570 		goto abort_with_fw;
4571 	}
4572 	sc->num_slices = cmd.data0;
4573 	if (sc->num_slices > msix_cnt)
4574 		sc->num_slices = msix_cnt;
4575 
4576 	if (mxge_max_slices == -1) {
4577 		/* cap to number of CPUs in system */
4578 		if (sc->num_slices > mp_ncpus)
4579 			sc->num_slices = mp_ncpus;
4580 	} else {
4581 		if (sc->num_slices > mxge_max_slices)
4582 			sc->num_slices = mxge_max_slices;
4583 	}
4584 	/* make sure it is a power of two */
4585 	while (sc->num_slices & (sc->num_slices - 1))
4586 		sc->num_slices--;
4587 
4588 	if (mxge_verbose)
4589 		device_printf(sc->dev, "using %d slices\n",
4590 			      sc->num_slices);
4591 
4592 	return;
4593 
4594 abort_with_fw:
4595 	sc->fw_name = old_fw;
4596 	(void) mxge_load_firmware(sc, 0);
4597 }
4598 
4599 static int
4600 mxge_add_msix_irqs(mxge_softc_t *sc)
4601 {
4602 	size_t bytes;
4603 	int count, err, i, rid;
4604 
4605 	rid = PCIR_BAR(2);
4606 	sc->msix_table_res = bus_alloc_resource_any(sc->dev, SYS_RES_MEMORY,
4607 						    &rid, RF_ACTIVE);
4608 
4609 	if (sc->msix_table_res == NULL) {
4610 		device_printf(sc->dev, "couldn't alloc MSIX table res\n");
4611 		return ENXIO;
4612 	}
4613 
4614 	count = sc->num_slices;
4615 	err = pci_alloc_msix(sc->dev, &count);
4616 	if (err != 0) {
4617 		device_printf(sc->dev, "pci_alloc_msix: failed, wanted %d"
4618 			      "err = %d \n", sc->num_slices, err);
4619 		goto abort_with_msix_table;
4620 	}
4621 	if (count < sc->num_slices) {
4622 		device_printf(sc->dev, "pci_alloc_msix: need %d, got %d\n",
4623 			      count, sc->num_slices);
4624 		device_printf(sc->dev,
4625 			      "Try setting hw.mxge.max_slices to %d\n",
4626 			      count);
4627 		err = ENOSPC;
4628 		goto abort_with_msix;
4629 	}
4630 	bytes = sizeof (*sc->msix_irq_res) * sc->num_slices;
4631 	sc->msix_irq_res = malloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO);
4632 	if (sc->msix_irq_res == NULL) {
4633 		err = ENOMEM;
4634 		goto abort_with_msix;
4635 	}
4636 
4637 	for (i = 0; i < sc->num_slices; i++) {
4638 		rid = i + 1;
4639 		sc->msix_irq_res[i] = bus_alloc_resource_any(sc->dev,
4640 							  SYS_RES_IRQ,
4641 							  &rid, RF_ACTIVE);
4642 		if (sc->msix_irq_res[i] == NULL) {
4643 			device_printf(sc->dev, "couldn't allocate IRQ res"
4644 				      " for message %d\n", i);
4645 			err = ENXIO;
4646 			goto abort_with_res;
4647 		}
4648 	}
4649 
4650 	bytes = sizeof (*sc->msix_ih) * sc->num_slices;
4651 	sc->msix_ih =  malloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO);
4652 
4653 	for (i = 0; i < sc->num_slices; i++) {
4654 		err = bus_setup_intr(sc->dev, sc->msix_irq_res[i],
4655 				     INTR_TYPE_NET | INTR_MPSAFE,
4656 #if __FreeBSD_version > 700030
4657 				     NULL,
4658 #endif
4659 				     mxge_intr, &sc->ss[i], &sc->msix_ih[i]);
4660 		if (err != 0) {
4661 			device_printf(sc->dev, "couldn't setup intr for "
4662 				      "message %d\n", i);
4663 			goto abort_with_intr;
4664 		}
4665 		bus_describe_intr(sc->dev, sc->msix_irq_res[i],
4666 				  sc->msix_ih[i], "s%d", i);
4667 	}
4668 
4669 	if (mxge_verbose) {
4670 		device_printf(sc->dev, "using %d msix IRQs:",
4671 			      sc->num_slices);
4672 		for (i = 0; i < sc->num_slices; i++)
4673 			printf(" %jd", rman_get_start(sc->msix_irq_res[i]));
4674 		printf("\n");
4675 	}
4676 	return (0);
4677 
4678 abort_with_intr:
4679 	for (i = 0; i < sc->num_slices; i++) {
4680 		if (sc->msix_ih[i] != NULL) {
4681 			bus_teardown_intr(sc->dev, sc->msix_irq_res[i],
4682 					  sc->msix_ih[i]);
4683 			sc->msix_ih[i] = NULL;
4684 		}
4685 	}
4686 	free(sc->msix_ih, M_DEVBUF);
4687 
4688 
4689 abort_with_res:
4690 	for (i = 0; i < sc->num_slices; i++) {
4691 		rid = i + 1;
4692 		if (sc->msix_irq_res[i] != NULL)
4693 			bus_release_resource(sc->dev, SYS_RES_IRQ, rid,
4694 					     sc->msix_irq_res[i]);
4695 		sc->msix_irq_res[i] = NULL;
4696 	}
4697 	free(sc->msix_irq_res, M_DEVBUF);
4698 
4699 
4700 abort_with_msix:
4701 	pci_release_msi(sc->dev);
4702 
4703 abort_with_msix_table:
4704 	bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2),
4705 			     sc->msix_table_res);
4706 
4707 	return err;
4708 }
4709 
4710 static int
4711 mxge_add_single_irq(mxge_softc_t *sc)
4712 {
4713 	int count, err, rid;
4714 
4715 	count = pci_msi_count(sc->dev);
4716 	if (count == 1 && pci_alloc_msi(sc->dev, &count) == 0) {
4717 		rid = 1;
4718 	} else {
4719 		rid = 0;
4720 		sc->legacy_irq = 1;
4721 	}
4722 	sc->irq_res = bus_alloc_resource_any(sc->dev, SYS_RES_IRQ, &rid,
4723 					     RF_SHAREABLE | RF_ACTIVE);
4724 	if (sc->irq_res == NULL) {
4725 		device_printf(sc->dev, "could not alloc interrupt\n");
4726 		return ENXIO;
4727 	}
4728 	if (mxge_verbose)
4729 		device_printf(sc->dev, "using %s irq %jd\n",
4730 			      sc->legacy_irq ? "INTx" : "MSI",
4731 			      rman_get_start(sc->irq_res));
4732 	err = bus_setup_intr(sc->dev, sc->irq_res,
4733 			     INTR_TYPE_NET | INTR_MPSAFE,
4734 #if __FreeBSD_version > 700030
4735 			     NULL,
4736 #endif
4737 			     mxge_intr, &sc->ss[0], &sc->ih);
4738 	if (err != 0) {
4739 		bus_release_resource(sc->dev, SYS_RES_IRQ,
4740 				     sc->legacy_irq ? 0 : 1, sc->irq_res);
4741 		if (!sc->legacy_irq)
4742 			pci_release_msi(sc->dev);
4743 	}
4744 	return err;
4745 }
4746 
4747 static void
4748 mxge_rem_msix_irqs(mxge_softc_t *sc)
4749 {
4750 	int i, rid;
4751 
4752 	for (i = 0; i < sc->num_slices; i++) {
4753 		if (sc->msix_ih[i] != NULL) {
4754 			bus_teardown_intr(sc->dev, sc->msix_irq_res[i],
4755 					  sc->msix_ih[i]);
4756 			sc->msix_ih[i] = NULL;
4757 		}
4758 	}
4759 	free(sc->msix_ih, M_DEVBUF);
4760 
4761 	for (i = 0; i < sc->num_slices; i++) {
4762 		rid = i + 1;
4763 		if (sc->msix_irq_res[i] != NULL)
4764 			bus_release_resource(sc->dev, SYS_RES_IRQ, rid,
4765 					     sc->msix_irq_res[i]);
4766 		sc->msix_irq_res[i] = NULL;
4767 	}
4768 	free(sc->msix_irq_res, M_DEVBUF);
4769 
4770 	bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2),
4771 			     sc->msix_table_res);
4772 
4773 	pci_release_msi(sc->dev);
4774 	return;
4775 }
4776 
4777 static void
4778 mxge_rem_single_irq(mxge_softc_t *sc)
4779 {
4780 	bus_teardown_intr(sc->dev, sc->irq_res, sc->ih);
4781 	bus_release_resource(sc->dev, SYS_RES_IRQ,
4782 			     sc->legacy_irq ? 0 : 1, sc->irq_res);
4783 	if (!sc->legacy_irq)
4784 		pci_release_msi(sc->dev);
4785 }
4786 
4787 static void
4788 mxge_rem_irq(mxge_softc_t *sc)
4789 {
4790 	if (sc->num_slices > 1)
4791 		mxge_rem_msix_irqs(sc);
4792 	else
4793 		mxge_rem_single_irq(sc);
4794 }
4795 
4796 static int
4797 mxge_add_irq(mxge_softc_t *sc)
4798 {
4799 	int err;
4800 
4801 	if (sc->num_slices > 1)
4802 		err = mxge_add_msix_irqs(sc);
4803 	else
4804 		err = mxge_add_single_irq(sc);
4805 
4806 	if (0 && err == 0 && sc->num_slices > 1) {
4807 		mxge_rem_msix_irqs(sc);
4808 		err = mxge_add_msix_irqs(sc);
4809 	}
4810 	return err;
4811 }
4812 
4813 
4814 static int
4815 mxge_attach(device_t dev)
4816 {
4817 	mxge_cmd_t cmd;
4818 	mxge_softc_t *sc = device_get_softc(dev);
4819 	struct ifnet *ifp;
4820 	int err, rid;
4821 
4822 	sc->dev = dev;
4823 	mxge_fetch_tunables(sc);
4824 
4825 	TASK_INIT(&sc->watchdog_task, 1, mxge_watchdog_task, sc);
4826 	sc->tq = taskqueue_create("mxge_taskq", M_WAITOK,
4827 				  taskqueue_thread_enqueue, &sc->tq);
4828 	if (sc->tq == NULL) {
4829 		err = ENOMEM;
4830 		goto abort_with_nothing;
4831 	}
4832 
4833 	err = bus_dma_tag_create(bus_get_dma_tag(dev),	/* parent */
4834 				 1,			/* alignment */
4835 				 0,			/* boundary */
4836 				 BUS_SPACE_MAXADDR,	/* low */
4837 				 BUS_SPACE_MAXADDR,	/* high */
4838 				 NULL, NULL,		/* filter */
4839 				 65536 + 256,		/* maxsize */
4840 				 MXGE_MAX_SEND_DESC, 	/* num segs */
4841 				 65536,			/* maxsegsize */
4842 				 0,			/* flags */
4843 				 NULL, NULL,		/* lock */
4844 				 &sc->parent_dmat);	/* tag */
4845 
4846 	if (err != 0) {
4847 		device_printf(sc->dev, "Err %d allocating parent dmat\n",
4848 			      err);
4849 		goto abort_with_tq;
4850 	}
4851 
4852 	ifp = sc->ifp = if_alloc(IFT_ETHER);
4853 	if (ifp == NULL) {
4854 		device_printf(dev, "can not if_alloc()\n");
4855 		err = ENOSPC;
4856 		goto abort_with_parent_dmat;
4857 	}
4858 	if_initname(ifp, device_get_name(dev), device_get_unit(dev));
4859 
4860 	snprintf(sc->cmd_mtx_name, sizeof(sc->cmd_mtx_name), "%s:cmd",
4861 		 device_get_nameunit(dev));
4862 	mtx_init(&sc->cmd_mtx, sc->cmd_mtx_name, NULL, MTX_DEF);
4863 	snprintf(sc->driver_mtx_name, sizeof(sc->driver_mtx_name),
4864 		 "%s:drv", device_get_nameunit(dev));
4865 	mtx_init(&sc->driver_mtx, sc->driver_mtx_name,
4866 		 MTX_NETWORK_LOCK, MTX_DEF);
4867 
4868 	callout_init_mtx(&sc->co_hdl, &sc->driver_mtx, 0);
4869 
4870 	mxge_setup_cfg_space(sc);
4871 
4872 	/* Map the board into the kernel */
4873 	rid = PCIR_BARS;
4874 	sc->mem_res = bus_alloc_resource_any(dev, SYS_RES_MEMORY, &rid,
4875 					     RF_ACTIVE);
4876 	if (sc->mem_res == NULL) {
4877 		device_printf(dev, "could not map memory\n");
4878 		err = ENXIO;
4879 		goto abort_with_lock;
4880 	}
4881 	sc->sram = rman_get_virtual(sc->mem_res);
4882 	sc->sram_size = 2*1024*1024 - (2*(48*1024)+(32*1024)) - 0x100;
4883 	if (sc->sram_size > rman_get_size(sc->mem_res)) {
4884 		device_printf(dev, "impossible memory region size %jd\n",
4885 			      rman_get_size(sc->mem_res));
4886 		err = ENXIO;
4887 		goto abort_with_mem_res;
4888 	}
4889 
4890 	/* make NULL terminated copy of the EEPROM strings section of
4891 	   lanai SRAM */
4892 	bzero(sc->eeprom_strings, MXGE_EEPROM_STRINGS_SIZE);
4893 	bus_space_read_region_1(rman_get_bustag(sc->mem_res),
4894 				rman_get_bushandle(sc->mem_res),
4895 				sc->sram_size - MXGE_EEPROM_STRINGS_SIZE,
4896 				sc->eeprom_strings,
4897 				MXGE_EEPROM_STRINGS_SIZE - 2);
4898 	err = mxge_parse_strings(sc);
4899 	if (err != 0)
4900 		goto abort_with_mem_res;
4901 
4902 	/* Enable write combining for efficient use of PCIe bus */
4903 	mxge_enable_wc(sc);
4904 
4905 	/* Allocate the out of band dma memory */
4906 	err = mxge_dma_alloc(sc, &sc->cmd_dma,
4907 			     sizeof (mxge_cmd_t), 64);
4908 	if (err != 0)
4909 		goto abort_with_mem_res;
4910 	sc->cmd = (mcp_cmd_response_t *) sc->cmd_dma.addr;
4911 	err = mxge_dma_alloc(sc, &sc->zeropad_dma, 64, 64);
4912 	if (err != 0)
4913 		goto abort_with_cmd_dma;
4914 
4915 	err = mxge_dma_alloc(sc, &sc->dmabench_dma, 4096, 4096);
4916 	if (err != 0)
4917 		goto abort_with_zeropad_dma;
4918 
4919 	/* select & load the firmware */
4920 	err = mxge_select_firmware(sc);
4921 	if (err != 0)
4922 		goto abort_with_dmabench;
4923 	sc->intr_coal_delay = mxge_intr_coal_delay;
4924 
4925 	mxge_slice_probe(sc);
4926 	err = mxge_alloc_slices(sc);
4927 	if (err != 0)
4928 		goto abort_with_dmabench;
4929 
4930 	err = mxge_reset(sc, 0);
4931 	if (err != 0)
4932 		goto abort_with_slices;
4933 
4934 	err = mxge_alloc_rings(sc);
4935 	if (err != 0) {
4936 		device_printf(sc->dev, "failed to allocate rings\n");
4937 		goto abort_with_slices;
4938 	}
4939 
4940 	err = mxge_add_irq(sc);
4941 	if (err != 0) {
4942 		device_printf(sc->dev, "failed to add irq\n");
4943 		goto abort_with_rings;
4944 	}
4945 
4946 	ifp->if_baudrate = IF_Gbps(10);
4947 	ifp->if_capabilities = IFCAP_RXCSUM | IFCAP_TXCSUM | IFCAP_TSO4 |
4948 		IFCAP_VLAN_MTU | IFCAP_LINKSTATE | IFCAP_TXCSUM_IPV6 |
4949 		IFCAP_RXCSUM_IPV6;
4950 #if defined(INET) || defined(INET6)
4951 	ifp->if_capabilities |= IFCAP_LRO;
4952 #endif
4953 
4954 #ifdef MXGE_NEW_VLAN_API
4955 	ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_HWCSUM;
4956 
4957 	/* Only FW 1.4.32 and newer can do TSO over vlans */
4958 	if (sc->fw_ver_major == 1 && sc->fw_ver_minor == 4 &&
4959 	    sc->fw_ver_tiny >= 32)
4960 		ifp->if_capabilities |= IFCAP_VLAN_HWTSO;
4961 #endif
4962 	sc->max_mtu = mxge_max_mtu(sc);
4963 	if (sc->max_mtu >= 9000)
4964 		ifp->if_capabilities |= IFCAP_JUMBO_MTU;
4965 	else
4966 		device_printf(dev, "MTU limited to %d.  Install "
4967 			      "latest firmware for 9000 byte jumbo support\n",
4968 			      sc->max_mtu - ETHER_HDR_LEN);
4969 	ifp->if_hwassist = CSUM_TCP | CSUM_UDP | CSUM_TSO;
4970 	ifp->if_hwassist |= CSUM_TCP_IPV6 | CSUM_UDP_IPV6;
4971 	/* check to see if f/w supports TSO for IPv6 */
4972 	if (!mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_TSO6_HDR_SIZE, &cmd)) {
4973 		if (CSUM_TCP_IPV6)
4974 			ifp->if_capabilities |= IFCAP_TSO6;
4975 		sc->max_tso6_hlen = min(cmd.data0,
4976 					sizeof (sc->ss[0].scratch));
4977 	}
4978 	ifp->if_capenable = ifp->if_capabilities;
4979 	if (sc->lro_cnt == 0)
4980 		ifp->if_capenable &= ~IFCAP_LRO;
4981 	ifp->if_init = mxge_init;
4982 	ifp->if_softc = sc;
4983 	ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
4984 	ifp->if_ioctl = mxge_ioctl;
4985 	ifp->if_start = mxge_start;
4986 	ifp->if_get_counter = mxge_get_counter;
4987 	ifp->if_hw_tsomax = IP_MAXPACKET - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN);
4988 	ifp->if_hw_tsomaxsegcount = sc->ss[0].tx.max_desc;
4989 	ifp->if_hw_tsomaxsegsize = IP_MAXPACKET;
4990 	/* Initialise the ifmedia structure */
4991 	ifmedia_init(&sc->media, 0, mxge_media_change,
4992 		     mxge_media_status);
4993 	mxge_media_init(sc);
4994 	mxge_media_probe(sc);
4995 	sc->dying = 0;
4996 	ether_ifattach(ifp, sc->mac_addr);
4997 	/* ether_ifattach sets mtu to ETHERMTU */
4998 	if (mxge_initial_mtu != ETHERMTU)
4999 		mxge_change_mtu(sc, mxge_initial_mtu);
5000 
5001 	mxge_add_sysctls(sc);
5002 #ifdef IFNET_BUF_RING
5003 	ifp->if_transmit = mxge_transmit;
5004 	ifp->if_qflush = mxge_qflush;
5005 #endif
5006 	taskqueue_start_threads(&sc->tq, 1, PI_NET, "%s taskq",
5007 				device_get_nameunit(sc->dev));
5008 	callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
5009 	return 0;
5010 
5011 abort_with_rings:
5012 	mxge_free_rings(sc);
5013 abort_with_slices:
5014 	mxge_free_slices(sc);
5015 abort_with_dmabench:
5016 	mxge_dma_free(&sc->dmabench_dma);
5017 abort_with_zeropad_dma:
5018 	mxge_dma_free(&sc->zeropad_dma);
5019 abort_with_cmd_dma:
5020 	mxge_dma_free(&sc->cmd_dma);
5021 abort_with_mem_res:
5022 	bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
5023 abort_with_lock:
5024 	pci_disable_busmaster(dev);
5025 	mtx_destroy(&sc->cmd_mtx);
5026 	mtx_destroy(&sc->driver_mtx);
5027 	if_free(ifp);
5028 abort_with_parent_dmat:
5029 	bus_dma_tag_destroy(sc->parent_dmat);
5030 abort_with_tq:
5031 	if (sc->tq != NULL) {
5032 		taskqueue_drain(sc->tq, &sc->watchdog_task);
5033 		taskqueue_free(sc->tq);
5034 		sc->tq = NULL;
5035 	}
5036 abort_with_nothing:
5037 	return err;
5038 }
5039 
5040 static int
5041 mxge_detach(device_t dev)
5042 {
5043 	mxge_softc_t *sc = device_get_softc(dev);
5044 
5045 	if (mxge_vlans_active(sc)) {
5046 		device_printf(sc->dev,
5047 			      "Detach vlans before removing module\n");
5048 		return EBUSY;
5049 	}
5050 	mtx_lock(&sc->driver_mtx);
5051 	sc->dying = 1;
5052 	if (sc->ifp->if_drv_flags & IFF_DRV_RUNNING)
5053 		mxge_close(sc, 0);
5054 	mtx_unlock(&sc->driver_mtx);
5055 	ether_ifdetach(sc->ifp);
5056 	if (sc->tq != NULL) {
5057 		taskqueue_drain(sc->tq, &sc->watchdog_task);
5058 		taskqueue_free(sc->tq);
5059 		sc->tq = NULL;
5060 	}
5061 	callout_drain(&sc->co_hdl);
5062 	ifmedia_removeall(&sc->media);
5063 	mxge_dummy_rdma(sc, 0);
5064 	mxge_rem_sysctls(sc);
5065 	mxge_rem_irq(sc);
5066 	mxge_free_rings(sc);
5067 	mxge_free_slices(sc);
5068 	mxge_dma_free(&sc->dmabench_dma);
5069 	mxge_dma_free(&sc->zeropad_dma);
5070 	mxge_dma_free(&sc->cmd_dma);
5071 	bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
5072 	pci_disable_busmaster(dev);
5073 	mtx_destroy(&sc->cmd_mtx);
5074 	mtx_destroy(&sc->driver_mtx);
5075 	if_free(sc->ifp);
5076 	bus_dma_tag_destroy(sc->parent_dmat);
5077 	return 0;
5078 }
5079 
5080 static int
5081 mxge_shutdown(device_t dev)
5082 {
5083 	return 0;
5084 }
5085 
5086 /*
5087   This file uses Myri10GE driver indentation.
5088 
5089   Local Variables:
5090   c-file-style:"linux"
5091   tab-width:8
5092   End:
5093 */
5094