xref: /freebsd/sys/dev/mxge/if_mxge.c (revision 830940567b49bb0c08dfaed40418999e76616909)
1 /******************************************************************************
2 
3 Copyright (c) 2006-2009, Myricom Inc.
4 All rights reserved.
5 
6 Redistribution and use in source and binary forms, with or without
7 modification, are permitted provided that the following conditions are met:
8 
9  1. Redistributions of source code must retain the above copyright notice,
10     this list of conditions and the following disclaimer.
11 
12  2. Neither the name of the Myricom Inc, nor the names of its
13     contributors may be used to endorse or promote products derived from
14     this software without specific prior written permission.
15 
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 POSSIBILITY OF SUCH DAMAGE.
27 
28 ***************************************************************************/
29 
30 #include <sys/cdefs.h>
31 __FBSDID("$FreeBSD$");
32 
33 #include <sys/param.h>
34 #include <sys/systm.h>
35 #include <sys/linker.h>
36 #include <sys/firmware.h>
37 #include <sys/endian.h>
38 #include <sys/sockio.h>
39 #include <sys/mbuf.h>
40 #include <sys/malloc.h>
41 #include <sys/kdb.h>
42 #include <sys/kernel.h>
43 #include <sys/lock.h>
44 #include <sys/module.h>
45 #include <sys/socket.h>
46 #include <sys/sysctl.h>
47 #include <sys/sx.h>
48 
49 /* count xmits ourselves, rather than via drbr */
50 #define NO_SLOW_STATS
51 #include <net/if.h>
52 #include <net/if_arp.h>
53 #include <net/ethernet.h>
54 #include <net/if_dl.h>
55 #include <net/if_media.h>
56 
57 #include <net/bpf.h>
58 
59 #include <net/if_types.h>
60 #include <net/if_vlan_var.h>
61 #include <net/zlib.h>
62 
63 #include <netinet/in_systm.h>
64 #include <netinet/in.h>
65 #include <netinet/ip.h>
66 #include <netinet/tcp.h>
67 
68 #include <machine/bus.h>
69 #include <machine/in_cksum.h>
70 #include <machine/resource.h>
71 #include <sys/bus.h>
72 #include <sys/rman.h>
73 #include <sys/smp.h>
74 
75 #include <dev/pci/pcireg.h>
76 #include <dev/pci/pcivar.h>
77 #include <dev/pci/pci_private.h> /* XXX for pci_cfg_restore */
78 
79 #include <vm/vm.h>		/* for pmap_mapdev() */
80 #include <vm/pmap.h>
81 
82 #if defined(__i386) || defined(__amd64)
83 #include <machine/specialreg.h>
84 #endif
85 
86 #include <dev/mxge/mxge_mcp.h>
87 #include <dev/mxge/mcp_gen_header.h>
88 /*#define MXGE_FAKE_IFP*/
89 #include <dev/mxge/if_mxge_var.h>
90 #ifdef IFNET_BUF_RING
91 #include <sys/buf_ring.h>
92 #endif
93 
94 #include "opt_inet.h"
95 
96 /* tunable params */
97 static int mxge_nvidia_ecrc_enable = 1;
98 static int mxge_force_firmware = 0;
99 static int mxge_intr_coal_delay = 30;
100 static int mxge_deassert_wait = 1;
101 static int mxge_flow_control = 1;
102 static int mxge_verbose = 0;
103 static int mxge_lro_cnt = 8;
104 static int mxge_ticks;
105 static int mxge_max_slices = 1;
106 static int mxge_rss_hash_type = MXGEFW_RSS_HASH_TYPE_SRC_PORT;
107 static int mxge_always_promisc = 0;
108 static int mxge_initial_mtu = ETHERMTU_JUMBO;
109 static char *mxge_fw_unaligned = "mxge_ethp_z8e";
110 static char *mxge_fw_aligned = "mxge_eth_z8e";
111 static char *mxge_fw_rss_aligned = "mxge_rss_eth_z8e";
112 static char *mxge_fw_rss_unaligned = "mxge_rss_ethp_z8e";
113 
114 static int mxge_probe(device_t dev);
115 static int mxge_attach(device_t dev);
116 static int mxge_detach(device_t dev);
117 static int mxge_shutdown(device_t dev);
118 static void mxge_intr(void *arg);
119 
120 static device_method_t mxge_methods[] =
121 {
122   /* Device interface */
123   DEVMETHOD(device_probe, mxge_probe),
124   DEVMETHOD(device_attach, mxge_attach),
125   DEVMETHOD(device_detach, mxge_detach),
126   DEVMETHOD(device_shutdown, mxge_shutdown),
127   {0, 0}
128 };
129 
130 static driver_t mxge_driver =
131 {
132   "mxge",
133   mxge_methods,
134   sizeof(mxge_softc_t),
135 };
136 
137 static devclass_t mxge_devclass;
138 
139 /* Declare ourselves to be a child of the PCI bus.*/
140 DRIVER_MODULE(mxge, pci, mxge_driver, mxge_devclass, 0, 0);
141 MODULE_DEPEND(mxge, firmware, 1, 1, 1);
142 MODULE_DEPEND(mxge, zlib, 1, 1, 1);
143 
144 static int mxge_load_firmware(mxge_softc_t *sc, int adopt);
145 static int mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data);
146 static int mxge_close(mxge_softc_t *sc);
147 static int mxge_open(mxge_softc_t *sc);
148 static void mxge_tick(void *arg);
149 
150 static int
151 mxge_probe(device_t dev)
152 {
153 	int rev;
154 
155 
156 	if ((pci_get_vendor(dev) == MXGE_PCI_VENDOR_MYRICOM) &&
157 	    ((pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E) ||
158 	     (pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E_9))) {
159 		rev = pci_get_revid(dev);
160 		switch (rev) {
161 		case MXGE_PCI_REV_Z8E:
162 			device_set_desc(dev, "Myri10G-PCIE-8A");
163 			break;
164 		case MXGE_PCI_REV_Z8ES:
165 			device_set_desc(dev, "Myri10G-PCIE-8B");
166 			break;
167 		default:
168 			device_set_desc(dev, "Myri10G-PCIE-8??");
169 			device_printf(dev, "Unrecognized rev %d NIC\n",
170 				      rev);
171 			break;
172 		}
173 		return 0;
174 	}
175 	return ENXIO;
176 }
177 
178 static void
179 mxge_enable_wc(mxge_softc_t *sc)
180 {
181 #if defined(__i386) || defined(__amd64)
182 	vm_offset_t len;
183 	int err;
184 
185 	sc->wc = 1;
186 	len = rman_get_size(sc->mem_res);
187 	err = pmap_change_attr((vm_offset_t) sc->sram,
188 			       len, PAT_WRITE_COMBINING);
189 	if (err != 0) {
190 		device_printf(sc->dev, "pmap_change_attr failed, %d\n",
191 			      err);
192 		sc->wc = 0;
193 	}
194 #endif
195 }
196 
197 
198 /* callback to get our DMA address */
199 static void
200 mxge_dmamap_callback(void *arg, bus_dma_segment_t *segs, int nsegs,
201 			 int error)
202 {
203 	if (error == 0) {
204 		*(bus_addr_t *) arg = segs->ds_addr;
205 	}
206 }
207 
208 static int
209 mxge_dma_alloc(mxge_softc_t *sc, mxge_dma_t *dma, size_t bytes,
210 		   bus_size_t alignment)
211 {
212 	int err;
213 	device_t dev = sc->dev;
214 	bus_size_t boundary, maxsegsize;
215 
216 	if (bytes > 4096 && alignment == 4096) {
217 		boundary = 0;
218 		maxsegsize = bytes;
219 	} else {
220 		boundary = 4096;
221 		maxsegsize = 4096;
222 	}
223 
224 	/* allocate DMAable memory tags */
225 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
226 				 alignment,		/* alignment */
227 				 boundary,		/* boundary */
228 				 BUS_SPACE_MAXADDR,	/* low */
229 				 BUS_SPACE_MAXADDR,	/* high */
230 				 NULL, NULL,		/* filter */
231 				 bytes,			/* maxsize */
232 				 1,			/* num segs */
233 				 maxsegsize,		/* maxsegsize */
234 				 BUS_DMA_COHERENT,	/* flags */
235 				 NULL, NULL,		/* lock */
236 				 &dma->dmat);		/* tag */
237 	if (err != 0) {
238 		device_printf(dev, "couldn't alloc tag (err = %d)\n", err);
239 		return err;
240 	}
241 
242 	/* allocate DMAable memory & map */
243 	err = bus_dmamem_alloc(dma->dmat, &dma->addr,
244 			       (BUS_DMA_WAITOK | BUS_DMA_COHERENT
245 				| BUS_DMA_ZERO),  &dma->map);
246 	if (err != 0) {
247 		device_printf(dev, "couldn't alloc mem (err = %d)\n", err);
248 		goto abort_with_dmat;
249 	}
250 
251 	/* load the memory */
252 	err = bus_dmamap_load(dma->dmat, dma->map, dma->addr, bytes,
253 			      mxge_dmamap_callback,
254 			      (void *)&dma->bus_addr, 0);
255 	if (err != 0) {
256 		device_printf(dev, "couldn't load map (err = %d)\n", err);
257 		goto abort_with_mem;
258 	}
259 	return 0;
260 
261 abort_with_mem:
262 	bus_dmamem_free(dma->dmat, dma->addr, dma->map);
263 abort_with_dmat:
264 	(void)bus_dma_tag_destroy(dma->dmat);
265 	return err;
266 }
267 
268 
269 static void
270 mxge_dma_free(mxge_dma_t *dma)
271 {
272 	bus_dmamap_unload(dma->dmat, dma->map);
273 	bus_dmamem_free(dma->dmat, dma->addr, dma->map);
274 	(void)bus_dma_tag_destroy(dma->dmat);
275 }
276 
277 /*
278  * The eeprom strings on the lanaiX have the format
279  * SN=x\0
280  * MAC=x:x:x:x:x:x\0
281  * PC=text\0
282  */
283 
284 static int
285 mxge_parse_strings(mxge_softc_t *sc)
286 {
287 #define MXGE_NEXT_STRING(p) while(ptr < limit && *ptr++)
288 
289 	char *ptr, *limit;
290 	int i, found_mac;
291 
292 	ptr = sc->eeprom_strings;
293 	limit = sc->eeprom_strings + MXGE_EEPROM_STRINGS_SIZE;
294 	found_mac = 0;
295 	while (ptr < limit && *ptr != '\0') {
296 		if (memcmp(ptr, "MAC=", 4) == 0) {
297 			ptr += 1;
298 			sc->mac_addr_string = ptr;
299 			for (i = 0; i < 6; i++) {
300 				ptr += 3;
301 				if ((ptr + 2) > limit)
302 					goto abort;
303 				sc->mac_addr[i] = strtoul(ptr, NULL, 16);
304 				found_mac = 1;
305 			}
306 		} else if (memcmp(ptr, "PC=", 3) == 0) {
307 			ptr += 3;
308 			strncpy(sc->product_code_string, ptr,
309 				sizeof (sc->product_code_string) - 1);
310 		} else if (memcmp(ptr, "SN=", 3) == 0) {
311 			ptr += 3;
312 			strncpy(sc->serial_number_string, ptr,
313 				sizeof (sc->serial_number_string) - 1);
314 		}
315 		MXGE_NEXT_STRING(ptr);
316 	}
317 
318 	if (found_mac)
319 		return 0;
320 
321  abort:
322 	device_printf(sc->dev, "failed to parse eeprom_strings\n");
323 
324 	return ENXIO;
325 }
326 
327 #if defined __i386 || defined i386 || defined __i386__ || defined __x86_64__
328 static void
329 mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
330 {
331 	uint32_t val;
332 	unsigned long base, off;
333 	char *va, *cfgptr;
334 	device_t pdev, mcp55;
335 	uint16_t vendor_id, device_id, word;
336 	uintptr_t bus, slot, func, ivend, idev;
337 	uint32_t *ptr32;
338 
339 
340 	if (!mxge_nvidia_ecrc_enable)
341 		return;
342 
343 	pdev = device_get_parent(device_get_parent(sc->dev));
344 	if (pdev == NULL) {
345 		device_printf(sc->dev, "could not find parent?\n");
346 		return;
347 	}
348 	vendor_id = pci_read_config(pdev, PCIR_VENDOR, 2);
349 	device_id = pci_read_config(pdev, PCIR_DEVICE, 2);
350 
351 	if (vendor_id != 0x10de)
352 		return;
353 
354 	base = 0;
355 
356 	if (device_id == 0x005d) {
357 		/* ck804, base address is magic */
358 		base = 0xe0000000UL;
359 	} else if (device_id >= 0x0374 && device_id <= 0x378) {
360 		/* mcp55, base address stored in chipset */
361 		mcp55 = pci_find_bsf(0, 0, 0);
362 		if (mcp55 &&
363 		    0x10de == pci_read_config(mcp55, PCIR_VENDOR, 2) &&
364 		    0x0369 == pci_read_config(mcp55, PCIR_DEVICE, 2)) {
365 			word = pci_read_config(mcp55, 0x90, 2);
366 			base = ((unsigned long)word & 0x7ffeU) << 25;
367 		}
368 	}
369 	if (!base)
370 		return;
371 
372 	/* XXXX
373 	   Test below is commented because it is believed that doing
374 	   config read/write beyond 0xff will access the config space
375 	   for the next larger function.  Uncomment this and remove
376 	   the hacky pmap_mapdev() way of accessing config space when
377 	   FreeBSD grows support for extended pcie config space access
378 	*/
379 #if 0
380 	/* See if we can, by some miracle, access the extended
381 	   config space */
382 	val = pci_read_config(pdev, 0x178, 4);
383 	if (val != 0xffffffff) {
384 		val |= 0x40;
385 		pci_write_config(pdev, 0x178, val, 4);
386 		return;
387 	}
388 #endif
389 	/* Rather than using normal pci config space writes, we must
390 	 * map the Nvidia config space ourselves.  This is because on
391 	 * opteron/nvidia class machine the 0xe000000 mapping is
392 	 * handled by the nvidia chipset, that means the internal PCI
393 	 * device (the on-chip northbridge), or the amd-8131 bridge
394 	 * and things behind them are not visible by this method.
395 	 */
396 
397 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
398 		      PCI_IVAR_BUS, &bus);
399 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
400 		      PCI_IVAR_SLOT, &slot);
401 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
402 		      PCI_IVAR_FUNCTION, &func);
403 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
404 		      PCI_IVAR_VENDOR, &ivend);
405 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
406 		      PCI_IVAR_DEVICE, &idev);
407 
408 	off =  base
409 		+ 0x00100000UL * (unsigned long)bus
410 		+ 0x00001000UL * (unsigned long)(func
411 						 + 8 * slot);
412 
413 	/* map it into the kernel */
414 	va = pmap_mapdev(trunc_page((vm_paddr_t)off), PAGE_SIZE);
415 
416 
417 	if (va == NULL) {
418 		device_printf(sc->dev, "pmap_kenter_temporary didn't\n");
419 		return;
420 	}
421 	/* get a pointer to the config space mapped into the kernel */
422 	cfgptr = va + (off & PAGE_MASK);
423 
424 	/* make sure that we can really access it */
425 	vendor_id = *(uint16_t *)(cfgptr + PCIR_VENDOR);
426 	device_id = *(uint16_t *)(cfgptr + PCIR_DEVICE);
427 	if (! (vendor_id == ivend && device_id == idev)) {
428 		device_printf(sc->dev, "mapping failed: 0x%x:0x%x\n",
429 			      vendor_id, device_id);
430 		pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
431 		return;
432 	}
433 
434 	ptr32 = (uint32_t*)(cfgptr + 0x178);
435 	val = *ptr32;
436 
437 	if (val == 0xffffffff) {
438 		device_printf(sc->dev, "extended mapping failed\n");
439 		pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
440 		return;
441 	}
442 	*ptr32 = val | 0x40;
443 	pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
444 	if (mxge_verbose)
445 		device_printf(sc->dev,
446 			      "Enabled ECRC on upstream Nvidia bridge "
447 			      "at %d:%d:%d\n",
448 			      (int)bus, (int)slot, (int)func);
449 	return;
450 }
451 #else
452 static void
453 mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
454 {
455 	device_printf(sc->dev,
456 		      "Nforce 4 chipset on non-x86/amd64!?!?!\n");
457 	return;
458 }
459 #endif
460 
461 
462 static int
463 mxge_dma_test(mxge_softc_t *sc, int test_type)
464 {
465 	mxge_cmd_t cmd;
466 	bus_addr_t dmatest_bus = sc->dmabench_dma.bus_addr;
467 	int status;
468 	uint32_t len;
469 	char *test = " ";
470 
471 
472 	/* Run a small DMA test.
473 	 * The magic multipliers to the length tell the firmware
474 	 * to do DMA read, write, or read+write tests.  The
475 	 * results are returned in cmd.data0.  The upper 16
476 	 * bits of the return is the number of transfers completed.
477 	 * The lower 16 bits is the time in 0.5us ticks that the
478 	 * transfers took to complete.
479 	 */
480 
481 	len = sc->tx_boundary;
482 
483 	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
484 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
485 	cmd.data2 = len * 0x10000;
486 	status = mxge_send_cmd(sc, test_type, &cmd);
487 	if (status != 0) {
488 		test = "read";
489 		goto abort;
490 	}
491 	sc->read_dma = ((cmd.data0>>16) * len * 2) /
492 		(cmd.data0 & 0xffff);
493 	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
494 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
495 	cmd.data2 = len * 0x1;
496 	status = mxge_send_cmd(sc, test_type, &cmd);
497 	if (status != 0) {
498 		test = "write";
499 		goto abort;
500 	}
501 	sc->write_dma = ((cmd.data0>>16) * len * 2) /
502 		(cmd.data0 & 0xffff);
503 
504 	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
505 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
506 	cmd.data2 = len * 0x10001;
507 	status = mxge_send_cmd(sc, test_type, &cmd);
508 	if (status != 0) {
509 		test = "read/write";
510 		goto abort;
511 	}
512 	sc->read_write_dma = ((cmd.data0>>16) * len * 2 * 2) /
513 		(cmd.data0 & 0xffff);
514 
515 abort:
516 	if (status != 0 && test_type != MXGEFW_CMD_UNALIGNED_TEST)
517 		device_printf(sc->dev, "DMA %s benchmark failed: %d\n",
518 			      test, status);
519 
520 	return status;
521 }
522 
523 /*
524  * The Lanai Z8E PCI-E interface achieves higher Read-DMA throughput
525  * when the PCI-E Completion packets are aligned on an 8-byte
526  * boundary.  Some PCI-E chip sets always align Completion packets; on
527  * the ones that do not, the alignment can be enforced by enabling
528  * ECRC generation (if supported).
529  *
530  * When PCI-E Completion packets are not aligned, it is actually more
531  * efficient to limit Read-DMA transactions to 2KB, rather than 4KB.
532  *
533  * If the driver can neither enable ECRC nor verify that it has
534  * already been enabled, then it must use a firmware image which works
535  * around unaligned completion packets (ethp_z8e.dat), and it should
536  * also ensure that it never gives the device a Read-DMA which is
537  * larger than 2KB by setting the tx_boundary to 2KB.  If ECRC is
538  * enabled, then the driver should use the aligned (eth_z8e.dat)
539  * firmware image, and set tx_boundary to 4KB.
540  */
541 
542 static int
543 mxge_firmware_probe(mxge_softc_t *sc)
544 {
545 	device_t dev = sc->dev;
546 	int reg, status;
547 	uint16_t pectl;
548 
549 	sc->tx_boundary = 4096;
550 	/*
551 	 * Verify the max read request size was set to 4KB
552 	 * before trying the test with 4KB.
553 	 */
554 	if (pci_find_extcap(dev, PCIY_EXPRESS, &reg) == 0) {
555 		pectl = pci_read_config(dev, reg + 0x8, 2);
556 		if ((pectl & (5 << 12)) != (5 << 12)) {
557 			device_printf(dev, "Max Read Req. size != 4k (0x%x\n",
558 				      pectl);
559 			sc->tx_boundary = 2048;
560 		}
561 	}
562 
563 	/*
564 	 * load the optimized firmware (which assumes aligned PCIe
565 	 * completions) in order to see if it works on this host.
566 	 */
567 	sc->fw_name = mxge_fw_aligned;
568 	status = mxge_load_firmware(sc, 1);
569 	if (status != 0) {
570 		return status;
571 	}
572 
573 	/*
574 	 * Enable ECRC if possible
575 	 */
576 	mxge_enable_nvidia_ecrc(sc);
577 
578 	/*
579 	 * Run a DMA test which watches for unaligned completions and
580 	 * aborts on the first one seen.
581 	 */
582 
583 	status = mxge_dma_test(sc, MXGEFW_CMD_UNALIGNED_TEST);
584 	if (status == 0)
585 		return 0; /* keep the aligned firmware */
586 
587 	if (status != E2BIG)
588 		device_printf(dev, "DMA test failed: %d\n", status);
589 	if (status == ENOSYS)
590 		device_printf(dev, "Falling back to ethp! "
591 			      "Please install up to date fw\n");
592 	return status;
593 }
594 
595 static int
596 mxge_select_firmware(mxge_softc_t *sc)
597 {
598 	int aligned = 0;
599 
600 
601 	if (mxge_force_firmware != 0) {
602 		if (mxge_force_firmware == 1)
603 			aligned = 1;
604 		else
605 			aligned = 0;
606 		if (mxge_verbose)
607 			device_printf(sc->dev,
608 				      "Assuming %s completions (forced)\n",
609 				      aligned ? "aligned" : "unaligned");
610 		goto abort;
611 	}
612 
613 	/* if the PCIe link width is 4 or less, we can use the aligned
614 	   firmware and skip any checks */
615 	if (sc->link_width != 0 && sc->link_width <= 4) {
616 		device_printf(sc->dev,
617 			      "PCIe x%d Link, expect reduced performance\n",
618 			      sc->link_width);
619 		aligned = 1;
620 		goto abort;
621 	}
622 
623 	if (0 == mxge_firmware_probe(sc))
624 		return 0;
625 
626 abort:
627 	if (aligned) {
628 		sc->fw_name = mxge_fw_aligned;
629 		sc->tx_boundary = 4096;
630 	} else {
631 		sc->fw_name = mxge_fw_unaligned;
632 		sc->tx_boundary = 2048;
633 	}
634 	return (mxge_load_firmware(sc, 0));
635 }
636 
637 union qualhack
638 {
639         const char *ro_char;
640         char *rw_char;
641 };
642 
643 static int
644 mxge_validate_firmware(mxge_softc_t *sc, const mcp_gen_header_t *hdr)
645 {
646 
647 
648 	if (be32toh(hdr->mcp_type) != MCP_TYPE_ETH) {
649 		device_printf(sc->dev, "Bad firmware type: 0x%x\n",
650 			      be32toh(hdr->mcp_type));
651 		return EIO;
652 	}
653 
654 	/* save firmware version for sysctl */
655 	strncpy(sc->fw_version, hdr->version, sizeof (sc->fw_version));
656 	if (mxge_verbose)
657 		device_printf(sc->dev, "firmware id: %s\n", hdr->version);
658 
659 	sscanf(sc->fw_version, "%d.%d.%d", &sc->fw_ver_major,
660 	       &sc->fw_ver_minor, &sc->fw_ver_tiny);
661 
662 	if (!(sc->fw_ver_major == MXGEFW_VERSION_MAJOR
663 	      && sc->fw_ver_minor == MXGEFW_VERSION_MINOR)) {
664 		device_printf(sc->dev, "Found firmware version %s\n",
665 			      sc->fw_version);
666 		device_printf(sc->dev, "Driver needs %d.%d\n",
667 			      MXGEFW_VERSION_MAJOR, MXGEFW_VERSION_MINOR);
668 		return EINVAL;
669 	}
670 	return 0;
671 
672 }
673 
674 static void *
675 z_alloc(void *nil, u_int items, u_int size)
676 {
677         void *ptr;
678 
679         ptr = malloc(items * size, M_TEMP, M_NOWAIT);
680         return ptr;
681 }
682 
683 static void
684 z_free(void *nil, void *ptr)
685 {
686         free(ptr, M_TEMP);
687 }
688 
689 
690 static int
691 mxge_load_firmware_helper(mxge_softc_t *sc, uint32_t *limit)
692 {
693 	z_stream zs;
694 	char *inflate_buffer;
695 	const struct firmware *fw;
696 	const mcp_gen_header_t *hdr;
697 	unsigned hdr_offset;
698 	int status;
699 	unsigned int i;
700 	char dummy;
701 	size_t fw_len;
702 
703 	fw = firmware_get(sc->fw_name);
704 	if (fw == NULL) {
705 		device_printf(sc->dev, "Could not find firmware image %s\n",
706 			      sc->fw_name);
707 		return ENOENT;
708 	}
709 
710 
711 
712 	/* setup zlib and decompress f/w */
713 	bzero(&zs, sizeof (zs));
714 	zs.zalloc = z_alloc;
715 	zs.zfree = z_free;
716 	status = inflateInit(&zs);
717 	if (status != Z_OK) {
718 		status = EIO;
719 		goto abort_with_fw;
720 	}
721 
722 	/* the uncompressed size is stored as the firmware version,
723 	   which would otherwise go unused */
724 	fw_len = (size_t) fw->version;
725 	inflate_buffer = malloc(fw_len, M_TEMP, M_NOWAIT);
726 	if (inflate_buffer == NULL)
727 		goto abort_with_zs;
728 	zs.avail_in = fw->datasize;
729 	zs.next_in = __DECONST(char *, fw->data);
730 	zs.avail_out = fw_len;
731 	zs.next_out = inflate_buffer;
732 	status = inflate(&zs, Z_FINISH);
733 	if (status != Z_STREAM_END) {
734 		device_printf(sc->dev, "zlib %d\n", status);
735 		status = EIO;
736 		goto abort_with_buffer;
737 	}
738 
739 	/* check id */
740 	hdr_offset = htobe32(*(const uint32_t *)
741 			     (inflate_buffer + MCP_HEADER_PTR_OFFSET));
742 	if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > fw_len) {
743 		device_printf(sc->dev, "Bad firmware file");
744 		status = EIO;
745 		goto abort_with_buffer;
746 	}
747 	hdr = (const void*)(inflate_buffer + hdr_offset);
748 
749 	status = mxge_validate_firmware(sc, hdr);
750 	if (status != 0)
751 		goto abort_with_buffer;
752 
753 	/* Copy the inflated firmware to NIC SRAM. */
754 	for (i = 0; i < fw_len; i += 256) {
755 		mxge_pio_copy(sc->sram + MXGE_FW_OFFSET + i,
756 			      inflate_buffer + i,
757 			      min(256U, (unsigned)(fw_len - i)));
758 		wmb();
759 		dummy = *sc->sram;
760 		wmb();
761 	}
762 
763 	*limit = fw_len;
764 	status = 0;
765 abort_with_buffer:
766 	free(inflate_buffer, M_TEMP);
767 abort_with_zs:
768 	inflateEnd(&zs);
769 abort_with_fw:
770 	firmware_put(fw, FIRMWARE_UNLOAD);
771 	return status;
772 }
773 
774 /*
775  * Enable or disable periodic RDMAs from the host to make certain
776  * chipsets resend dropped PCIe messages
777  */
778 
779 static void
780 mxge_dummy_rdma(mxge_softc_t *sc, int enable)
781 {
782 	char buf_bytes[72];
783 	volatile uint32_t *confirm;
784 	volatile char *submit;
785 	uint32_t *buf, dma_low, dma_high;
786 	int i;
787 
788 	buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
789 
790 	/* clear confirmation addr */
791 	confirm = (volatile uint32_t *)sc->cmd;
792 	*confirm = 0;
793 	wmb();
794 
795 	/* send an rdma command to the PCIe engine, and wait for the
796 	   response in the confirmation address.  The firmware should
797 	   write a -1 there to indicate it is alive and well
798 	*/
799 
800 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
801 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
802 	buf[0] = htobe32(dma_high);		/* confirm addr MSW */
803 	buf[1] = htobe32(dma_low);		/* confirm addr LSW */
804 	buf[2] = htobe32(0xffffffff);		/* confirm data */
805 	dma_low = MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr);
806 	dma_high = MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr);
807 	buf[3] = htobe32(dma_high); 		/* dummy addr MSW */
808 	buf[4] = htobe32(dma_low); 		/* dummy addr LSW */
809 	buf[5] = htobe32(enable);			/* enable? */
810 
811 
812 	submit = (volatile char *)(sc->sram + MXGEFW_BOOT_DUMMY_RDMA);
813 
814 	mxge_pio_copy(submit, buf, 64);
815 	wmb();
816 	DELAY(1000);
817 	wmb();
818 	i = 0;
819 	while (*confirm != 0xffffffff && i < 20) {
820 		DELAY(1000);
821 		i++;
822 	}
823 	if (*confirm != 0xffffffff) {
824 		device_printf(sc->dev, "dummy rdma %s failed (%p = 0x%x)",
825 			      (enable ? "enable" : "disable"), confirm,
826 			      *confirm);
827 	}
828 	return;
829 }
830 
831 static int
832 mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data)
833 {
834 	mcp_cmd_t *buf;
835 	char buf_bytes[sizeof(*buf) + 8];
836 	volatile mcp_cmd_response_t *response = sc->cmd;
837 	volatile char *cmd_addr = sc->sram + MXGEFW_ETH_CMD;
838 	uint32_t dma_low, dma_high;
839 	int err, sleep_total = 0;
840 
841 	/* ensure buf is aligned to 8 bytes */
842 	buf = (mcp_cmd_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
843 
844 	buf->data0 = htobe32(data->data0);
845 	buf->data1 = htobe32(data->data1);
846 	buf->data2 = htobe32(data->data2);
847 	buf->cmd = htobe32(cmd);
848 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
849 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
850 
851 	buf->response_addr.low = htobe32(dma_low);
852 	buf->response_addr.high = htobe32(dma_high);
853 	mtx_lock(&sc->cmd_mtx);
854 	response->result = 0xffffffff;
855 	wmb();
856 	mxge_pio_copy((volatile void *)cmd_addr, buf, sizeof (*buf));
857 
858 	/* wait up to 20ms */
859 	err = EAGAIN;
860 	for (sleep_total = 0; sleep_total <  20; sleep_total++) {
861 		bus_dmamap_sync(sc->cmd_dma.dmat,
862 				sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
863 		wmb();
864 		switch (be32toh(response->result)) {
865 		case 0:
866 			data->data0 = be32toh(response->data);
867 			err = 0;
868 			break;
869 		case 0xffffffff:
870 			DELAY(1000);
871 			break;
872 		case MXGEFW_CMD_UNKNOWN:
873 			err = ENOSYS;
874 			break;
875 		case MXGEFW_CMD_ERROR_UNALIGNED:
876 			err = E2BIG;
877 			break;
878 		case MXGEFW_CMD_ERROR_BUSY:
879 			err = EBUSY;
880 			break;
881 		default:
882 			device_printf(sc->dev,
883 				      "mxge: command %d "
884 				      "failed, result = %d\n",
885 				      cmd, be32toh(response->result));
886 			err = ENXIO;
887 			break;
888 		}
889 		if (err != EAGAIN)
890 			break;
891 	}
892 	if (err == EAGAIN)
893 		device_printf(sc->dev, "mxge: command %d timed out"
894 			      "result = %d\n",
895 			      cmd, be32toh(response->result));
896 	mtx_unlock(&sc->cmd_mtx);
897 	return err;
898 }
899 
900 static int
901 mxge_adopt_running_firmware(mxge_softc_t *sc)
902 {
903 	struct mcp_gen_header *hdr;
904 	const size_t bytes = sizeof (struct mcp_gen_header);
905 	size_t hdr_offset;
906 	int status;
907 
908 	/* find running firmware header */
909 	hdr_offset = htobe32(*(volatile uint32_t *)
910 			     (sc->sram + MCP_HEADER_PTR_OFFSET));
911 
912 	if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > sc->sram_size) {
913 		device_printf(sc->dev,
914 			      "Running firmware has bad header offset (%d)\n",
915 			      (int)hdr_offset);
916 		return EIO;
917 	}
918 
919 	/* copy header of running firmware from SRAM to host memory to
920 	 * validate firmware */
921 	hdr = malloc(bytes, M_DEVBUF, M_NOWAIT);
922 	if (hdr == NULL) {
923 		device_printf(sc->dev, "could not malloc firmware hdr\n");
924 		return ENOMEM;
925 	}
926 	bus_space_read_region_1(rman_get_bustag(sc->mem_res),
927 				rman_get_bushandle(sc->mem_res),
928 				hdr_offset, (char *)hdr, bytes);
929 	status = mxge_validate_firmware(sc, hdr);
930 	free(hdr, M_DEVBUF);
931 
932 	/*
933 	 * check to see if adopted firmware has bug where adopting
934 	 * it will cause broadcasts to be filtered unless the NIC
935 	 * is kept in ALLMULTI mode
936 	 */
937 	if (sc->fw_ver_major == 1 && sc->fw_ver_minor == 4 &&
938 	    sc->fw_ver_tiny >= 4 && sc->fw_ver_tiny <= 11) {
939 		sc->adopted_rx_filter_bug = 1;
940 		device_printf(sc->dev, "Adopting fw %d.%d.%d: "
941 			      "working around rx filter bug\n",
942 			      sc->fw_ver_major, sc->fw_ver_minor,
943 			      sc->fw_ver_tiny);
944 	}
945 
946 	return status;
947 }
948 
949 
950 static int
951 mxge_load_firmware(mxge_softc_t *sc, int adopt)
952 {
953 	volatile uint32_t *confirm;
954 	volatile char *submit;
955 	char buf_bytes[72];
956 	uint32_t *buf, size, dma_low, dma_high;
957 	int status, i;
958 
959 	buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
960 
961 	size = sc->sram_size;
962 	status = mxge_load_firmware_helper(sc, &size);
963 	if (status) {
964 		if (!adopt)
965 			return status;
966 		/* Try to use the currently running firmware, if
967 		   it is new enough */
968 		status = mxge_adopt_running_firmware(sc);
969 		if (status) {
970 			device_printf(sc->dev,
971 				      "failed to adopt running firmware\n");
972 			return status;
973 		}
974 		device_printf(sc->dev,
975 			      "Successfully adopted running firmware\n");
976 		if (sc->tx_boundary == 4096) {
977 			device_printf(sc->dev,
978 				"Using firmware currently running on NIC"
979 				 ".  For optimal\n");
980 			device_printf(sc->dev,
981 				 "performance consider loading optimized "
982 				 "firmware\n");
983 		}
984 		sc->fw_name = mxge_fw_unaligned;
985 		sc->tx_boundary = 2048;
986 		return 0;
987 	}
988 	/* clear confirmation addr */
989 	confirm = (volatile uint32_t *)sc->cmd;
990 	*confirm = 0;
991 	wmb();
992 	/* send a reload command to the bootstrap MCP, and wait for the
993 	   response in the confirmation address.  The firmware should
994 	   write a -1 there to indicate it is alive and well
995 	*/
996 
997 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
998 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
999 
1000 	buf[0] = htobe32(dma_high);	/* confirm addr MSW */
1001 	buf[1] = htobe32(dma_low);	/* confirm addr LSW */
1002 	buf[2] = htobe32(0xffffffff);	/* confirm data */
1003 
1004 	/* FIX: All newest firmware should un-protect the bottom of
1005 	   the sram before handoff. However, the very first interfaces
1006 	   do not. Therefore the handoff copy must skip the first 8 bytes
1007 	*/
1008 					/* where the code starts*/
1009 	buf[3] = htobe32(MXGE_FW_OFFSET + 8);
1010 	buf[4] = htobe32(size - 8); 	/* length of code */
1011 	buf[5] = htobe32(8);		/* where to copy to */
1012 	buf[6] = htobe32(0);		/* where to jump to */
1013 
1014 	submit = (volatile char *)(sc->sram + MXGEFW_BOOT_HANDOFF);
1015 	mxge_pio_copy(submit, buf, 64);
1016 	wmb();
1017 	DELAY(1000);
1018 	wmb();
1019 	i = 0;
1020 	while (*confirm != 0xffffffff && i < 20) {
1021 		DELAY(1000*10);
1022 		i++;
1023 		bus_dmamap_sync(sc->cmd_dma.dmat,
1024 				sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
1025 	}
1026 	if (*confirm != 0xffffffff) {
1027 		device_printf(sc->dev,"handoff failed (%p = 0x%x)",
1028 			confirm, *confirm);
1029 
1030 		return ENXIO;
1031 	}
1032 	return 0;
1033 }
1034 
1035 static int
1036 mxge_update_mac_address(mxge_softc_t *sc)
1037 {
1038 	mxge_cmd_t cmd;
1039 	uint8_t *addr = sc->mac_addr;
1040 	int status;
1041 
1042 
1043 	cmd.data0 = ((addr[0] << 24) | (addr[1] << 16)
1044 		     | (addr[2] << 8) | addr[3]);
1045 
1046 	cmd.data1 = ((addr[4] << 8) | (addr[5]));
1047 
1048 	status = mxge_send_cmd(sc, MXGEFW_SET_MAC_ADDRESS, &cmd);
1049 	return status;
1050 }
1051 
1052 static int
1053 mxge_change_pause(mxge_softc_t *sc, int pause)
1054 {
1055 	mxge_cmd_t cmd;
1056 	int status;
1057 
1058 	if (pause)
1059 		status = mxge_send_cmd(sc, MXGEFW_ENABLE_FLOW_CONTROL,
1060 				       &cmd);
1061 	else
1062 		status = mxge_send_cmd(sc, MXGEFW_DISABLE_FLOW_CONTROL,
1063 				       &cmd);
1064 
1065 	if (status) {
1066 		device_printf(sc->dev, "Failed to set flow control mode\n");
1067 		return ENXIO;
1068 	}
1069 	sc->pause = pause;
1070 	return 0;
1071 }
1072 
1073 static void
1074 mxge_change_promisc(mxge_softc_t *sc, int promisc)
1075 {
1076 	mxge_cmd_t cmd;
1077 	int status;
1078 
1079 	if (mxge_always_promisc)
1080 		promisc = 1;
1081 
1082 	if (promisc)
1083 		status = mxge_send_cmd(sc, MXGEFW_ENABLE_PROMISC,
1084 				       &cmd);
1085 	else
1086 		status = mxge_send_cmd(sc, MXGEFW_DISABLE_PROMISC,
1087 				       &cmd);
1088 
1089 	if (status) {
1090 		device_printf(sc->dev, "Failed to set promisc mode\n");
1091 	}
1092 }
1093 
1094 static void
1095 mxge_set_multicast_list(mxge_softc_t *sc)
1096 {
1097 	mxge_cmd_t cmd;
1098 	struct ifmultiaddr *ifma;
1099 	struct ifnet *ifp = sc->ifp;
1100 	int err;
1101 
1102 	/* This firmware is known to not support multicast */
1103 	if (!sc->fw_multicast_support)
1104 		return;
1105 
1106 	/* Disable multicast filtering while we play with the lists*/
1107 	err = mxge_send_cmd(sc, MXGEFW_ENABLE_ALLMULTI, &cmd);
1108 	if (err != 0) {
1109 		device_printf(sc->dev, "Failed MXGEFW_ENABLE_ALLMULTI,"
1110 		       " error status: %d\n", err);
1111 		return;
1112 	}
1113 
1114 	if (sc->adopted_rx_filter_bug)
1115 		return;
1116 
1117 	if (ifp->if_flags & IFF_ALLMULTI)
1118 		/* request to disable multicast filtering, so quit here */
1119 		return;
1120 
1121 	/* Flush all the filters */
1122 
1123 	err = mxge_send_cmd(sc, MXGEFW_LEAVE_ALL_MULTICAST_GROUPS, &cmd);
1124 	if (err != 0) {
1125 		device_printf(sc->dev,
1126 			      "Failed MXGEFW_LEAVE_ALL_MULTICAST_GROUPS"
1127 			      ", error status: %d\n", err);
1128 		return;
1129 	}
1130 
1131 	/* Walk the multicast list, and add each address */
1132 
1133 	if_maddr_rlock(ifp);
1134 	TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
1135 		if (ifma->ifma_addr->sa_family != AF_LINK)
1136 			continue;
1137 		bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr),
1138 		      &cmd.data0, 4);
1139 		bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr) + 4,
1140 		      &cmd.data1, 2);
1141 		cmd.data0 = htonl(cmd.data0);
1142 		cmd.data1 = htonl(cmd.data1);
1143 		err = mxge_send_cmd(sc, MXGEFW_JOIN_MULTICAST_GROUP, &cmd);
1144 		if (err != 0) {
1145 			device_printf(sc->dev, "Failed "
1146 			       "MXGEFW_JOIN_MULTICAST_GROUP, error status:"
1147 			       "%d\t", err);
1148 			/* abort, leaving multicast filtering off */
1149 			if_maddr_runlock(ifp);
1150 			return;
1151 		}
1152 	}
1153 	if_maddr_runlock(ifp);
1154 	/* Enable multicast filtering */
1155 	err = mxge_send_cmd(sc, MXGEFW_DISABLE_ALLMULTI, &cmd);
1156 	if (err != 0) {
1157 		device_printf(sc->dev, "Failed MXGEFW_DISABLE_ALLMULTI"
1158 		       ", error status: %d\n", err);
1159 	}
1160 }
1161 
1162 static int
1163 mxge_max_mtu(mxge_softc_t *sc)
1164 {
1165 	mxge_cmd_t cmd;
1166 	int status;
1167 
1168 	if (MJUMPAGESIZE - MXGEFW_PAD >  MXGEFW_MAX_MTU)
1169 		return  MXGEFW_MAX_MTU - MXGEFW_PAD;
1170 
1171 	/* try to set nbufs to see if it we can
1172 	   use virtually contiguous jumbos */
1173 	cmd.data0 = 0;
1174 	status = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
1175 			       &cmd);
1176 	if (status == 0)
1177 		return  MXGEFW_MAX_MTU - MXGEFW_PAD;
1178 
1179 	/* otherwise, we're limited to MJUMPAGESIZE */
1180 	return MJUMPAGESIZE - MXGEFW_PAD;
1181 }
1182 
1183 static int
1184 mxge_reset(mxge_softc_t *sc, int interrupts_setup)
1185 {
1186 	struct mxge_slice_state *ss;
1187 	mxge_rx_done_t *rx_done;
1188 	volatile uint32_t *irq_claim;
1189 	mxge_cmd_t cmd;
1190 	int slice, status;
1191 
1192 	/* try to send a reset command to the card to see if it
1193 	   is alive */
1194 	memset(&cmd, 0, sizeof (cmd));
1195 	status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
1196 	if (status != 0) {
1197 		device_printf(sc->dev, "failed reset\n");
1198 		return ENXIO;
1199 	}
1200 
1201 	mxge_dummy_rdma(sc, 1);
1202 
1203 
1204 	/* set the intrq size */
1205 	cmd.data0 = sc->rx_ring_size;
1206 	status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
1207 
1208 	/*
1209 	 * Even though we already know how many slices are supported
1210 	 * via mxge_slice_probe(), MXGEFW_CMD_GET_MAX_RSS_QUEUES
1211 	 * has magic side effects, and must be called after a reset.
1212 	 * It must be called prior to calling any RSS related cmds,
1213 	 * including assigning an interrupt queue for anything but
1214 	 * slice 0.  It must also be called *after*
1215 	 * MXGEFW_CMD_SET_INTRQ_SIZE, since the intrq size is used by
1216 	 * the firmware to compute offsets.
1217 	 */
1218 
1219 	if (sc->num_slices > 1) {
1220 		/* ask the maximum number of slices it supports */
1221 		status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES,
1222 					   &cmd);
1223 		if (status != 0) {
1224 			device_printf(sc->dev,
1225 				      "failed to get number of slices\n");
1226 			return status;
1227 		}
1228 		/*
1229 		 * MXGEFW_CMD_ENABLE_RSS_QUEUES must be called prior
1230 		 * to setting up the interrupt queue DMA
1231 		 */
1232 		cmd.data0 = sc->num_slices;
1233 		cmd.data1 = MXGEFW_SLICE_INTR_MODE_ONE_PER_SLICE;
1234 #ifdef IFNET_BUF_RING
1235 		cmd.data1 |= MXGEFW_SLICE_ENABLE_MULTIPLE_TX_QUEUES;
1236 #endif
1237 		status = mxge_send_cmd(sc, MXGEFW_CMD_ENABLE_RSS_QUEUES,
1238 					   &cmd);
1239 		if (status != 0) {
1240 			device_printf(sc->dev,
1241 				      "failed to set number of slices\n");
1242 			return status;
1243 		}
1244 	}
1245 
1246 
1247 	if (interrupts_setup) {
1248 		/* Now exchange information about interrupts  */
1249 		for (slice = 0; slice < sc->num_slices; slice++) {
1250 			rx_done = &sc->ss[slice].rx_done;
1251 			memset(rx_done->entry, 0, sc->rx_ring_size);
1252 			cmd.data0 = MXGE_LOWPART_TO_U32(rx_done->dma.bus_addr);
1253 			cmd.data1 = MXGE_HIGHPART_TO_U32(rx_done->dma.bus_addr);
1254 			cmd.data2 = slice;
1255 			status |= mxge_send_cmd(sc,
1256 						MXGEFW_CMD_SET_INTRQ_DMA,
1257 						&cmd);
1258 		}
1259 	}
1260 
1261 	status |= mxge_send_cmd(sc,
1262 				MXGEFW_CMD_GET_INTR_COAL_DELAY_OFFSET, &cmd);
1263 
1264 
1265 	sc->intr_coal_delay_ptr = (volatile uint32_t *)(sc->sram + cmd.data0);
1266 
1267 	status |= mxge_send_cmd(sc, MXGEFW_CMD_GET_IRQ_ACK_OFFSET, &cmd);
1268 	irq_claim = (volatile uint32_t *)(sc->sram + cmd.data0);
1269 
1270 
1271 	status |= mxge_send_cmd(sc,  MXGEFW_CMD_GET_IRQ_DEASSERT_OFFSET,
1272 				&cmd);
1273 	sc->irq_deassert = (volatile uint32_t *)(sc->sram + cmd.data0);
1274 	if (status != 0) {
1275 		device_printf(sc->dev, "failed set interrupt parameters\n");
1276 		return status;
1277 	}
1278 
1279 
1280 	*sc->intr_coal_delay_ptr = htobe32(sc->intr_coal_delay);
1281 
1282 
1283 	/* run a DMA benchmark */
1284 	(void) mxge_dma_test(sc, MXGEFW_DMA_TEST);
1285 
1286 	for (slice = 0; slice < sc->num_slices; slice++) {
1287 		ss = &sc->ss[slice];
1288 
1289 		ss->irq_claim = irq_claim + (2 * slice);
1290 		/* reset mcp/driver shared state back to 0 */
1291 		ss->rx_done.idx = 0;
1292 		ss->rx_done.cnt = 0;
1293 		ss->tx.req = 0;
1294 		ss->tx.done = 0;
1295 		ss->tx.pkt_done = 0;
1296 		ss->tx.queue_active = 0;
1297 		ss->tx.activate = 0;
1298 		ss->tx.deactivate = 0;
1299 		ss->tx.wake = 0;
1300 		ss->tx.defrag = 0;
1301 		ss->tx.stall = 0;
1302 		ss->rx_big.cnt = 0;
1303 		ss->rx_small.cnt = 0;
1304 		ss->lro_bad_csum = 0;
1305 		ss->lro_queued = 0;
1306 		ss->lro_flushed = 0;
1307 		if (ss->fw_stats != NULL) {
1308 			ss->fw_stats->valid = 0;
1309 			ss->fw_stats->send_done_count = 0;
1310 		}
1311 	}
1312 	sc->rdma_tags_available = 15;
1313 	status = mxge_update_mac_address(sc);
1314 	mxge_change_promisc(sc, sc->ifp->if_flags & IFF_PROMISC);
1315 	mxge_change_pause(sc, sc->pause);
1316 	mxge_set_multicast_list(sc);
1317 	return status;
1318 }
1319 
1320 static int
1321 mxge_change_intr_coal(SYSCTL_HANDLER_ARGS)
1322 {
1323         mxge_softc_t *sc;
1324         unsigned int intr_coal_delay;
1325         int err;
1326 
1327         sc = arg1;
1328         intr_coal_delay = sc->intr_coal_delay;
1329         err = sysctl_handle_int(oidp, &intr_coal_delay, arg2, req);
1330         if (err != 0) {
1331                 return err;
1332         }
1333         if (intr_coal_delay == sc->intr_coal_delay)
1334                 return 0;
1335 
1336         if (intr_coal_delay == 0 || intr_coal_delay > 1000*1000)
1337                 return EINVAL;
1338 
1339 	mtx_lock(&sc->driver_mtx);
1340 	*sc->intr_coal_delay_ptr = htobe32(intr_coal_delay);
1341 	sc->intr_coal_delay = intr_coal_delay;
1342 
1343 	mtx_unlock(&sc->driver_mtx);
1344         return err;
1345 }
1346 
1347 static int
1348 mxge_change_flow_control(SYSCTL_HANDLER_ARGS)
1349 {
1350         mxge_softc_t *sc;
1351         unsigned int enabled;
1352         int err;
1353 
1354         sc = arg1;
1355         enabled = sc->pause;
1356         err = sysctl_handle_int(oidp, &enabled, arg2, req);
1357         if (err != 0) {
1358                 return err;
1359         }
1360         if (enabled == sc->pause)
1361                 return 0;
1362 
1363 	mtx_lock(&sc->driver_mtx);
1364 	err = mxge_change_pause(sc, enabled);
1365 	mtx_unlock(&sc->driver_mtx);
1366         return err;
1367 }
1368 
1369 static int
1370 mxge_change_lro_locked(mxge_softc_t *sc, int lro_cnt)
1371 {
1372 	struct ifnet *ifp;
1373 	int err = 0;
1374 
1375 	ifp = sc->ifp;
1376 	if (lro_cnt == 0)
1377 		ifp->if_capenable &= ~IFCAP_LRO;
1378 	else
1379 		ifp->if_capenable |= IFCAP_LRO;
1380 	sc->lro_cnt = lro_cnt;
1381 	if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
1382 		mxge_close(sc);
1383 		err = mxge_open(sc);
1384 	}
1385 	return err;
1386 }
1387 
1388 static int
1389 mxge_change_lro(SYSCTL_HANDLER_ARGS)
1390 {
1391 	mxge_softc_t *sc;
1392 	unsigned int lro_cnt;
1393 	int err;
1394 
1395 	sc = arg1;
1396 	lro_cnt = sc->lro_cnt;
1397 	err = sysctl_handle_int(oidp, &lro_cnt, arg2, req);
1398 	if (err != 0)
1399 		return err;
1400 
1401 	if (lro_cnt == sc->lro_cnt)
1402 		return 0;
1403 
1404 	if (lro_cnt > 128)
1405 		return EINVAL;
1406 
1407 	mtx_lock(&sc->driver_mtx);
1408 	err = mxge_change_lro_locked(sc, lro_cnt);
1409 	mtx_unlock(&sc->driver_mtx);
1410 	return err;
1411 }
1412 
1413 static int
1414 mxge_handle_be32(SYSCTL_HANDLER_ARGS)
1415 {
1416         int err;
1417 
1418         if (arg1 == NULL)
1419                 return EFAULT;
1420         arg2 = be32toh(*(int *)arg1);
1421         arg1 = NULL;
1422         err = sysctl_handle_int(oidp, arg1, arg2, req);
1423 
1424         return err;
1425 }
1426 
1427 static void
1428 mxge_rem_sysctls(mxge_softc_t *sc)
1429 {
1430 	struct mxge_slice_state *ss;
1431 	int slice;
1432 
1433 	if (sc->slice_sysctl_tree == NULL)
1434 		return;
1435 
1436 	for (slice = 0; slice < sc->num_slices; slice++) {
1437 		ss = &sc->ss[slice];
1438 		if (ss == NULL || ss->sysctl_tree == NULL)
1439 			continue;
1440 		sysctl_ctx_free(&ss->sysctl_ctx);
1441 		ss->sysctl_tree = NULL;
1442 	}
1443 	sysctl_ctx_free(&sc->slice_sysctl_ctx);
1444 	sc->slice_sysctl_tree = NULL;
1445 }
1446 
1447 static void
1448 mxge_add_sysctls(mxge_softc_t *sc)
1449 {
1450 	struct sysctl_ctx_list *ctx;
1451 	struct sysctl_oid_list *children;
1452 	mcp_irq_data_t *fw;
1453 	struct mxge_slice_state *ss;
1454 	int slice;
1455 	char slice_num[8];
1456 
1457 	ctx = device_get_sysctl_ctx(sc->dev);
1458 	children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev));
1459 	fw = sc->ss[0].fw_stats;
1460 
1461 	/* random information */
1462 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1463 		       "firmware_version",
1464 		       CTLFLAG_RD, &sc->fw_version,
1465 		       0, "firmware version");
1466 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1467 		       "serial_number",
1468 		       CTLFLAG_RD, &sc->serial_number_string,
1469 		       0, "serial number");
1470 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1471 		       "product_code",
1472 		       CTLFLAG_RD, &sc->product_code_string,
1473 		       0, "product_code");
1474 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1475 		       "pcie_link_width",
1476 		       CTLFLAG_RD, &sc->link_width,
1477 		       0, "tx_boundary");
1478 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1479 		       "tx_boundary",
1480 		       CTLFLAG_RD, &sc->tx_boundary,
1481 		       0, "tx_boundary");
1482 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1483 		       "write_combine",
1484 		       CTLFLAG_RD, &sc->wc,
1485 		       0, "write combining PIO?");
1486 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1487 		       "read_dma_MBs",
1488 		       CTLFLAG_RD, &sc->read_dma,
1489 		       0, "DMA Read speed in MB/s");
1490 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1491 		       "write_dma_MBs",
1492 		       CTLFLAG_RD, &sc->write_dma,
1493 		       0, "DMA Write speed in MB/s");
1494 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1495 		       "read_write_dma_MBs",
1496 		       CTLFLAG_RD, &sc->read_write_dma,
1497 		       0, "DMA concurrent Read/Write speed in MB/s");
1498 
1499 
1500 	/* performance related tunables */
1501 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1502 			"intr_coal_delay",
1503 			CTLTYPE_INT|CTLFLAG_RW, sc,
1504 			0, mxge_change_intr_coal,
1505 			"I", "interrupt coalescing delay in usecs");
1506 
1507 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1508 			"flow_control_enabled",
1509 			CTLTYPE_INT|CTLFLAG_RW, sc,
1510 			0, mxge_change_flow_control,
1511 			"I", "interrupt coalescing delay in usecs");
1512 
1513 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1514 		       "deassert_wait",
1515 		       CTLFLAG_RW, &mxge_deassert_wait,
1516 		       0, "Wait for IRQ line to go low in ihandler");
1517 
1518 	/* stats block from firmware is in network byte order.
1519 	   Need to swap it */
1520 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1521 			"link_up",
1522 			CTLTYPE_INT|CTLFLAG_RD, &fw->link_up,
1523 			0, mxge_handle_be32,
1524 			"I", "link up");
1525 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1526 			"rdma_tags_available",
1527 			CTLTYPE_INT|CTLFLAG_RD, &fw->rdma_tags_available,
1528 			0, mxge_handle_be32,
1529 			"I", "rdma_tags_available");
1530 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1531 			"dropped_bad_crc32",
1532 			CTLTYPE_INT|CTLFLAG_RD,
1533 			&fw->dropped_bad_crc32,
1534 			0, mxge_handle_be32,
1535 			"I", "dropped_bad_crc32");
1536 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1537 			"dropped_bad_phy",
1538 			CTLTYPE_INT|CTLFLAG_RD,
1539 			&fw->dropped_bad_phy,
1540 			0, mxge_handle_be32,
1541 			"I", "dropped_bad_phy");
1542 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1543 			"dropped_link_error_or_filtered",
1544 			CTLTYPE_INT|CTLFLAG_RD,
1545 			&fw->dropped_link_error_or_filtered,
1546 			0, mxge_handle_be32,
1547 			"I", "dropped_link_error_or_filtered");
1548 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1549 			"dropped_link_overflow",
1550 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_link_overflow,
1551 			0, mxge_handle_be32,
1552 			"I", "dropped_link_overflow");
1553 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1554 			"dropped_multicast_filtered",
1555 			CTLTYPE_INT|CTLFLAG_RD,
1556 			&fw->dropped_multicast_filtered,
1557 			0, mxge_handle_be32,
1558 			"I", "dropped_multicast_filtered");
1559 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1560 			"dropped_no_big_buffer",
1561 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_no_big_buffer,
1562 			0, mxge_handle_be32,
1563 			"I", "dropped_no_big_buffer");
1564 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1565 			"dropped_no_small_buffer",
1566 			CTLTYPE_INT|CTLFLAG_RD,
1567 			&fw->dropped_no_small_buffer,
1568 			0, mxge_handle_be32,
1569 			"I", "dropped_no_small_buffer");
1570 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1571 			"dropped_overrun",
1572 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_overrun,
1573 			0, mxge_handle_be32,
1574 			"I", "dropped_overrun");
1575 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1576 			"dropped_pause",
1577 			CTLTYPE_INT|CTLFLAG_RD,
1578 			&fw->dropped_pause,
1579 			0, mxge_handle_be32,
1580 			"I", "dropped_pause");
1581 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1582 			"dropped_runt",
1583 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_runt,
1584 			0, mxge_handle_be32,
1585 			"I", "dropped_runt");
1586 
1587 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1588 			"dropped_unicast_filtered",
1589 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_unicast_filtered,
1590 			0, mxge_handle_be32,
1591 			"I", "dropped_unicast_filtered");
1592 
1593 	/* verbose printing? */
1594 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1595 		       "verbose",
1596 		       CTLFLAG_RW, &mxge_verbose,
1597 		       0, "verbose printing");
1598 
1599 	/* lro */
1600 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1601 			"lro_cnt",
1602 			CTLTYPE_INT|CTLFLAG_RW, sc,
1603 			0, mxge_change_lro,
1604 			"I", "number of lro merge queues");
1605 
1606 
1607 	/* add counters exported for debugging from all slices */
1608 	sysctl_ctx_init(&sc->slice_sysctl_ctx);
1609 	sc->slice_sysctl_tree =
1610 		SYSCTL_ADD_NODE(&sc->slice_sysctl_ctx, children, OID_AUTO,
1611 				"slice", CTLFLAG_RD, 0, "");
1612 
1613 	for (slice = 0; slice < sc->num_slices; slice++) {
1614 		ss = &sc->ss[slice];
1615 		sysctl_ctx_init(&ss->sysctl_ctx);
1616 		ctx = &ss->sysctl_ctx;
1617 		children = SYSCTL_CHILDREN(sc->slice_sysctl_tree);
1618 		sprintf(slice_num, "%d", slice);
1619 		ss->sysctl_tree =
1620 			SYSCTL_ADD_NODE(ctx, children, OID_AUTO, slice_num,
1621 					CTLFLAG_RD, 0, "");
1622 		children = SYSCTL_CHILDREN(ss->sysctl_tree);
1623 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1624 			       "rx_small_cnt",
1625 			       CTLFLAG_RD, &ss->rx_small.cnt,
1626 			       0, "rx_small_cnt");
1627 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1628 			       "rx_big_cnt",
1629 			       CTLFLAG_RD, &ss->rx_big.cnt,
1630 			       0, "rx_small_cnt");
1631 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1632 			       "lro_flushed", CTLFLAG_RD, &ss->lro_flushed,
1633 			       0, "number of lro merge queues flushed");
1634 
1635 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1636 			       "lro_queued", CTLFLAG_RD, &ss->lro_queued,
1637 			       0, "number of frames appended to lro merge"
1638 			       "queues");
1639 
1640 #ifndef IFNET_BUF_RING
1641 		/* only transmit from slice 0 for now */
1642 		if (slice > 0)
1643 			continue;
1644 #endif
1645 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1646 			       "tx_req",
1647 			       CTLFLAG_RD, &ss->tx.req,
1648 			       0, "tx_req");
1649 
1650 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1651 			       "tx_done",
1652 			       CTLFLAG_RD, &ss->tx.done,
1653 			       0, "tx_done");
1654 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1655 			       "tx_pkt_done",
1656 			       CTLFLAG_RD, &ss->tx.pkt_done,
1657 			       0, "tx_done");
1658 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1659 			       "tx_stall",
1660 			       CTLFLAG_RD, &ss->tx.stall,
1661 			       0, "tx_stall");
1662 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1663 			       "tx_wake",
1664 			       CTLFLAG_RD, &ss->tx.wake,
1665 			       0, "tx_wake");
1666 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1667 			       "tx_defrag",
1668 			       CTLFLAG_RD, &ss->tx.defrag,
1669 			       0, "tx_defrag");
1670 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1671 			       "tx_queue_active",
1672 			       CTLFLAG_RD, &ss->tx.queue_active,
1673 			       0, "tx_queue_active");
1674 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1675 			       "tx_activate",
1676 			       CTLFLAG_RD, &ss->tx.activate,
1677 			       0, "tx_activate");
1678 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1679 			       "tx_deactivate",
1680 			       CTLFLAG_RD, &ss->tx.deactivate,
1681 			       0, "tx_deactivate");
1682 	}
1683 }
1684 
1685 /* copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
1686    backwards one at a time and handle ring wraps */
1687 
1688 static inline void
1689 mxge_submit_req_backwards(mxge_tx_ring_t *tx,
1690 			    mcp_kreq_ether_send_t *src, int cnt)
1691 {
1692         int idx, starting_slot;
1693         starting_slot = tx->req;
1694         while (cnt > 1) {
1695                 cnt--;
1696                 idx = (starting_slot + cnt) & tx->mask;
1697                 mxge_pio_copy(&tx->lanai[idx],
1698 			      &src[cnt], sizeof(*src));
1699                 wmb();
1700         }
1701 }
1702 
1703 /*
1704  * copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
1705  * at most 32 bytes at a time, so as to avoid involving the software
1706  * pio handler in the nic.   We re-write the first segment's flags
1707  * to mark them valid only after writing the entire chain
1708  */
1709 
1710 static inline void
1711 mxge_submit_req(mxge_tx_ring_t *tx, mcp_kreq_ether_send_t *src,
1712                   int cnt)
1713 {
1714         int idx, i;
1715         uint32_t *src_ints;
1716 	volatile uint32_t *dst_ints;
1717         mcp_kreq_ether_send_t *srcp;
1718 	volatile mcp_kreq_ether_send_t *dstp, *dst;
1719 	uint8_t last_flags;
1720 
1721         idx = tx->req & tx->mask;
1722 
1723 	last_flags = src->flags;
1724 	src->flags = 0;
1725         wmb();
1726         dst = dstp = &tx->lanai[idx];
1727         srcp = src;
1728 
1729         if ((idx + cnt) < tx->mask) {
1730                 for (i = 0; i < (cnt - 1); i += 2) {
1731                         mxge_pio_copy(dstp, srcp, 2 * sizeof(*src));
1732                         wmb(); /* force write every 32 bytes */
1733                         srcp += 2;
1734                         dstp += 2;
1735                 }
1736         } else {
1737                 /* submit all but the first request, and ensure
1738                    that it is submitted below */
1739                 mxge_submit_req_backwards(tx, src, cnt);
1740                 i = 0;
1741         }
1742         if (i < cnt) {
1743                 /* submit the first request */
1744                 mxge_pio_copy(dstp, srcp, sizeof(*src));
1745                 wmb(); /* barrier before setting valid flag */
1746         }
1747 
1748         /* re-write the last 32-bits with the valid flags */
1749         src->flags = last_flags;
1750         src_ints = (uint32_t *)src;
1751         src_ints+=3;
1752         dst_ints = (volatile uint32_t *)dst;
1753         dst_ints+=3;
1754         *dst_ints =  *src_ints;
1755         tx->req += cnt;
1756         wmb();
1757 }
1758 
1759 #if IFCAP_TSO4
1760 
1761 static void
1762 mxge_encap_tso(struct mxge_slice_state *ss, struct mbuf *m,
1763 	       int busdma_seg_cnt, int ip_off)
1764 {
1765 	mxge_tx_ring_t *tx;
1766 	mcp_kreq_ether_send_t *req;
1767 	bus_dma_segment_t *seg;
1768 	struct ip *ip;
1769 	struct tcphdr *tcp;
1770 	uint32_t low, high_swapped;
1771 	int len, seglen, cum_len, cum_len_next;
1772 	int next_is_first, chop, cnt, rdma_count, small;
1773 	uint16_t pseudo_hdr_offset, cksum_offset, mss;
1774 	uint8_t flags, flags_next;
1775 	static int once;
1776 
1777 	mss = m->m_pkthdr.tso_segsz;
1778 
1779 	/* negative cum_len signifies to the
1780 	 * send loop that we are still in the
1781 	 * header portion of the TSO packet.
1782 	 */
1783 
1784 	/* ensure we have the ethernet, IP and TCP
1785 	   header together in the first mbuf, copy
1786 	   it to a scratch buffer if not */
1787 	if (__predict_false(m->m_len < ip_off + sizeof (*ip))) {
1788 		m_copydata(m, 0, ip_off + sizeof (*ip),
1789 			   ss->scratch);
1790 		ip = (struct ip *)(ss->scratch + ip_off);
1791 	} else {
1792 		ip = (struct ip *)(mtod(m, char *) + ip_off);
1793 	}
1794 	if (__predict_false(m->m_len < ip_off + (ip->ip_hl << 2)
1795 			    + sizeof (*tcp))) {
1796 		m_copydata(m, 0, ip_off + (ip->ip_hl << 2)
1797 			   + sizeof (*tcp),  ss->scratch);
1798 		ip = (struct ip *)(mtod(m, char *) + ip_off);
1799 	}
1800 
1801 	tcp = (struct tcphdr *)((char *)ip + (ip->ip_hl << 2));
1802 	cum_len = -(ip_off + ((ip->ip_hl + tcp->th_off) << 2));
1803 
1804 	/* TSO implies checksum offload on this hardware */
1805 	cksum_offset = ip_off + (ip->ip_hl << 2);
1806 	flags = MXGEFW_FLAGS_TSO_HDR | MXGEFW_FLAGS_FIRST;
1807 
1808 
1809 	/* for TSO, pseudo_hdr_offset holds mss.
1810 	 * The firmware figures out where to put
1811 	 * the checksum by parsing the header. */
1812 	pseudo_hdr_offset = htobe16(mss);
1813 
1814 	tx = &ss->tx;
1815 	req = tx->req_list;
1816 	seg = tx->seg_list;
1817 	cnt = 0;
1818 	rdma_count = 0;
1819 	/* "rdma_count" is the number of RDMAs belonging to the
1820 	 * current packet BEFORE the current send request. For
1821 	 * non-TSO packets, this is equal to "count".
1822 	 * For TSO packets, rdma_count needs to be reset
1823 	 * to 0 after a segment cut.
1824 	 *
1825 	 * The rdma_count field of the send request is
1826 	 * the number of RDMAs of the packet starting at
1827 	 * that request. For TSO send requests with one ore more cuts
1828 	 * in the middle, this is the number of RDMAs starting
1829 	 * after the last cut in the request. All previous
1830 	 * segments before the last cut implicitly have 1 RDMA.
1831 	 *
1832 	 * Since the number of RDMAs is not known beforehand,
1833 	 * it must be filled-in retroactively - after each
1834 	 * segmentation cut or at the end of the entire packet.
1835 	 */
1836 
1837 	while (busdma_seg_cnt) {
1838 		/* Break the busdma segment up into pieces*/
1839 		low = MXGE_LOWPART_TO_U32(seg->ds_addr);
1840 		high_swapped = 	htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
1841 		len = seg->ds_len;
1842 
1843 		while (len) {
1844 			flags_next = flags & ~MXGEFW_FLAGS_FIRST;
1845 			seglen = len;
1846 			cum_len_next = cum_len + seglen;
1847 			(req-rdma_count)->rdma_count = rdma_count + 1;
1848 			if (__predict_true(cum_len >= 0)) {
1849 				/* payload */
1850 				chop = (cum_len_next > mss);
1851 				cum_len_next = cum_len_next % mss;
1852 				next_is_first = (cum_len_next == 0);
1853 				flags |= chop * MXGEFW_FLAGS_TSO_CHOP;
1854 				flags_next |= next_is_first *
1855 					MXGEFW_FLAGS_FIRST;
1856 				rdma_count |= -(chop | next_is_first);
1857 				rdma_count += chop & !next_is_first;
1858 			} else if (cum_len_next >= 0) {
1859 				/* header ends */
1860 				rdma_count = -1;
1861 				cum_len_next = 0;
1862 				seglen = -cum_len;
1863 				small = (mss <= MXGEFW_SEND_SMALL_SIZE);
1864 				flags_next = MXGEFW_FLAGS_TSO_PLD |
1865 					MXGEFW_FLAGS_FIRST |
1866 					(small * MXGEFW_FLAGS_SMALL);
1867 			    }
1868 
1869 			req->addr_high = high_swapped;
1870 			req->addr_low = htobe32(low);
1871 			req->pseudo_hdr_offset = pseudo_hdr_offset;
1872 			req->pad = 0;
1873 			req->rdma_count = 1;
1874 			req->length = htobe16(seglen);
1875 			req->cksum_offset = cksum_offset;
1876 			req->flags = flags | ((cum_len & 1) *
1877 					      MXGEFW_FLAGS_ALIGN_ODD);
1878 			low += seglen;
1879 			len -= seglen;
1880 			cum_len = cum_len_next;
1881 			flags = flags_next;
1882 			req++;
1883 			cnt++;
1884 			rdma_count++;
1885 			if (__predict_false(cksum_offset > seglen))
1886 				cksum_offset -= seglen;
1887 			else
1888 				cksum_offset = 0;
1889 			if (__predict_false(cnt > tx->max_desc))
1890 				goto drop;
1891 		}
1892 		busdma_seg_cnt--;
1893 		seg++;
1894 	}
1895 	(req-rdma_count)->rdma_count = rdma_count;
1896 
1897 	do {
1898 		req--;
1899 		req->flags |= MXGEFW_FLAGS_TSO_LAST;
1900 	} while (!(req->flags & (MXGEFW_FLAGS_TSO_CHOP | MXGEFW_FLAGS_FIRST)));
1901 
1902 	tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
1903 	mxge_submit_req(tx, tx->req_list, cnt);
1904 #ifdef IFNET_BUF_RING
1905 	if ((ss->sc->num_slices > 1) && tx->queue_active == 0) {
1906 		/* tell the NIC to start polling this slice */
1907 		*tx->send_go = 1;
1908 		tx->queue_active = 1;
1909 		tx->activate++;
1910 		wmb();
1911 	}
1912 #endif
1913 	return;
1914 
1915 drop:
1916 	bus_dmamap_unload(tx->dmat, tx->info[tx->req & tx->mask].map);
1917 	m_freem(m);
1918 	ss->oerrors++;
1919 	if (!once) {
1920 		printf("tx->max_desc exceeded via TSO!\n");
1921 		printf("mss = %d, %ld, %d!\n", mss,
1922 		       (long)seg - (long)tx->seg_list, tx->max_desc);
1923 		once = 1;
1924 	}
1925 	return;
1926 
1927 }
1928 
1929 #endif /* IFCAP_TSO4 */
1930 
1931 #ifdef MXGE_NEW_VLAN_API
1932 /*
1933  * We reproduce the software vlan tag insertion from
1934  * net/if_vlan.c:vlan_start() here so that we can advertise "hardware"
1935  * vlan tag insertion. We need to advertise this in order to have the
1936  * vlan interface respect our csum offload flags.
1937  */
1938 static struct mbuf *
1939 mxge_vlan_tag_insert(struct mbuf *m)
1940 {
1941 	struct ether_vlan_header *evl;
1942 
1943 	M_PREPEND(m, ETHER_VLAN_ENCAP_LEN, M_DONTWAIT);
1944 	if (__predict_false(m == NULL))
1945 		return NULL;
1946 	if (m->m_len < sizeof(*evl)) {
1947 		m = m_pullup(m, sizeof(*evl));
1948 		if (__predict_false(m == NULL))
1949 			return NULL;
1950 	}
1951 	/*
1952 	 * Transform the Ethernet header into an Ethernet header
1953 	 * with 802.1Q encapsulation.
1954 	 */
1955 	evl = mtod(m, struct ether_vlan_header *);
1956 	bcopy((char *)evl + ETHER_VLAN_ENCAP_LEN,
1957 	      (char *)evl, ETHER_HDR_LEN - ETHER_TYPE_LEN);
1958 	evl->evl_encap_proto = htons(ETHERTYPE_VLAN);
1959 	evl->evl_tag = htons(m->m_pkthdr.ether_vtag);
1960 	m->m_flags &= ~M_VLANTAG;
1961 	return m;
1962 }
1963 #endif /* MXGE_NEW_VLAN_API */
1964 
1965 static void
1966 mxge_encap(struct mxge_slice_state *ss, struct mbuf *m)
1967 {
1968 	mxge_softc_t *sc;
1969 	mcp_kreq_ether_send_t *req;
1970 	bus_dma_segment_t *seg;
1971 	struct mbuf *m_tmp;
1972 	struct ifnet *ifp;
1973 	mxge_tx_ring_t *tx;
1974 	struct ip *ip;
1975 	int cnt, cum_len, err, i, idx, odd_flag, ip_off;
1976 	uint16_t pseudo_hdr_offset;
1977         uint8_t flags, cksum_offset;
1978 
1979 
1980 	sc = ss->sc;
1981 	ifp = sc->ifp;
1982 	tx = &ss->tx;
1983 
1984 	ip_off = sizeof (struct ether_header);
1985 #ifdef MXGE_NEW_VLAN_API
1986 	if (m->m_flags & M_VLANTAG) {
1987 		m = mxge_vlan_tag_insert(m);
1988 		if (__predict_false(m == NULL))
1989 			goto drop;
1990 		ip_off += ETHER_VLAN_ENCAP_LEN;
1991 	}
1992 #endif
1993 	/* (try to) map the frame for DMA */
1994 	idx = tx->req & tx->mask;
1995 	err = bus_dmamap_load_mbuf_sg(tx->dmat, tx->info[idx].map,
1996 				      m, tx->seg_list, &cnt,
1997 				      BUS_DMA_NOWAIT);
1998 	if (__predict_false(err == EFBIG)) {
1999 		/* Too many segments in the chain.  Try
2000 		   to defrag */
2001 		m_tmp = m_defrag(m, M_NOWAIT);
2002 		if (m_tmp == NULL) {
2003 			goto drop;
2004 		}
2005 		ss->tx.defrag++;
2006 		m = m_tmp;
2007 		err = bus_dmamap_load_mbuf_sg(tx->dmat,
2008 					      tx->info[idx].map,
2009 					      m, tx->seg_list, &cnt,
2010 					      BUS_DMA_NOWAIT);
2011 	}
2012 	if (__predict_false(err != 0)) {
2013 		device_printf(sc->dev, "bus_dmamap_load_mbuf_sg returned %d"
2014 			      " packet len = %d\n", err, m->m_pkthdr.len);
2015 		goto drop;
2016 	}
2017 	bus_dmamap_sync(tx->dmat, tx->info[idx].map,
2018 			BUS_DMASYNC_PREWRITE);
2019 	tx->info[idx].m = m;
2020 
2021 #if IFCAP_TSO4
2022 	/* TSO is different enough, we handle it in another routine */
2023 	if (m->m_pkthdr.csum_flags & (CSUM_TSO)) {
2024 		mxge_encap_tso(ss, m, cnt, ip_off);
2025 		return;
2026 	}
2027 #endif
2028 
2029 	req = tx->req_list;
2030 	cksum_offset = 0;
2031 	pseudo_hdr_offset = 0;
2032 	flags = MXGEFW_FLAGS_NO_TSO;
2033 
2034 	/* checksum offloading? */
2035 	if (m->m_pkthdr.csum_flags & (CSUM_DELAY_DATA)) {
2036 		/* ensure ip header is in first mbuf, copy
2037 		   it to a scratch buffer if not */
2038 		if (__predict_false(m->m_len < ip_off + sizeof (*ip))) {
2039 			m_copydata(m, 0, ip_off + sizeof (*ip),
2040 				   ss->scratch);
2041 			ip = (struct ip *)(ss->scratch + ip_off);
2042 		} else {
2043 			ip = (struct ip *)(mtod(m, char *) + ip_off);
2044 		}
2045 		cksum_offset = ip_off + (ip->ip_hl << 2);
2046 		pseudo_hdr_offset = cksum_offset +  m->m_pkthdr.csum_data;
2047 		pseudo_hdr_offset = htobe16(pseudo_hdr_offset);
2048 		req->cksum_offset = cksum_offset;
2049 		flags |= MXGEFW_FLAGS_CKSUM;
2050 		odd_flag = MXGEFW_FLAGS_ALIGN_ODD;
2051 	} else {
2052 		odd_flag = 0;
2053 	}
2054 	if (m->m_pkthdr.len < MXGEFW_SEND_SMALL_SIZE)
2055 		flags |= MXGEFW_FLAGS_SMALL;
2056 
2057 	/* convert segments into a request list */
2058 	cum_len = 0;
2059 	seg = tx->seg_list;
2060 	req->flags = MXGEFW_FLAGS_FIRST;
2061 	for (i = 0; i < cnt; i++) {
2062 		req->addr_low =
2063 			htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
2064 		req->addr_high =
2065 			htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
2066 		req->length = htobe16(seg->ds_len);
2067 		req->cksum_offset = cksum_offset;
2068 		if (cksum_offset > seg->ds_len)
2069 			cksum_offset -= seg->ds_len;
2070 		else
2071 			cksum_offset = 0;
2072 		req->pseudo_hdr_offset = pseudo_hdr_offset;
2073 		req->pad = 0; /* complete solid 16-byte block */
2074 		req->rdma_count = 1;
2075 		req->flags |= flags | ((cum_len & 1) * odd_flag);
2076 		cum_len += seg->ds_len;
2077 		seg++;
2078 		req++;
2079 		req->flags = 0;
2080 	}
2081 	req--;
2082 	/* pad runts to 60 bytes */
2083 	if (cum_len < 60) {
2084 		req++;
2085 		req->addr_low =
2086 			htobe32(MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr));
2087 		req->addr_high =
2088 			htobe32(MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr));
2089 		req->length = htobe16(60 - cum_len);
2090 		req->cksum_offset = 0;
2091 		req->pseudo_hdr_offset = pseudo_hdr_offset;
2092 		req->pad = 0; /* complete solid 16-byte block */
2093 		req->rdma_count = 1;
2094 		req->flags |= flags | ((cum_len & 1) * odd_flag);
2095 		cnt++;
2096 	}
2097 
2098 	tx->req_list[0].rdma_count = cnt;
2099 #if 0
2100 	/* print what the firmware will see */
2101 	for (i = 0; i < cnt; i++) {
2102 		printf("%d: addr: 0x%x 0x%x len:%d pso%d,"
2103 		    "cso:%d, flags:0x%x, rdma:%d\n",
2104 		    i, (int)ntohl(tx->req_list[i].addr_high),
2105 		    (int)ntohl(tx->req_list[i].addr_low),
2106 		    (int)ntohs(tx->req_list[i].length),
2107 		    (int)ntohs(tx->req_list[i].pseudo_hdr_offset),
2108 		    tx->req_list[i].cksum_offset, tx->req_list[i].flags,
2109 		    tx->req_list[i].rdma_count);
2110 	}
2111 	printf("--------------\n");
2112 #endif
2113 	tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
2114 	mxge_submit_req(tx, tx->req_list, cnt);
2115 #ifdef IFNET_BUF_RING
2116 	if ((ss->sc->num_slices > 1) && tx->queue_active == 0) {
2117 		/* tell the NIC to start polling this slice */
2118 		*tx->send_go = 1;
2119 		tx->queue_active = 1;
2120 		tx->activate++;
2121 		wmb();
2122 	}
2123 #endif
2124 	return;
2125 
2126 drop:
2127 	m_freem(m);
2128 	ss->oerrors++;
2129 	return;
2130 }
2131 
2132 #ifdef IFNET_BUF_RING
2133 static void
2134 mxge_qflush(struct ifnet *ifp)
2135 {
2136 	mxge_softc_t *sc = ifp->if_softc;
2137 	mxge_tx_ring_t *tx;
2138 	struct mbuf *m;
2139 	int slice;
2140 
2141 	for (slice = 0; slice < sc->num_slices; slice++) {
2142 		tx = &sc->ss[slice].tx;
2143 		mtx_lock(&tx->mtx);
2144 		while ((m = buf_ring_dequeue_sc(tx->br)) != NULL)
2145 			m_freem(m);
2146 		mtx_unlock(&tx->mtx);
2147 	}
2148 	if_qflush(ifp);
2149 }
2150 
2151 static inline void
2152 mxge_start_locked(struct mxge_slice_state *ss)
2153 {
2154 	mxge_softc_t *sc;
2155 	struct mbuf *m;
2156 	struct ifnet *ifp;
2157 	mxge_tx_ring_t *tx;
2158 
2159 	sc = ss->sc;
2160 	ifp = sc->ifp;
2161 	tx = &ss->tx;
2162 
2163 	while ((tx->mask - (tx->req - tx->done)) > tx->max_desc) {
2164 		m = drbr_dequeue(ifp, tx->br);
2165 		if (m == NULL) {
2166 			return;
2167 		}
2168 		/* let BPF see it */
2169 		BPF_MTAP(ifp, m);
2170 
2171 		/* give it to the nic */
2172 		mxge_encap(ss, m);
2173 	}
2174 	/* ran out of transmit slots */
2175 	if (((ss->if_drv_flags & IFF_DRV_OACTIVE) == 0)
2176 	    && (!drbr_empty(ifp, tx->br))) {
2177 		ss->if_drv_flags |= IFF_DRV_OACTIVE;
2178 		tx->stall++;
2179 	}
2180 }
2181 
2182 static int
2183 mxge_transmit_locked(struct mxge_slice_state *ss, struct mbuf *m)
2184 {
2185 	mxge_softc_t *sc;
2186 	struct ifnet *ifp;
2187 	mxge_tx_ring_t *tx;
2188 	int err;
2189 
2190 	sc = ss->sc;
2191 	ifp = sc->ifp;
2192 	tx = &ss->tx;
2193 
2194 	if ((ss->if_drv_flags & (IFF_DRV_RUNNING|IFF_DRV_OACTIVE)) !=
2195 	    IFF_DRV_RUNNING) {
2196 		err = drbr_enqueue(ifp, tx->br, m);
2197 		return (err);
2198 	}
2199 
2200 	if (drbr_empty(ifp, tx->br) &&
2201 	    ((tx->mask - (tx->req - tx->done)) > tx->max_desc)) {
2202 		/* let BPF see it */
2203 		BPF_MTAP(ifp, m);
2204 		/* give it to the nic */
2205 		mxge_encap(ss, m);
2206 	} else if ((err = drbr_enqueue(ifp, tx->br, m)) != 0) {
2207 		return (err);
2208 	}
2209 	if (!drbr_empty(ifp, tx->br))
2210 		mxge_start_locked(ss);
2211 	return (0);
2212 }
2213 
2214 static int
2215 mxge_transmit(struct ifnet *ifp, struct mbuf *m)
2216 {
2217 	mxge_softc_t *sc = ifp->if_softc;
2218 	struct mxge_slice_state *ss;
2219 	mxge_tx_ring_t *tx;
2220 	int err = 0;
2221 	int slice;
2222 
2223 	slice = m->m_pkthdr.flowid;
2224 	slice &= (sc->num_slices - 1);  /* num_slices always power of 2 */
2225 
2226 	ss = &sc->ss[slice];
2227 	tx = &ss->tx;
2228 
2229 	if (mtx_trylock(&tx->mtx)) {
2230 		err = mxge_transmit_locked(ss, m);
2231 		mtx_unlock(&tx->mtx);
2232 	} else {
2233 		err = drbr_enqueue(ifp, tx->br, m);
2234 	}
2235 
2236 	return (err);
2237 }
2238 
2239 #else
2240 
2241 static inline void
2242 mxge_start_locked(struct mxge_slice_state *ss)
2243 {
2244 	mxge_softc_t *sc;
2245 	struct mbuf *m;
2246 	struct ifnet *ifp;
2247 	mxge_tx_ring_t *tx;
2248 
2249 	sc = ss->sc;
2250 	ifp = sc->ifp;
2251 	tx = &ss->tx;
2252 	while ((tx->mask - (tx->req - tx->done)) > tx->max_desc) {
2253 		IFQ_DRV_DEQUEUE(&ifp->if_snd, m);
2254 		if (m == NULL) {
2255 			return;
2256 		}
2257 		/* let BPF see it */
2258 		BPF_MTAP(ifp, m);
2259 
2260 		/* give it to the nic */
2261 		mxge_encap(ss, m);
2262 	}
2263 	/* ran out of transmit slots */
2264 	if ((sc->ifp->if_drv_flags & IFF_DRV_OACTIVE) == 0) {
2265 		sc->ifp->if_drv_flags |= IFF_DRV_OACTIVE;
2266 		tx->stall++;
2267 	}
2268 }
2269 #endif
2270 static void
2271 mxge_start(struct ifnet *ifp)
2272 {
2273 	mxge_softc_t *sc = ifp->if_softc;
2274 	struct mxge_slice_state *ss;
2275 
2276 	/* only use the first slice for now */
2277 	ss = &sc->ss[0];
2278 	mtx_lock(&ss->tx.mtx);
2279 	mxge_start_locked(ss);
2280 	mtx_unlock(&ss->tx.mtx);
2281 }
2282 
2283 /*
2284  * copy an array of mcp_kreq_ether_recv_t's to the mcp.  Copy
2285  * at most 32 bytes at a time, so as to avoid involving the software
2286  * pio handler in the nic.   We re-write the first segment's low
2287  * DMA address to mark it valid only after we write the entire chunk
2288  * in a burst
2289  */
2290 static inline void
2291 mxge_submit_8rx(volatile mcp_kreq_ether_recv_t *dst,
2292 		mcp_kreq_ether_recv_t *src)
2293 {
2294 	uint32_t low;
2295 
2296 	low = src->addr_low;
2297 	src->addr_low = 0xffffffff;
2298 	mxge_pio_copy(dst, src, 4 * sizeof (*src));
2299 	wmb();
2300 	mxge_pio_copy(dst + 4, src + 4, 4 * sizeof (*src));
2301 	wmb();
2302 	src->addr_low = low;
2303 	dst->addr_low = low;
2304 	wmb();
2305 }
2306 
2307 static int
2308 mxge_get_buf_small(struct mxge_slice_state *ss, bus_dmamap_t map, int idx)
2309 {
2310 	bus_dma_segment_t seg;
2311 	struct mbuf *m;
2312 	mxge_rx_ring_t *rx = &ss->rx_small;
2313 	int cnt, err;
2314 
2315 	m = m_gethdr(M_DONTWAIT, MT_DATA);
2316 	if (m == NULL) {
2317 		rx->alloc_fail++;
2318 		err = ENOBUFS;
2319 		goto done;
2320 	}
2321 	m->m_len = MHLEN;
2322 	err = bus_dmamap_load_mbuf_sg(rx->dmat, map, m,
2323 				      &seg, &cnt, BUS_DMA_NOWAIT);
2324 	if (err != 0) {
2325 		m_free(m);
2326 		goto done;
2327 	}
2328 	rx->info[idx].m = m;
2329 	rx->shadow[idx].addr_low =
2330 		htobe32(MXGE_LOWPART_TO_U32(seg.ds_addr));
2331 	rx->shadow[idx].addr_high =
2332 		htobe32(MXGE_HIGHPART_TO_U32(seg.ds_addr));
2333 
2334 done:
2335 	if ((idx & 7) == 7)
2336 		mxge_submit_8rx(&rx->lanai[idx - 7], &rx->shadow[idx - 7]);
2337 	return err;
2338 }
2339 
2340 static int
2341 mxge_get_buf_big(struct mxge_slice_state *ss, bus_dmamap_t map, int idx)
2342 {
2343 	bus_dma_segment_t seg[3];
2344 	struct mbuf *m;
2345 	mxge_rx_ring_t *rx = &ss->rx_big;
2346 	int cnt, err, i;
2347 
2348 	if (rx->cl_size == MCLBYTES)
2349 		m = m_getcl(M_DONTWAIT, MT_DATA, M_PKTHDR);
2350 	else
2351 		m = m_getjcl(M_DONTWAIT, MT_DATA, M_PKTHDR, rx->cl_size);
2352 	if (m == NULL) {
2353 		rx->alloc_fail++;
2354 		err = ENOBUFS;
2355 		goto done;
2356 	}
2357 	m->m_len = rx->mlen;
2358 	err = bus_dmamap_load_mbuf_sg(rx->dmat, map, m,
2359 				      seg, &cnt, BUS_DMA_NOWAIT);
2360 	if (err != 0) {
2361 		m_free(m);
2362 		goto done;
2363 	}
2364 	rx->info[idx].m = m;
2365 	rx->shadow[idx].addr_low =
2366 		htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
2367 	rx->shadow[idx].addr_high =
2368 		htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
2369 
2370 #if MXGE_VIRT_JUMBOS
2371 	for (i = 1; i < cnt; i++) {
2372 		rx->shadow[idx + i].addr_low =
2373 			htobe32(MXGE_LOWPART_TO_U32(seg[i].ds_addr));
2374 		rx->shadow[idx + i].addr_high =
2375 			htobe32(MXGE_HIGHPART_TO_U32(seg[i].ds_addr));
2376        }
2377 #endif
2378 
2379 done:
2380        for (i = 0; i < rx->nbufs; i++) {
2381 		if ((idx & 7) == 7) {
2382 			mxge_submit_8rx(&rx->lanai[idx - 7],
2383 					&rx->shadow[idx - 7]);
2384 		}
2385 		idx++;
2386 	}
2387 	return err;
2388 }
2389 
2390 /*
2391  *  Myri10GE hardware checksums are not valid if the sender
2392  *  padded the frame with non-zero padding.  This is because
2393  *  the firmware just does a simple 16-bit 1s complement
2394  *  checksum across the entire frame, excluding the first 14
2395  *  bytes.  It is best to simply to check the checksum and
2396  *  tell the stack about it only if the checksum is good
2397  */
2398 
2399 static inline uint16_t
2400 mxge_rx_csum(struct mbuf *m, int csum)
2401 {
2402 	struct ether_header *eh;
2403 	struct ip *ip;
2404 	uint16_t c;
2405 
2406 	eh = mtod(m, struct ether_header *);
2407 
2408 	/* only deal with IPv4 TCP & UDP for now */
2409 	if (__predict_false(eh->ether_type != htons(ETHERTYPE_IP)))
2410 		return 1;
2411 	ip = (struct ip *)(eh + 1);
2412 	if (__predict_false(ip->ip_p != IPPROTO_TCP &&
2413 			    ip->ip_p != IPPROTO_UDP))
2414 		return 1;
2415 #ifdef INET
2416 	c = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
2417 		      htonl(ntohs(csum) + ntohs(ip->ip_len) +
2418 			    - (ip->ip_hl << 2) + ip->ip_p));
2419 #else
2420 	c = 1;
2421 #endif
2422 	c ^= 0xffff;
2423 	return (c);
2424 }
2425 
2426 static void
2427 mxge_vlan_tag_remove(struct mbuf *m, uint32_t *csum)
2428 {
2429 	struct ether_vlan_header *evl;
2430 	struct ether_header *eh;
2431 	uint32_t partial;
2432 
2433 	evl = mtod(m, struct ether_vlan_header *);
2434 	eh = mtod(m, struct ether_header *);
2435 
2436 	/*
2437 	 * fix checksum by subtracting ETHER_VLAN_ENCAP_LEN bytes
2438 	 * after what the firmware thought was the end of the ethernet
2439 	 * header.
2440 	 */
2441 
2442 	/* put checksum into host byte order */
2443 	*csum = ntohs(*csum);
2444 	partial = ntohl(*(uint32_t *)(mtod(m, char *) + ETHER_HDR_LEN));
2445 	(*csum) += ~partial;
2446 	(*csum) +=  ((*csum) < ~partial);
2447 	(*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2448 	(*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2449 
2450 	/* restore checksum to network byte order;
2451 	   later consumers expect this */
2452 	*csum = htons(*csum);
2453 
2454 	/* save the tag */
2455 #ifdef MXGE_NEW_VLAN_API
2456 	m->m_pkthdr.ether_vtag = ntohs(evl->evl_tag);
2457 #else
2458 	{
2459 		struct m_tag *mtag;
2460 		mtag = m_tag_alloc(MTAG_VLAN, MTAG_VLAN_TAG, sizeof(u_int),
2461 				   M_NOWAIT);
2462 		if (mtag == NULL)
2463 			return;
2464 		VLAN_TAG_VALUE(mtag) = ntohs(evl->evl_tag);
2465 		m_tag_prepend(m, mtag);
2466 	}
2467 
2468 #endif
2469 	m->m_flags |= M_VLANTAG;
2470 
2471 	/*
2472 	 * Remove the 802.1q header by copying the Ethernet
2473 	 * addresses over it and adjusting the beginning of
2474 	 * the data in the mbuf.  The encapsulated Ethernet
2475 	 * type field is already in place.
2476 	 */
2477 	bcopy((char *)evl, (char *)evl + ETHER_VLAN_ENCAP_LEN,
2478 	      ETHER_HDR_LEN - ETHER_TYPE_LEN);
2479 	m_adj(m, ETHER_VLAN_ENCAP_LEN);
2480 }
2481 
2482 
2483 static inline void
2484 mxge_rx_done_big(struct mxge_slice_state *ss, uint32_t len, uint32_t csum)
2485 {
2486 	mxge_softc_t *sc;
2487 	struct ifnet *ifp;
2488 	struct mbuf *m;
2489 	struct ether_header *eh;
2490 	mxge_rx_ring_t *rx;
2491 	bus_dmamap_t old_map;
2492 	int idx;
2493 	uint16_t tcpudp_csum;
2494 
2495 	sc = ss->sc;
2496 	ifp = sc->ifp;
2497 	rx = &ss->rx_big;
2498 	idx = rx->cnt & rx->mask;
2499 	rx->cnt += rx->nbufs;
2500 	/* save a pointer to the received mbuf */
2501 	m = rx->info[idx].m;
2502 	/* try to replace the received mbuf */
2503 	if (mxge_get_buf_big(ss, rx->extra_map, idx)) {
2504 		/* drop the frame -- the old mbuf is re-cycled */
2505 		ifp->if_ierrors++;
2506 		return;
2507 	}
2508 
2509 	/* unmap the received buffer */
2510 	old_map = rx->info[idx].map;
2511 	bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2512 	bus_dmamap_unload(rx->dmat, old_map);
2513 
2514 	/* swap the bus_dmamap_t's */
2515 	rx->info[idx].map = rx->extra_map;
2516 	rx->extra_map = old_map;
2517 
2518 	/* mcp implicitly skips 1st 2 bytes so that packet is properly
2519 	 * aligned */
2520 	m->m_data += MXGEFW_PAD;
2521 
2522 	m->m_pkthdr.rcvif = ifp;
2523 	m->m_len = m->m_pkthdr.len = len;
2524 	ss->ipackets++;
2525 	eh = mtod(m, struct ether_header *);
2526 	if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2527 		mxge_vlan_tag_remove(m, &csum);
2528 	}
2529 	/* if the checksum is valid, mark it in the mbuf header */
2530 	if (sc->csum_flag && (0 == (tcpudp_csum = mxge_rx_csum(m, csum)))) {
2531 		if (sc->lro_cnt && (0 == mxge_lro_rx(ss, m, csum)))
2532 			return;
2533 		/* otherwise, it was a UDP frame, or a TCP frame which
2534 		   we could not do LRO on.  Tell the stack that the
2535 		   checksum is good */
2536 		m->m_pkthdr.csum_data = 0xffff;
2537 		m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR | CSUM_DATA_VALID;
2538 	}
2539 	/* flowid only valid if RSS hashing is enabled */
2540 	if (sc->num_slices > 1) {
2541 		m->m_pkthdr.flowid = (ss - sc->ss);
2542 		m->m_flags |= M_FLOWID;
2543 	}
2544 	/* pass the frame up the stack */
2545 	(*ifp->if_input)(ifp, m);
2546 }
2547 
2548 static inline void
2549 mxge_rx_done_small(struct mxge_slice_state *ss, uint32_t len, uint32_t csum)
2550 {
2551 	mxge_softc_t *sc;
2552 	struct ifnet *ifp;
2553 	struct ether_header *eh;
2554 	struct mbuf *m;
2555 	mxge_rx_ring_t *rx;
2556 	bus_dmamap_t old_map;
2557 	int idx;
2558 	uint16_t tcpudp_csum;
2559 
2560 	sc = ss->sc;
2561 	ifp = sc->ifp;
2562 	rx = &ss->rx_small;
2563 	idx = rx->cnt & rx->mask;
2564 	rx->cnt++;
2565 	/* save a pointer to the received mbuf */
2566 	m = rx->info[idx].m;
2567 	/* try to replace the received mbuf */
2568 	if (mxge_get_buf_small(ss, rx->extra_map, idx)) {
2569 		/* drop the frame -- the old mbuf is re-cycled */
2570 		ifp->if_ierrors++;
2571 		return;
2572 	}
2573 
2574 	/* unmap the received buffer */
2575 	old_map = rx->info[idx].map;
2576 	bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2577 	bus_dmamap_unload(rx->dmat, old_map);
2578 
2579 	/* swap the bus_dmamap_t's */
2580 	rx->info[idx].map = rx->extra_map;
2581 	rx->extra_map = old_map;
2582 
2583 	/* mcp implicitly skips 1st 2 bytes so that packet is properly
2584 	 * aligned */
2585 	m->m_data += MXGEFW_PAD;
2586 
2587 	m->m_pkthdr.rcvif = ifp;
2588 	m->m_len = m->m_pkthdr.len = len;
2589 	ss->ipackets++;
2590 	eh = mtod(m, struct ether_header *);
2591 	if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2592 		mxge_vlan_tag_remove(m, &csum);
2593 	}
2594 	/* if the checksum is valid, mark it in the mbuf header */
2595 	if (sc->csum_flag && (0 == (tcpudp_csum = mxge_rx_csum(m, csum)))) {
2596 		if (sc->lro_cnt && (0 == mxge_lro_rx(ss, m, csum)))
2597 			return;
2598 		/* otherwise, it was a UDP frame, or a TCP frame which
2599 		   we could not do LRO on.  Tell the stack that the
2600 		   checksum is good */
2601 		m->m_pkthdr.csum_data = 0xffff;
2602 		m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR | CSUM_DATA_VALID;
2603 	}
2604 	/* flowid only valid if RSS hashing is enabled */
2605 	if (sc->num_slices > 1) {
2606 		m->m_pkthdr.flowid = (ss - sc->ss);
2607 		m->m_flags |= M_FLOWID;
2608 	}
2609 	/* pass the frame up the stack */
2610 	(*ifp->if_input)(ifp, m);
2611 }
2612 
2613 static inline void
2614 mxge_clean_rx_done(struct mxge_slice_state *ss)
2615 {
2616 	mxge_rx_done_t *rx_done = &ss->rx_done;
2617 	int limit = 0;
2618 	uint16_t length;
2619 	uint16_t checksum;
2620 
2621 
2622 	while (rx_done->entry[rx_done->idx].length != 0) {
2623 		length = ntohs(rx_done->entry[rx_done->idx].length);
2624 		rx_done->entry[rx_done->idx].length = 0;
2625 		checksum = rx_done->entry[rx_done->idx].checksum;
2626 		if (length <= (MHLEN - MXGEFW_PAD))
2627 			mxge_rx_done_small(ss, length, checksum);
2628 		else
2629 			mxge_rx_done_big(ss, length, checksum);
2630 		rx_done->cnt++;
2631 		rx_done->idx = rx_done->cnt & rx_done->mask;
2632 
2633 		/* limit potential for livelock */
2634 		if (__predict_false(++limit > rx_done->mask / 2))
2635 			break;
2636 	}
2637 #ifdef INET
2638 	while (!SLIST_EMPTY(&ss->lro_active)) {
2639 		struct lro_entry *lro = SLIST_FIRST(&ss->lro_active);
2640 		SLIST_REMOVE_HEAD(&ss->lro_active, next);
2641 		mxge_lro_flush(ss, lro);
2642 	}
2643 #endif
2644 }
2645 
2646 
2647 static inline void
2648 mxge_tx_done(struct mxge_slice_state *ss, uint32_t mcp_idx)
2649 {
2650 	struct ifnet *ifp;
2651 	mxge_tx_ring_t *tx;
2652 	struct mbuf *m;
2653 	bus_dmamap_t map;
2654 	int idx;
2655 	int *flags;
2656 
2657 	tx = &ss->tx;
2658 	ifp = ss->sc->ifp;
2659 	while (tx->pkt_done != mcp_idx) {
2660 		idx = tx->done & tx->mask;
2661 		tx->done++;
2662 		m = tx->info[idx].m;
2663 		/* mbuf and DMA map only attached to the first
2664 		   segment per-mbuf */
2665 		if (m != NULL) {
2666 			ss->obytes += m->m_pkthdr.len;
2667 			if (m->m_flags & M_MCAST)
2668 				ss->omcasts++;
2669 			ss->opackets++;
2670 			tx->info[idx].m = NULL;
2671 			map = tx->info[idx].map;
2672 			bus_dmamap_unload(tx->dmat, map);
2673 			m_freem(m);
2674 		}
2675 		if (tx->info[idx].flag) {
2676 			tx->info[idx].flag = 0;
2677 			tx->pkt_done++;
2678 		}
2679 	}
2680 
2681 	/* If we have space, clear IFF_OACTIVE to tell the stack that
2682            its OK to send packets */
2683 #ifdef IFNET_BUF_RING
2684 	flags = &ss->if_drv_flags;
2685 #else
2686 	flags = &ifp->if_drv_flags;
2687 #endif
2688 	mtx_lock(&ss->tx.mtx);
2689 	if ((*flags) & IFF_DRV_OACTIVE &&
2690 	    tx->req - tx->done < (tx->mask + 1)/4) {
2691 		*(flags) &= ~IFF_DRV_OACTIVE;
2692 		ss->tx.wake++;
2693 		mxge_start_locked(ss);
2694 	}
2695 #ifdef IFNET_BUF_RING
2696 	if ((ss->sc->num_slices > 1) && (tx->req == tx->done)) {
2697 		/* let the NIC stop polling this queue, since there
2698 		 * are no more transmits pending */
2699 		if (tx->req == tx->done) {
2700 			*tx->send_stop = 1;
2701 			tx->queue_active = 0;
2702 			tx->deactivate++;
2703 			wmb();
2704 		}
2705 	}
2706 #endif
2707 	mtx_unlock(&ss->tx.mtx);
2708 
2709 }
2710 
2711 static struct mxge_media_type mxge_xfp_media_types[] =
2712 {
2713 	{IFM_10G_CX4,	0x7f, 		"10GBASE-CX4 (module)"},
2714 	{IFM_10G_SR, 	(1 << 7),	"10GBASE-SR"},
2715 	{IFM_10G_LR, 	(1 << 6),	"10GBASE-LR"},
2716 	{0,		(1 << 5),	"10GBASE-ER"},
2717 	{IFM_10G_LRM,	(1 << 4),	"10GBASE-LRM"},
2718 	{0,		(1 << 3),	"10GBASE-SW"},
2719 	{0,		(1 << 2),	"10GBASE-LW"},
2720 	{0,		(1 << 1),	"10GBASE-EW"},
2721 	{0,		(1 << 0),	"Reserved"}
2722 };
2723 static struct mxge_media_type mxge_sfp_media_types[] =
2724 {
2725 	{0,		(1 << 7),	"Reserved"},
2726 	{IFM_10G_LRM,	(1 << 6),	"10GBASE-LRM"},
2727 	{IFM_10G_LR, 	(1 << 5),	"10GBASE-LR"},
2728 	{IFM_10G_SR,	(1 << 4),	"10GBASE-SR"}
2729 };
2730 
2731 static void
2732 mxge_set_media(mxge_softc_t *sc, int type)
2733 {
2734 	sc->media_flags |= type;
2735 	ifmedia_add(&sc->media, sc->media_flags, 0, NULL);
2736 	ifmedia_set(&sc->media, sc->media_flags);
2737 }
2738 
2739 
2740 /*
2741  * Determine the media type for a NIC.  Some XFPs will identify
2742  * themselves only when their link is up, so this is initiated via a
2743  * link up interrupt.  However, this can potentially take up to
2744  * several milliseconds, so it is run via the watchdog routine, rather
2745  * than in the interrupt handler itself.   This need only be done
2746  * once, not each time the link is up.
2747  */
2748 static void
2749 mxge_media_probe(mxge_softc_t *sc)
2750 {
2751 	mxge_cmd_t cmd;
2752 	char *cage_type;
2753 	char *ptr;
2754 	struct mxge_media_type *mxge_media_types = NULL;
2755 	int i, err, ms, mxge_media_type_entries;
2756 	uint32_t byte;
2757 
2758 	sc->need_media_probe = 0;
2759 
2760 	/* if we've already set a media type, we're done */
2761 	if (sc->media_flags  != (IFM_ETHER | IFM_AUTO))
2762 		return;
2763 
2764 	/*
2765 	 * parse the product code to deterimine the interface type
2766 	 * (CX4, XFP, Quad Ribbon Fiber) by looking at the character
2767 	 * after the 3rd dash in the driver's cached copy of the
2768 	 * EEPROM's product code string.
2769 	 */
2770 	ptr = sc->product_code_string;
2771 	if (ptr == NULL) {
2772 		device_printf(sc->dev, "Missing product code\n");
2773 	}
2774 
2775 	for (i = 0; i < 3; i++, ptr++) {
2776 		ptr = index(ptr, '-');
2777 		if (ptr == NULL) {
2778 			device_printf(sc->dev,
2779 				      "only %d dashes in PC?!?\n", i);
2780 			return;
2781 		}
2782 	}
2783 	if (*ptr == 'C') {
2784 		/* -C is CX4 */
2785 		mxge_set_media(sc, IFM_10G_CX4);
2786 		return;
2787 	}
2788 	else if (*ptr == 'Q') {
2789 		/* -Q is Quad Ribbon Fiber */
2790 		device_printf(sc->dev, "Quad Ribbon Fiber Media\n");
2791 		/* FreeBSD has no media type for Quad ribbon fiber */
2792 		return;
2793 	}
2794 
2795 	if (*ptr == 'R') {
2796 		/* -R is XFP */
2797 		mxge_media_types = mxge_xfp_media_types;
2798 		mxge_media_type_entries =
2799 			sizeof (mxge_xfp_media_types) /
2800 			sizeof (mxge_xfp_media_types[0]);
2801 		byte = MXGE_XFP_COMPLIANCE_BYTE;
2802 		cage_type = "XFP";
2803 	}
2804 
2805 	if (*ptr == 'S' || *(ptr +1) == 'S') {
2806 		/* -S or -2S is SFP+ */
2807 		mxge_media_types = mxge_sfp_media_types;
2808 		mxge_media_type_entries =
2809 			sizeof (mxge_sfp_media_types) /
2810 			sizeof (mxge_sfp_media_types[0]);
2811 		cage_type = "SFP+";
2812 		byte = 3;
2813 	}
2814 
2815 	if (mxge_media_types == NULL) {
2816 		device_printf(sc->dev, "Unknown media type: %c\n", *ptr);
2817 		return;
2818 	}
2819 
2820 	/*
2821 	 * At this point we know the NIC has an XFP cage, so now we
2822 	 * try to determine what is in the cage by using the
2823 	 * firmware's XFP I2C commands to read the XFP 10GbE compilance
2824 	 * register.  We read just one byte, which may take over
2825 	 * a millisecond
2826 	 */
2827 
2828 	cmd.data0 = 0;	 /* just fetch 1 byte, not all 256 */
2829 	cmd.data1 = byte;
2830 	err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_READ, &cmd);
2831 	if (err == MXGEFW_CMD_ERROR_I2C_FAILURE) {
2832 		device_printf(sc->dev, "failed to read XFP\n");
2833 	}
2834 	if (err == MXGEFW_CMD_ERROR_I2C_ABSENT) {
2835 		device_printf(sc->dev, "Type R/S with no XFP!?!?\n");
2836 	}
2837 	if (err != MXGEFW_CMD_OK) {
2838 		return;
2839 	}
2840 
2841 	/* now we wait for the data to be cached */
2842 	cmd.data0 = byte;
2843 	err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
2844 	for (ms = 0; (err == EBUSY) && (ms < 50); ms++) {
2845 		DELAY(1000);
2846 		cmd.data0 = byte;
2847 		err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
2848 	}
2849 	if (err != MXGEFW_CMD_OK) {
2850 		device_printf(sc->dev, "failed to read %s (%d, %dms)\n",
2851 			      cage_type, err, ms);
2852 		return;
2853 	}
2854 
2855 	if (cmd.data0 == mxge_media_types[0].bitmask) {
2856 		if (mxge_verbose)
2857 			device_printf(sc->dev, "%s:%s\n", cage_type,
2858 				      mxge_media_types[0].name);
2859 		mxge_set_media(sc, IFM_10G_CX4);
2860 		return;
2861 	}
2862 	for (i = 1; i < mxge_media_type_entries; i++) {
2863 		if (cmd.data0 & mxge_media_types[i].bitmask) {
2864 			if (mxge_verbose)
2865 				device_printf(sc->dev, "%s:%s\n",
2866 					      cage_type,
2867 					      mxge_media_types[i].name);
2868 
2869 			mxge_set_media(sc, mxge_media_types[i].flag);
2870 			return;
2871 		}
2872 	}
2873 	device_printf(sc->dev, "%s media 0x%x unknown\n", cage_type,
2874 		      cmd.data0);
2875 
2876 	return;
2877 }
2878 
2879 static void
2880 mxge_intr(void *arg)
2881 {
2882 	struct mxge_slice_state *ss = arg;
2883 	mxge_softc_t *sc = ss->sc;
2884 	mcp_irq_data_t *stats = ss->fw_stats;
2885 	mxge_tx_ring_t *tx = &ss->tx;
2886 	mxge_rx_done_t *rx_done = &ss->rx_done;
2887 	uint32_t send_done_count;
2888 	uint8_t valid;
2889 
2890 
2891 #ifndef IFNET_BUF_RING
2892 	/* an interrupt on a non-zero slice is implicitly valid
2893 	   since MSI-X irqs are not shared */
2894 	if (ss != sc->ss) {
2895 		mxge_clean_rx_done(ss);
2896 		*ss->irq_claim = be32toh(3);
2897 		return;
2898 	}
2899 #endif
2900 
2901 	/* make sure the DMA has finished */
2902 	if (!stats->valid) {
2903 		return;
2904 	}
2905 	valid = stats->valid;
2906 
2907 	if (sc->legacy_irq) {
2908 		/* lower legacy IRQ  */
2909 		*sc->irq_deassert = 0;
2910 		if (!mxge_deassert_wait)
2911 			/* don't wait for conf. that irq is low */
2912 			stats->valid = 0;
2913 	} else {
2914 		stats->valid = 0;
2915 	}
2916 
2917 	/* loop while waiting for legacy irq deassertion */
2918 	do {
2919 		/* check for transmit completes and receives */
2920 		send_done_count = be32toh(stats->send_done_count);
2921 		while ((send_done_count != tx->pkt_done) ||
2922 		       (rx_done->entry[rx_done->idx].length != 0)) {
2923 			if (send_done_count != tx->pkt_done)
2924 				mxge_tx_done(ss, (int)send_done_count);
2925 			mxge_clean_rx_done(ss);
2926 			send_done_count = be32toh(stats->send_done_count);
2927 		}
2928 		if (sc->legacy_irq && mxge_deassert_wait)
2929 			wmb();
2930 	} while (*((volatile uint8_t *) &stats->valid));
2931 
2932 	/* fw link & error stats meaningful only on the first slice */
2933 	if (__predict_false((ss == sc->ss) && stats->stats_updated)) {
2934 		if (sc->link_state != stats->link_up) {
2935 			sc->link_state = stats->link_up;
2936 			if (sc->link_state) {
2937 				if_link_state_change(sc->ifp, LINK_STATE_UP);
2938 				if (mxge_verbose)
2939 					device_printf(sc->dev, "link up\n");
2940 			} else {
2941 				if_link_state_change(sc->ifp, LINK_STATE_DOWN);
2942 				if (mxge_verbose)
2943 					device_printf(sc->dev, "link down\n");
2944 			}
2945 			sc->need_media_probe = 1;
2946 		}
2947 		if (sc->rdma_tags_available !=
2948 		    be32toh(stats->rdma_tags_available)) {
2949 			sc->rdma_tags_available =
2950 				be32toh(stats->rdma_tags_available);
2951 			device_printf(sc->dev, "RDMA timed out! %d tags "
2952 				      "left\n", sc->rdma_tags_available);
2953 		}
2954 
2955 		if (stats->link_down) {
2956 			sc->down_cnt += stats->link_down;
2957 			sc->link_state = 0;
2958 			if_link_state_change(sc->ifp, LINK_STATE_DOWN);
2959 		}
2960 	}
2961 
2962 	/* check to see if we have rx token to pass back */
2963 	if (valid & 0x1)
2964 	    *ss->irq_claim = be32toh(3);
2965 	*(ss->irq_claim + 1) = be32toh(3);
2966 }
2967 
2968 static void
2969 mxge_init(void *arg)
2970 {
2971 }
2972 
2973 
2974 
2975 static void
2976 mxge_free_slice_mbufs(struct mxge_slice_state *ss)
2977 {
2978 	struct lro_entry *lro_entry;
2979 	int i;
2980 
2981 	while (!SLIST_EMPTY(&ss->lro_free)) {
2982 		lro_entry = SLIST_FIRST(&ss->lro_free);
2983 		SLIST_REMOVE_HEAD(&ss->lro_free, next);
2984 		free(lro_entry, M_DEVBUF);
2985 	}
2986 
2987 	for (i = 0; i <= ss->rx_big.mask; i++) {
2988 		if (ss->rx_big.info[i].m == NULL)
2989 			continue;
2990 		bus_dmamap_unload(ss->rx_big.dmat,
2991 				  ss->rx_big.info[i].map);
2992 		m_freem(ss->rx_big.info[i].m);
2993 		ss->rx_big.info[i].m = NULL;
2994 	}
2995 
2996 	for (i = 0; i <= ss->rx_small.mask; i++) {
2997 		if (ss->rx_small.info[i].m == NULL)
2998 			continue;
2999 		bus_dmamap_unload(ss->rx_small.dmat,
3000 				  ss->rx_small.info[i].map);
3001 		m_freem(ss->rx_small.info[i].m);
3002 		ss->rx_small.info[i].m = NULL;
3003 	}
3004 
3005 	/* transmit ring used only on the first slice */
3006 	if (ss->tx.info == NULL)
3007 		return;
3008 
3009 	for (i = 0; i <= ss->tx.mask; i++) {
3010 		ss->tx.info[i].flag = 0;
3011 		if (ss->tx.info[i].m == NULL)
3012 			continue;
3013 		bus_dmamap_unload(ss->tx.dmat,
3014 				  ss->tx.info[i].map);
3015 		m_freem(ss->tx.info[i].m);
3016 		ss->tx.info[i].m = NULL;
3017 	}
3018 }
3019 
3020 static void
3021 mxge_free_mbufs(mxge_softc_t *sc)
3022 {
3023 	int slice;
3024 
3025 	for (slice = 0; slice < sc->num_slices; slice++)
3026 		mxge_free_slice_mbufs(&sc->ss[slice]);
3027 }
3028 
3029 static void
3030 mxge_free_slice_rings(struct mxge_slice_state *ss)
3031 {
3032 	int i;
3033 
3034 
3035 	if (ss->rx_done.entry != NULL)
3036 		mxge_dma_free(&ss->rx_done.dma);
3037 	ss->rx_done.entry = NULL;
3038 
3039 	if (ss->tx.req_bytes != NULL)
3040 		free(ss->tx.req_bytes, M_DEVBUF);
3041 	ss->tx.req_bytes = NULL;
3042 
3043 	if (ss->tx.seg_list != NULL)
3044 		free(ss->tx.seg_list, M_DEVBUF);
3045 	ss->tx.seg_list = NULL;
3046 
3047 	if (ss->rx_small.shadow != NULL)
3048 		free(ss->rx_small.shadow, M_DEVBUF);
3049 	ss->rx_small.shadow = NULL;
3050 
3051 	if (ss->rx_big.shadow != NULL)
3052 		free(ss->rx_big.shadow, M_DEVBUF);
3053 	ss->rx_big.shadow = NULL;
3054 
3055 	if (ss->tx.info != NULL) {
3056 		if (ss->tx.dmat != NULL) {
3057 			for (i = 0; i <= ss->tx.mask; i++) {
3058 				bus_dmamap_destroy(ss->tx.dmat,
3059 						   ss->tx.info[i].map);
3060 			}
3061 			bus_dma_tag_destroy(ss->tx.dmat);
3062 		}
3063 		free(ss->tx.info, M_DEVBUF);
3064 	}
3065 	ss->tx.info = NULL;
3066 
3067 	if (ss->rx_small.info != NULL) {
3068 		if (ss->rx_small.dmat != NULL) {
3069 			for (i = 0; i <= ss->rx_small.mask; i++) {
3070 				bus_dmamap_destroy(ss->rx_small.dmat,
3071 						   ss->rx_small.info[i].map);
3072 			}
3073 			bus_dmamap_destroy(ss->rx_small.dmat,
3074 					   ss->rx_small.extra_map);
3075 			bus_dma_tag_destroy(ss->rx_small.dmat);
3076 		}
3077 		free(ss->rx_small.info, M_DEVBUF);
3078 	}
3079 	ss->rx_small.info = NULL;
3080 
3081 	if (ss->rx_big.info != NULL) {
3082 		if (ss->rx_big.dmat != NULL) {
3083 			for (i = 0; i <= ss->rx_big.mask; i++) {
3084 				bus_dmamap_destroy(ss->rx_big.dmat,
3085 						   ss->rx_big.info[i].map);
3086 			}
3087 			bus_dmamap_destroy(ss->rx_big.dmat,
3088 					   ss->rx_big.extra_map);
3089 			bus_dma_tag_destroy(ss->rx_big.dmat);
3090 		}
3091 		free(ss->rx_big.info, M_DEVBUF);
3092 	}
3093 	ss->rx_big.info = NULL;
3094 }
3095 
3096 static void
3097 mxge_free_rings(mxge_softc_t *sc)
3098 {
3099 	int slice;
3100 
3101 	for (slice = 0; slice < sc->num_slices; slice++)
3102 		mxge_free_slice_rings(&sc->ss[slice]);
3103 }
3104 
3105 static int
3106 mxge_alloc_slice_rings(struct mxge_slice_state *ss, int rx_ring_entries,
3107 		       int tx_ring_entries)
3108 {
3109 	mxge_softc_t *sc = ss->sc;
3110 	size_t bytes;
3111 	int err, i;
3112 
3113 	err = ENOMEM;
3114 
3115 	/* allocate per-slice receive resources */
3116 
3117 	ss->rx_small.mask = ss->rx_big.mask = rx_ring_entries - 1;
3118 	ss->rx_done.mask = (2 * rx_ring_entries) - 1;
3119 
3120 	/* allocate the rx shadow rings */
3121 	bytes = rx_ring_entries * sizeof (*ss->rx_small.shadow);
3122 	ss->rx_small.shadow = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3123 	if (ss->rx_small.shadow == NULL)
3124 		return err;;
3125 
3126 	bytes = rx_ring_entries * sizeof (*ss->rx_big.shadow);
3127 	ss->rx_big.shadow = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3128 	if (ss->rx_big.shadow == NULL)
3129 		return err;;
3130 
3131 	/* allocate the rx host info rings */
3132 	bytes = rx_ring_entries * sizeof (*ss->rx_small.info);
3133 	ss->rx_small.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3134 	if (ss->rx_small.info == NULL)
3135 		return err;;
3136 
3137 	bytes = rx_ring_entries * sizeof (*ss->rx_big.info);
3138 	ss->rx_big.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3139 	if (ss->rx_big.info == NULL)
3140 		return err;;
3141 
3142 	/* allocate the rx busdma resources */
3143 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
3144 				 1,			/* alignment */
3145 				 4096,			/* boundary */
3146 				 BUS_SPACE_MAXADDR,	/* low */
3147 				 BUS_SPACE_MAXADDR,	/* high */
3148 				 NULL, NULL,		/* filter */
3149 				 MHLEN,			/* maxsize */
3150 				 1,			/* num segs */
3151 				 MHLEN,			/* maxsegsize */
3152 				 BUS_DMA_ALLOCNOW,	/* flags */
3153 				 NULL, NULL,		/* lock */
3154 				 &ss->rx_small.dmat);	/* tag */
3155 	if (err != 0) {
3156 		device_printf(sc->dev, "Err %d allocating rx_small dmat\n",
3157 			      err);
3158 		return err;;
3159 	}
3160 
3161 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
3162 				 1,			/* alignment */
3163 #if MXGE_VIRT_JUMBOS
3164 				 4096,			/* boundary */
3165 #else
3166 				 0,			/* boundary */
3167 #endif
3168 				 BUS_SPACE_MAXADDR,	/* low */
3169 				 BUS_SPACE_MAXADDR,	/* high */
3170 				 NULL, NULL,		/* filter */
3171 				 3*4096,		/* maxsize */
3172 #if MXGE_VIRT_JUMBOS
3173 				 3,			/* num segs */
3174 				 4096,			/* maxsegsize*/
3175 #else
3176 				 1,			/* num segs */
3177 				 MJUM9BYTES,		/* maxsegsize*/
3178 #endif
3179 				 BUS_DMA_ALLOCNOW,	/* flags */
3180 				 NULL, NULL,		/* lock */
3181 				 &ss->rx_big.dmat);	/* tag */
3182 	if (err != 0) {
3183 		device_printf(sc->dev, "Err %d allocating rx_big dmat\n",
3184 			      err);
3185 		return err;;
3186 	}
3187 	for (i = 0; i <= ss->rx_small.mask; i++) {
3188 		err = bus_dmamap_create(ss->rx_small.dmat, 0,
3189 					&ss->rx_small.info[i].map);
3190 		if (err != 0) {
3191 			device_printf(sc->dev, "Err %d  rx_small dmamap\n",
3192 				      err);
3193 			return err;;
3194 		}
3195 	}
3196 	err = bus_dmamap_create(ss->rx_small.dmat, 0,
3197 				&ss->rx_small.extra_map);
3198 	if (err != 0) {
3199 		device_printf(sc->dev, "Err %d extra rx_small dmamap\n",
3200 			      err);
3201 		return err;;
3202 	}
3203 
3204 	for (i = 0; i <= ss->rx_big.mask; i++) {
3205 		err = bus_dmamap_create(ss->rx_big.dmat, 0,
3206 					&ss->rx_big.info[i].map);
3207 		if (err != 0) {
3208 			device_printf(sc->dev, "Err %d  rx_big dmamap\n",
3209 				      err);
3210 			return err;;
3211 		}
3212 	}
3213 	err = bus_dmamap_create(ss->rx_big.dmat, 0,
3214 				&ss->rx_big.extra_map);
3215 	if (err != 0) {
3216 		device_printf(sc->dev, "Err %d extra rx_big dmamap\n",
3217 			      err);
3218 		return err;;
3219 	}
3220 
3221 	/* now allocate TX resouces */
3222 
3223 #ifndef IFNET_BUF_RING
3224 	/* only use a single TX ring for now */
3225 	if (ss != ss->sc->ss)
3226 		return 0;
3227 #endif
3228 
3229 	ss->tx.mask = tx_ring_entries - 1;
3230 	ss->tx.max_desc = MIN(MXGE_MAX_SEND_DESC, tx_ring_entries / 4);
3231 
3232 
3233 	/* allocate the tx request copy block */
3234 	bytes = 8 +
3235 		sizeof (*ss->tx.req_list) * (ss->tx.max_desc + 4);
3236 	ss->tx.req_bytes = malloc(bytes, M_DEVBUF, M_WAITOK);
3237 	if (ss->tx.req_bytes == NULL)
3238 		return err;;
3239 	/* ensure req_list entries are aligned to 8 bytes */
3240 	ss->tx.req_list = (mcp_kreq_ether_send_t *)
3241 		((unsigned long)(ss->tx.req_bytes + 7) & ~7UL);
3242 
3243 	/* allocate the tx busdma segment list */
3244 	bytes = sizeof (*ss->tx.seg_list) * ss->tx.max_desc;
3245 	ss->tx.seg_list = (bus_dma_segment_t *)
3246 		malloc(bytes, M_DEVBUF, M_WAITOK);
3247 	if (ss->tx.seg_list == NULL)
3248 		return err;;
3249 
3250 	/* allocate the tx host info ring */
3251 	bytes = tx_ring_entries * sizeof (*ss->tx.info);
3252 	ss->tx.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3253 	if (ss->tx.info == NULL)
3254 		return err;;
3255 
3256 	/* allocate the tx busdma resources */
3257 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
3258 				 1,			/* alignment */
3259 				 sc->tx_boundary,	/* boundary */
3260 				 BUS_SPACE_MAXADDR,	/* low */
3261 				 BUS_SPACE_MAXADDR,	/* high */
3262 				 NULL, NULL,		/* filter */
3263 				 65536 + 256,		/* maxsize */
3264 				 ss->tx.max_desc - 2,	/* num segs */
3265 				 sc->tx_boundary,	/* maxsegsz */
3266 				 BUS_DMA_ALLOCNOW,	/* flags */
3267 				 NULL, NULL,		/* lock */
3268 				 &ss->tx.dmat);		/* tag */
3269 
3270 	if (err != 0) {
3271 		device_printf(sc->dev, "Err %d allocating tx dmat\n",
3272 			      err);
3273 		return err;;
3274 	}
3275 
3276 	/* now use these tags to setup dmamaps for each slot
3277 	   in the ring */
3278 	for (i = 0; i <= ss->tx.mask; i++) {
3279 		err = bus_dmamap_create(ss->tx.dmat, 0,
3280 					&ss->tx.info[i].map);
3281 		if (err != 0) {
3282 			device_printf(sc->dev, "Err %d  tx dmamap\n",
3283 				      err);
3284 			return err;;
3285 		}
3286 	}
3287 	return 0;
3288 
3289 }
3290 
3291 static int
3292 mxge_alloc_rings(mxge_softc_t *sc)
3293 {
3294 	mxge_cmd_t cmd;
3295 	int tx_ring_size;
3296 	int tx_ring_entries, rx_ring_entries;
3297 	int err, slice;
3298 
3299 	/* get ring sizes */
3300 	err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_RING_SIZE, &cmd);
3301 	tx_ring_size = cmd.data0;
3302 	if (err != 0) {
3303 		device_printf(sc->dev, "Cannot determine tx ring sizes\n");
3304 		goto abort;
3305 	}
3306 
3307 	tx_ring_entries = tx_ring_size / sizeof (mcp_kreq_ether_send_t);
3308 	rx_ring_entries = sc->rx_ring_size / sizeof (mcp_dma_addr_t);
3309 	IFQ_SET_MAXLEN(&sc->ifp->if_snd, tx_ring_entries - 1);
3310 	sc->ifp->if_snd.ifq_drv_maxlen = sc->ifp->if_snd.ifq_maxlen;
3311 	IFQ_SET_READY(&sc->ifp->if_snd);
3312 
3313 	for (slice = 0; slice < sc->num_slices; slice++) {
3314 		err = mxge_alloc_slice_rings(&sc->ss[slice],
3315 					     rx_ring_entries,
3316 					     tx_ring_entries);
3317 		if (err != 0)
3318 			goto abort;
3319 	}
3320 	return 0;
3321 
3322 abort:
3323 	mxge_free_rings(sc);
3324 	return err;
3325 
3326 }
3327 
3328 
3329 static void
3330 mxge_choose_params(int mtu, int *big_buf_size, int *cl_size, int *nbufs)
3331 {
3332 	int bufsize = mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN + MXGEFW_PAD;
3333 
3334 	if (bufsize < MCLBYTES) {
3335 		/* easy, everything fits in a single buffer */
3336 		*big_buf_size = MCLBYTES;
3337 		*cl_size = MCLBYTES;
3338 		*nbufs = 1;
3339 		return;
3340 	}
3341 
3342 	if (bufsize < MJUMPAGESIZE) {
3343 		/* still easy, everything still fits in a single buffer */
3344 		*big_buf_size = MJUMPAGESIZE;
3345 		*cl_size = MJUMPAGESIZE;
3346 		*nbufs = 1;
3347 		return;
3348 	}
3349 #if MXGE_VIRT_JUMBOS
3350 	/* now we need to use virtually contiguous buffers */
3351 	*cl_size = MJUM9BYTES;
3352 	*big_buf_size = 4096;
3353 	*nbufs = mtu / 4096 + 1;
3354 	/* needs to be a power of two, so round up */
3355 	if (*nbufs == 3)
3356 		*nbufs = 4;
3357 #else
3358 	*cl_size = MJUM9BYTES;
3359 	*big_buf_size = MJUM9BYTES;
3360 	*nbufs = 1;
3361 #endif
3362 }
3363 
3364 static int
3365 mxge_slice_open(struct mxge_slice_state *ss, int nbufs, int cl_size)
3366 {
3367 	mxge_softc_t *sc;
3368 	mxge_cmd_t cmd;
3369 	bus_dmamap_t map;
3370 	struct lro_entry *lro_entry;
3371 	int err, i, slice;
3372 
3373 
3374 	sc = ss->sc;
3375 	slice = ss - sc->ss;
3376 
3377 	SLIST_INIT(&ss->lro_free);
3378 	SLIST_INIT(&ss->lro_active);
3379 
3380 	for (i = 0; i < sc->lro_cnt; i++) {
3381 		lro_entry = (struct lro_entry *)
3382 			malloc(sizeof (*lro_entry), M_DEVBUF,
3383 			       M_NOWAIT | M_ZERO);
3384 		if (lro_entry == NULL) {
3385 			sc->lro_cnt = i;
3386 			break;
3387 		}
3388 		SLIST_INSERT_HEAD(&ss->lro_free, lro_entry, next);
3389 	}
3390 	/* get the lanai pointers to the send and receive rings */
3391 
3392 	err = 0;
3393 #ifndef IFNET_BUF_RING
3394 	/* We currently only send from the first slice */
3395 	if (slice == 0) {
3396 #endif
3397 		cmd.data0 = slice;
3398 		err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_OFFSET, &cmd);
3399 		ss->tx.lanai =
3400 			(volatile mcp_kreq_ether_send_t *)(sc->sram + cmd.data0);
3401 		ss->tx.send_go = (volatile uint32_t *)
3402 			(sc->sram + MXGEFW_ETH_SEND_GO + 64 * slice);
3403 		ss->tx.send_stop = (volatile uint32_t *)
3404 		(sc->sram + MXGEFW_ETH_SEND_STOP + 64 * slice);
3405 #ifndef IFNET_BUF_RING
3406 	}
3407 #endif
3408 	cmd.data0 = slice;
3409 	err |= mxge_send_cmd(sc,
3410 			     MXGEFW_CMD_GET_SMALL_RX_OFFSET, &cmd);
3411 	ss->rx_small.lanai =
3412 		(volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
3413 	cmd.data0 = slice;
3414 	err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_BIG_RX_OFFSET, &cmd);
3415 	ss->rx_big.lanai =
3416 		(volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
3417 
3418 	if (err != 0) {
3419 		device_printf(sc->dev,
3420 			      "failed to get ring sizes or locations\n");
3421 		return EIO;
3422 	}
3423 
3424 	/* stock receive rings */
3425 	for (i = 0; i <= ss->rx_small.mask; i++) {
3426 		map = ss->rx_small.info[i].map;
3427 		err = mxge_get_buf_small(ss, map, i);
3428 		if (err) {
3429 			device_printf(sc->dev, "alloced %d/%d smalls\n",
3430 				      i, ss->rx_small.mask + 1);
3431 			return ENOMEM;
3432 		}
3433 	}
3434 	for (i = 0; i <= ss->rx_big.mask; i++) {
3435 		ss->rx_big.shadow[i].addr_low = 0xffffffff;
3436 		ss->rx_big.shadow[i].addr_high = 0xffffffff;
3437 	}
3438 	ss->rx_big.nbufs = nbufs;
3439 	ss->rx_big.cl_size = cl_size;
3440 	ss->rx_big.mlen = ss->sc->ifp->if_mtu + ETHER_HDR_LEN +
3441 		ETHER_VLAN_ENCAP_LEN + MXGEFW_PAD;
3442 	for (i = 0; i <= ss->rx_big.mask; i += ss->rx_big.nbufs) {
3443 		map = ss->rx_big.info[i].map;
3444 		err = mxge_get_buf_big(ss, map, i);
3445 		if (err) {
3446 			device_printf(sc->dev, "alloced %d/%d bigs\n",
3447 				      i, ss->rx_big.mask + 1);
3448 			return ENOMEM;
3449 		}
3450 	}
3451 	return 0;
3452 }
3453 
3454 static int
3455 mxge_open(mxge_softc_t *sc)
3456 {
3457 	mxge_cmd_t cmd;
3458 	int err, big_bytes, nbufs, slice, cl_size, i;
3459 	bus_addr_t bus;
3460 	volatile uint8_t *itable;
3461 	struct mxge_slice_state *ss;
3462 
3463 	/* Copy the MAC address in case it was overridden */
3464 	bcopy(IF_LLADDR(sc->ifp), sc->mac_addr, ETHER_ADDR_LEN);
3465 
3466 	err = mxge_reset(sc, 1);
3467 	if (err != 0) {
3468 		device_printf(sc->dev, "failed to reset\n");
3469 		return EIO;
3470 	}
3471 
3472 	if (sc->num_slices > 1) {
3473 		/* setup the indirection table */
3474 		cmd.data0 = sc->num_slices;
3475 		err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_TABLE_SIZE,
3476 				    &cmd);
3477 
3478 		err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_RSS_TABLE_OFFSET,
3479 				     &cmd);
3480 		if (err != 0) {
3481 			device_printf(sc->dev,
3482 				      "failed to setup rss tables\n");
3483 			return err;
3484 		}
3485 
3486 		/* just enable an identity mapping */
3487 		itable = sc->sram + cmd.data0;
3488 		for (i = 0; i < sc->num_slices; i++)
3489 			itable[i] = (uint8_t)i;
3490 
3491 		cmd.data0 = 1;
3492 		cmd.data1 = mxge_rss_hash_type;
3493 		err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_ENABLE, &cmd);
3494 		if (err != 0) {
3495 			device_printf(sc->dev, "failed to enable slices\n");
3496 			return err;
3497 		}
3498 	}
3499 
3500 
3501 	mxge_choose_params(sc->ifp->if_mtu, &big_bytes, &cl_size, &nbufs);
3502 
3503 	cmd.data0 = nbufs;
3504 	err = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
3505 			    &cmd);
3506 	/* error is only meaningful if we're trying to set
3507 	   MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS > 1 */
3508 	if (err && nbufs > 1) {
3509 		device_printf(sc->dev,
3510 			      "Failed to set alway-use-n to %d\n",
3511 			      nbufs);
3512 		return EIO;
3513 	}
3514 	/* Give the firmware the mtu and the big and small buffer
3515 	   sizes.  The firmware wants the big buf size to be a power
3516 	   of two. Luckily, FreeBSD's clusters are powers of two */
3517 	cmd.data0 = sc->ifp->if_mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
3518 	err = mxge_send_cmd(sc, MXGEFW_CMD_SET_MTU, &cmd);
3519 	cmd.data0 = MHLEN - MXGEFW_PAD;
3520 	err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_SMALL_BUFFER_SIZE,
3521 			     &cmd);
3522 	cmd.data0 = big_bytes;
3523 	err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_BIG_BUFFER_SIZE, &cmd);
3524 
3525 	if (err != 0) {
3526 		device_printf(sc->dev, "failed to setup params\n");
3527 		goto abort;
3528 	}
3529 
3530 	/* Now give him the pointer to the stats block */
3531 	for (slice = 0;
3532 #ifdef IFNET_BUF_RING
3533 	     slice < sc->num_slices;
3534 #else
3535 	     slice < 1;
3536 #endif
3537 	     slice++) {
3538 		ss = &sc->ss[slice];
3539 		cmd.data0 =
3540 			MXGE_LOWPART_TO_U32(ss->fw_stats_dma.bus_addr);
3541 		cmd.data1 =
3542 			MXGE_HIGHPART_TO_U32(ss->fw_stats_dma.bus_addr);
3543 		cmd.data2 = sizeof(struct mcp_irq_data);
3544 		cmd.data2 |= (slice << 16);
3545 		err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_STATS_DMA_V2, &cmd);
3546 	}
3547 
3548 	if (err != 0) {
3549 		bus = sc->ss->fw_stats_dma.bus_addr;
3550 		bus += offsetof(struct mcp_irq_data, send_done_count);
3551 		cmd.data0 = MXGE_LOWPART_TO_U32(bus);
3552 		cmd.data1 = MXGE_HIGHPART_TO_U32(bus);
3553 		err = mxge_send_cmd(sc,
3554 				    MXGEFW_CMD_SET_STATS_DMA_OBSOLETE,
3555 				    &cmd);
3556 		/* Firmware cannot support multicast without STATS_DMA_V2 */
3557 		sc->fw_multicast_support = 0;
3558 	} else {
3559 		sc->fw_multicast_support = 1;
3560 	}
3561 
3562 	if (err != 0) {
3563 		device_printf(sc->dev, "failed to setup params\n");
3564 		goto abort;
3565 	}
3566 
3567 	for (slice = 0; slice < sc->num_slices; slice++) {
3568 		err = mxge_slice_open(&sc->ss[slice], nbufs, cl_size);
3569 		if (err != 0) {
3570 			device_printf(sc->dev, "couldn't open slice %d\n",
3571 				      slice);
3572 			goto abort;
3573 		}
3574 	}
3575 
3576 	/* Finally, start the firmware running */
3577 	err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_UP, &cmd);
3578 	if (err) {
3579 		device_printf(sc->dev, "Couldn't bring up link\n");
3580 		goto abort;
3581 	}
3582 #ifdef IFNET_BUF_RING
3583 	for (slice = 0; slice < sc->num_slices; slice++) {
3584 		ss = &sc->ss[slice];
3585 		ss->if_drv_flags |= IFF_DRV_RUNNING;
3586 		ss->if_drv_flags &= ~IFF_DRV_OACTIVE;
3587 	}
3588 #endif
3589 	sc->ifp->if_drv_flags |= IFF_DRV_RUNNING;
3590 	sc->ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
3591 	callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
3592 
3593 	return 0;
3594 
3595 
3596 abort:
3597 	mxge_free_mbufs(sc);
3598 
3599 	return err;
3600 }
3601 
3602 static int
3603 mxge_close(mxge_softc_t *sc)
3604 {
3605 	mxge_cmd_t cmd;
3606 	int err, old_down_cnt;
3607 #ifdef IFNET_BUF_RING
3608 	struct mxge_slice_state *ss;
3609 	int slice;
3610 #endif
3611 
3612 	callout_stop(&sc->co_hdl);
3613 #ifdef IFNET_BUF_RING
3614 	for (slice = 0; slice < sc->num_slices; slice++) {
3615 		ss = &sc->ss[slice];
3616 		ss->if_drv_flags &= ~IFF_DRV_RUNNING;
3617 	}
3618 #endif
3619 	sc->ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
3620 	old_down_cnt = sc->down_cnt;
3621 	wmb();
3622 	err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_DOWN, &cmd);
3623 	if (err) {
3624 		device_printf(sc->dev, "Couldn't bring down link\n");
3625 	}
3626 	if (old_down_cnt == sc->down_cnt) {
3627 		/* wait for down irq */
3628 		DELAY(10 * sc->intr_coal_delay);
3629 	}
3630 	wmb();
3631 	if (old_down_cnt == sc->down_cnt) {
3632 		device_printf(sc->dev, "never got down irq\n");
3633 	}
3634 
3635 	mxge_free_mbufs(sc);
3636 
3637 	return 0;
3638 }
3639 
3640 static void
3641 mxge_setup_cfg_space(mxge_softc_t *sc)
3642 {
3643 	device_t dev = sc->dev;
3644 	int reg;
3645 	uint16_t cmd, lnk, pectl;
3646 
3647 	/* find the PCIe link width and set max read request to 4KB*/
3648 	if (pci_find_extcap(dev, PCIY_EXPRESS, &reg) == 0) {
3649 		lnk = pci_read_config(dev, reg + 0x12, 2);
3650 		sc->link_width = (lnk >> 4) & 0x3f;
3651 
3652 		pectl = pci_read_config(dev, reg + 0x8, 2);
3653 		pectl = (pectl & ~0x7000) | (5 << 12);
3654 		pci_write_config(dev, reg + 0x8, pectl, 2);
3655 	}
3656 
3657 	/* Enable DMA and Memory space access */
3658 	pci_enable_busmaster(dev);
3659 	cmd = pci_read_config(dev, PCIR_COMMAND, 2);
3660 	cmd |= PCIM_CMD_MEMEN;
3661 	pci_write_config(dev, PCIR_COMMAND, cmd, 2);
3662 }
3663 
3664 static uint32_t
3665 mxge_read_reboot(mxge_softc_t *sc)
3666 {
3667 	device_t dev = sc->dev;
3668 	uint32_t vs;
3669 
3670 	/* find the vendor specific offset */
3671 	if (pci_find_extcap(dev, PCIY_VENDOR, &vs) != 0) {
3672 		device_printf(sc->dev,
3673 			      "could not find vendor specific offset\n");
3674 		return (uint32_t)-1;
3675 	}
3676 	/* enable read32 mode */
3677 	pci_write_config(dev, vs + 0x10, 0x3, 1);
3678 	/* tell NIC which register to read */
3679 	pci_write_config(dev, vs + 0x18, 0xfffffff0, 4);
3680 	return (pci_read_config(dev, vs + 0x14, 4));
3681 }
3682 
3683 static int
3684 mxge_watchdog_reset(mxge_softc_t *sc, int slice)
3685 {
3686 	struct pci_devinfo *dinfo;
3687 	mxge_tx_ring_t *tx;
3688 	int err;
3689 	uint32_t reboot;
3690 	uint16_t cmd;
3691 
3692 	err = ENXIO;
3693 
3694 	device_printf(sc->dev, "Watchdog reset!\n");
3695 
3696 	/*
3697 	 * check to see if the NIC rebooted.  If it did, then all of
3698 	 * PCI config space has been reset, and things like the
3699 	 * busmaster bit will be zero.  If this is the case, then we
3700 	 * must restore PCI config space before the NIC can be used
3701 	 * again
3702 	 */
3703 	cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3704 	if (cmd == 0xffff) {
3705 		/*
3706 		 * maybe the watchdog caught the NIC rebooting; wait
3707 		 * up to 100ms for it to finish.  If it does not come
3708 		 * back, then give up
3709 		 */
3710 		DELAY(1000*100);
3711 		cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3712 		if (cmd == 0xffff) {
3713 			device_printf(sc->dev, "NIC disappeared!\n");
3714 			return (err);
3715 		}
3716 	}
3717 	if ((cmd & PCIM_CMD_BUSMASTEREN) == 0) {
3718 		/* print the reboot status */
3719 		reboot = mxge_read_reboot(sc);
3720 		device_printf(sc->dev, "NIC rebooted, status = 0x%x\n",
3721 			      reboot);
3722 		/* restore PCI configuration space */
3723 		dinfo = device_get_ivars(sc->dev);
3724 		pci_cfg_restore(sc->dev, dinfo);
3725 
3726 		/* and redo any changes we made to our config space */
3727 		mxge_setup_cfg_space(sc);
3728 
3729 		if (sc->ifp->if_drv_flags & IFF_DRV_RUNNING) {
3730 			mxge_close(sc);
3731 			err = mxge_open(sc);
3732 		}
3733 	} else {
3734 		tx = &sc->ss[slice].tx;
3735 		device_printf(sc->dev,
3736 			      "NIC did not reboot, slice %d ring state:\n",
3737 			      slice);
3738 		device_printf(sc->dev,
3739 			      "tx.req=%d tx.done=%d, tx.queue_active=%d\n",
3740 			      tx->req, tx->done, tx->queue_active);
3741 		device_printf(sc->dev, "tx.activate=%d tx.deactivate=%d\n",
3742 			      tx->activate, tx->deactivate);
3743 		device_printf(sc->dev, "pkt_done=%d fw=%d\n",
3744 			      tx->pkt_done,
3745 			      be32toh(sc->ss->fw_stats->send_done_count));
3746 		device_printf(sc->dev, "not resetting\n");
3747 	}
3748 	return (err);
3749 }
3750 
3751 static int
3752 mxge_watchdog(mxge_softc_t *sc)
3753 {
3754 	mxge_tx_ring_t *tx;
3755 	uint32_t rx_pause = be32toh(sc->ss->fw_stats->dropped_pause);
3756 	int i, err = 0;
3757 
3758 	/* see if we have outstanding transmits, which
3759 	   have been pending for more than mxge_ticks */
3760 	for (i = 0;
3761 #ifdef IFNET_BUF_RING
3762 	     (i < sc->num_slices) && (err == 0);
3763 #else
3764 	     (i < 1) && (err == 0);
3765 #endif
3766 	     i++) {
3767 		tx = &sc->ss[i].tx;
3768 		if (tx->req != tx->done &&
3769 		    tx->watchdog_req != tx->watchdog_done &&
3770 		    tx->done == tx->watchdog_done) {
3771 			/* check for pause blocking before resetting */
3772 			if (tx->watchdog_rx_pause == rx_pause)
3773 				err = mxge_watchdog_reset(sc, i);
3774 			else
3775 				device_printf(sc->dev, "Flow control blocking "
3776 					      "xmits, check link partner\n");
3777 		}
3778 
3779 		tx->watchdog_req = tx->req;
3780 		tx->watchdog_done = tx->done;
3781 		tx->watchdog_rx_pause = rx_pause;
3782 	}
3783 
3784 	if (sc->need_media_probe)
3785 		mxge_media_probe(sc);
3786 	return (err);
3787 }
3788 
3789 static void
3790 mxge_update_stats(mxge_softc_t *sc)
3791 {
3792 	struct mxge_slice_state *ss;
3793 	u_long ipackets = 0;
3794 	u_long opackets = 0;
3795 #ifdef IFNET_BUF_RING
3796 	u_long obytes = 0;
3797 	u_long omcasts = 0;
3798 	u_long odrops = 0;
3799 #endif
3800 	u_long oerrors = 0;
3801 	int slice;
3802 
3803 	for (slice = 0; slice < sc->num_slices; slice++) {
3804 		ss = &sc->ss[slice];
3805 		ipackets += ss->ipackets;
3806 		opackets += ss->opackets;
3807 #ifdef IFNET_BUF_RING
3808 		obytes += ss->obytes;
3809 		omcasts += ss->omcasts;
3810 		odrops += ss->tx.br->br_drops;
3811 #endif
3812 		oerrors += ss->oerrors;
3813 	}
3814 	sc->ifp->if_ipackets = ipackets;
3815 	sc->ifp->if_opackets = opackets;
3816 #ifdef IFNET_BUF_RING
3817 	sc->ifp->if_obytes = obytes;
3818 	sc->ifp->if_omcasts = omcasts;
3819 	sc->ifp->if_snd.ifq_drops = odrops;
3820 #endif
3821 	sc->ifp->if_oerrors = oerrors;
3822 }
3823 
3824 static void
3825 mxge_tick(void *arg)
3826 {
3827 	mxge_softc_t *sc = arg;
3828 	int err = 0;
3829 
3830 	/* aggregate stats from different slices */
3831 	mxge_update_stats(sc);
3832 	if (!sc->watchdog_countdown) {
3833 		err = mxge_watchdog(sc);
3834 		sc->watchdog_countdown = 4;
3835 	}
3836 	sc->watchdog_countdown--;
3837 	if (err == 0)
3838 		callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
3839 
3840 }
3841 
3842 static int
3843 mxge_media_change(struct ifnet *ifp)
3844 {
3845 	return EINVAL;
3846 }
3847 
3848 static int
3849 mxge_change_mtu(mxge_softc_t *sc, int mtu)
3850 {
3851 	struct ifnet *ifp = sc->ifp;
3852 	int real_mtu, old_mtu;
3853 	int err = 0;
3854 
3855 
3856 	real_mtu = mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
3857 	if ((real_mtu > sc->max_mtu) || real_mtu < 60)
3858 		return EINVAL;
3859 	mtx_lock(&sc->driver_mtx);
3860 	old_mtu = ifp->if_mtu;
3861 	ifp->if_mtu = mtu;
3862 	if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
3863 		mxge_close(sc);
3864 		err = mxge_open(sc);
3865 		if (err != 0) {
3866 			ifp->if_mtu = old_mtu;
3867 			mxge_close(sc);
3868 			(void) mxge_open(sc);
3869 		}
3870 	}
3871 	mtx_unlock(&sc->driver_mtx);
3872 	return err;
3873 }
3874 
3875 static void
3876 mxge_media_status(struct ifnet *ifp, struct ifmediareq *ifmr)
3877 {
3878 	mxge_softc_t *sc = ifp->if_softc;
3879 
3880 
3881 	if (sc == NULL)
3882 		return;
3883 	ifmr->ifm_status = IFM_AVALID;
3884 	ifmr->ifm_status |= sc->link_state ? IFM_ACTIVE : 0;
3885 	ifmr->ifm_active = IFM_AUTO | IFM_ETHER;
3886 	ifmr->ifm_active |= sc->link_state ? IFM_FDX : 0;
3887 }
3888 
3889 static int
3890 mxge_ioctl(struct ifnet *ifp, u_long command, caddr_t data)
3891 {
3892 	mxge_softc_t *sc = ifp->if_softc;
3893 	struct ifreq *ifr = (struct ifreq *)data;
3894 	int err, mask;
3895 
3896 	err = 0;
3897 	switch (command) {
3898 	case SIOCSIFADDR:
3899 	case SIOCGIFADDR:
3900 		err = ether_ioctl(ifp, command, data);
3901 		break;
3902 
3903 	case SIOCSIFMTU:
3904 		err = mxge_change_mtu(sc, ifr->ifr_mtu);
3905 		break;
3906 
3907 	case SIOCSIFFLAGS:
3908 		mtx_lock(&sc->driver_mtx);
3909 		if (sc->dying) {
3910 			mtx_unlock(&sc->driver_mtx);
3911 			return EINVAL;
3912 		}
3913 		if (ifp->if_flags & IFF_UP) {
3914 			if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) {
3915 				err = mxge_open(sc);
3916 			} else {
3917 				/* take care of promis can allmulti
3918 				   flag chages */
3919 				mxge_change_promisc(sc,
3920 						    ifp->if_flags & IFF_PROMISC);
3921 				mxge_set_multicast_list(sc);
3922 			}
3923 		} else {
3924 			if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
3925 				mxge_close(sc);
3926 			}
3927 		}
3928 		mtx_unlock(&sc->driver_mtx);
3929 		break;
3930 
3931 	case SIOCADDMULTI:
3932 	case SIOCDELMULTI:
3933 		mtx_lock(&sc->driver_mtx);
3934 		mxge_set_multicast_list(sc);
3935 		mtx_unlock(&sc->driver_mtx);
3936 		break;
3937 
3938 	case SIOCSIFCAP:
3939 		mtx_lock(&sc->driver_mtx);
3940 		mask = ifr->ifr_reqcap ^ ifp->if_capenable;
3941 		if (mask & IFCAP_TXCSUM) {
3942 			if (IFCAP_TXCSUM & ifp->if_capenable) {
3943 				ifp->if_capenable &= ~(IFCAP_TXCSUM|IFCAP_TSO4);
3944 				ifp->if_hwassist &= ~(CSUM_TCP | CSUM_UDP
3945 						      | CSUM_TSO);
3946 			} else {
3947 				ifp->if_capenable |= IFCAP_TXCSUM;
3948 				ifp->if_hwassist |= (CSUM_TCP | CSUM_UDP);
3949 			}
3950 		} else if (mask & IFCAP_RXCSUM) {
3951 			if (IFCAP_RXCSUM & ifp->if_capenable) {
3952 				ifp->if_capenable &= ~IFCAP_RXCSUM;
3953 				sc->csum_flag = 0;
3954 			} else {
3955 				ifp->if_capenable |= IFCAP_RXCSUM;
3956 				sc->csum_flag = 1;
3957 			}
3958 		}
3959 		if (mask & IFCAP_TSO4) {
3960 			if (IFCAP_TSO4 & ifp->if_capenable) {
3961 				ifp->if_capenable &= ~IFCAP_TSO4;
3962 				ifp->if_hwassist &= ~CSUM_TSO;
3963 			} else if (IFCAP_TXCSUM & ifp->if_capenable) {
3964 				ifp->if_capenable |= IFCAP_TSO4;
3965 				ifp->if_hwassist |= CSUM_TSO;
3966 			} else {
3967 				printf("mxge requires tx checksum offload"
3968 				       " be enabled to use TSO\n");
3969 				err = EINVAL;
3970 			}
3971 		}
3972 		if (mask & IFCAP_LRO) {
3973 			if (IFCAP_LRO & ifp->if_capenable)
3974 				err = mxge_change_lro_locked(sc, 0);
3975 			else
3976 				err = mxge_change_lro_locked(sc, mxge_lro_cnt);
3977 		}
3978 		if (mask & IFCAP_VLAN_HWTAGGING)
3979 			ifp->if_capenable ^= IFCAP_VLAN_HWTAGGING;
3980 		mtx_unlock(&sc->driver_mtx);
3981 		VLAN_CAPABILITIES(ifp);
3982 
3983 		break;
3984 
3985 	case SIOCGIFMEDIA:
3986 		err = ifmedia_ioctl(ifp, (struct ifreq *)data,
3987 				    &sc->media, command);
3988                 break;
3989 
3990 	default:
3991 		err = ENOTTY;
3992         }
3993 	return err;
3994 }
3995 
3996 static void
3997 mxge_fetch_tunables(mxge_softc_t *sc)
3998 {
3999 
4000 	TUNABLE_INT_FETCH("hw.mxge.max_slices", &mxge_max_slices);
4001 	TUNABLE_INT_FETCH("hw.mxge.flow_control_enabled",
4002 			  &mxge_flow_control);
4003 	TUNABLE_INT_FETCH("hw.mxge.intr_coal_delay",
4004 			  &mxge_intr_coal_delay);
4005 	TUNABLE_INT_FETCH("hw.mxge.nvidia_ecrc_enable",
4006 			  &mxge_nvidia_ecrc_enable);
4007 	TUNABLE_INT_FETCH("hw.mxge.force_firmware",
4008 			  &mxge_force_firmware);
4009 	TUNABLE_INT_FETCH("hw.mxge.deassert_wait",
4010 			  &mxge_deassert_wait);
4011 	TUNABLE_INT_FETCH("hw.mxge.verbose",
4012 			  &mxge_verbose);
4013 	TUNABLE_INT_FETCH("hw.mxge.ticks", &mxge_ticks);
4014 	TUNABLE_INT_FETCH("hw.mxge.lro_cnt", &sc->lro_cnt);
4015 	TUNABLE_INT_FETCH("hw.mxge.always_promisc", &mxge_always_promisc);
4016 	TUNABLE_INT_FETCH("hw.mxge.rss_hash_type", &mxge_rss_hash_type);
4017 	TUNABLE_INT_FETCH("hw.mxge.rss_hashtype", &mxge_rss_hash_type);
4018 	TUNABLE_INT_FETCH("hw.mxge.initial_mtu", &mxge_initial_mtu);
4019 	if (sc->lro_cnt != 0)
4020 		mxge_lro_cnt = sc->lro_cnt;
4021 
4022 	if (bootverbose)
4023 		mxge_verbose = 1;
4024 	if (mxge_intr_coal_delay < 0 || mxge_intr_coal_delay > 10*1000)
4025 		mxge_intr_coal_delay = 30;
4026 	if (mxge_ticks == 0)
4027 		mxge_ticks = hz / 2;
4028 	sc->pause = mxge_flow_control;
4029 	if (mxge_rss_hash_type < MXGEFW_RSS_HASH_TYPE_IPV4
4030 	    || mxge_rss_hash_type > MXGEFW_RSS_HASH_TYPE_MAX) {
4031 		mxge_rss_hash_type = MXGEFW_RSS_HASH_TYPE_SRC_PORT;
4032 	}
4033 	if (mxge_initial_mtu > ETHERMTU_JUMBO ||
4034 	    mxge_initial_mtu < ETHER_MIN_LEN)
4035 		mxge_initial_mtu = ETHERMTU_JUMBO;
4036 }
4037 
4038 
4039 static void
4040 mxge_free_slices(mxge_softc_t *sc)
4041 {
4042 	struct mxge_slice_state *ss;
4043 	int i;
4044 
4045 
4046 	if (sc->ss == NULL)
4047 		return;
4048 
4049 	for (i = 0; i < sc->num_slices; i++) {
4050 		ss = &sc->ss[i];
4051 		if (ss->fw_stats != NULL) {
4052 			mxge_dma_free(&ss->fw_stats_dma);
4053 			ss->fw_stats = NULL;
4054 #ifdef IFNET_BUF_RING
4055 			if (ss->tx.br != NULL) {
4056 				drbr_free(ss->tx.br, M_DEVBUF);
4057 				ss->tx.br = NULL;
4058 			}
4059 #endif
4060 			mtx_destroy(&ss->tx.mtx);
4061 		}
4062 		if (ss->rx_done.entry != NULL) {
4063 			mxge_dma_free(&ss->rx_done.dma);
4064 			ss->rx_done.entry = NULL;
4065 		}
4066 	}
4067 	free(sc->ss, M_DEVBUF);
4068 	sc->ss = NULL;
4069 }
4070 
4071 static int
4072 mxge_alloc_slices(mxge_softc_t *sc)
4073 {
4074 	mxge_cmd_t cmd;
4075 	struct mxge_slice_state *ss;
4076 	size_t bytes;
4077 	int err, i, max_intr_slots;
4078 
4079 	err = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
4080 	if (err != 0) {
4081 		device_printf(sc->dev, "Cannot determine rx ring size\n");
4082 		return err;
4083 	}
4084 	sc->rx_ring_size = cmd.data0;
4085 	max_intr_slots = 2 * (sc->rx_ring_size / sizeof (mcp_dma_addr_t));
4086 
4087 	bytes = sizeof (*sc->ss) * sc->num_slices;
4088 	sc->ss = malloc(bytes, M_DEVBUF, M_NOWAIT | M_ZERO);
4089 	if (sc->ss == NULL)
4090 		return (ENOMEM);
4091 	for (i = 0; i < sc->num_slices; i++) {
4092 		ss = &sc->ss[i];
4093 
4094 		ss->sc = sc;
4095 
4096 		/* allocate per-slice rx interrupt queues */
4097 
4098 		bytes = max_intr_slots * sizeof (*ss->rx_done.entry);
4099 		err = mxge_dma_alloc(sc, &ss->rx_done.dma, bytes, 4096);
4100 		if (err != 0)
4101 			goto abort;
4102 		ss->rx_done.entry = ss->rx_done.dma.addr;
4103 		bzero(ss->rx_done.entry, bytes);
4104 
4105 		/*
4106 		 * allocate the per-slice firmware stats; stats
4107 		 * (including tx) are used used only on the first
4108 		 * slice for now
4109 		 */
4110 #ifndef IFNET_BUF_RING
4111 		if (i > 0)
4112 			continue;
4113 #endif
4114 
4115 		bytes = sizeof (*ss->fw_stats);
4116 		err = mxge_dma_alloc(sc, &ss->fw_stats_dma,
4117 				     sizeof (*ss->fw_stats), 64);
4118 		if (err != 0)
4119 			goto abort;
4120 		ss->fw_stats = (mcp_irq_data_t *)ss->fw_stats_dma.addr;
4121 		snprintf(ss->tx.mtx_name, sizeof(ss->tx.mtx_name),
4122 			 "%s:tx(%d)", device_get_nameunit(sc->dev), i);
4123 		mtx_init(&ss->tx.mtx, ss->tx.mtx_name, NULL, MTX_DEF);
4124 #ifdef IFNET_BUF_RING
4125 		ss->tx.br = buf_ring_alloc(2048, M_DEVBUF, M_WAITOK,
4126 					   &ss->tx.mtx);
4127 #endif
4128 	}
4129 
4130 	return (0);
4131 
4132 abort:
4133 	mxge_free_slices(sc);
4134 	return (ENOMEM);
4135 }
4136 
4137 static void
4138 mxge_slice_probe(mxge_softc_t *sc)
4139 {
4140 	mxge_cmd_t cmd;
4141 	char *old_fw;
4142 	int msix_cnt, status, max_intr_slots;
4143 
4144 	sc->num_slices = 1;
4145 	/*
4146 	 *  don't enable multiple slices if they are not enabled,
4147 	 *  or if this is not an SMP system
4148 	 */
4149 
4150 	if (mxge_max_slices == 0 || mxge_max_slices == 1 || mp_ncpus < 2)
4151 		return;
4152 
4153 	/* see how many MSI-X interrupts are available */
4154 	msix_cnt = pci_msix_count(sc->dev);
4155 	if (msix_cnt < 2)
4156 		return;
4157 
4158 	/* now load the slice aware firmware see what it supports */
4159 	old_fw = sc->fw_name;
4160 	if (old_fw == mxge_fw_aligned)
4161 		sc->fw_name = mxge_fw_rss_aligned;
4162 	else
4163 		sc->fw_name = mxge_fw_rss_unaligned;
4164 	status = mxge_load_firmware(sc, 0);
4165 	if (status != 0) {
4166 		device_printf(sc->dev, "Falling back to a single slice\n");
4167 		return;
4168 	}
4169 
4170 	/* try to send a reset command to the card to see if it
4171 	   is alive */
4172 	memset(&cmd, 0, sizeof (cmd));
4173 	status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
4174 	if (status != 0) {
4175 		device_printf(sc->dev, "failed reset\n");
4176 		goto abort_with_fw;
4177 	}
4178 
4179 	/* get rx ring size */
4180 	status = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
4181 	if (status != 0) {
4182 		device_printf(sc->dev, "Cannot determine rx ring size\n");
4183 		goto abort_with_fw;
4184 	}
4185 	max_intr_slots = 2 * (cmd.data0 / sizeof (mcp_dma_addr_t));
4186 
4187 	/* tell it the size of the interrupt queues */
4188 	cmd.data0 = max_intr_slots * sizeof (struct mcp_slot);
4189 	status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
4190 	if (status != 0) {
4191 		device_printf(sc->dev, "failed MXGEFW_CMD_SET_INTRQ_SIZE\n");
4192 		goto abort_with_fw;
4193 	}
4194 
4195 	/* ask the maximum number of slices it supports */
4196 	status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES, &cmd);
4197 	if (status != 0) {
4198 		device_printf(sc->dev,
4199 			      "failed MXGEFW_CMD_GET_MAX_RSS_QUEUES\n");
4200 		goto abort_with_fw;
4201 	}
4202 	sc->num_slices = cmd.data0;
4203 	if (sc->num_slices > msix_cnt)
4204 		sc->num_slices = msix_cnt;
4205 
4206 	if (mxge_max_slices == -1) {
4207 		/* cap to number of CPUs in system */
4208 		if (sc->num_slices > mp_ncpus)
4209 			sc->num_slices = mp_ncpus;
4210 	} else {
4211 		if (sc->num_slices > mxge_max_slices)
4212 			sc->num_slices = mxge_max_slices;
4213 	}
4214 	/* make sure it is a power of two */
4215 	while (sc->num_slices & (sc->num_slices - 1))
4216 		sc->num_slices--;
4217 
4218 	if (mxge_verbose)
4219 		device_printf(sc->dev, "using %d slices\n",
4220 			      sc->num_slices);
4221 
4222 	return;
4223 
4224 abort_with_fw:
4225 	sc->fw_name = old_fw;
4226 	(void) mxge_load_firmware(sc, 0);
4227 }
4228 
4229 static int
4230 mxge_add_msix_irqs(mxge_softc_t *sc)
4231 {
4232 	size_t bytes;
4233 	int count, err, i, rid;
4234 
4235 	rid = PCIR_BAR(2);
4236 	sc->msix_table_res = bus_alloc_resource_any(sc->dev, SYS_RES_MEMORY,
4237 						    &rid, RF_ACTIVE);
4238 
4239 	if (sc->msix_table_res == NULL) {
4240 		device_printf(sc->dev, "couldn't alloc MSIX table res\n");
4241 		return ENXIO;
4242 	}
4243 
4244 	count = sc->num_slices;
4245 	err = pci_alloc_msix(sc->dev, &count);
4246 	if (err != 0) {
4247 		device_printf(sc->dev, "pci_alloc_msix: failed, wanted %d"
4248 			      "err = %d \n", sc->num_slices, err);
4249 		goto abort_with_msix_table;
4250 	}
4251 	if (count < sc->num_slices) {
4252 		device_printf(sc->dev, "pci_alloc_msix: need %d, got %d\n",
4253 			      count, sc->num_slices);
4254 		device_printf(sc->dev,
4255 			      "Try setting hw.mxge.max_slices to %d\n",
4256 			      count);
4257 		err = ENOSPC;
4258 		goto abort_with_msix;
4259 	}
4260 	bytes = sizeof (*sc->msix_irq_res) * sc->num_slices;
4261 	sc->msix_irq_res = malloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO);
4262 	if (sc->msix_irq_res == NULL) {
4263 		err = ENOMEM;
4264 		goto abort_with_msix;
4265 	}
4266 
4267 	for (i = 0; i < sc->num_slices; i++) {
4268 		rid = i + 1;
4269 		sc->msix_irq_res[i] = bus_alloc_resource_any(sc->dev,
4270 							  SYS_RES_IRQ,
4271 							  &rid, RF_ACTIVE);
4272 		if (sc->msix_irq_res[i] == NULL) {
4273 			device_printf(sc->dev, "couldn't allocate IRQ res"
4274 				      " for message %d\n", i);
4275 			err = ENXIO;
4276 			goto abort_with_res;
4277 		}
4278 	}
4279 
4280 	bytes = sizeof (*sc->msix_ih) * sc->num_slices;
4281 	sc->msix_ih =  malloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO);
4282 
4283 	for (i = 0; i < sc->num_slices; i++) {
4284 		err = bus_setup_intr(sc->dev, sc->msix_irq_res[i],
4285 				     INTR_TYPE_NET | INTR_MPSAFE,
4286 #if __FreeBSD_version > 700030
4287 				     NULL,
4288 #endif
4289 				     mxge_intr, &sc->ss[i], &sc->msix_ih[i]);
4290 		if (err != 0) {
4291 			device_printf(sc->dev, "couldn't setup intr for "
4292 				      "message %d\n", i);
4293 			goto abort_with_intr;
4294 		}
4295 	}
4296 
4297 	if (mxge_verbose) {
4298 		device_printf(sc->dev, "using %d msix IRQs:",
4299 			      sc->num_slices);
4300 		for (i = 0; i < sc->num_slices; i++)
4301 			printf(" %ld",  rman_get_start(sc->msix_irq_res[i]));
4302 		printf("\n");
4303 	}
4304 	return (0);
4305 
4306 abort_with_intr:
4307 	for (i = 0; i < sc->num_slices; i++) {
4308 		if (sc->msix_ih[i] != NULL) {
4309 			bus_teardown_intr(sc->dev, sc->msix_irq_res[i],
4310 					  sc->msix_ih[i]);
4311 			sc->msix_ih[i] = NULL;
4312 		}
4313 	}
4314 	free(sc->msix_ih, M_DEVBUF);
4315 
4316 
4317 abort_with_res:
4318 	for (i = 0; i < sc->num_slices; i++) {
4319 		rid = i + 1;
4320 		if (sc->msix_irq_res[i] != NULL)
4321 			bus_release_resource(sc->dev, SYS_RES_IRQ, rid,
4322 					     sc->msix_irq_res[i]);
4323 		sc->msix_irq_res[i] = NULL;
4324 	}
4325 	free(sc->msix_irq_res, M_DEVBUF);
4326 
4327 
4328 abort_with_msix:
4329 	pci_release_msi(sc->dev);
4330 
4331 abort_with_msix_table:
4332 	bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2),
4333 			     sc->msix_table_res);
4334 
4335 	return err;
4336 }
4337 
4338 static int
4339 mxge_add_single_irq(mxge_softc_t *sc)
4340 {
4341 	int count, err, rid;
4342 
4343 	count = pci_msi_count(sc->dev);
4344 	if (count == 1 && pci_alloc_msi(sc->dev, &count) == 0) {
4345 		rid = 1;
4346 	} else {
4347 		rid = 0;
4348 		sc->legacy_irq = 1;
4349 	}
4350 	sc->irq_res = bus_alloc_resource(sc->dev, SYS_RES_IRQ, &rid, 0, ~0,
4351 					 1, RF_SHAREABLE | RF_ACTIVE);
4352 	if (sc->irq_res == NULL) {
4353 		device_printf(sc->dev, "could not alloc interrupt\n");
4354 		return ENXIO;
4355 	}
4356 	if (mxge_verbose)
4357 		device_printf(sc->dev, "using %s irq %ld\n",
4358 			      sc->legacy_irq ? "INTx" : "MSI",
4359 			      rman_get_start(sc->irq_res));
4360 	err = bus_setup_intr(sc->dev, sc->irq_res,
4361 			     INTR_TYPE_NET | INTR_MPSAFE,
4362 #if __FreeBSD_version > 700030
4363 			     NULL,
4364 #endif
4365 			     mxge_intr, &sc->ss[0], &sc->ih);
4366 	if (err != 0) {
4367 		bus_release_resource(sc->dev, SYS_RES_IRQ,
4368 				     sc->legacy_irq ? 0 : 1, sc->irq_res);
4369 		if (!sc->legacy_irq)
4370 			pci_release_msi(sc->dev);
4371 	}
4372 	return err;
4373 }
4374 
4375 static void
4376 mxge_rem_msix_irqs(mxge_softc_t *sc)
4377 {
4378 	int i, rid;
4379 
4380 	for (i = 0; i < sc->num_slices; i++) {
4381 		if (sc->msix_ih[i] != NULL) {
4382 			bus_teardown_intr(sc->dev, sc->msix_irq_res[i],
4383 					  sc->msix_ih[i]);
4384 			sc->msix_ih[i] = NULL;
4385 		}
4386 	}
4387 	free(sc->msix_ih, M_DEVBUF);
4388 
4389 	for (i = 0; i < sc->num_slices; i++) {
4390 		rid = i + 1;
4391 		if (sc->msix_irq_res[i] != NULL)
4392 			bus_release_resource(sc->dev, SYS_RES_IRQ, rid,
4393 					     sc->msix_irq_res[i]);
4394 		sc->msix_irq_res[i] = NULL;
4395 	}
4396 	free(sc->msix_irq_res, M_DEVBUF);
4397 
4398 	bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2),
4399 			     sc->msix_table_res);
4400 
4401 	pci_release_msi(sc->dev);
4402 	return;
4403 }
4404 
4405 static void
4406 mxge_rem_single_irq(mxge_softc_t *sc)
4407 {
4408 	bus_teardown_intr(sc->dev, sc->irq_res, sc->ih);
4409 	bus_release_resource(sc->dev, SYS_RES_IRQ,
4410 			     sc->legacy_irq ? 0 : 1, sc->irq_res);
4411 	if (!sc->legacy_irq)
4412 		pci_release_msi(sc->dev);
4413 }
4414 
4415 static void
4416 mxge_rem_irq(mxge_softc_t *sc)
4417 {
4418 	if (sc->num_slices > 1)
4419 		mxge_rem_msix_irqs(sc);
4420 	else
4421 		mxge_rem_single_irq(sc);
4422 }
4423 
4424 static int
4425 mxge_add_irq(mxge_softc_t *sc)
4426 {
4427 	int err;
4428 
4429 	if (sc->num_slices > 1)
4430 		err = mxge_add_msix_irqs(sc);
4431 	else
4432 		err = mxge_add_single_irq(sc);
4433 
4434 	if (0 && err == 0 && sc->num_slices > 1) {
4435 		mxge_rem_msix_irqs(sc);
4436 		err = mxge_add_msix_irqs(sc);
4437 	}
4438 	return err;
4439 }
4440 
4441 
4442 static int
4443 mxge_attach(device_t dev)
4444 {
4445 	mxge_softc_t *sc = device_get_softc(dev);
4446 	struct ifnet *ifp;
4447 	int err, rid;
4448 
4449 	sc->dev = dev;
4450 	mxge_fetch_tunables(sc);
4451 
4452 	err = bus_dma_tag_create(NULL,			/* parent */
4453 				 1,			/* alignment */
4454 				 0,			/* boundary */
4455 				 BUS_SPACE_MAXADDR,	/* low */
4456 				 BUS_SPACE_MAXADDR,	/* high */
4457 				 NULL, NULL,		/* filter */
4458 				 65536 + 256,		/* maxsize */
4459 				 MXGE_MAX_SEND_DESC, 	/* num segs */
4460 				 65536,			/* maxsegsize */
4461 				 0,			/* flags */
4462 				 NULL, NULL,		/* lock */
4463 				 &sc->parent_dmat);	/* tag */
4464 
4465 	if (err != 0) {
4466 		device_printf(sc->dev, "Err %d allocating parent dmat\n",
4467 			      err);
4468 		goto abort_with_nothing;
4469 	}
4470 
4471 	ifp = sc->ifp = if_alloc(IFT_ETHER);
4472 	if (ifp == NULL) {
4473 		device_printf(dev, "can not if_alloc()\n");
4474 		err = ENOSPC;
4475 		goto abort_with_parent_dmat;
4476 	}
4477 	if_initname(ifp, device_get_name(dev), device_get_unit(dev));
4478 
4479 	snprintf(sc->cmd_mtx_name, sizeof(sc->cmd_mtx_name), "%s:cmd",
4480 		 device_get_nameunit(dev));
4481 	mtx_init(&sc->cmd_mtx, sc->cmd_mtx_name, NULL, MTX_DEF);
4482 	snprintf(sc->driver_mtx_name, sizeof(sc->driver_mtx_name),
4483 		 "%s:drv", device_get_nameunit(dev));
4484 	mtx_init(&sc->driver_mtx, sc->driver_mtx_name,
4485 		 MTX_NETWORK_LOCK, MTX_DEF);
4486 
4487 	callout_init_mtx(&sc->co_hdl, &sc->driver_mtx, 0);
4488 
4489 	mxge_setup_cfg_space(sc);
4490 
4491 	/* Map the board into the kernel */
4492 	rid = PCIR_BARS;
4493 	sc->mem_res = bus_alloc_resource(dev, SYS_RES_MEMORY, &rid, 0,
4494 					 ~0, 1, RF_ACTIVE);
4495 	if (sc->mem_res == NULL) {
4496 		device_printf(dev, "could not map memory\n");
4497 		err = ENXIO;
4498 		goto abort_with_lock;
4499 	}
4500 	sc->sram = rman_get_virtual(sc->mem_res);
4501 	sc->sram_size = 2*1024*1024 - (2*(48*1024)+(32*1024)) - 0x100;
4502 	if (sc->sram_size > rman_get_size(sc->mem_res)) {
4503 		device_printf(dev, "impossible memory region size %ld\n",
4504 			      rman_get_size(sc->mem_res));
4505 		err = ENXIO;
4506 		goto abort_with_mem_res;
4507 	}
4508 
4509 	/* make NULL terminated copy of the EEPROM strings section of
4510 	   lanai SRAM */
4511 	bzero(sc->eeprom_strings, MXGE_EEPROM_STRINGS_SIZE);
4512 	bus_space_read_region_1(rman_get_bustag(sc->mem_res),
4513 				rman_get_bushandle(sc->mem_res),
4514 				sc->sram_size - MXGE_EEPROM_STRINGS_SIZE,
4515 				sc->eeprom_strings,
4516 				MXGE_EEPROM_STRINGS_SIZE - 2);
4517 	err = mxge_parse_strings(sc);
4518 	if (err != 0)
4519 		goto abort_with_mem_res;
4520 
4521 	/* Enable write combining for efficient use of PCIe bus */
4522 	mxge_enable_wc(sc);
4523 
4524 	/* Allocate the out of band dma memory */
4525 	err = mxge_dma_alloc(sc, &sc->cmd_dma,
4526 			     sizeof (mxge_cmd_t), 64);
4527 	if (err != 0)
4528 		goto abort_with_mem_res;
4529 	sc->cmd = (mcp_cmd_response_t *) sc->cmd_dma.addr;
4530 	err = mxge_dma_alloc(sc, &sc->zeropad_dma, 64, 64);
4531 	if (err != 0)
4532 		goto abort_with_cmd_dma;
4533 
4534 	err = mxge_dma_alloc(sc, &sc->dmabench_dma, 4096, 4096);
4535 	if (err != 0)
4536 		goto abort_with_zeropad_dma;
4537 
4538 	/* select & load the firmware */
4539 	err = mxge_select_firmware(sc);
4540 	if (err != 0)
4541 		goto abort_with_dmabench;
4542 	sc->intr_coal_delay = mxge_intr_coal_delay;
4543 
4544 	mxge_slice_probe(sc);
4545 	err = mxge_alloc_slices(sc);
4546 	if (err != 0)
4547 		goto abort_with_dmabench;
4548 
4549 	err = mxge_reset(sc, 0);
4550 	if (err != 0)
4551 		goto abort_with_slices;
4552 
4553 	err = mxge_alloc_rings(sc);
4554 	if (err != 0) {
4555 		device_printf(sc->dev, "failed to allocate rings\n");
4556 		goto abort_with_dmabench;
4557 	}
4558 
4559 	err = mxge_add_irq(sc);
4560 	if (err != 0) {
4561 		device_printf(sc->dev, "failed to add irq\n");
4562 		goto abort_with_rings;
4563 	}
4564 
4565 	ifp->if_baudrate = IF_Gbps(10UL);
4566 	ifp->if_capabilities = IFCAP_RXCSUM | IFCAP_TXCSUM | IFCAP_TSO4 |
4567 		IFCAP_VLAN_MTU;
4568 #ifdef INET
4569 	ifp->if_capabilities |= IFCAP_LRO;
4570 #endif
4571 
4572 #ifdef MXGE_NEW_VLAN_API
4573 	ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_HWCSUM;
4574 #endif
4575 
4576 	sc->max_mtu = mxge_max_mtu(sc);
4577 	if (sc->max_mtu >= 9000)
4578 		ifp->if_capabilities |= IFCAP_JUMBO_MTU;
4579 	else
4580 		device_printf(dev, "MTU limited to %d.  Install "
4581 			      "latest firmware for 9000 byte jumbo support\n",
4582 			      sc->max_mtu - ETHER_HDR_LEN);
4583 	ifp->if_hwassist = CSUM_TCP | CSUM_UDP | CSUM_TSO;
4584 	ifp->if_capenable = ifp->if_capabilities;
4585 	if (sc->lro_cnt == 0)
4586 		ifp->if_capenable &= ~IFCAP_LRO;
4587 	sc->csum_flag = 1;
4588         ifp->if_init = mxge_init;
4589         ifp->if_softc = sc;
4590         ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
4591         ifp->if_ioctl = mxge_ioctl;
4592         ifp->if_start = mxge_start;
4593 	/* Initialise the ifmedia structure */
4594 	ifmedia_init(&sc->media, 0, mxge_media_change,
4595 		     mxge_media_status);
4596 	mxge_set_media(sc, IFM_ETHER | IFM_AUTO);
4597 	mxge_media_probe(sc);
4598 	sc->dying = 0;
4599 	ether_ifattach(ifp, sc->mac_addr);
4600 	/* ether_ifattach sets mtu to ETHERMTU */
4601 	if (mxge_initial_mtu != ETHERMTU)
4602 		mxge_change_mtu(sc, mxge_initial_mtu);
4603 
4604 	mxge_add_sysctls(sc);
4605 #ifdef IFNET_BUF_RING
4606 	ifp->if_transmit = mxge_transmit;
4607 	ifp->if_qflush = mxge_qflush;
4608 #endif
4609 	return 0;
4610 
4611 abort_with_rings:
4612 	mxge_free_rings(sc);
4613 abort_with_slices:
4614 	mxge_free_slices(sc);
4615 abort_with_dmabench:
4616 	mxge_dma_free(&sc->dmabench_dma);
4617 abort_with_zeropad_dma:
4618 	mxge_dma_free(&sc->zeropad_dma);
4619 abort_with_cmd_dma:
4620 	mxge_dma_free(&sc->cmd_dma);
4621 abort_with_mem_res:
4622 	bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
4623 abort_with_lock:
4624 	pci_disable_busmaster(dev);
4625 	mtx_destroy(&sc->cmd_mtx);
4626 	mtx_destroy(&sc->driver_mtx);
4627 	if_free(ifp);
4628 abort_with_parent_dmat:
4629 	bus_dma_tag_destroy(sc->parent_dmat);
4630 
4631 abort_with_nothing:
4632 	return err;
4633 }
4634 
4635 static int
4636 mxge_detach(device_t dev)
4637 {
4638 	mxge_softc_t *sc = device_get_softc(dev);
4639 
4640 	if (mxge_vlans_active(sc)) {
4641 		device_printf(sc->dev,
4642 			      "Detach vlans before removing module\n");
4643 		return EBUSY;
4644 	}
4645 	mtx_lock(&sc->driver_mtx);
4646 	sc->dying = 1;
4647 	if (sc->ifp->if_drv_flags & IFF_DRV_RUNNING)
4648 		mxge_close(sc);
4649 	mtx_unlock(&sc->driver_mtx);
4650 	ether_ifdetach(sc->ifp);
4651 	callout_drain(&sc->co_hdl);
4652 	ifmedia_removeall(&sc->media);
4653 	mxge_dummy_rdma(sc, 0);
4654 	mxge_rem_sysctls(sc);
4655 	mxge_rem_irq(sc);
4656 	mxge_free_rings(sc);
4657 	mxge_free_slices(sc);
4658 	mxge_dma_free(&sc->dmabench_dma);
4659 	mxge_dma_free(&sc->zeropad_dma);
4660 	mxge_dma_free(&sc->cmd_dma);
4661 	bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
4662 	pci_disable_busmaster(dev);
4663 	mtx_destroy(&sc->cmd_mtx);
4664 	mtx_destroy(&sc->driver_mtx);
4665 	if_free(sc->ifp);
4666 	bus_dma_tag_destroy(sc->parent_dmat);
4667 	return 0;
4668 }
4669 
4670 static int
4671 mxge_shutdown(device_t dev)
4672 {
4673 	return 0;
4674 }
4675 
4676 /*
4677   This file uses Myri10GE driver indentation.
4678 
4679   Local Variables:
4680   c-file-style:"linux"
4681   tab-width:8
4682   End:
4683 */
4684