xref: /freebsd/sys/dev/mxge/if_mxge.c (revision 195ebc7e9e4b129de810833791a19dfb4349d6a9)
1 /******************************************************************************
2 
3 Copyright (c) 2006-2009, Myricom Inc.
4 All rights reserved.
5 
6 Redistribution and use in source and binary forms, with or without
7 modification, are permitted provided that the following conditions are met:
8 
9  1. Redistributions of source code must retain the above copyright notice,
10     this list of conditions and the following disclaimer.
11 
12  2. Neither the name of the Myricom Inc, nor the names of its
13     contributors may be used to endorse or promote products derived from
14     this software without specific prior written permission.
15 
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 POSSIBILITY OF SUCH DAMAGE.
27 
28 ***************************************************************************/
29 
30 #include <sys/cdefs.h>
31 __FBSDID("$FreeBSD$");
32 
33 #include <sys/param.h>
34 #include <sys/systm.h>
35 #include <sys/linker.h>
36 #include <sys/firmware.h>
37 #include <sys/endian.h>
38 #include <sys/sockio.h>
39 #include <sys/mbuf.h>
40 #include <sys/malloc.h>
41 #include <sys/kdb.h>
42 #include <sys/kernel.h>
43 #include <sys/lock.h>
44 #include <sys/module.h>
45 #include <sys/socket.h>
46 #include <sys/sysctl.h>
47 #include <sys/sx.h>
48 
49 #include <net/if.h>
50 #include <net/if_arp.h>
51 #include <net/ethernet.h>
52 #include <net/if_dl.h>
53 #include <net/if_media.h>
54 
55 #include <net/bpf.h>
56 
57 #include <net/if_types.h>
58 #include <net/if_vlan_var.h>
59 #include <net/zlib.h>
60 
61 #include <netinet/in_systm.h>
62 #include <netinet/in.h>
63 #include <netinet/ip.h>
64 #include <netinet/tcp.h>
65 
66 #include <machine/bus.h>
67 #include <machine/in_cksum.h>
68 #include <machine/resource.h>
69 #include <sys/bus.h>
70 #include <sys/rman.h>
71 #include <sys/smp.h>
72 
73 #include <dev/pci/pcireg.h>
74 #include <dev/pci/pcivar.h>
75 #include <dev/pci/pci_private.h> /* XXX for pci_cfg_restore */
76 
77 #include <vm/vm.h>		/* for pmap_mapdev() */
78 #include <vm/pmap.h>
79 
80 #if defined(__i386) || defined(__amd64)
81 #include <machine/specialreg.h>
82 #endif
83 
84 #include <dev/mxge/mxge_mcp.h>
85 #include <dev/mxge/mcp_gen_header.h>
86 /*#define MXGE_FAKE_IFP*/
87 #include <dev/mxge/if_mxge_var.h>
88 #ifdef IFNET_BUF_RING
89 #include <sys/buf_ring.h>
90 #endif
91 
92 /* tunable params */
93 static int mxge_nvidia_ecrc_enable = 1;
94 static int mxge_force_firmware = 0;
95 static int mxge_intr_coal_delay = 30;
96 static int mxge_deassert_wait = 1;
97 static int mxge_flow_control = 1;
98 static int mxge_verbose = 0;
99 static int mxge_lro_cnt = 8;
100 static int mxge_ticks;
101 static int mxge_max_slices = 1;
102 static int mxge_rss_hash_type = MXGEFW_RSS_HASH_TYPE_SRC_PORT;
103 static int mxge_always_promisc = 0;
104 static char *mxge_fw_unaligned = "mxge_ethp_z8e";
105 static char *mxge_fw_aligned = "mxge_eth_z8e";
106 static char *mxge_fw_rss_aligned = "mxge_rss_eth_z8e";
107 static char *mxge_fw_rss_unaligned = "mxge_rss_ethp_z8e";
108 
109 static int mxge_probe(device_t dev);
110 static int mxge_attach(device_t dev);
111 static int mxge_detach(device_t dev);
112 static int mxge_shutdown(device_t dev);
113 static void mxge_intr(void *arg);
114 
115 static device_method_t mxge_methods[] =
116 {
117   /* Device interface */
118   DEVMETHOD(device_probe, mxge_probe),
119   DEVMETHOD(device_attach, mxge_attach),
120   DEVMETHOD(device_detach, mxge_detach),
121   DEVMETHOD(device_shutdown, mxge_shutdown),
122   {0, 0}
123 };
124 
125 static driver_t mxge_driver =
126 {
127   "mxge",
128   mxge_methods,
129   sizeof(mxge_softc_t),
130 };
131 
132 static devclass_t mxge_devclass;
133 
134 /* Declare ourselves to be a child of the PCI bus.*/
135 DRIVER_MODULE(mxge, pci, mxge_driver, mxge_devclass, 0, 0);
136 MODULE_DEPEND(mxge, firmware, 1, 1, 1);
137 MODULE_DEPEND(mxge, zlib, 1, 1, 1);
138 
139 static int mxge_load_firmware(mxge_softc_t *sc, int adopt);
140 static int mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data);
141 static int mxge_close(mxge_softc_t *sc);
142 static int mxge_open(mxge_softc_t *sc);
143 static void mxge_tick(void *arg);
144 
145 static int
146 mxge_probe(device_t dev)
147 {
148 	int rev;
149 
150 
151 	if ((pci_get_vendor(dev) == MXGE_PCI_VENDOR_MYRICOM) &&
152 	    ((pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E) ||
153 	     (pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E_9))) {
154 		rev = pci_get_revid(dev);
155 		switch (rev) {
156 		case MXGE_PCI_REV_Z8E:
157 			device_set_desc(dev, "Myri10G-PCIE-8A");
158 			break;
159 		case MXGE_PCI_REV_Z8ES:
160 			device_set_desc(dev, "Myri10G-PCIE-8B");
161 			break;
162 		default:
163 			device_set_desc(dev, "Myri10G-PCIE-8??");
164 			device_printf(dev, "Unrecognized rev %d NIC\n",
165 				      rev);
166 			break;
167 		}
168 		return 0;
169 	}
170 	return ENXIO;
171 }
172 
173 static void
174 mxge_enable_wc(mxge_softc_t *sc)
175 {
176 #if defined(__i386) || defined(__amd64)
177 	vm_offset_t len;
178 	int err;
179 
180 	sc->wc = 1;
181 	len = rman_get_size(sc->mem_res);
182 	err = pmap_change_attr((vm_offset_t) sc->sram,
183 			       len, PAT_WRITE_COMBINING);
184 	if (err != 0) {
185 		device_printf(sc->dev, "pmap_change_attr failed, %d\n",
186 			      err);
187 		sc->wc = 0;
188 	}
189 #endif
190 }
191 
192 
193 /* callback to get our DMA address */
194 static void
195 mxge_dmamap_callback(void *arg, bus_dma_segment_t *segs, int nsegs,
196 			 int error)
197 {
198 	if (error == 0) {
199 		*(bus_addr_t *) arg = segs->ds_addr;
200 	}
201 }
202 
203 static int
204 mxge_dma_alloc(mxge_softc_t *sc, mxge_dma_t *dma, size_t bytes,
205 		   bus_size_t alignment)
206 {
207 	int err;
208 	device_t dev = sc->dev;
209 	bus_size_t boundary, maxsegsize;
210 
211 	if (bytes > 4096 && alignment == 4096) {
212 		boundary = 0;
213 		maxsegsize = bytes;
214 	} else {
215 		boundary = 4096;
216 		maxsegsize = 4096;
217 	}
218 
219 	/* allocate DMAable memory tags */
220 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
221 				 alignment,		/* alignment */
222 				 boundary,		/* boundary */
223 				 BUS_SPACE_MAXADDR,	/* low */
224 				 BUS_SPACE_MAXADDR,	/* high */
225 				 NULL, NULL,		/* filter */
226 				 bytes,			/* maxsize */
227 				 1,			/* num segs */
228 				 maxsegsize,		/* maxsegsize */
229 				 BUS_DMA_COHERENT,	/* flags */
230 				 NULL, NULL,		/* lock */
231 				 &dma->dmat);		/* tag */
232 	if (err != 0) {
233 		device_printf(dev, "couldn't alloc tag (err = %d)\n", err);
234 		return err;
235 	}
236 
237 	/* allocate DMAable memory & map */
238 	err = bus_dmamem_alloc(dma->dmat, &dma->addr,
239 			       (BUS_DMA_WAITOK | BUS_DMA_COHERENT
240 				| BUS_DMA_ZERO),  &dma->map);
241 	if (err != 0) {
242 		device_printf(dev, "couldn't alloc mem (err = %d)\n", err);
243 		goto abort_with_dmat;
244 	}
245 
246 	/* load the memory */
247 	err = bus_dmamap_load(dma->dmat, dma->map, dma->addr, bytes,
248 			      mxge_dmamap_callback,
249 			      (void *)&dma->bus_addr, 0);
250 	if (err != 0) {
251 		device_printf(dev, "couldn't load map (err = %d)\n", err);
252 		goto abort_with_mem;
253 	}
254 	return 0;
255 
256 abort_with_mem:
257 	bus_dmamem_free(dma->dmat, dma->addr, dma->map);
258 abort_with_dmat:
259 	(void)bus_dma_tag_destroy(dma->dmat);
260 	return err;
261 }
262 
263 
264 static void
265 mxge_dma_free(mxge_dma_t *dma)
266 {
267 	bus_dmamap_unload(dma->dmat, dma->map);
268 	bus_dmamem_free(dma->dmat, dma->addr, dma->map);
269 	(void)bus_dma_tag_destroy(dma->dmat);
270 }
271 
272 /*
273  * The eeprom strings on the lanaiX have the format
274  * SN=x\0
275  * MAC=x:x:x:x:x:x\0
276  * PC=text\0
277  */
278 
279 static int
280 mxge_parse_strings(mxge_softc_t *sc)
281 {
282 #define MXGE_NEXT_STRING(p) while(ptr < limit && *ptr++)
283 
284 	char *ptr, *limit;
285 	int i, found_mac;
286 
287 	ptr = sc->eeprom_strings;
288 	limit = sc->eeprom_strings + MXGE_EEPROM_STRINGS_SIZE;
289 	found_mac = 0;
290 	while (ptr < limit && *ptr != '\0') {
291 		if (memcmp(ptr, "MAC=", 4) == 0) {
292 			ptr += 1;
293 			sc->mac_addr_string = ptr;
294 			for (i = 0; i < 6; i++) {
295 				ptr += 3;
296 				if ((ptr + 2) > limit)
297 					goto abort;
298 				sc->mac_addr[i] = strtoul(ptr, NULL, 16);
299 				found_mac = 1;
300 			}
301 		} else if (memcmp(ptr, "PC=", 3) == 0) {
302 			ptr += 3;
303 			strncpy(sc->product_code_string, ptr,
304 				sizeof (sc->product_code_string) - 1);
305 		} else if (memcmp(ptr, "SN=", 3) == 0) {
306 			ptr += 3;
307 			strncpy(sc->serial_number_string, ptr,
308 				sizeof (sc->serial_number_string) - 1);
309 		}
310 		MXGE_NEXT_STRING(ptr);
311 	}
312 
313 	if (found_mac)
314 		return 0;
315 
316  abort:
317 	device_printf(sc->dev, "failed to parse eeprom_strings\n");
318 
319 	return ENXIO;
320 }
321 
322 #if defined __i386 || defined i386 || defined __i386__ || defined __x86_64__
323 static void
324 mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
325 {
326 	uint32_t val;
327 	unsigned long base, off;
328 	char *va, *cfgptr;
329 	device_t pdev, mcp55;
330 	uint16_t vendor_id, device_id, word;
331 	uintptr_t bus, slot, func, ivend, idev;
332 	uint32_t *ptr32;
333 
334 
335 	if (!mxge_nvidia_ecrc_enable)
336 		return;
337 
338 	pdev = device_get_parent(device_get_parent(sc->dev));
339 	if (pdev == NULL) {
340 		device_printf(sc->dev, "could not find parent?\n");
341 		return;
342 	}
343 	vendor_id = pci_read_config(pdev, PCIR_VENDOR, 2);
344 	device_id = pci_read_config(pdev, PCIR_DEVICE, 2);
345 
346 	if (vendor_id != 0x10de)
347 		return;
348 
349 	base = 0;
350 
351 	if (device_id == 0x005d) {
352 		/* ck804, base address is magic */
353 		base = 0xe0000000UL;
354 	} else if (device_id >= 0x0374 && device_id <= 0x378) {
355 		/* mcp55, base address stored in chipset */
356 		mcp55 = pci_find_bsf(0, 0, 0);
357 		if (mcp55 &&
358 		    0x10de == pci_read_config(mcp55, PCIR_VENDOR, 2) &&
359 		    0x0369 == pci_read_config(mcp55, PCIR_DEVICE, 2)) {
360 			word = pci_read_config(mcp55, 0x90, 2);
361 			base = ((unsigned long)word & 0x7ffeU) << 25;
362 		}
363 	}
364 	if (!base)
365 		return;
366 
367 	/* XXXX
368 	   Test below is commented because it is believed that doing
369 	   config read/write beyond 0xff will access the config space
370 	   for the next larger function.  Uncomment this and remove
371 	   the hacky pmap_mapdev() way of accessing config space when
372 	   FreeBSD grows support for extended pcie config space access
373 	*/
374 #if 0
375 	/* See if we can, by some miracle, access the extended
376 	   config space */
377 	val = pci_read_config(pdev, 0x178, 4);
378 	if (val != 0xffffffff) {
379 		val |= 0x40;
380 		pci_write_config(pdev, 0x178, val, 4);
381 		return;
382 	}
383 #endif
384 	/* Rather than using normal pci config space writes, we must
385 	 * map the Nvidia config space ourselves.  This is because on
386 	 * opteron/nvidia class machine the 0xe000000 mapping is
387 	 * handled by the nvidia chipset, that means the internal PCI
388 	 * device (the on-chip northbridge), or the amd-8131 bridge
389 	 * and things behind them are not visible by this method.
390 	 */
391 
392 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
393 		      PCI_IVAR_BUS, &bus);
394 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
395 		      PCI_IVAR_SLOT, &slot);
396 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
397 		      PCI_IVAR_FUNCTION, &func);
398 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
399 		      PCI_IVAR_VENDOR, &ivend);
400 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
401 		      PCI_IVAR_DEVICE, &idev);
402 
403 	off =  base
404 		+ 0x00100000UL * (unsigned long)bus
405 		+ 0x00001000UL * (unsigned long)(func
406 						 + 8 * slot);
407 
408 	/* map it into the kernel */
409 	va = pmap_mapdev(trunc_page((vm_paddr_t)off), PAGE_SIZE);
410 
411 
412 	if (va == NULL) {
413 		device_printf(sc->dev, "pmap_kenter_temporary didn't\n");
414 		return;
415 	}
416 	/* get a pointer to the config space mapped into the kernel */
417 	cfgptr = va + (off & PAGE_MASK);
418 
419 	/* make sure that we can really access it */
420 	vendor_id = *(uint16_t *)(cfgptr + PCIR_VENDOR);
421 	device_id = *(uint16_t *)(cfgptr + PCIR_DEVICE);
422 	if (! (vendor_id == ivend && device_id == idev)) {
423 		device_printf(sc->dev, "mapping failed: 0x%x:0x%x\n",
424 			      vendor_id, device_id);
425 		pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
426 		return;
427 	}
428 
429 	ptr32 = (uint32_t*)(cfgptr + 0x178);
430 	val = *ptr32;
431 
432 	if (val == 0xffffffff) {
433 		device_printf(sc->dev, "extended mapping failed\n");
434 		pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
435 		return;
436 	}
437 	*ptr32 = val | 0x40;
438 	pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
439 	if (mxge_verbose)
440 		device_printf(sc->dev,
441 			      "Enabled ECRC on upstream Nvidia bridge "
442 			      "at %d:%d:%d\n",
443 			      (int)bus, (int)slot, (int)func);
444 	return;
445 }
446 #else
447 static void
448 mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
449 {
450 	device_printf(sc->dev,
451 		      "Nforce 4 chipset on non-x86/amd64!?!?!\n");
452 	return;
453 }
454 #endif
455 
456 
457 static int
458 mxge_dma_test(mxge_softc_t *sc, int test_type)
459 {
460 	mxge_cmd_t cmd;
461 	bus_addr_t dmatest_bus = sc->dmabench_dma.bus_addr;
462 	int status;
463 	uint32_t len;
464 	char *test = " ";
465 
466 
467 	/* Run a small DMA test.
468 	 * The magic multipliers to the length tell the firmware
469 	 * to do DMA read, write, or read+write tests.  The
470 	 * results are returned in cmd.data0.  The upper 16
471 	 * bits of the return is the number of transfers completed.
472 	 * The lower 16 bits is the time in 0.5us ticks that the
473 	 * transfers took to complete.
474 	 */
475 
476 	len = sc->tx_boundary;
477 
478 	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
479 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
480 	cmd.data2 = len * 0x10000;
481 	status = mxge_send_cmd(sc, test_type, &cmd);
482 	if (status != 0) {
483 		test = "read";
484 		goto abort;
485 	}
486 	sc->read_dma = ((cmd.data0>>16) * len * 2) /
487 		(cmd.data0 & 0xffff);
488 	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
489 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
490 	cmd.data2 = len * 0x1;
491 	status = mxge_send_cmd(sc, test_type, &cmd);
492 	if (status != 0) {
493 		test = "write";
494 		goto abort;
495 	}
496 	sc->write_dma = ((cmd.data0>>16) * len * 2) /
497 		(cmd.data0 & 0xffff);
498 
499 	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
500 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
501 	cmd.data2 = len * 0x10001;
502 	status = mxge_send_cmd(sc, test_type, &cmd);
503 	if (status != 0) {
504 		test = "read/write";
505 		goto abort;
506 	}
507 	sc->read_write_dma = ((cmd.data0>>16) * len * 2 * 2) /
508 		(cmd.data0 & 0xffff);
509 
510 abort:
511 	if (status != 0 && test_type != MXGEFW_CMD_UNALIGNED_TEST)
512 		device_printf(sc->dev, "DMA %s benchmark failed: %d\n",
513 			      test, status);
514 
515 	return status;
516 }
517 
518 /*
519  * The Lanai Z8E PCI-E interface achieves higher Read-DMA throughput
520  * when the PCI-E Completion packets are aligned on an 8-byte
521  * boundary.  Some PCI-E chip sets always align Completion packets; on
522  * the ones that do not, the alignment can be enforced by enabling
523  * ECRC generation (if supported).
524  *
525  * When PCI-E Completion packets are not aligned, it is actually more
526  * efficient to limit Read-DMA transactions to 2KB, rather than 4KB.
527  *
528  * If the driver can neither enable ECRC nor verify that it has
529  * already been enabled, then it must use a firmware image which works
530  * around unaligned completion packets (ethp_z8e.dat), and it should
531  * also ensure that it never gives the device a Read-DMA which is
532  * larger than 2KB by setting the tx_boundary to 2KB.  If ECRC is
533  * enabled, then the driver should use the aligned (eth_z8e.dat)
534  * firmware image, and set tx_boundary to 4KB.
535  */
536 
537 static int
538 mxge_firmware_probe(mxge_softc_t *sc)
539 {
540 	device_t dev = sc->dev;
541 	int reg, status;
542 	uint16_t pectl;
543 
544 	sc->tx_boundary = 4096;
545 	/*
546 	 * Verify the max read request size was set to 4KB
547 	 * before trying the test with 4KB.
548 	 */
549 	if (pci_find_extcap(dev, PCIY_EXPRESS, &reg) == 0) {
550 		pectl = pci_read_config(dev, reg + 0x8, 2);
551 		if ((pectl & (5 << 12)) != (5 << 12)) {
552 			device_printf(dev, "Max Read Req. size != 4k (0x%x\n",
553 				      pectl);
554 			sc->tx_boundary = 2048;
555 		}
556 	}
557 
558 	/*
559 	 * load the optimized firmware (which assumes aligned PCIe
560 	 * completions) in order to see if it works on this host.
561 	 */
562 	sc->fw_name = mxge_fw_aligned;
563 	status = mxge_load_firmware(sc, 1);
564 	if (status != 0) {
565 		return status;
566 	}
567 
568 	/*
569 	 * Enable ECRC if possible
570 	 */
571 	mxge_enable_nvidia_ecrc(sc);
572 
573 	/*
574 	 * Run a DMA test which watches for unaligned completions and
575 	 * aborts on the first one seen.
576 	 */
577 
578 	status = mxge_dma_test(sc, MXGEFW_CMD_UNALIGNED_TEST);
579 	if (status == 0)
580 		return 0; /* keep the aligned firmware */
581 
582 	if (status != E2BIG)
583 		device_printf(dev, "DMA test failed: %d\n", status);
584 	if (status == ENOSYS)
585 		device_printf(dev, "Falling back to ethp! "
586 			      "Please install up to date fw\n");
587 	return status;
588 }
589 
590 static int
591 mxge_select_firmware(mxge_softc_t *sc)
592 {
593 	int aligned = 0;
594 
595 
596 	if (mxge_force_firmware != 0) {
597 		if (mxge_force_firmware == 1)
598 			aligned = 1;
599 		else
600 			aligned = 0;
601 		if (mxge_verbose)
602 			device_printf(sc->dev,
603 				      "Assuming %s completions (forced)\n",
604 				      aligned ? "aligned" : "unaligned");
605 		goto abort;
606 	}
607 
608 	/* if the PCIe link width is 4 or less, we can use the aligned
609 	   firmware and skip any checks */
610 	if (sc->link_width != 0 && sc->link_width <= 4) {
611 		device_printf(sc->dev,
612 			      "PCIe x%d Link, expect reduced performance\n",
613 			      sc->link_width);
614 		aligned = 1;
615 		goto abort;
616 	}
617 
618 	if (0 == mxge_firmware_probe(sc))
619 		return 0;
620 
621 abort:
622 	if (aligned) {
623 		sc->fw_name = mxge_fw_aligned;
624 		sc->tx_boundary = 4096;
625 	} else {
626 		sc->fw_name = mxge_fw_unaligned;
627 		sc->tx_boundary = 2048;
628 	}
629 	return (mxge_load_firmware(sc, 0));
630 }
631 
632 union qualhack
633 {
634         const char *ro_char;
635         char *rw_char;
636 };
637 
638 static int
639 mxge_validate_firmware(mxge_softc_t *sc, const mcp_gen_header_t *hdr)
640 {
641 
642 
643 	if (be32toh(hdr->mcp_type) != MCP_TYPE_ETH) {
644 		device_printf(sc->dev, "Bad firmware type: 0x%x\n",
645 			      be32toh(hdr->mcp_type));
646 		return EIO;
647 	}
648 
649 	/* save firmware version for sysctl */
650 	strncpy(sc->fw_version, hdr->version, sizeof (sc->fw_version));
651 	if (mxge_verbose)
652 		device_printf(sc->dev, "firmware id: %s\n", hdr->version);
653 
654 	sscanf(sc->fw_version, "%d.%d.%d", &sc->fw_ver_major,
655 	       &sc->fw_ver_minor, &sc->fw_ver_tiny);
656 
657 	if (!(sc->fw_ver_major == MXGEFW_VERSION_MAJOR
658 	      && sc->fw_ver_minor == MXGEFW_VERSION_MINOR)) {
659 		device_printf(sc->dev, "Found firmware version %s\n",
660 			      sc->fw_version);
661 		device_printf(sc->dev, "Driver needs %d.%d\n",
662 			      MXGEFW_VERSION_MAJOR, MXGEFW_VERSION_MINOR);
663 		return EINVAL;
664 	}
665 	return 0;
666 
667 }
668 
669 static void *
670 z_alloc(void *nil, u_int items, u_int size)
671 {
672         void *ptr;
673 
674         ptr = malloc(items * size, M_TEMP, M_NOWAIT);
675         return ptr;
676 }
677 
678 static void
679 z_free(void *nil, void *ptr)
680 {
681         free(ptr, M_TEMP);
682 }
683 
684 
685 static int
686 mxge_load_firmware_helper(mxge_softc_t *sc, uint32_t *limit)
687 {
688 	z_stream zs;
689 	char *inflate_buffer;
690 	const struct firmware *fw;
691 	const mcp_gen_header_t *hdr;
692 	unsigned hdr_offset;
693 	int status;
694 	unsigned int i;
695 	char dummy;
696 	size_t fw_len;
697 
698 	fw = firmware_get(sc->fw_name);
699 	if (fw == NULL) {
700 		device_printf(sc->dev, "Could not find firmware image %s\n",
701 			      sc->fw_name);
702 		return ENOENT;
703 	}
704 
705 
706 
707 	/* setup zlib and decompress f/w */
708 	bzero(&zs, sizeof (zs));
709 	zs.zalloc = z_alloc;
710 	zs.zfree = z_free;
711 	status = inflateInit(&zs);
712 	if (status != Z_OK) {
713 		status = EIO;
714 		goto abort_with_fw;
715 	}
716 
717 	/* the uncompressed size is stored as the firmware version,
718 	   which would otherwise go unused */
719 	fw_len = (size_t) fw->version;
720 	inflate_buffer = malloc(fw_len, M_TEMP, M_NOWAIT);
721 	if (inflate_buffer == NULL)
722 		goto abort_with_zs;
723 	zs.avail_in = fw->datasize;
724 	zs.next_in = __DECONST(char *, fw->data);
725 	zs.avail_out = fw_len;
726 	zs.next_out = inflate_buffer;
727 	status = inflate(&zs, Z_FINISH);
728 	if (status != Z_STREAM_END) {
729 		device_printf(sc->dev, "zlib %d\n", status);
730 		status = EIO;
731 		goto abort_with_buffer;
732 	}
733 
734 	/* check id */
735 	hdr_offset = htobe32(*(const uint32_t *)
736 			     (inflate_buffer + MCP_HEADER_PTR_OFFSET));
737 	if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > fw_len) {
738 		device_printf(sc->dev, "Bad firmware file");
739 		status = EIO;
740 		goto abort_with_buffer;
741 	}
742 	hdr = (const void*)(inflate_buffer + hdr_offset);
743 
744 	status = mxge_validate_firmware(sc, hdr);
745 	if (status != 0)
746 		goto abort_with_buffer;
747 
748 	/* Copy the inflated firmware to NIC SRAM. */
749 	for (i = 0; i < fw_len; i += 256) {
750 		mxge_pio_copy(sc->sram + MXGE_FW_OFFSET + i,
751 			      inflate_buffer + i,
752 			      min(256U, (unsigned)(fw_len - i)));
753 		wmb();
754 		dummy = *sc->sram;
755 		wmb();
756 	}
757 
758 	*limit = fw_len;
759 	status = 0;
760 abort_with_buffer:
761 	free(inflate_buffer, M_TEMP);
762 abort_with_zs:
763 	inflateEnd(&zs);
764 abort_with_fw:
765 	firmware_put(fw, FIRMWARE_UNLOAD);
766 	return status;
767 }
768 
769 /*
770  * Enable or disable periodic RDMAs from the host to make certain
771  * chipsets resend dropped PCIe messages
772  */
773 
774 static void
775 mxge_dummy_rdma(mxge_softc_t *sc, int enable)
776 {
777 	char buf_bytes[72];
778 	volatile uint32_t *confirm;
779 	volatile char *submit;
780 	uint32_t *buf, dma_low, dma_high;
781 	int i;
782 
783 	buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
784 
785 	/* clear confirmation addr */
786 	confirm = (volatile uint32_t *)sc->cmd;
787 	*confirm = 0;
788 	wmb();
789 
790 	/* send an rdma command to the PCIe engine, and wait for the
791 	   response in the confirmation address.  The firmware should
792 	   write a -1 there to indicate it is alive and well
793 	*/
794 
795 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
796 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
797 	buf[0] = htobe32(dma_high);		/* confirm addr MSW */
798 	buf[1] = htobe32(dma_low);		/* confirm addr LSW */
799 	buf[2] = htobe32(0xffffffff);		/* confirm data */
800 	dma_low = MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr);
801 	dma_high = MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr);
802 	buf[3] = htobe32(dma_high); 		/* dummy addr MSW */
803 	buf[4] = htobe32(dma_low); 		/* dummy addr LSW */
804 	buf[5] = htobe32(enable);			/* enable? */
805 
806 
807 	submit = (volatile char *)(sc->sram + MXGEFW_BOOT_DUMMY_RDMA);
808 
809 	mxge_pio_copy(submit, buf, 64);
810 	wmb();
811 	DELAY(1000);
812 	wmb();
813 	i = 0;
814 	while (*confirm != 0xffffffff && i < 20) {
815 		DELAY(1000);
816 		i++;
817 	}
818 	if (*confirm != 0xffffffff) {
819 		device_printf(sc->dev, "dummy rdma %s failed (%p = 0x%x)",
820 			      (enable ? "enable" : "disable"), confirm,
821 			      *confirm);
822 	}
823 	return;
824 }
825 
826 static int
827 mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data)
828 {
829 	mcp_cmd_t *buf;
830 	char buf_bytes[sizeof(*buf) + 8];
831 	volatile mcp_cmd_response_t *response = sc->cmd;
832 	volatile char *cmd_addr = sc->sram + MXGEFW_ETH_CMD;
833 	uint32_t dma_low, dma_high;
834 	int err, sleep_total = 0;
835 
836 	/* ensure buf is aligned to 8 bytes */
837 	buf = (mcp_cmd_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
838 
839 	buf->data0 = htobe32(data->data0);
840 	buf->data1 = htobe32(data->data1);
841 	buf->data2 = htobe32(data->data2);
842 	buf->cmd = htobe32(cmd);
843 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
844 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
845 
846 	buf->response_addr.low = htobe32(dma_low);
847 	buf->response_addr.high = htobe32(dma_high);
848 	mtx_lock(&sc->cmd_mtx);
849 	response->result = 0xffffffff;
850 	wmb();
851 	mxge_pio_copy((volatile void *)cmd_addr, buf, sizeof (*buf));
852 
853 	/* wait up to 20ms */
854 	err = EAGAIN;
855 	for (sleep_total = 0; sleep_total <  20; sleep_total++) {
856 		bus_dmamap_sync(sc->cmd_dma.dmat,
857 				sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
858 		wmb();
859 		switch (be32toh(response->result)) {
860 		case 0:
861 			data->data0 = be32toh(response->data);
862 			err = 0;
863 			break;
864 		case 0xffffffff:
865 			DELAY(1000);
866 			break;
867 		case MXGEFW_CMD_UNKNOWN:
868 			err = ENOSYS;
869 			break;
870 		case MXGEFW_CMD_ERROR_UNALIGNED:
871 			err = E2BIG;
872 			break;
873 		case MXGEFW_CMD_ERROR_BUSY:
874 			err = EBUSY;
875 			break;
876 		default:
877 			device_printf(sc->dev,
878 				      "mxge: command %d "
879 				      "failed, result = %d\n",
880 				      cmd, be32toh(response->result));
881 			err = ENXIO;
882 			break;
883 		}
884 		if (err != EAGAIN)
885 			break;
886 	}
887 	if (err == EAGAIN)
888 		device_printf(sc->dev, "mxge: command %d timed out"
889 			      "result = %d\n",
890 			      cmd, be32toh(response->result));
891 	mtx_unlock(&sc->cmd_mtx);
892 	return err;
893 }
894 
895 static int
896 mxge_adopt_running_firmware(mxge_softc_t *sc)
897 {
898 	struct mcp_gen_header *hdr;
899 	const size_t bytes = sizeof (struct mcp_gen_header);
900 	size_t hdr_offset;
901 	int status;
902 
903 	/* find running firmware header */
904 	hdr_offset = htobe32(*(volatile uint32_t *)
905 			     (sc->sram + MCP_HEADER_PTR_OFFSET));
906 
907 	if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > sc->sram_size) {
908 		device_printf(sc->dev,
909 			      "Running firmware has bad header offset (%d)\n",
910 			      (int)hdr_offset);
911 		return EIO;
912 	}
913 
914 	/* copy header of running firmware from SRAM to host memory to
915 	 * validate firmware */
916 	hdr = malloc(bytes, M_DEVBUF, M_NOWAIT);
917 	if (hdr == NULL) {
918 		device_printf(sc->dev, "could not malloc firmware hdr\n");
919 		return ENOMEM;
920 	}
921 	bus_space_read_region_1(rman_get_bustag(sc->mem_res),
922 				rman_get_bushandle(sc->mem_res),
923 				hdr_offset, (char *)hdr, bytes);
924 	status = mxge_validate_firmware(sc, hdr);
925 	free(hdr, M_DEVBUF);
926 
927 	/*
928 	 * check to see if adopted firmware has bug where adopting
929 	 * it will cause broadcasts to be filtered unless the NIC
930 	 * is kept in ALLMULTI mode
931 	 */
932 	if (sc->fw_ver_major == 1 && sc->fw_ver_minor == 4 &&
933 	    sc->fw_ver_tiny >= 4 && sc->fw_ver_tiny <= 11) {
934 		sc->adopted_rx_filter_bug = 1;
935 		device_printf(sc->dev, "Adopting fw %d.%d.%d: "
936 			      "working around rx filter bug\n",
937 			      sc->fw_ver_major, sc->fw_ver_minor,
938 			      sc->fw_ver_tiny);
939 	}
940 
941 	return status;
942 }
943 
944 
945 static int
946 mxge_load_firmware(mxge_softc_t *sc, int adopt)
947 {
948 	volatile uint32_t *confirm;
949 	volatile char *submit;
950 	char buf_bytes[72];
951 	uint32_t *buf, size, dma_low, dma_high;
952 	int status, i;
953 
954 	buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
955 
956 	size = sc->sram_size;
957 	status = mxge_load_firmware_helper(sc, &size);
958 	if (status) {
959 		if (!adopt)
960 			return status;
961 		/* Try to use the currently running firmware, if
962 		   it is new enough */
963 		status = mxge_adopt_running_firmware(sc);
964 		if (status) {
965 			device_printf(sc->dev,
966 				      "failed to adopt running firmware\n");
967 			return status;
968 		}
969 		device_printf(sc->dev,
970 			      "Successfully adopted running firmware\n");
971 		if (sc->tx_boundary == 4096) {
972 			device_printf(sc->dev,
973 				"Using firmware currently running on NIC"
974 				 ".  For optimal\n");
975 			device_printf(sc->dev,
976 				 "performance consider loading optimized "
977 				 "firmware\n");
978 		}
979 		sc->fw_name = mxge_fw_unaligned;
980 		sc->tx_boundary = 2048;
981 		return 0;
982 	}
983 	/* clear confirmation addr */
984 	confirm = (volatile uint32_t *)sc->cmd;
985 	*confirm = 0;
986 	wmb();
987 	/* send a reload command to the bootstrap MCP, and wait for the
988 	   response in the confirmation address.  The firmware should
989 	   write a -1 there to indicate it is alive and well
990 	*/
991 
992 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
993 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
994 
995 	buf[0] = htobe32(dma_high);	/* confirm addr MSW */
996 	buf[1] = htobe32(dma_low);	/* confirm addr LSW */
997 	buf[2] = htobe32(0xffffffff);	/* confirm data */
998 
999 	/* FIX: All newest firmware should un-protect the bottom of
1000 	   the sram before handoff. However, the very first interfaces
1001 	   do not. Therefore the handoff copy must skip the first 8 bytes
1002 	*/
1003 					/* where the code starts*/
1004 	buf[3] = htobe32(MXGE_FW_OFFSET + 8);
1005 	buf[4] = htobe32(size - 8); 	/* length of code */
1006 	buf[5] = htobe32(8);		/* where to copy to */
1007 	buf[6] = htobe32(0);		/* where to jump to */
1008 
1009 	submit = (volatile char *)(sc->sram + MXGEFW_BOOT_HANDOFF);
1010 	mxge_pio_copy(submit, buf, 64);
1011 	wmb();
1012 	DELAY(1000);
1013 	wmb();
1014 	i = 0;
1015 	while (*confirm != 0xffffffff && i < 20) {
1016 		DELAY(1000*10);
1017 		i++;
1018 		bus_dmamap_sync(sc->cmd_dma.dmat,
1019 				sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
1020 	}
1021 	if (*confirm != 0xffffffff) {
1022 		device_printf(sc->dev,"handoff failed (%p = 0x%x)",
1023 			confirm, *confirm);
1024 
1025 		return ENXIO;
1026 	}
1027 	return 0;
1028 }
1029 
1030 static int
1031 mxge_update_mac_address(mxge_softc_t *sc)
1032 {
1033 	mxge_cmd_t cmd;
1034 	uint8_t *addr = sc->mac_addr;
1035 	int status;
1036 
1037 
1038 	cmd.data0 = ((addr[0] << 24) | (addr[1] << 16)
1039 		     | (addr[2] << 8) | addr[3]);
1040 
1041 	cmd.data1 = ((addr[4] << 8) | (addr[5]));
1042 
1043 	status = mxge_send_cmd(sc, MXGEFW_SET_MAC_ADDRESS, &cmd);
1044 	return status;
1045 }
1046 
1047 static int
1048 mxge_change_pause(mxge_softc_t *sc, int pause)
1049 {
1050 	mxge_cmd_t cmd;
1051 	int status;
1052 
1053 	if (pause)
1054 		status = mxge_send_cmd(sc, MXGEFW_ENABLE_FLOW_CONTROL,
1055 				       &cmd);
1056 	else
1057 		status = mxge_send_cmd(sc, MXGEFW_DISABLE_FLOW_CONTROL,
1058 				       &cmd);
1059 
1060 	if (status) {
1061 		device_printf(sc->dev, "Failed to set flow control mode\n");
1062 		return ENXIO;
1063 	}
1064 	sc->pause = pause;
1065 	return 0;
1066 }
1067 
1068 static void
1069 mxge_change_promisc(mxge_softc_t *sc, int promisc)
1070 {
1071 	mxge_cmd_t cmd;
1072 	int status;
1073 
1074 	if (mxge_always_promisc)
1075 		promisc = 1;
1076 
1077 	if (promisc)
1078 		status = mxge_send_cmd(sc, MXGEFW_ENABLE_PROMISC,
1079 				       &cmd);
1080 	else
1081 		status = mxge_send_cmd(sc, MXGEFW_DISABLE_PROMISC,
1082 				       &cmd);
1083 
1084 	if (status) {
1085 		device_printf(sc->dev, "Failed to set promisc mode\n");
1086 	}
1087 }
1088 
1089 static void
1090 mxge_set_multicast_list(mxge_softc_t *sc)
1091 {
1092 	mxge_cmd_t cmd;
1093 	struct ifmultiaddr *ifma;
1094 	struct ifnet *ifp = sc->ifp;
1095 	int err;
1096 
1097 	/* This firmware is known to not support multicast */
1098 	if (!sc->fw_multicast_support)
1099 		return;
1100 
1101 	/* Disable multicast filtering while we play with the lists*/
1102 	err = mxge_send_cmd(sc, MXGEFW_ENABLE_ALLMULTI, &cmd);
1103 	if (err != 0) {
1104 		device_printf(sc->dev, "Failed MXGEFW_ENABLE_ALLMULTI,"
1105 		       " error status: %d\n", err);
1106 		return;
1107 	}
1108 
1109 	if (sc->adopted_rx_filter_bug)
1110 		return;
1111 
1112 	if (ifp->if_flags & IFF_ALLMULTI)
1113 		/* request to disable multicast filtering, so quit here */
1114 		return;
1115 
1116 	/* Flush all the filters */
1117 
1118 	err = mxge_send_cmd(sc, MXGEFW_LEAVE_ALL_MULTICAST_GROUPS, &cmd);
1119 	if (err != 0) {
1120 		device_printf(sc->dev,
1121 			      "Failed MXGEFW_LEAVE_ALL_MULTICAST_GROUPS"
1122 			      ", error status: %d\n", err);
1123 		return;
1124 	}
1125 
1126 	/* Walk the multicast list, and add each address */
1127 
1128 	IF_ADDR_LOCK(ifp);
1129 	TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
1130 		if (ifma->ifma_addr->sa_family != AF_LINK)
1131 			continue;
1132 		bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr),
1133 		      &cmd.data0, 4);
1134 		bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr) + 4,
1135 		      &cmd.data1, 2);
1136 		cmd.data0 = htonl(cmd.data0);
1137 		cmd.data1 = htonl(cmd.data1);
1138 		err = mxge_send_cmd(sc, MXGEFW_JOIN_MULTICAST_GROUP, &cmd);
1139 		if (err != 0) {
1140 			device_printf(sc->dev, "Failed "
1141 			       "MXGEFW_JOIN_MULTICAST_GROUP, error status:"
1142 			       "%d\t", err);
1143 			/* abort, leaving multicast filtering off */
1144 			IF_ADDR_UNLOCK(ifp);
1145 			return;
1146 		}
1147 	}
1148 	IF_ADDR_UNLOCK(ifp);
1149 	/* Enable multicast filtering */
1150 	err = mxge_send_cmd(sc, MXGEFW_DISABLE_ALLMULTI, &cmd);
1151 	if (err != 0) {
1152 		device_printf(sc->dev, "Failed MXGEFW_DISABLE_ALLMULTI"
1153 		       ", error status: %d\n", err);
1154 	}
1155 }
1156 
1157 static int
1158 mxge_max_mtu(mxge_softc_t *sc)
1159 {
1160 	mxge_cmd_t cmd;
1161 	int status;
1162 
1163 	if (MJUMPAGESIZE - MXGEFW_PAD >  MXGEFW_MAX_MTU)
1164 		return  MXGEFW_MAX_MTU - MXGEFW_PAD;
1165 
1166 	/* try to set nbufs to see if it we can
1167 	   use virtually contiguous jumbos */
1168 	cmd.data0 = 0;
1169 	status = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
1170 			       &cmd);
1171 	if (status == 0)
1172 		return  MXGEFW_MAX_MTU - MXGEFW_PAD;
1173 
1174 	/* otherwise, we're limited to MJUMPAGESIZE */
1175 	return MJUMPAGESIZE - MXGEFW_PAD;
1176 }
1177 
1178 static int
1179 mxge_reset(mxge_softc_t *sc, int interrupts_setup)
1180 {
1181 	struct mxge_slice_state *ss;
1182 	mxge_rx_done_t *rx_done;
1183 	volatile uint32_t *irq_claim;
1184 	mxge_cmd_t cmd;
1185 	int slice, status;
1186 
1187 	/* try to send a reset command to the card to see if it
1188 	   is alive */
1189 	memset(&cmd, 0, sizeof (cmd));
1190 	status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
1191 	if (status != 0) {
1192 		device_printf(sc->dev, "failed reset\n");
1193 		return ENXIO;
1194 	}
1195 
1196 	mxge_dummy_rdma(sc, 1);
1197 
1198 
1199 	/* set the intrq size */
1200 	cmd.data0 = sc->rx_ring_size;
1201 	status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
1202 
1203 	/*
1204 	 * Even though we already know how many slices are supported
1205 	 * via mxge_slice_probe(), MXGEFW_CMD_GET_MAX_RSS_QUEUES
1206 	 * has magic side effects, and must be called after a reset.
1207 	 * It must be called prior to calling any RSS related cmds,
1208 	 * including assigning an interrupt queue for anything but
1209 	 * slice 0.  It must also be called *after*
1210 	 * MXGEFW_CMD_SET_INTRQ_SIZE, since the intrq size is used by
1211 	 * the firmware to compute offsets.
1212 	 */
1213 
1214 	if (sc->num_slices > 1) {
1215 		/* ask the maximum number of slices it supports */
1216 		status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES,
1217 					   &cmd);
1218 		if (status != 0) {
1219 			device_printf(sc->dev,
1220 				      "failed to get number of slices\n");
1221 			return status;
1222 		}
1223 		/*
1224 		 * MXGEFW_CMD_ENABLE_RSS_QUEUES must be called prior
1225 		 * to setting up the interrupt queue DMA
1226 		 */
1227 		cmd.data0 = sc->num_slices;
1228 		cmd.data1 = MXGEFW_SLICE_INTR_MODE_ONE_PER_SLICE;
1229 #ifdef IFNET_BUF_RING
1230 		cmd.data1 |= MXGEFW_SLICE_ENABLE_MULTIPLE_TX_QUEUES;
1231 #endif
1232 		status = mxge_send_cmd(sc, MXGEFW_CMD_ENABLE_RSS_QUEUES,
1233 					   &cmd);
1234 		if (status != 0) {
1235 			device_printf(sc->dev,
1236 				      "failed to set number of slices\n");
1237 			return status;
1238 		}
1239 	}
1240 
1241 
1242 	if (interrupts_setup) {
1243 		/* Now exchange information about interrupts  */
1244 		for (slice = 0; slice < sc->num_slices; slice++) {
1245 			rx_done = &sc->ss[slice].rx_done;
1246 			memset(rx_done->entry, 0, sc->rx_ring_size);
1247 			cmd.data0 = MXGE_LOWPART_TO_U32(rx_done->dma.bus_addr);
1248 			cmd.data1 = MXGE_HIGHPART_TO_U32(rx_done->dma.bus_addr);
1249 			cmd.data2 = slice;
1250 			status |= mxge_send_cmd(sc,
1251 						MXGEFW_CMD_SET_INTRQ_DMA,
1252 						&cmd);
1253 		}
1254 	}
1255 
1256 	status |= mxge_send_cmd(sc,
1257 				MXGEFW_CMD_GET_INTR_COAL_DELAY_OFFSET, &cmd);
1258 
1259 
1260 	sc->intr_coal_delay_ptr = (volatile uint32_t *)(sc->sram + cmd.data0);
1261 
1262 	status |= mxge_send_cmd(sc, MXGEFW_CMD_GET_IRQ_ACK_OFFSET, &cmd);
1263 	irq_claim = (volatile uint32_t *)(sc->sram + cmd.data0);
1264 
1265 
1266 	status |= mxge_send_cmd(sc,  MXGEFW_CMD_GET_IRQ_DEASSERT_OFFSET,
1267 				&cmd);
1268 	sc->irq_deassert = (volatile uint32_t *)(sc->sram + cmd.data0);
1269 	if (status != 0) {
1270 		device_printf(sc->dev, "failed set interrupt parameters\n");
1271 		return status;
1272 	}
1273 
1274 
1275 	*sc->intr_coal_delay_ptr = htobe32(sc->intr_coal_delay);
1276 
1277 
1278 	/* run a DMA benchmark */
1279 	(void) mxge_dma_test(sc, MXGEFW_DMA_TEST);
1280 
1281 	for (slice = 0; slice < sc->num_slices; slice++) {
1282 		ss = &sc->ss[slice];
1283 
1284 		ss->irq_claim = irq_claim + (2 * slice);
1285 		/* reset mcp/driver shared state back to 0 */
1286 		ss->rx_done.idx = 0;
1287 		ss->rx_done.cnt = 0;
1288 		ss->tx.req = 0;
1289 		ss->tx.done = 0;
1290 		ss->tx.pkt_done = 0;
1291 		ss->tx.queue_active = 0;
1292 		ss->tx.activate = 0;
1293 		ss->tx.deactivate = 0;
1294 		ss->tx.wake = 0;
1295 		ss->tx.defrag = 0;
1296 		ss->tx.stall = 0;
1297 		ss->rx_big.cnt = 0;
1298 		ss->rx_small.cnt = 0;
1299 		ss->lro_bad_csum = 0;
1300 		ss->lro_queued = 0;
1301 		ss->lro_flushed = 0;
1302 		if (ss->fw_stats != NULL) {
1303 			ss->fw_stats->valid = 0;
1304 			ss->fw_stats->send_done_count = 0;
1305 		}
1306 	}
1307 	sc->rdma_tags_available = 15;
1308 	status = mxge_update_mac_address(sc);
1309 	mxge_change_promisc(sc, 0);
1310 	mxge_change_pause(sc, sc->pause);
1311 	mxge_set_multicast_list(sc);
1312 	return status;
1313 }
1314 
1315 static int
1316 mxge_change_intr_coal(SYSCTL_HANDLER_ARGS)
1317 {
1318         mxge_softc_t *sc;
1319         unsigned int intr_coal_delay;
1320         int err;
1321 
1322         sc = arg1;
1323         intr_coal_delay = sc->intr_coal_delay;
1324         err = sysctl_handle_int(oidp, &intr_coal_delay, arg2, req);
1325         if (err != 0) {
1326                 return err;
1327         }
1328         if (intr_coal_delay == sc->intr_coal_delay)
1329                 return 0;
1330 
1331         if (intr_coal_delay == 0 || intr_coal_delay > 1000*1000)
1332                 return EINVAL;
1333 
1334 	mtx_lock(&sc->driver_mtx);
1335 	*sc->intr_coal_delay_ptr = htobe32(intr_coal_delay);
1336 	sc->intr_coal_delay = intr_coal_delay;
1337 
1338 	mtx_unlock(&sc->driver_mtx);
1339         return err;
1340 }
1341 
1342 static int
1343 mxge_change_flow_control(SYSCTL_HANDLER_ARGS)
1344 {
1345         mxge_softc_t *sc;
1346         unsigned int enabled;
1347         int err;
1348 
1349         sc = arg1;
1350         enabled = sc->pause;
1351         err = sysctl_handle_int(oidp, &enabled, arg2, req);
1352         if (err != 0) {
1353                 return err;
1354         }
1355         if (enabled == sc->pause)
1356                 return 0;
1357 
1358 	mtx_lock(&sc->driver_mtx);
1359 	err = mxge_change_pause(sc, enabled);
1360 	mtx_unlock(&sc->driver_mtx);
1361         return err;
1362 }
1363 
1364 static int
1365 mxge_change_lro_locked(mxge_softc_t *sc, int lro_cnt)
1366 {
1367 	struct ifnet *ifp;
1368 	int err = 0;
1369 
1370 	ifp = sc->ifp;
1371 	if (lro_cnt == 0)
1372 		ifp->if_capenable &= ~IFCAP_LRO;
1373 	else
1374 		ifp->if_capenable |= IFCAP_LRO;
1375 	sc->lro_cnt = lro_cnt;
1376 	if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
1377 		mxge_close(sc);
1378 		err = mxge_open(sc);
1379 	}
1380 	return err;
1381 }
1382 
1383 static int
1384 mxge_change_lro(SYSCTL_HANDLER_ARGS)
1385 {
1386 	mxge_softc_t *sc;
1387 	unsigned int lro_cnt;
1388 	int err;
1389 
1390 	sc = arg1;
1391 	lro_cnt = sc->lro_cnt;
1392 	err = sysctl_handle_int(oidp, &lro_cnt, arg2, req);
1393 	if (err != 0)
1394 		return err;
1395 
1396 	if (lro_cnt == sc->lro_cnt)
1397 		return 0;
1398 
1399 	if (lro_cnt > 128)
1400 		return EINVAL;
1401 
1402 	mtx_lock(&sc->driver_mtx);
1403 	err = mxge_change_lro_locked(sc, lro_cnt);
1404 	mtx_unlock(&sc->driver_mtx);
1405 	return err;
1406 }
1407 
1408 static int
1409 mxge_handle_be32(SYSCTL_HANDLER_ARGS)
1410 {
1411         int err;
1412 
1413         if (arg1 == NULL)
1414                 return EFAULT;
1415         arg2 = be32toh(*(int *)arg1);
1416         arg1 = NULL;
1417         err = sysctl_handle_int(oidp, arg1, arg2, req);
1418 
1419         return err;
1420 }
1421 
1422 static void
1423 mxge_rem_sysctls(mxge_softc_t *sc)
1424 {
1425 	struct mxge_slice_state *ss;
1426 	int slice;
1427 
1428 	if (sc->slice_sysctl_tree == NULL)
1429 		return;
1430 
1431 	for (slice = 0; slice < sc->num_slices; slice++) {
1432 		ss = &sc->ss[slice];
1433 		if (ss == NULL || ss->sysctl_tree == NULL)
1434 			continue;
1435 		sysctl_ctx_free(&ss->sysctl_ctx);
1436 		ss->sysctl_tree = NULL;
1437 	}
1438 	sysctl_ctx_free(&sc->slice_sysctl_ctx);
1439 	sc->slice_sysctl_tree = NULL;
1440 }
1441 
1442 static void
1443 mxge_add_sysctls(mxge_softc_t *sc)
1444 {
1445 	struct sysctl_ctx_list *ctx;
1446 	struct sysctl_oid_list *children;
1447 	mcp_irq_data_t *fw;
1448 	struct mxge_slice_state *ss;
1449 	int slice;
1450 	char slice_num[8];
1451 
1452 	ctx = device_get_sysctl_ctx(sc->dev);
1453 	children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev));
1454 	fw = sc->ss[0].fw_stats;
1455 
1456 	/* random information */
1457 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1458 		       "firmware_version",
1459 		       CTLFLAG_RD, &sc->fw_version,
1460 		       0, "firmware version");
1461 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1462 		       "serial_number",
1463 		       CTLFLAG_RD, &sc->serial_number_string,
1464 		       0, "serial number");
1465 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1466 		       "product_code",
1467 		       CTLFLAG_RD, &sc->product_code_string,
1468 		       0, "product_code");
1469 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1470 		       "pcie_link_width",
1471 		       CTLFLAG_RD, &sc->link_width,
1472 		       0, "tx_boundary");
1473 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1474 		       "tx_boundary",
1475 		       CTLFLAG_RD, &sc->tx_boundary,
1476 		       0, "tx_boundary");
1477 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1478 		       "write_combine",
1479 		       CTLFLAG_RD, &sc->wc,
1480 		       0, "write combining PIO?");
1481 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1482 		       "read_dma_MBs",
1483 		       CTLFLAG_RD, &sc->read_dma,
1484 		       0, "DMA Read speed in MB/s");
1485 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1486 		       "write_dma_MBs",
1487 		       CTLFLAG_RD, &sc->write_dma,
1488 		       0, "DMA Write speed in MB/s");
1489 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1490 		       "read_write_dma_MBs",
1491 		       CTLFLAG_RD, &sc->read_write_dma,
1492 		       0, "DMA concurrent Read/Write speed in MB/s");
1493 
1494 
1495 	/* performance related tunables */
1496 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1497 			"intr_coal_delay",
1498 			CTLTYPE_INT|CTLFLAG_RW, sc,
1499 			0, mxge_change_intr_coal,
1500 			"I", "interrupt coalescing delay in usecs");
1501 
1502 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1503 			"flow_control_enabled",
1504 			CTLTYPE_INT|CTLFLAG_RW, sc,
1505 			0, mxge_change_flow_control,
1506 			"I", "interrupt coalescing delay in usecs");
1507 
1508 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1509 		       "deassert_wait",
1510 		       CTLFLAG_RW, &mxge_deassert_wait,
1511 		       0, "Wait for IRQ line to go low in ihandler");
1512 
1513 	/* stats block from firmware is in network byte order.
1514 	   Need to swap it */
1515 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1516 			"link_up",
1517 			CTLTYPE_INT|CTLFLAG_RD, &fw->link_up,
1518 			0, mxge_handle_be32,
1519 			"I", "link up");
1520 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1521 			"rdma_tags_available",
1522 			CTLTYPE_INT|CTLFLAG_RD, &fw->rdma_tags_available,
1523 			0, mxge_handle_be32,
1524 			"I", "rdma_tags_available");
1525 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1526 			"dropped_bad_crc32",
1527 			CTLTYPE_INT|CTLFLAG_RD,
1528 			&fw->dropped_bad_crc32,
1529 			0, mxge_handle_be32,
1530 			"I", "dropped_bad_crc32");
1531 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1532 			"dropped_bad_phy",
1533 			CTLTYPE_INT|CTLFLAG_RD,
1534 			&fw->dropped_bad_phy,
1535 			0, mxge_handle_be32,
1536 			"I", "dropped_bad_phy");
1537 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1538 			"dropped_link_error_or_filtered",
1539 			CTLTYPE_INT|CTLFLAG_RD,
1540 			&fw->dropped_link_error_or_filtered,
1541 			0, mxge_handle_be32,
1542 			"I", "dropped_link_error_or_filtered");
1543 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1544 			"dropped_link_overflow",
1545 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_link_overflow,
1546 			0, mxge_handle_be32,
1547 			"I", "dropped_link_overflow");
1548 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1549 			"dropped_multicast_filtered",
1550 			CTLTYPE_INT|CTLFLAG_RD,
1551 			&fw->dropped_multicast_filtered,
1552 			0, mxge_handle_be32,
1553 			"I", "dropped_multicast_filtered");
1554 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1555 			"dropped_no_big_buffer",
1556 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_no_big_buffer,
1557 			0, mxge_handle_be32,
1558 			"I", "dropped_no_big_buffer");
1559 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1560 			"dropped_no_small_buffer",
1561 			CTLTYPE_INT|CTLFLAG_RD,
1562 			&fw->dropped_no_small_buffer,
1563 			0, mxge_handle_be32,
1564 			"I", "dropped_no_small_buffer");
1565 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1566 			"dropped_overrun",
1567 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_overrun,
1568 			0, mxge_handle_be32,
1569 			"I", "dropped_overrun");
1570 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1571 			"dropped_pause",
1572 			CTLTYPE_INT|CTLFLAG_RD,
1573 			&fw->dropped_pause,
1574 			0, mxge_handle_be32,
1575 			"I", "dropped_pause");
1576 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1577 			"dropped_runt",
1578 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_runt,
1579 			0, mxge_handle_be32,
1580 			"I", "dropped_runt");
1581 
1582 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1583 			"dropped_unicast_filtered",
1584 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_unicast_filtered,
1585 			0, mxge_handle_be32,
1586 			"I", "dropped_unicast_filtered");
1587 
1588 	/* verbose printing? */
1589 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1590 		       "verbose",
1591 		       CTLFLAG_RW, &mxge_verbose,
1592 		       0, "verbose printing");
1593 
1594 	/* lro */
1595 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1596 			"lro_cnt",
1597 			CTLTYPE_INT|CTLFLAG_RW, sc,
1598 			0, mxge_change_lro,
1599 			"I", "number of lro merge queues");
1600 
1601 
1602 	/* add counters exported for debugging from all slices */
1603 	sysctl_ctx_init(&sc->slice_sysctl_ctx);
1604 	sc->slice_sysctl_tree =
1605 		SYSCTL_ADD_NODE(&sc->slice_sysctl_ctx, children, OID_AUTO,
1606 				"slice", CTLFLAG_RD, 0, "");
1607 
1608 	for (slice = 0; slice < sc->num_slices; slice++) {
1609 		ss = &sc->ss[slice];
1610 		sysctl_ctx_init(&ss->sysctl_ctx);
1611 		ctx = &ss->sysctl_ctx;
1612 		children = SYSCTL_CHILDREN(sc->slice_sysctl_tree);
1613 		sprintf(slice_num, "%d", slice);
1614 		ss->sysctl_tree =
1615 			SYSCTL_ADD_NODE(ctx, children, OID_AUTO, slice_num,
1616 					CTLFLAG_RD, 0, "");
1617 		children = SYSCTL_CHILDREN(ss->sysctl_tree);
1618 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1619 			       "rx_small_cnt",
1620 			       CTLFLAG_RD, &ss->rx_small.cnt,
1621 			       0, "rx_small_cnt");
1622 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1623 			       "rx_big_cnt",
1624 			       CTLFLAG_RD, &ss->rx_big.cnt,
1625 			       0, "rx_small_cnt");
1626 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1627 			       "lro_flushed", CTLFLAG_RD, &ss->lro_flushed,
1628 			       0, "number of lro merge queues flushed");
1629 
1630 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1631 			       "lro_queued", CTLFLAG_RD, &ss->lro_queued,
1632 			       0, "number of frames appended to lro merge"
1633 			       "queues");
1634 
1635 #ifndef IFNET_BUF_RING
1636 		/* only transmit from slice 0 for now */
1637 		if (slice > 0)
1638 			continue;
1639 #endif
1640 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1641 			       "tx_req",
1642 			       CTLFLAG_RD, &ss->tx.req,
1643 			       0, "tx_req");
1644 
1645 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1646 			       "tx_done",
1647 			       CTLFLAG_RD, &ss->tx.done,
1648 			       0, "tx_done");
1649 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1650 			       "tx_pkt_done",
1651 			       CTLFLAG_RD, &ss->tx.pkt_done,
1652 			       0, "tx_done");
1653 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1654 			       "tx_stall",
1655 			       CTLFLAG_RD, &ss->tx.stall,
1656 			       0, "tx_stall");
1657 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1658 			       "tx_wake",
1659 			       CTLFLAG_RD, &ss->tx.wake,
1660 			       0, "tx_wake");
1661 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1662 			       "tx_defrag",
1663 			       CTLFLAG_RD, &ss->tx.defrag,
1664 			       0, "tx_defrag");
1665 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1666 			       "tx_queue_active",
1667 			       CTLFLAG_RD, &ss->tx.queue_active,
1668 			       0, "tx_queue_active");
1669 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1670 			       "tx_activate",
1671 			       CTLFLAG_RD, &ss->tx.activate,
1672 			       0, "tx_activate");
1673 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1674 			       "tx_deactivate",
1675 			       CTLFLAG_RD, &ss->tx.deactivate,
1676 			       0, "tx_deactivate");
1677 	}
1678 }
1679 
1680 /* copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
1681    backwards one at a time and handle ring wraps */
1682 
1683 static inline void
1684 mxge_submit_req_backwards(mxge_tx_ring_t *tx,
1685 			    mcp_kreq_ether_send_t *src, int cnt)
1686 {
1687         int idx, starting_slot;
1688         starting_slot = tx->req;
1689         while (cnt > 1) {
1690                 cnt--;
1691                 idx = (starting_slot + cnt) & tx->mask;
1692                 mxge_pio_copy(&tx->lanai[idx],
1693 			      &src[cnt], sizeof(*src));
1694                 wmb();
1695         }
1696 }
1697 
1698 /*
1699  * copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
1700  * at most 32 bytes at a time, so as to avoid involving the software
1701  * pio handler in the nic.   We re-write the first segment's flags
1702  * to mark them valid only after writing the entire chain
1703  */
1704 
1705 static inline void
1706 mxge_submit_req(mxge_tx_ring_t *tx, mcp_kreq_ether_send_t *src,
1707                   int cnt)
1708 {
1709         int idx, i;
1710         uint32_t *src_ints;
1711 	volatile uint32_t *dst_ints;
1712         mcp_kreq_ether_send_t *srcp;
1713 	volatile mcp_kreq_ether_send_t *dstp, *dst;
1714 	uint8_t last_flags;
1715 
1716         idx = tx->req & tx->mask;
1717 
1718 	last_flags = src->flags;
1719 	src->flags = 0;
1720         wmb();
1721         dst = dstp = &tx->lanai[idx];
1722         srcp = src;
1723 
1724         if ((idx + cnt) < tx->mask) {
1725                 for (i = 0; i < (cnt - 1); i += 2) {
1726                         mxge_pio_copy(dstp, srcp, 2 * sizeof(*src));
1727                         wmb(); /* force write every 32 bytes */
1728                         srcp += 2;
1729                         dstp += 2;
1730                 }
1731         } else {
1732                 /* submit all but the first request, and ensure
1733                    that it is submitted below */
1734                 mxge_submit_req_backwards(tx, src, cnt);
1735                 i = 0;
1736         }
1737         if (i < cnt) {
1738                 /* submit the first request */
1739                 mxge_pio_copy(dstp, srcp, sizeof(*src));
1740                 wmb(); /* barrier before setting valid flag */
1741         }
1742 
1743         /* re-write the last 32-bits with the valid flags */
1744         src->flags = last_flags;
1745         src_ints = (uint32_t *)src;
1746         src_ints+=3;
1747         dst_ints = (volatile uint32_t *)dst;
1748         dst_ints+=3;
1749         *dst_ints =  *src_ints;
1750         tx->req += cnt;
1751         wmb();
1752 }
1753 
1754 #if IFCAP_TSO4
1755 
1756 static void
1757 mxge_encap_tso(struct mxge_slice_state *ss, struct mbuf *m,
1758 	       int busdma_seg_cnt, int ip_off)
1759 {
1760 	mxge_tx_ring_t *tx;
1761 	mcp_kreq_ether_send_t *req;
1762 	bus_dma_segment_t *seg;
1763 	struct ip *ip;
1764 	struct tcphdr *tcp;
1765 	uint32_t low, high_swapped;
1766 	int len, seglen, cum_len, cum_len_next;
1767 	int next_is_first, chop, cnt, rdma_count, small;
1768 	uint16_t pseudo_hdr_offset, cksum_offset, mss;
1769 	uint8_t flags, flags_next;
1770 	static int once;
1771 
1772 	mss = m->m_pkthdr.tso_segsz;
1773 
1774 	/* negative cum_len signifies to the
1775 	 * send loop that we are still in the
1776 	 * header portion of the TSO packet.
1777 	 */
1778 
1779 	/* ensure we have the ethernet, IP and TCP
1780 	   header together in the first mbuf, copy
1781 	   it to a scratch buffer if not */
1782 	if (__predict_false(m->m_len < ip_off + sizeof (*ip))) {
1783 		m_copydata(m, 0, ip_off + sizeof (*ip),
1784 			   ss->scratch);
1785 		ip = (struct ip *)(ss->scratch + ip_off);
1786 	} else {
1787 		ip = (struct ip *)(mtod(m, char *) + ip_off);
1788 	}
1789 	if (__predict_false(m->m_len < ip_off + (ip->ip_hl << 2)
1790 			    + sizeof (*tcp))) {
1791 		m_copydata(m, 0, ip_off + (ip->ip_hl << 2)
1792 			   + sizeof (*tcp),  ss->scratch);
1793 		ip = (struct ip *)(mtod(m, char *) + ip_off);
1794 	}
1795 
1796 	tcp = (struct tcphdr *)((char *)ip + (ip->ip_hl << 2));
1797 	cum_len = -(ip_off + ((ip->ip_hl + tcp->th_off) << 2));
1798 
1799 	/* TSO implies checksum offload on this hardware */
1800 	cksum_offset = ip_off + (ip->ip_hl << 2);
1801 	flags = MXGEFW_FLAGS_TSO_HDR | MXGEFW_FLAGS_FIRST;
1802 
1803 
1804 	/* for TSO, pseudo_hdr_offset holds mss.
1805 	 * The firmware figures out where to put
1806 	 * the checksum by parsing the header. */
1807 	pseudo_hdr_offset = htobe16(mss);
1808 
1809 	tx = &ss->tx;
1810 	req = tx->req_list;
1811 	seg = tx->seg_list;
1812 	cnt = 0;
1813 	rdma_count = 0;
1814 	/* "rdma_count" is the number of RDMAs belonging to the
1815 	 * current packet BEFORE the current send request. For
1816 	 * non-TSO packets, this is equal to "count".
1817 	 * For TSO packets, rdma_count needs to be reset
1818 	 * to 0 after a segment cut.
1819 	 *
1820 	 * The rdma_count field of the send request is
1821 	 * the number of RDMAs of the packet starting at
1822 	 * that request. For TSO send requests with one ore more cuts
1823 	 * in the middle, this is the number of RDMAs starting
1824 	 * after the last cut in the request. All previous
1825 	 * segments before the last cut implicitly have 1 RDMA.
1826 	 *
1827 	 * Since the number of RDMAs is not known beforehand,
1828 	 * it must be filled-in retroactively - after each
1829 	 * segmentation cut or at the end of the entire packet.
1830 	 */
1831 
1832 	while (busdma_seg_cnt) {
1833 		/* Break the busdma segment up into pieces*/
1834 		low = MXGE_LOWPART_TO_U32(seg->ds_addr);
1835 		high_swapped = 	htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
1836 		len = seg->ds_len;
1837 
1838 		while (len) {
1839 			flags_next = flags & ~MXGEFW_FLAGS_FIRST;
1840 			seglen = len;
1841 			cum_len_next = cum_len + seglen;
1842 			(req-rdma_count)->rdma_count = rdma_count + 1;
1843 			if (__predict_true(cum_len >= 0)) {
1844 				/* payload */
1845 				chop = (cum_len_next > mss);
1846 				cum_len_next = cum_len_next % mss;
1847 				next_is_first = (cum_len_next == 0);
1848 				flags |= chop * MXGEFW_FLAGS_TSO_CHOP;
1849 				flags_next |= next_is_first *
1850 					MXGEFW_FLAGS_FIRST;
1851 				rdma_count |= -(chop | next_is_first);
1852 				rdma_count += chop & !next_is_first;
1853 			} else if (cum_len_next >= 0) {
1854 				/* header ends */
1855 				rdma_count = -1;
1856 				cum_len_next = 0;
1857 				seglen = -cum_len;
1858 				small = (mss <= MXGEFW_SEND_SMALL_SIZE);
1859 				flags_next = MXGEFW_FLAGS_TSO_PLD |
1860 					MXGEFW_FLAGS_FIRST |
1861 					(small * MXGEFW_FLAGS_SMALL);
1862 			    }
1863 
1864 			req->addr_high = high_swapped;
1865 			req->addr_low = htobe32(low);
1866 			req->pseudo_hdr_offset = pseudo_hdr_offset;
1867 			req->pad = 0;
1868 			req->rdma_count = 1;
1869 			req->length = htobe16(seglen);
1870 			req->cksum_offset = cksum_offset;
1871 			req->flags = flags | ((cum_len & 1) *
1872 					      MXGEFW_FLAGS_ALIGN_ODD);
1873 			low += seglen;
1874 			len -= seglen;
1875 			cum_len = cum_len_next;
1876 			flags = flags_next;
1877 			req++;
1878 			cnt++;
1879 			rdma_count++;
1880 			if (__predict_false(cksum_offset > seglen))
1881 				cksum_offset -= seglen;
1882 			else
1883 				cksum_offset = 0;
1884 			if (__predict_false(cnt > tx->max_desc))
1885 				goto drop;
1886 		}
1887 		busdma_seg_cnt--;
1888 		seg++;
1889 	}
1890 	(req-rdma_count)->rdma_count = rdma_count;
1891 
1892 	do {
1893 		req--;
1894 		req->flags |= MXGEFW_FLAGS_TSO_LAST;
1895 	} while (!(req->flags & (MXGEFW_FLAGS_TSO_CHOP | MXGEFW_FLAGS_FIRST)));
1896 
1897 	tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
1898 	mxge_submit_req(tx, tx->req_list, cnt);
1899 #ifdef IFNET_BUF_RING
1900 	if ((ss->sc->num_slices > 1) && tx->queue_active == 0) {
1901 		/* tell the NIC to start polling this slice */
1902 		*tx->send_go = 1;
1903 		tx->queue_active = 1;
1904 		tx->activate++;
1905 		wmb();
1906 	}
1907 #endif
1908 	return;
1909 
1910 drop:
1911 	bus_dmamap_unload(tx->dmat, tx->info[tx->req & tx->mask].map);
1912 	m_freem(m);
1913 	ss->oerrors++;
1914 	if (!once) {
1915 		printf("tx->max_desc exceeded via TSO!\n");
1916 		printf("mss = %d, %ld, %d!\n", mss,
1917 		       (long)seg - (long)tx->seg_list, tx->max_desc);
1918 		once = 1;
1919 	}
1920 	return;
1921 
1922 }
1923 
1924 #endif /* IFCAP_TSO4 */
1925 
1926 #ifdef MXGE_NEW_VLAN_API
1927 /*
1928  * We reproduce the software vlan tag insertion from
1929  * net/if_vlan.c:vlan_start() here so that we can advertise "hardware"
1930  * vlan tag insertion. We need to advertise this in order to have the
1931  * vlan interface respect our csum offload flags.
1932  */
1933 static struct mbuf *
1934 mxge_vlan_tag_insert(struct mbuf *m)
1935 {
1936 	struct ether_vlan_header *evl;
1937 
1938 	M_PREPEND(m, ETHER_VLAN_ENCAP_LEN, M_DONTWAIT);
1939 	if (__predict_false(m == NULL))
1940 		return NULL;
1941 	if (m->m_len < sizeof(*evl)) {
1942 		m = m_pullup(m, sizeof(*evl));
1943 		if (__predict_false(m == NULL))
1944 			return NULL;
1945 	}
1946 	/*
1947 	 * Transform the Ethernet header into an Ethernet header
1948 	 * with 802.1Q encapsulation.
1949 	 */
1950 	evl = mtod(m, struct ether_vlan_header *);
1951 	bcopy((char *)evl + ETHER_VLAN_ENCAP_LEN,
1952 	      (char *)evl, ETHER_HDR_LEN - ETHER_TYPE_LEN);
1953 	evl->evl_encap_proto = htons(ETHERTYPE_VLAN);
1954 	evl->evl_tag = htons(m->m_pkthdr.ether_vtag);
1955 	m->m_flags &= ~M_VLANTAG;
1956 	return m;
1957 }
1958 #endif /* MXGE_NEW_VLAN_API */
1959 
1960 static void
1961 mxge_encap(struct mxge_slice_state *ss, struct mbuf *m)
1962 {
1963 	mxge_softc_t *sc;
1964 	mcp_kreq_ether_send_t *req;
1965 	bus_dma_segment_t *seg;
1966 	struct mbuf *m_tmp;
1967 	struct ifnet *ifp;
1968 	mxge_tx_ring_t *tx;
1969 	struct ip *ip;
1970 	int cnt, cum_len, err, i, idx, odd_flag, ip_off;
1971 	uint16_t pseudo_hdr_offset;
1972         uint8_t flags, cksum_offset;
1973 
1974 
1975 	sc = ss->sc;
1976 	ifp = sc->ifp;
1977 	tx = &ss->tx;
1978 
1979 	ip_off = sizeof (struct ether_header);
1980 #ifdef MXGE_NEW_VLAN_API
1981 	if (m->m_flags & M_VLANTAG) {
1982 		m = mxge_vlan_tag_insert(m);
1983 		if (__predict_false(m == NULL))
1984 			goto drop;
1985 		ip_off += ETHER_VLAN_ENCAP_LEN;
1986 	}
1987 #endif
1988 	/* (try to) map the frame for DMA */
1989 	idx = tx->req & tx->mask;
1990 	err = bus_dmamap_load_mbuf_sg(tx->dmat, tx->info[idx].map,
1991 				      m, tx->seg_list, &cnt,
1992 				      BUS_DMA_NOWAIT);
1993 	if (__predict_false(err == EFBIG)) {
1994 		/* Too many segments in the chain.  Try
1995 		   to defrag */
1996 		m_tmp = m_defrag(m, M_NOWAIT);
1997 		if (m_tmp == NULL) {
1998 			goto drop;
1999 		}
2000 		ss->tx.defrag++;
2001 		m = m_tmp;
2002 		err = bus_dmamap_load_mbuf_sg(tx->dmat,
2003 					      tx->info[idx].map,
2004 					      m, tx->seg_list, &cnt,
2005 					      BUS_DMA_NOWAIT);
2006 	}
2007 	if (__predict_false(err != 0)) {
2008 		device_printf(sc->dev, "bus_dmamap_load_mbuf_sg returned %d"
2009 			      " packet len = %d\n", err, m->m_pkthdr.len);
2010 		goto drop;
2011 	}
2012 	bus_dmamap_sync(tx->dmat, tx->info[idx].map,
2013 			BUS_DMASYNC_PREWRITE);
2014 	tx->info[idx].m = m;
2015 
2016 #if IFCAP_TSO4
2017 	/* TSO is different enough, we handle it in another routine */
2018 	if (m->m_pkthdr.csum_flags & (CSUM_TSO)) {
2019 		mxge_encap_tso(ss, m, cnt, ip_off);
2020 		return;
2021 	}
2022 #endif
2023 
2024 	req = tx->req_list;
2025 	cksum_offset = 0;
2026 	pseudo_hdr_offset = 0;
2027 	flags = MXGEFW_FLAGS_NO_TSO;
2028 
2029 	/* checksum offloading? */
2030 	if (m->m_pkthdr.csum_flags & (CSUM_DELAY_DATA)) {
2031 		/* ensure ip header is in first mbuf, copy
2032 		   it to a scratch buffer if not */
2033 		if (__predict_false(m->m_len < ip_off + sizeof (*ip))) {
2034 			m_copydata(m, 0, ip_off + sizeof (*ip),
2035 				   ss->scratch);
2036 			ip = (struct ip *)(ss->scratch + ip_off);
2037 		} else {
2038 			ip = (struct ip *)(mtod(m, char *) + ip_off);
2039 		}
2040 		cksum_offset = ip_off + (ip->ip_hl << 2);
2041 		pseudo_hdr_offset = cksum_offset +  m->m_pkthdr.csum_data;
2042 		pseudo_hdr_offset = htobe16(pseudo_hdr_offset);
2043 		req->cksum_offset = cksum_offset;
2044 		flags |= MXGEFW_FLAGS_CKSUM;
2045 		odd_flag = MXGEFW_FLAGS_ALIGN_ODD;
2046 	} else {
2047 		odd_flag = 0;
2048 	}
2049 	if (m->m_pkthdr.len < MXGEFW_SEND_SMALL_SIZE)
2050 		flags |= MXGEFW_FLAGS_SMALL;
2051 
2052 	/* convert segments into a request list */
2053 	cum_len = 0;
2054 	seg = tx->seg_list;
2055 	req->flags = MXGEFW_FLAGS_FIRST;
2056 	for (i = 0; i < cnt; i++) {
2057 		req->addr_low =
2058 			htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
2059 		req->addr_high =
2060 			htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
2061 		req->length = htobe16(seg->ds_len);
2062 		req->cksum_offset = cksum_offset;
2063 		if (cksum_offset > seg->ds_len)
2064 			cksum_offset -= seg->ds_len;
2065 		else
2066 			cksum_offset = 0;
2067 		req->pseudo_hdr_offset = pseudo_hdr_offset;
2068 		req->pad = 0; /* complete solid 16-byte block */
2069 		req->rdma_count = 1;
2070 		req->flags |= flags | ((cum_len & 1) * odd_flag);
2071 		cum_len += seg->ds_len;
2072 		seg++;
2073 		req++;
2074 		req->flags = 0;
2075 	}
2076 	req--;
2077 	/* pad runts to 60 bytes */
2078 	if (cum_len < 60) {
2079 		req++;
2080 		req->addr_low =
2081 			htobe32(MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr));
2082 		req->addr_high =
2083 			htobe32(MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr));
2084 		req->length = htobe16(60 - cum_len);
2085 		req->cksum_offset = 0;
2086 		req->pseudo_hdr_offset = pseudo_hdr_offset;
2087 		req->pad = 0; /* complete solid 16-byte block */
2088 		req->rdma_count = 1;
2089 		req->flags |= flags | ((cum_len & 1) * odd_flag);
2090 		cnt++;
2091 	}
2092 
2093 	tx->req_list[0].rdma_count = cnt;
2094 #if 0
2095 	/* print what the firmware will see */
2096 	for (i = 0; i < cnt; i++) {
2097 		printf("%d: addr: 0x%x 0x%x len:%d pso%d,"
2098 		    "cso:%d, flags:0x%x, rdma:%d\n",
2099 		    i, (int)ntohl(tx->req_list[i].addr_high),
2100 		    (int)ntohl(tx->req_list[i].addr_low),
2101 		    (int)ntohs(tx->req_list[i].length),
2102 		    (int)ntohs(tx->req_list[i].pseudo_hdr_offset),
2103 		    tx->req_list[i].cksum_offset, tx->req_list[i].flags,
2104 		    tx->req_list[i].rdma_count);
2105 	}
2106 	printf("--------------\n");
2107 #endif
2108 	tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
2109 	mxge_submit_req(tx, tx->req_list, cnt);
2110 #ifdef IFNET_BUF_RING
2111 	if ((ss->sc->num_slices > 1) && tx->queue_active == 0) {
2112 		/* tell the NIC to start polling this slice */
2113 		*tx->send_go = 1;
2114 		tx->queue_active = 1;
2115 		tx->activate++;
2116 		wmb();
2117 	}
2118 #endif
2119 	return;
2120 
2121 drop:
2122 	m_freem(m);
2123 	ss->oerrors++;
2124 	return;
2125 }
2126 
2127 #ifdef IFNET_BUF_RING
2128 static void
2129 mxge_qflush(struct ifnet *ifp)
2130 {
2131 	mxge_softc_t *sc = ifp->if_softc;
2132 	mxge_tx_ring_t *tx;
2133 	struct mbuf *m;
2134 	int slice;
2135 
2136 	for (slice = 0; slice < sc->num_slices; slice++) {
2137 		tx = &sc->ss[slice].tx;
2138 		mtx_lock(&tx->mtx);
2139 		while ((m = buf_ring_dequeue_sc(tx->br)) != NULL)
2140 			m_freem(m);
2141 		mtx_unlock(&tx->mtx);
2142 	}
2143 	if_qflush(ifp);
2144 }
2145 
2146 static inline void
2147 mxge_start_locked(struct mxge_slice_state *ss)
2148 {
2149 	mxge_softc_t *sc;
2150 	struct mbuf *m;
2151 	struct ifnet *ifp;
2152 	mxge_tx_ring_t *tx;
2153 
2154 	sc = ss->sc;
2155 	ifp = sc->ifp;
2156 	tx = &ss->tx;
2157 
2158 	while ((tx->mask - (tx->req - tx->done)) > tx->max_desc) {
2159 		m = drbr_dequeue(ifp, tx->br);
2160 		if (m == NULL) {
2161 			return;
2162 		}
2163 		/* let BPF see it */
2164 		BPF_MTAP(ifp, m);
2165 
2166 		/* give it to the nic */
2167 		mxge_encap(ss, m);
2168 	}
2169 	/* ran out of transmit slots */
2170 	if (((ss->if_drv_flags & IFF_DRV_OACTIVE) == 0)
2171 	    && (!drbr_empty(ifp, tx->br))) {
2172 		ss->if_drv_flags |= IFF_DRV_OACTIVE;
2173 		tx->stall++;
2174 	}
2175 }
2176 
2177 static int
2178 mxge_transmit_locked(struct mxge_slice_state *ss, struct mbuf *m)
2179 {
2180 	mxge_softc_t *sc;
2181 	struct ifnet *ifp;
2182 	mxge_tx_ring_t *tx;
2183 	int err;
2184 
2185 	sc = ss->sc;
2186 	ifp = sc->ifp;
2187 	tx = &ss->tx;
2188 
2189 	if ((ss->if_drv_flags & (IFF_DRV_RUNNING|IFF_DRV_OACTIVE)) !=
2190 	    IFF_DRV_RUNNING) {
2191 		err = drbr_enqueue(ifp, tx->br, m);
2192 		return (err);
2193 	}
2194 
2195 	if (drbr_empty(ifp, tx->br) &&
2196 	    ((tx->mask - (tx->req - tx->done)) > tx->max_desc)) {
2197 		/* let BPF see it */
2198 		BPF_MTAP(ifp, m);
2199 		/* give it to the nic */
2200 		mxge_encap(ss, m);
2201 		drbr_stats_update(ifp, m->m_pkthdr.len, m->m_flags);
2202 	} else if ((err = drbr_enqueue(ifp, tx->br, m)) != 0) {
2203 		return (err);
2204 	}
2205 	if (!drbr_empty(ifp, tx->br))
2206 		mxge_start_locked(ss);
2207 	return (0);
2208 }
2209 
2210 static int
2211 mxge_transmit(struct ifnet *ifp, struct mbuf *m)
2212 {
2213 	mxge_softc_t *sc = ifp->if_softc;
2214 	struct mxge_slice_state *ss;
2215 	mxge_tx_ring_t *tx;
2216 	int err = 0;
2217 	int slice;
2218 
2219 	slice = m->m_pkthdr.flowid;
2220 	slice &= (sc->num_slices - 1);  /* num_slices always power of 2 */
2221 
2222 	ss = &sc->ss[slice];
2223 	tx = &ss->tx;
2224 
2225 	if (mtx_trylock(&tx->mtx)) {
2226 		err = mxge_transmit_locked(ss, m);
2227 		mtx_unlock(&tx->mtx);
2228 	} else {
2229 		err = drbr_enqueue(ifp, tx->br, m);
2230 	}
2231 
2232 	return (err);
2233 }
2234 
2235 #else
2236 
2237 static inline void
2238 mxge_start_locked(struct mxge_slice_state *ss)
2239 {
2240 	mxge_softc_t *sc;
2241 	struct mbuf *m;
2242 	struct ifnet *ifp;
2243 	mxge_tx_ring_t *tx;
2244 
2245 	sc = ss->sc;
2246 	ifp = sc->ifp;
2247 	tx = &ss->tx;
2248 	while ((tx->mask - (tx->req - tx->done)) > tx->max_desc) {
2249 		IFQ_DRV_DEQUEUE(&ifp->if_snd, m);
2250 		if (m == NULL) {
2251 			return;
2252 		}
2253 		/* let BPF see it */
2254 		BPF_MTAP(ifp, m);
2255 
2256 		/* give it to the nic */
2257 		mxge_encap(ss, m);
2258 	}
2259 	/* ran out of transmit slots */
2260 	if ((sc->ifp->if_drv_flags & IFF_DRV_OACTIVE) == 0) {
2261 		sc->ifp->if_drv_flags |= IFF_DRV_OACTIVE;
2262 		tx->stall++;
2263 	}
2264 }
2265 #endif
2266 static void
2267 mxge_start(struct ifnet *ifp)
2268 {
2269 	mxge_softc_t *sc = ifp->if_softc;
2270 	struct mxge_slice_state *ss;
2271 
2272 	/* only use the first slice for now */
2273 	ss = &sc->ss[0];
2274 	mtx_lock(&ss->tx.mtx);
2275 	mxge_start_locked(ss);
2276 	mtx_unlock(&ss->tx.mtx);
2277 }
2278 
2279 /*
2280  * copy an array of mcp_kreq_ether_recv_t's to the mcp.  Copy
2281  * at most 32 bytes at a time, so as to avoid involving the software
2282  * pio handler in the nic.   We re-write the first segment's low
2283  * DMA address to mark it valid only after we write the entire chunk
2284  * in a burst
2285  */
2286 static inline void
2287 mxge_submit_8rx(volatile mcp_kreq_ether_recv_t *dst,
2288 		mcp_kreq_ether_recv_t *src)
2289 {
2290 	uint32_t low;
2291 
2292 	low = src->addr_low;
2293 	src->addr_low = 0xffffffff;
2294 	mxge_pio_copy(dst, src, 4 * sizeof (*src));
2295 	wmb();
2296 	mxge_pio_copy(dst + 4, src + 4, 4 * sizeof (*src));
2297 	wmb();
2298 	src->addr_low = low;
2299 	dst->addr_low = low;
2300 	wmb();
2301 }
2302 
2303 static int
2304 mxge_get_buf_small(struct mxge_slice_state *ss, bus_dmamap_t map, int idx)
2305 {
2306 	bus_dma_segment_t seg;
2307 	struct mbuf *m;
2308 	mxge_rx_ring_t *rx = &ss->rx_small;
2309 	int cnt, err;
2310 
2311 	m = m_gethdr(M_DONTWAIT, MT_DATA);
2312 	if (m == NULL) {
2313 		rx->alloc_fail++;
2314 		err = ENOBUFS;
2315 		goto done;
2316 	}
2317 	m->m_len = MHLEN;
2318 	err = bus_dmamap_load_mbuf_sg(rx->dmat, map, m,
2319 				      &seg, &cnt, BUS_DMA_NOWAIT);
2320 	if (err != 0) {
2321 		m_free(m);
2322 		goto done;
2323 	}
2324 	rx->info[idx].m = m;
2325 	rx->shadow[idx].addr_low =
2326 		htobe32(MXGE_LOWPART_TO_U32(seg.ds_addr));
2327 	rx->shadow[idx].addr_high =
2328 		htobe32(MXGE_HIGHPART_TO_U32(seg.ds_addr));
2329 
2330 done:
2331 	if ((idx & 7) == 7)
2332 		mxge_submit_8rx(&rx->lanai[idx - 7], &rx->shadow[idx - 7]);
2333 	return err;
2334 }
2335 
2336 static int
2337 mxge_get_buf_big(struct mxge_slice_state *ss, bus_dmamap_t map, int idx)
2338 {
2339 	bus_dma_segment_t seg[3];
2340 	struct mbuf *m;
2341 	mxge_rx_ring_t *rx = &ss->rx_big;
2342 	int cnt, err, i;
2343 
2344 	if (rx->cl_size == MCLBYTES)
2345 		m = m_getcl(M_DONTWAIT, MT_DATA, M_PKTHDR);
2346 	else
2347 		m = m_getjcl(M_DONTWAIT, MT_DATA, M_PKTHDR, rx->cl_size);
2348 	if (m == NULL) {
2349 		rx->alloc_fail++;
2350 		err = ENOBUFS;
2351 		goto done;
2352 	}
2353 	m->m_len = rx->mlen;
2354 	err = bus_dmamap_load_mbuf_sg(rx->dmat, map, m,
2355 				      seg, &cnt, BUS_DMA_NOWAIT);
2356 	if (err != 0) {
2357 		m_free(m);
2358 		goto done;
2359 	}
2360 	rx->info[idx].m = m;
2361 	rx->shadow[idx].addr_low =
2362 		htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
2363 	rx->shadow[idx].addr_high =
2364 		htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
2365 
2366 #if MXGE_VIRT_JUMBOS
2367 	for (i = 1; i < cnt; i++) {
2368 		rx->shadow[idx + i].addr_low =
2369 			htobe32(MXGE_LOWPART_TO_U32(seg[i].ds_addr));
2370 		rx->shadow[idx + i].addr_high =
2371 			htobe32(MXGE_HIGHPART_TO_U32(seg[i].ds_addr));
2372        }
2373 #endif
2374 
2375 done:
2376        for (i = 0; i < rx->nbufs; i++) {
2377 		if ((idx & 7) == 7) {
2378 			mxge_submit_8rx(&rx->lanai[idx - 7],
2379 					&rx->shadow[idx - 7]);
2380 		}
2381 		idx++;
2382 	}
2383 	return err;
2384 }
2385 
2386 /*
2387  *  Myri10GE hardware checksums are not valid if the sender
2388  *  padded the frame with non-zero padding.  This is because
2389  *  the firmware just does a simple 16-bit 1s complement
2390  *  checksum across the entire frame, excluding the first 14
2391  *  bytes.  It is best to simply to check the checksum and
2392  *  tell the stack about it only if the checksum is good
2393  */
2394 
2395 static inline uint16_t
2396 mxge_rx_csum(struct mbuf *m, int csum)
2397 {
2398 	struct ether_header *eh;
2399 	struct ip *ip;
2400 	uint16_t c;
2401 
2402 	eh = mtod(m, struct ether_header *);
2403 
2404 	/* only deal with IPv4 TCP & UDP for now */
2405 	if (__predict_false(eh->ether_type != htons(ETHERTYPE_IP)))
2406 		return 1;
2407 	ip = (struct ip *)(eh + 1);
2408 	if (__predict_false(ip->ip_p != IPPROTO_TCP &&
2409 			    ip->ip_p != IPPROTO_UDP))
2410 		return 1;
2411 
2412 	c = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
2413 		      htonl(ntohs(csum) + ntohs(ip->ip_len) +
2414 			    - (ip->ip_hl << 2) + ip->ip_p));
2415 	c ^= 0xffff;
2416 	return (c);
2417 }
2418 
2419 static void
2420 mxge_vlan_tag_remove(struct mbuf *m, uint32_t *csum)
2421 {
2422 	struct ether_vlan_header *evl;
2423 	struct ether_header *eh;
2424 	uint32_t partial;
2425 
2426 	evl = mtod(m, struct ether_vlan_header *);
2427 	eh = mtod(m, struct ether_header *);
2428 
2429 	/*
2430 	 * fix checksum by subtracting ETHER_VLAN_ENCAP_LEN bytes
2431 	 * after what the firmware thought was the end of the ethernet
2432 	 * header.
2433 	 */
2434 
2435 	/* put checksum into host byte order */
2436 	*csum = ntohs(*csum);
2437 	partial = ntohl(*(uint32_t *)(mtod(m, char *) + ETHER_HDR_LEN));
2438 	(*csum) += ~partial;
2439 	(*csum) +=  ((*csum) < ~partial);
2440 	(*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2441 	(*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2442 
2443 	/* restore checksum to network byte order;
2444 	   later consumers expect this */
2445 	*csum = htons(*csum);
2446 
2447 	/* save the tag */
2448 #ifdef MXGE_NEW_VLAN_API
2449 	m->m_pkthdr.ether_vtag = ntohs(evl->evl_tag);
2450 #else
2451 	{
2452 		struct m_tag *mtag;
2453 		mtag = m_tag_alloc(MTAG_VLAN, MTAG_VLAN_TAG, sizeof(u_int),
2454 				   M_NOWAIT);
2455 		if (mtag == NULL)
2456 			return;
2457 		VLAN_TAG_VALUE(mtag) = ntohs(evl->evl_tag);
2458 		m_tag_prepend(m, mtag);
2459 	}
2460 
2461 #endif
2462 	m->m_flags |= M_VLANTAG;
2463 
2464 	/*
2465 	 * Remove the 802.1q header by copying the Ethernet
2466 	 * addresses over it and adjusting the beginning of
2467 	 * the data in the mbuf.  The encapsulated Ethernet
2468 	 * type field is already in place.
2469 	 */
2470 	bcopy((char *)evl, (char *)evl + ETHER_VLAN_ENCAP_LEN,
2471 	      ETHER_HDR_LEN - ETHER_TYPE_LEN);
2472 	m_adj(m, ETHER_VLAN_ENCAP_LEN);
2473 }
2474 
2475 
2476 static inline void
2477 mxge_rx_done_big(struct mxge_slice_state *ss, uint32_t len, uint32_t csum)
2478 {
2479 	mxge_softc_t *sc;
2480 	struct ifnet *ifp;
2481 	struct mbuf *m;
2482 	struct ether_header *eh;
2483 	mxge_rx_ring_t *rx;
2484 	bus_dmamap_t old_map;
2485 	int idx;
2486 	uint16_t tcpudp_csum;
2487 
2488 	sc = ss->sc;
2489 	ifp = sc->ifp;
2490 	rx = &ss->rx_big;
2491 	idx = rx->cnt & rx->mask;
2492 	rx->cnt += rx->nbufs;
2493 	/* save a pointer to the received mbuf */
2494 	m = rx->info[idx].m;
2495 	/* try to replace the received mbuf */
2496 	if (mxge_get_buf_big(ss, rx->extra_map, idx)) {
2497 		/* drop the frame -- the old mbuf is re-cycled */
2498 		ifp->if_ierrors++;
2499 		return;
2500 	}
2501 
2502 	/* unmap the received buffer */
2503 	old_map = rx->info[idx].map;
2504 	bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2505 	bus_dmamap_unload(rx->dmat, old_map);
2506 
2507 	/* swap the bus_dmamap_t's */
2508 	rx->info[idx].map = rx->extra_map;
2509 	rx->extra_map = old_map;
2510 
2511 	/* mcp implicitly skips 1st 2 bytes so that packet is properly
2512 	 * aligned */
2513 	m->m_data += MXGEFW_PAD;
2514 
2515 	m->m_pkthdr.rcvif = ifp;
2516 	m->m_len = m->m_pkthdr.len = len;
2517 	ss->ipackets++;
2518 	eh = mtod(m, struct ether_header *);
2519 	if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2520 		mxge_vlan_tag_remove(m, &csum);
2521 	}
2522 	/* if the checksum is valid, mark it in the mbuf header */
2523 	if (sc->csum_flag && (0 == (tcpudp_csum = mxge_rx_csum(m, csum)))) {
2524 		if (sc->lro_cnt && (0 == mxge_lro_rx(ss, m, csum)))
2525 			return;
2526 		/* otherwise, it was a UDP frame, or a TCP frame which
2527 		   we could not do LRO on.  Tell the stack that the
2528 		   checksum is good */
2529 		m->m_pkthdr.csum_data = 0xffff;
2530 		m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR | CSUM_DATA_VALID;
2531 	}
2532 	/* flowid only valid if RSS hashing is enabled */
2533 	if (sc->num_slices > 1) {
2534 		m->m_pkthdr.flowid = (ss - sc->ss);
2535 		m->m_flags |= M_FLOWID;
2536 	}
2537 	/* pass the frame up the stack */
2538 	(*ifp->if_input)(ifp, m);
2539 }
2540 
2541 static inline void
2542 mxge_rx_done_small(struct mxge_slice_state *ss, uint32_t len, uint32_t csum)
2543 {
2544 	mxge_softc_t *sc;
2545 	struct ifnet *ifp;
2546 	struct ether_header *eh;
2547 	struct mbuf *m;
2548 	mxge_rx_ring_t *rx;
2549 	bus_dmamap_t old_map;
2550 	int idx;
2551 	uint16_t tcpudp_csum;
2552 
2553 	sc = ss->sc;
2554 	ifp = sc->ifp;
2555 	rx = &ss->rx_small;
2556 	idx = rx->cnt & rx->mask;
2557 	rx->cnt++;
2558 	/* save a pointer to the received mbuf */
2559 	m = rx->info[idx].m;
2560 	/* try to replace the received mbuf */
2561 	if (mxge_get_buf_small(ss, rx->extra_map, idx)) {
2562 		/* drop the frame -- the old mbuf is re-cycled */
2563 		ifp->if_ierrors++;
2564 		return;
2565 	}
2566 
2567 	/* unmap the received buffer */
2568 	old_map = rx->info[idx].map;
2569 	bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2570 	bus_dmamap_unload(rx->dmat, old_map);
2571 
2572 	/* swap the bus_dmamap_t's */
2573 	rx->info[idx].map = rx->extra_map;
2574 	rx->extra_map = old_map;
2575 
2576 	/* mcp implicitly skips 1st 2 bytes so that packet is properly
2577 	 * aligned */
2578 	m->m_data += MXGEFW_PAD;
2579 
2580 	m->m_pkthdr.rcvif = ifp;
2581 	m->m_len = m->m_pkthdr.len = len;
2582 	ss->ipackets++;
2583 	eh = mtod(m, struct ether_header *);
2584 	if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2585 		mxge_vlan_tag_remove(m, &csum);
2586 	}
2587 	/* if the checksum is valid, mark it in the mbuf header */
2588 	if (sc->csum_flag && (0 == (tcpudp_csum = mxge_rx_csum(m, csum)))) {
2589 		if (sc->lro_cnt && (0 == mxge_lro_rx(ss, m, csum)))
2590 			return;
2591 		/* otherwise, it was a UDP frame, or a TCP frame which
2592 		   we could not do LRO on.  Tell the stack that the
2593 		   checksum is good */
2594 		m->m_pkthdr.csum_data = 0xffff;
2595 		m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR | CSUM_DATA_VALID;
2596 	}
2597 	/* flowid only valid if RSS hashing is enabled */
2598 	if (sc->num_slices > 1) {
2599 		m->m_pkthdr.flowid = (ss - sc->ss);
2600 		m->m_flags |= M_FLOWID;
2601 	}
2602 	/* pass the frame up the stack */
2603 	(*ifp->if_input)(ifp, m);
2604 }
2605 
2606 static inline void
2607 mxge_clean_rx_done(struct mxge_slice_state *ss)
2608 {
2609 	mxge_rx_done_t *rx_done = &ss->rx_done;
2610 	struct lro_entry *lro;
2611 	int limit = 0;
2612 	uint16_t length;
2613 	uint16_t checksum;
2614 
2615 
2616 	while (rx_done->entry[rx_done->idx].length != 0) {
2617 		length = ntohs(rx_done->entry[rx_done->idx].length);
2618 		rx_done->entry[rx_done->idx].length = 0;
2619 		checksum = rx_done->entry[rx_done->idx].checksum;
2620 		if (length <= (MHLEN - MXGEFW_PAD))
2621 			mxge_rx_done_small(ss, length, checksum);
2622 		else
2623 			mxge_rx_done_big(ss, length, checksum);
2624 		rx_done->cnt++;
2625 		rx_done->idx = rx_done->cnt & rx_done->mask;
2626 
2627 		/* limit potential for livelock */
2628 		if (__predict_false(++limit > rx_done->mask / 2))
2629 			break;
2630 	}
2631 	while (!SLIST_EMPTY(&ss->lro_active)) {
2632 		lro = SLIST_FIRST(&ss->lro_active);
2633 		SLIST_REMOVE_HEAD(&ss->lro_active, next);
2634 		mxge_lro_flush(ss, lro);
2635 	}
2636 }
2637 
2638 
2639 static inline void
2640 mxge_tx_done(struct mxge_slice_state *ss, uint32_t mcp_idx)
2641 {
2642 	struct ifnet *ifp;
2643 	mxge_tx_ring_t *tx;
2644 	struct mbuf *m;
2645 	bus_dmamap_t map;
2646 	int idx;
2647 	int *flags;
2648 
2649 	tx = &ss->tx;
2650 	ifp = ss->sc->ifp;
2651 	while (tx->pkt_done != mcp_idx) {
2652 		idx = tx->done & tx->mask;
2653 		tx->done++;
2654 		m = tx->info[idx].m;
2655 		/* mbuf and DMA map only attached to the first
2656 		   segment per-mbuf */
2657 		if (m != NULL) {
2658 			ss->opackets++;
2659 			tx->info[idx].m = NULL;
2660 			map = tx->info[idx].map;
2661 			bus_dmamap_unload(tx->dmat, map);
2662 			m_freem(m);
2663 		}
2664 		if (tx->info[idx].flag) {
2665 			tx->info[idx].flag = 0;
2666 			tx->pkt_done++;
2667 		}
2668 	}
2669 
2670 	/* If we have space, clear IFF_OACTIVE to tell the stack that
2671            its OK to send packets */
2672 #ifdef IFNET_BUF_RING
2673 	flags = &ss->if_drv_flags;
2674 #else
2675 	flags = &ifp->if_drv_flags;
2676 #endif
2677 	mtx_lock(&ss->tx.mtx);
2678 	if ((*flags) & IFF_DRV_OACTIVE &&
2679 	    tx->req - tx->done < (tx->mask + 1)/4) {
2680 		*(flags) &= ~IFF_DRV_OACTIVE;
2681 		ss->tx.wake++;
2682 		mxge_start_locked(ss);
2683 	}
2684 #ifdef IFNET_BUF_RING
2685 	if ((ss->sc->num_slices > 1) && (tx->req == tx->done)) {
2686 		/* let the NIC stop polling this queue, since there
2687 		 * are no more transmits pending */
2688 		if (tx->req == tx->done) {
2689 			*tx->send_stop = 1;
2690 			tx->queue_active = 0;
2691 			tx->deactivate++;
2692 			wmb();
2693 		}
2694 	}
2695 #endif
2696 	mtx_unlock(&ss->tx.mtx);
2697 
2698 }
2699 
2700 static struct mxge_media_type mxge_xfp_media_types[] =
2701 {
2702 	{IFM_10G_CX4,	0x7f, 		"10GBASE-CX4 (module)"},
2703 	{IFM_10G_SR, 	(1 << 7),	"10GBASE-SR"},
2704 	{IFM_10G_LR, 	(1 << 6),	"10GBASE-LR"},
2705 	{0,		(1 << 5),	"10GBASE-ER"},
2706 	{IFM_10G_LRM,	(1 << 4),	"10GBASE-LRM"},
2707 	{0,		(1 << 3),	"10GBASE-SW"},
2708 	{0,		(1 << 2),	"10GBASE-LW"},
2709 	{0,		(1 << 1),	"10GBASE-EW"},
2710 	{0,		(1 << 0),	"Reserved"}
2711 };
2712 static struct mxge_media_type mxge_sfp_media_types[] =
2713 {
2714 	{0,		(1 << 7),	"Reserved"},
2715 	{IFM_10G_LRM,	(1 << 6),	"10GBASE-LRM"},
2716 	{IFM_10G_LR, 	(1 << 5),	"10GBASE-LR"},
2717 	{IFM_10G_SR,	(1 << 4),	"10GBASE-SR"}
2718 };
2719 
2720 static void
2721 mxge_set_media(mxge_softc_t *sc, int type)
2722 {
2723 	sc->media_flags |= type;
2724 	ifmedia_add(&sc->media, sc->media_flags, 0, NULL);
2725 	ifmedia_set(&sc->media, sc->media_flags);
2726 }
2727 
2728 
2729 /*
2730  * Determine the media type for a NIC.  Some XFPs will identify
2731  * themselves only when their link is up, so this is initiated via a
2732  * link up interrupt.  However, this can potentially take up to
2733  * several milliseconds, so it is run via the watchdog routine, rather
2734  * than in the interrupt handler itself.   This need only be done
2735  * once, not each time the link is up.
2736  */
2737 static void
2738 mxge_media_probe(mxge_softc_t *sc)
2739 {
2740 	mxge_cmd_t cmd;
2741 	char *cage_type;
2742 	char *ptr;
2743 	struct mxge_media_type *mxge_media_types = NULL;
2744 	int i, err, ms, mxge_media_type_entries;
2745 	uint32_t byte;
2746 
2747 	sc->need_media_probe = 0;
2748 
2749 	/* if we've already set a media type, we're done */
2750 	if (sc->media_flags  != (IFM_ETHER | IFM_AUTO))
2751 		return;
2752 
2753 	/*
2754 	 * parse the product code to deterimine the interface type
2755 	 * (CX4, XFP, Quad Ribbon Fiber) by looking at the character
2756 	 * after the 3rd dash in the driver's cached copy of the
2757 	 * EEPROM's product code string.
2758 	 */
2759 	ptr = sc->product_code_string;
2760 	if (ptr == NULL) {
2761 		device_printf(sc->dev, "Missing product code\n");
2762 	}
2763 
2764 	for (i = 0; i < 3; i++, ptr++) {
2765 		ptr = index(ptr, '-');
2766 		if (ptr == NULL) {
2767 			device_printf(sc->dev,
2768 				      "only %d dashes in PC?!?\n", i);
2769 			return;
2770 		}
2771 	}
2772 	if (*ptr == 'C') {
2773 		/* -C is CX4 */
2774 		mxge_set_media(sc, IFM_10G_CX4);
2775 		return;
2776 	}
2777 	else if (*ptr == 'Q') {
2778 		/* -Q is Quad Ribbon Fiber */
2779 		device_printf(sc->dev, "Quad Ribbon Fiber Media\n");
2780 		/* FreeBSD has no media type for Quad ribbon fiber */
2781 		return;
2782 	}
2783 
2784 	if (*ptr == 'R') {
2785 		/* -R is XFP */
2786 		mxge_media_types = mxge_xfp_media_types;
2787 		mxge_media_type_entries =
2788 			sizeof (mxge_xfp_media_types) /
2789 			sizeof (mxge_xfp_media_types[0]);
2790 		byte = MXGE_XFP_COMPLIANCE_BYTE;
2791 		cage_type = "XFP";
2792 	}
2793 
2794 	if (*ptr == 'S' || *(ptr +1) == 'S') {
2795 		/* -S or -2S is SFP+ */
2796 		mxge_media_types = mxge_sfp_media_types;
2797 		mxge_media_type_entries =
2798 			sizeof (mxge_sfp_media_types) /
2799 			sizeof (mxge_sfp_media_types[0]);
2800 		cage_type = "SFP+";
2801 		byte = 3;
2802 	}
2803 
2804 	if (mxge_media_types == NULL) {
2805 		device_printf(sc->dev, "Unknown media type: %c\n", *ptr);
2806 		return;
2807 	}
2808 
2809 	/*
2810 	 * At this point we know the NIC has an XFP cage, so now we
2811 	 * try to determine what is in the cage by using the
2812 	 * firmware's XFP I2C commands to read the XFP 10GbE compilance
2813 	 * register.  We read just one byte, which may take over
2814 	 * a millisecond
2815 	 */
2816 
2817 	cmd.data0 = 0;	 /* just fetch 1 byte, not all 256 */
2818 	cmd.data1 = byte;
2819 	err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_READ, &cmd);
2820 	if (err == MXGEFW_CMD_ERROR_I2C_FAILURE) {
2821 		device_printf(sc->dev, "failed to read XFP\n");
2822 	}
2823 	if (err == MXGEFW_CMD_ERROR_I2C_ABSENT) {
2824 		device_printf(sc->dev, "Type R/S with no XFP!?!?\n");
2825 	}
2826 	if (err != MXGEFW_CMD_OK) {
2827 		return;
2828 	}
2829 
2830 	/* now we wait for the data to be cached */
2831 	cmd.data0 = byte;
2832 	err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
2833 	for (ms = 0; (err == EBUSY) && (ms < 50); ms++) {
2834 		DELAY(1000);
2835 		cmd.data0 = byte;
2836 		err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
2837 	}
2838 	if (err != MXGEFW_CMD_OK) {
2839 		device_printf(sc->dev, "failed to read %s (%d, %dms)\n",
2840 			      cage_type, err, ms);
2841 		return;
2842 	}
2843 
2844 	if (cmd.data0 == mxge_media_types[0].bitmask) {
2845 		if (mxge_verbose)
2846 			device_printf(sc->dev, "%s:%s\n", cage_type,
2847 				      mxge_media_types[0].name);
2848 		mxge_set_media(sc, IFM_10G_CX4);
2849 		return;
2850 	}
2851 	for (i = 1; i < mxge_media_type_entries; i++) {
2852 		if (cmd.data0 & mxge_media_types[i].bitmask) {
2853 			if (mxge_verbose)
2854 				device_printf(sc->dev, "%s:%s\n",
2855 					      cage_type,
2856 					      mxge_media_types[i].name);
2857 
2858 			mxge_set_media(sc, mxge_media_types[i].flag);
2859 			return;
2860 		}
2861 	}
2862 	device_printf(sc->dev, "%s media 0x%x unknown\n", cage_type,
2863 		      cmd.data0);
2864 
2865 	return;
2866 }
2867 
2868 static void
2869 mxge_intr(void *arg)
2870 {
2871 	struct mxge_slice_state *ss = arg;
2872 	mxge_softc_t *sc = ss->sc;
2873 	mcp_irq_data_t *stats = ss->fw_stats;
2874 	mxge_tx_ring_t *tx = &ss->tx;
2875 	mxge_rx_done_t *rx_done = &ss->rx_done;
2876 	uint32_t send_done_count;
2877 	uint8_t valid;
2878 
2879 
2880 #ifndef IFNET_BUF_RING
2881 	/* an interrupt on a non-zero slice is implicitly valid
2882 	   since MSI-X irqs are not shared */
2883 	if (ss != sc->ss) {
2884 		mxge_clean_rx_done(ss);
2885 		*ss->irq_claim = be32toh(3);
2886 		return;
2887 	}
2888 #endif
2889 
2890 	/* make sure the DMA has finished */
2891 	if (!stats->valid) {
2892 		return;
2893 	}
2894 	valid = stats->valid;
2895 
2896 	if (sc->legacy_irq) {
2897 		/* lower legacy IRQ  */
2898 		*sc->irq_deassert = 0;
2899 		if (!mxge_deassert_wait)
2900 			/* don't wait for conf. that irq is low */
2901 			stats->valid = 0;
2902 	} else {
2903 		stats->valid = 0;
2904 	}
2905 
2906 	/* loop while waiting for legacy irq deassertion */
2907 	do {
2908 		/* check for transmit completes and receives */
2909 		send_done_count = be32toh(stats->send_done_count);
2910 		while ((send_done_count != tx->pkt_done) ||
2911 		       (rx_done->entry[rx_done->idx].length != 0)) {
2912 			if (send_done_count != tx->pkt_done)
2913 				mxge_tx_done(ss, (int)send_done_count);
2914 			mxge_clean_rx_done(ss);
2915 			send_done_count = be32toh(stats->send_done_count);
2916 		}
2917 		if (sc->legacy_irq && mxge_deassert_wait)
2918 			wmb();
2919 	} while (*((volatile uint8_t *) &stats->valid));
2920 
2921 	/* fw link & error stats meaningful only on the first slice */
2922 	if (__predict_false((ss == sc->ss) && stats->stats_updated)) {
2923 		if (sc->link_state != stats->link_up) {
2924 			sc->link_state = stats->link_up;
2925 			if (sc->link_state) {
2926 				if_link_state_change(sc->ifp, LINK_STATE_UP);
2927 				if (mxge_verbose)
2928 					device_printf(sc->dev, "link up\n");
2929 			} else {
2930 				if_link_state_change(sc->ifp, LINK_STATE_DOWN);
2931 				if (mxge_verbose)
2932 					device_printf(sc->dev, "link down\n");
2933 			}
2934 			sc->need_media_probe = 1;
2935 		}
2936 		if (sc->rdma_tags_available !=
2937 		    be32toh(stats->rdma_tags_available)) {
2938 			sc->rdma_tags_available =
2939 				be32toh(stats->rdma_tags_available);
2940 			device_printf(sc->dev, "RDMA timed out! %d tags "
2941 				      "left\n", sc->rdma_tags_available);
2942 		}
2943 
2944 		if (stats->link_down) {
2945 			sc->down_cnt += stats->link_down;
2946 			sc->link_state = 0;
2947 			if_link_state_change(sc->ifp, LINK_STATE_DOWN);
2948 		}
2949 	}
2950 
2951 	/* check to see if we have rx token to pass back */
2952 	if (valid & 0x1)
2953 	    *ss->irq_claim = be32toh(3);
2954 	*(ss->irq_claim + 1) = be32toh(3);
2955 }
2956 
2957 static void
2958 mxge_init(void *arg)
2959 {
2960 }
2961 
2962 
2963 
2964 static void
2965 mxge_free_slice_mbufs(struct mxge_slice_state *ss)
2966 {
2967 	struct lro_entry *lro_entry;
2968 	int i;
2969 
2970 	while (!SLIST_EMPTY(&ss->lro_free)) {
2971 		lro_entry = SLIST_FIRST(&ss->lro_free);
2972 		SLIST_REMOVE_HEAD(&ss->lro_free, next);
2973 		free(lro_entry, M_DEVBUF);
2974 	}
2975 
2976 	for (i = 0; i <= ss->rx_big.mask; i++) {
2977 		if (ss->rx_big.info[i].m == NULL)
2978 			continue;
2979 		bus_dmamap_unload(ss->rx_big.dmat,
2980 				  ss->rx_big.info[i].map);
2981 		m_freem(ss->rx_big.info[i].m);
2982 		ss->rx_big.info[i].m = NULL;
2983 	}
2984 
2985 	for (i = 0; i <= ss->rx_small.mask; i++) {
2986 		if (ss->rx_small.info[i].m == NULL)
2987 			continue;
2988 		bus_dmamap_unload(ss->rx_small.dmat,
2989 				  ss->rx_small.info[i].map);
2990 		m_freem(ss->rx_small.info[i].m);
2991 		ss->rx_small.info[i].m = NULL;
2992 	}
2993 
2994 	/* transmit ring used only on the first slice */
2995 	if (ss->tx.info == NULL)
2996 		return;
2997 
2998 	for (i = 0; i <= ss->tx.mask; i++) {
2999 		ss->tx.info[i].flag = 0;
3000 		if (ss->tx.info[i].m == NULL)
3001 			continue;
3002 		bus_dmamap_unload(ss->tx.dmat,
3003 				  ss->tx.info[i].map);
3004 		m_freem(ss->tx.info[i].m);
3005 		ss->tx.info[i].m = NULL;
3006 	}
3007 }
3008 
3009 static void
3010 mxge_free_mbufs(mxge_softc_t *sc)
3011 {
3012 	int slice;
3013 
3014 	for (slice = 0; slice < sc->num_slices; slice++)
3015 		mxge_free_slice_mbufs(&sc->ss[slice]);
3016 }
3017 
3018 static void
3019 mxge_free_slice_rings(struct mxge_slice_state *ss)
3020 {
3021 	int i;
3022 
3023 
3024 	if (ss->rx_done.entry != NULL)
3025 		mxge_dma_free(&ss->rx_done.dma);
3026 	ss->rx_done.entry = NULL;
3027 
3028 	if (ss->tx.req_bytes != NULL)
3029 		free(ss->tx.req_bytes, M_DEVBUF);
3030 	ss->tx.req_bytes = NULL;
3031 
3032 	if (ss->tx.seg_list != NULL)
3033 		free(ss->tx.seg_list, M_DEVBUF);
3034 	ss->tx.seg_list = NULL;
3035 
3036 	if (ss->rx_small.shadow != NULL)
3037 		free(ss->rx_small.shadow, M_DEVBUF);
3038 	ss->rx_small.shadow = NULL;
3039 
3040 	if (ss->rx_big.shadow != NULL)
3041 		free(ss->rx_big.shadow, M_DEVBUF);
3042 	ss->rx_big.shadow = NULL;
3043 
3044 	if (ss->tx.info != NULL) {
3045 		if (ss->tx.dmat != NULL) {
3046 			for (i = 0; i <= ss->tx.mask; i++) {
3047 				bus_dmamap_destroy(ss->tx.dmat,
3048 						   ss->tx.info[i].map);
3049 			}
3050 			bus_dma_tag_destroy(ss->tx.dmat);
3051 		}
3052 		free(ss->tx.info, M_DEVBUF);
3053 	}
3054 	ss->tx.info = NULL;
3055 
3056 	if (ss->rx_small.info != NULL) {
3057 		if (ss->rx_small.dmat != NULL) {
3058 			for (i = 0; i <= ss->rx_small.mask; i++) {
3059 				bus_dmamap_destroy(ss->rx_small.dmat,
3060 						   ss->rx_small.info[i].map);
3061 			}
3062 			bus_dmamap_destroy(ss->rx_small.dmat,
3063 					   ss->rx_small.extra_map);
3064 			bus_dma_tag_destroy(ss->rx_small.dmat);
3065 		}
3066 		free(ss->rx_small.info, M_DEVBUF);
3067 	}
3068 	ss->rx_small.info = NULL;
3069 
3070 	if (ss->rx_big.info != NULL) {
3071 		if (ss->rx_big.dmat != NULL) {
3072 			for (i = 0; i <= ss->rx_big.mask; i++) {
3073 				bus_dmamap_destroy(ss->rx_big.dmat,
3074 						   ss->rx_big.info[i].map);
3075 			}
3076 			bus_dmamap_destroy(ss->rx_big.dmat,
3077 					   ss->rx_big.extra_map);
3078 			bus_dma_tag_destroy(ss->rx_big.dmat);
3079 		}
3080 		free(ss->rx_big.info, M_DEVBUF);
3081 	}
3082 	ss->rx_big.info = NULL;
3083 }
3084 
3085 static void
3086 mxge_free_rings(mxge_softc_t *sc)
3087 {
3088 	int slice;
3089 
3090 	for (slice = 0; slice < sc->num_slices; slice++)
3091 		mxge_free_slice_rings(&sc->ss[slice]);
3092 }
3093 
3094 static int
3095 mxge_alloc_slice_rings(struct mxge_slice_state *ss, int rx_ring_entries,
3096 		       int tx_ring_entries)
3097 {
3098 	mxge_softc_t *sc = ss->sc;
3099 	size_t bytes;
3100 	int err, i;
3101 
3102 	err = ENOMEM;
3103 
3104 	/* allocate per-slice receive resources */
3105 
3106 	ss->rx_small.mask = ss->rx_big.mask = rx_ring_entries - 1;
3107 	ss->rx_done.mask = (2 * rx_ring_entries) - 1;
3108 
3109 	/* allocate the rx shadow rings */
3110 	bytes = rx_ring_entries * sizeof (*ss->rx_small.shadow);
3111 	ss->rx_small.shadow = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3112 	if (ss->rx_small.shadow == NULL)
3113 		return err;;
3114 
3115 	bytes = rx_ring_entries * sizeof (*ss->rx_big.shadow);
3116 	ss->rx_big.shadow = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3117 	if (ss->rx_big.shadow == NULL)
3118 		return err;;
3119 
3120 	/* allocate the rx host info rings */
3121 	bytes = rx_ring_entries * sizeof (*ss->rx_small.info);
3122 	ss->rx_small.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3123 	if (ss->rx_small.info == NULL)
3124 		return err;;
3125 
3126 	bytes = rx_ring_entries * sizeof (*ss->rx_big.info);
3127 	ss->rx_big.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3128 	if (ss->rx_big.info == NULL)
3129 		return err;;
3130 
3131 	/* allocate the rx busdma resources */
3132 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
3133 				 1,			/* alignment */
3134 				 4096,			/* boundary */
3135 				 BUS_SPACE_MAXADDR,	/* low */
3136 				 BUS_SPACE_MAXADDR,	/* high */
3137 				 NULL, NULL,		/* filter */
3138 				 MHLEN,			/* maxsize */
3139 				 1,			/* num segs */
3140 				 MHLEN,			/* maxsegsize */
3141 				 BUS_DMA_ALLOCNOW,	/* flags */
3142 				 NULL, NULL,		/* lock */
3143 				 &ss->rx_small.dmat);	/* tag */
3144 	if (err != 0) {
3145 		device_printf(sc->dev, "Err %d allocating rx_small dmat\n",
3146 			      err);
3147 		return err;;
3148 	}
3149 
3150 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
3151 				 1,			/* alignment */
3152 #if MXGE_VIRT_JUMBOS
3153 				 4096,			/* boundary */
3154 #else
3155 				 0,			/* boundary */
3156 #endif
3157 				 BUS_SPACE_MAXADDR,	/* low */
3158 				 BUS_SPACE_MAXADDR,	/* high */
3159 				 NULL, NULL,		/* filter */
3160 				 3*4096,		/* maxsize */
3161 #if MXGE_VIRT_JUMBOS
3162 				 3,			/* num segs */
3163 				 4096,			/* maxsegsize*/
3164 #else
3165 				 1,			/* num segs */
3166 				 MJUM9BYTES,		/* maxsegsize*/
3167 #endif
3168 				 BUS_DMA_ALLOCNOW,	/* flags */
3169 				 NULL, NULL,		/* lock */
3170 				 &ss->rx_big.dmat);	/* tag */
3171 	if (err != 0) {
3172 		device_printf(sc->dev, "Err %d allocating rx_big dmat\n",
3173 			      err);
3174 		return err;;
3175 	}
3176 	for (i = 0; i <= ss->rx_small.mask; i++) {
3177 		err = bus_dmamap_create(ss->rx_small.dmat, 0,
3178 					&ss->rx_small.info[i].map);
3179 		if (err != 0) {
3180 			device_printf(sc->dev, "Err %d  rx_small dmamap\n",
3181 				      err);
3182 			return err;;
3183 		}
3184 	}
3185 	err = bus_dmamap_create(ss->rx_small.dmat, 0,
3186 				&ss->rx_small.extra_map);
3187 	if (err != 0) {
3188 		device_printf(sc->dev, "Err %d extra rx_small dmamap\n",
3189 			      err);
3190 		return err;;
3191 	}
3192 
3193 	for (i = 0; i <= ss->rx_big.mask; i++) {
3194 		err = bus_dmamap_create(ss->rx_big.dmat, 0,
3195 					&ss->rx_big.info[i].map);
3196 		if (err != 0) {
3197 			device_printf(sc->dev, "Err %d  rx_big dmamap\n",
3198 				      err);
3199 			return err;;
3200 		}
3201 	}
3202 	err = bus_dmamap_create(ss->rx_big.dmat, 0,
3203 				&ss->rx_big.extra_map);
3204 	if (err != 0) {
3205 		device_printf(sc->dev, "Err %d extra rx_big dmamap\n",
3206 			      err);
3207 		return err;;
3208 	}
3209 
3210 	/* now allocate TX resouces */
3211 
3212 #ifndef IFNET_BUF_RING
3213 	/* only use a single TX ring for now */
3214 	if (ss != ss->sc->ss)
3215 		return 0;
3216 #endif
3217 
3218 	ss->tx.mask = tx_ring_entries - 1;
3219 	ss->tx.max_desc = MIN(MXGE_MAX_SEND_DESC, tx_ring_entries / 4);
3220 
3221 
3222 	/* allocate the tx request copy block */
3223 	bytes = 8 +
3224 		sizeof (*ss->tx.req_list) * (ss->tx.max_desc + 4);
3225 	ss->tx.req_bytes = malloc(bytes, M_DEVBUF, M_WAITOK);
3226 	if (ss->tx.req_bytes == NULL)
3227 		return err;;
3228 	/* ensure req_list entries are aligned to 8 bytes */
3229 	ss->tx.req_list = (mcp_kreq_ether_send_t *)
3230 		((unsigned long)(ss->tx.req_bytes + 7) & ~7UL);
3231 
3232 	/* allocate the tx busdma segment list */
3233 	bytes = sizeof (*ss->tx.seg_list) * ss->tx.max_desc;
3234 	ss->tx.seg_list = (bus_dma_segment_t *)
3235 		malloc(bytes, M_DEVBUF, M_WAITOK);
3236 	if (ss->tx.seg_list == NULL)
3237 		return err;;
3238 
3239 	/* allocate the tx host info ring */
3240 	bytes = tx_ring_entries * sizeof (*ss->tx.info);
3241 	ss->tx.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3242 	if (ss->tx.info == NULL)
3243 		return err;;
3244 
3245 	/* allocate the tx busdma resources */
3246 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
3247 				 1,			/* alignment */
3248 				 sc->tx_boundary,	/* boundary */
3249 				 BUS_SPACE_MAXADDR,	/* low */
3250 				 BUS_SPACE_MAXADDR,	/* high */
3251 				 NULL, NULL,		/* filter */
3252 				 65536 + 256,		/* maxsize */
3253 				 ss->tx.max_desc - 2,	/* num segs */
3254 				 sc->tx_boundary,	/* maxsegsz */
3255 				 BUS_DMA_ALLOCNOW,	/* flags */
3256 				 NULL, NULL,		/* lock */
3257 				 &ss->tx.dmat);		/* tag */
3258 
3259 	if (err != 0) {
3260 		device_printf(sc->dev, "Err %d allocating tx dmat\n",
3261 			      err);
3262 		return err;;
3263 	}
3264 
3265 	/* now use these tags to setup dmamaps for each slot
3266 	   in the ring */
3267 	for (i = 0; i <= ss->tx.mask; i++) {
3268 		err = bus_dmamap_create(ss->tx.dmat, 0,
3269 					&ss->tx.info[i].map);
3270 		if (err != 0) {
3271 			device_printf(sc->dev, "Err %d  tx dmamap\n",
3272 				      err);
3273 			return err;;
3274 		}
3275 	}
3276 	return 0;
3277 
3278 }
3279 
3280 static int
3281 mxge_alloc_rings(mxge_softc_t *sc)
3282 {
3283 	mxge_cmd_t cmd;
3284 	int tx_ring_size;
3285 	int tx_ring_entries, rx_ring_entries;
3286 	int err, slice;
3287 
3288 	/* get ring sizes */
3289 	err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_RING_SIZE, &cmd);
3290 	tx_ring_size = cmd.data0;
3291 	if (err != 0) {
3292 		device_printf(sc->dev, "Cannot determine tx ring sizes\n");
3293 		goto abort;
3294 	}
3295 
3296 	tx_ring_entries = tx_ring_size / sizeof (mcp_kreq_ether_send_t);
3297 	rx_ring_entries = sc->rx_ring_size / sizeof (mcp_dma_addr_t);
3298 	IFQ_SET_MAXLEN(&sc->ifp->if_snd, tx_ring_entries - 1);
3299 	sc->ifp->if_snd.ifq_drv_maxlen = sc->ifp->if_snd.ifq_maxlen;
3300 	IFQ_SET_READY(&sc->ifp->if_snd);
3301 
3302 	for (slice = 0; slice < sc->num_slices; slice++) {
3303 		err = mxge_alloc_slice_rings(&sc->ss[slice],
3304 					     rx_ring_entries,
3305 					     tx_ring_entries);
3306 		if (err != 0)
3307 			goto abort;
3308 	}
3309 	return 0;
3310 
3311 abort:
3312 	mxge_free_rings(sc);
3313 	return err;
3314 
3315 }
3316 
3317 
3318 static void
3319 mxge_choose_params(int mtu, int *big_buf_size, int *cl_size, int *nbufs)
3320 {
3321 	int bufsize = mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN + MXGEFW_PAD;
3322 
3323 	if (bufsize < MCLBYTES) {
3324 		/* easy, everything fits in a single buffer */
3325 		*big_buf_size = MCLBYTES;
3326 		*cl_size = MCLBYTES;
3327 		*nbufs = 1;
3328 		return;
3329 	}
3330 
3331 	if (bufsize < MJUMPAGESIZE) {
3332 		/* still easy, everything still fits in a single buffer */
3333 		*big_buf_size = MJUMPAGESIZE;
3334 		*cl_size = MJUMPAGESIZE;
3335 		*nbufs = 1;
3336 		return;
3337 	}
3338 #if MXGE_VIRT_JUMBOS
3339 	/* now we need to use virtually contiguous buffers */
3340 	*cl_size = MJUM9BYTES;
3341 	*big_buf_size = 4096;
3342 	*nbufs = mtu / 4096 + 1;
3343 	/* needs to be a power of two, so round up */
3344 	if (*nbufs == 3)
3345 		*nbufs = 4;
3346 #else
3347 	*cl_size = MJUM9BYTES;
3348 	*big_buf_size = MJUM9BYTES;
3349 	*nbufs = 1;
3350 #endif
3351 }
3352 
3353 static int
3354 mxge_slice_open(struct mxge_slice_state *ss, int nbufs, int cl_size)
3355 {
3356 	mxge_softc_t *sc;
3357 	mxge_cmd_t cmd;
3358 	bus_dmamap_t map;
3359 	struct lro_entry *lro_entry;
3360 	int err, i, slice;
3361 
3362 
3363 	sc = ss->sc;
3364 	slice = ss - sc->ss;
3365 
3366 	SLIST_INIT(&ss->lro_free);
3367 	SLIST_INIT(&ss->lro_active);
3368 
3369 	for (i = 0; i < sc->lro_cnt; i++) {
3370 		lro_entry = (struct lro_entry *)
3371 			malloc(sizeof (*lro_entry), M_DEVBUF,
3372 			       M_NOWAIT | M_ZERO);
3373 		if (lro_entry == NULL) {
3374 			sc->lro_cnt = i;
3375 			break;
3376 		}
3377 		SLIST_INSERT_HEAD(&ss->lro_free, lro_entry, next);
3378 	}
3379 	/* get the lanai pointers to the send and receive rings */
3380 
3381 	err = 0;
3382 #ifndef IFNET_BUF_RING
3383 	/* We currently only send from the first slice */
3384 	if (slice == 0) {
3385 #endif
3386 		cmd.data0 = slice;
3387 		err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_OFFSET, &cmd);
3388 		ss->tx.lanai =
3389 			(volatile mcp_kreq_ether_send_t *)(sc->sram + cmd.data0);
3390 		ss->tx.send_go = (volatile uint32_t *)
3391 			(sc->sram + MXGEFW_ETH_SEND_GO + 64 * slice);
3392 		ss->tx.send_stop = (volatile uint32_t *)
3393 		(sc->sram + MXGEFW_ETH_SEND_STOP + 64 * slice);
3394 #ifndef IFNET_BUF_RING
3395 	}
3396 #endif
3397 	cmd.data0 = slice;
3398 	err |= mxge_send_cmd(sc,
3399 			     MXGEFW_CMD_GET_SMALL_RX_OFFSET, &cmd);
3400 	ss->rx_small.lanai =
3401 		(volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
3402 	cmd.data0 = slice;
3403 	err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_BIG_RX_OFFSET, &cmd);
3404 	ss->rx_big.lanai =
3405 		(volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
3406 
3407 	if (err != 0) {
3408 		device_printf(sc->dev,
3409 			      "failed to get ring sizes or locations\n");
3410 		return EIO;
3411 	}
3412 
3413 	/* stock receive rings */
3414 	for (i = 0; i <= ss->rx_small.mask; i++) {
3415 		map = ss->rx_small.info[i].map;
3416 		err = mxge_get_buf_small(ss, map, i);
3417 		if (err) {
3418 			device_printf(sc->dev, "alloced %d/%d smalls\n",
3419 				      i, ss->rx_small.mask + 1);
3420 			return ENOMEM;
3421 		}
3422 	}
3423 	for (i = 0; i <= ss->rx_big.mask; i++) {
3424 		ss->rx_big.shadow[i].addr_low = 0xffffffff;
3425 		ss->rx_big.shadow[i].addr_high = 0xffffffff;
3426 	}
3427 	ss->rx_big.nbufs = nbufs;
3428 	ss->rx_big.cl_size = cl_size;
3429 	ss->rx_big.mlen = ss->sc->ifp->if_mtu + ETHER_HDR_LEN +
3430 		ETHER_VLAN_ENCAP_LEN + MXGEFW_PAD;
3431 	for (i = 0; i <= ss->rx_big.mask; i += ss->rx_big.nbufs) {
3432 		map = ss->rx_big.info[i].map;
3433 		err = mxge_get_buf_big(ss, map, i);
3434 		if (err) {
3435 			device_printf(sc->dev, "alloced %d/%d bigs\n",
3436 				      i, ss->rx_big.mask + 1);
3437 			return ENOMEM;
3438 		}
3439 	}
3440 	return 0;
3441 }
3442 
3443 static int
3444 mxge_open(mxge_softc_t *sc)
3445 {
3446 	mxge_cmd_t cmd;
3447 	int err, big_bytes, nbufs, slice, cl_size, i;
3448 	bus_addr_t bus;
3449 	volatile uint8_t *itable;
3450 	struct mxge_slice_state *ss;
3451 
3452 	/* Copy the MAC address in case it was overridden */
3453 	bcopy(IF_LLADDR(sc->ifp), sc->mac_addr, ETHER_ADDR_LEN);
3454 
3455 	err = mxge_reset(sc, 1);
3456 	if (err != 0) {
3457 		device_printf(sc->dev, "failed to reset\n");
3458 		return EIO;
3459 	}
3460 
3461 	if (sc->num_slices > 1) {
3462 		/* setup the indirection table */
3463 		cmd.data0 = sc->num_slices;
3464 		err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_TABLE_SIZE,
3465 				    &cmd);
3466 
3467 		err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_RSS_TABLE_OFFSET,
3468 				     &cmd);
3469 		if (err != 0) {
3470 			device_printf(sc->dev,
3471 				      "failed to setup rss tables\n");
3472 			return err;
3473 		}
3474 
3475 		/* just enable an identity mapping */
3476 		itable = sc->sram + cmd.data0;
3477 		for (i = 0; i < sc->num_slices; i++)
3478 			itable[i] = (uint8_t)i;
3479 
3480 		cmd.data0 = 1;
3481 		cmd.data1 = mxge_rss_hash_type;
3482 		err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_ENABLE, &cmd);
3483 		if (err != 0) {
3484 			device_printf(sc->dev, "failed to enable slices\n");
3485 			return err;
3486 		}
3487 	}
3488 
3489 
3490 	mxge_choose_params(sc->ifp->if_mtu, &big_bytes, &cl_size, &nbufs);
3491 
3492 	cmd.data0 = nbufs;
3493 	err = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
3494 			    &cmd);
3495 	/* error is only meaningful if we're trying to set
3496 	   MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS > 1 */
3497 	if (err && nbufs > 1) {
3498 		device_printf(sc->dev,
3499 			      "Failed to set alway-use-n to %d\n",
3500 			      nbufs);
3501 		return EIO;
3502 	}
3503 	/* Give the firmware the mtu and the big and small buffer
3504 	   sizes.  The firmware wants the big buf size to be a power
3505 	   of two. Luckily, FreeBSD's clusters are powers of two */
3506 	cmd.data0 = sc->ifp->if_mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
3507 	err = mxge_send_cmd(sc, MXGEFW_CMD_SET_MTU, &cmd);
3508 	cmd.data0 = MHLEN - MXGEFW_PAD;
3509 	err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_SMALL_BUFFER_SIZE,
3510 			     &cmd);
3511 	cmd.data0 = big_bytes;
3512 	err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_BIG_BUFFER_SIZE, &cmd);
3513 
3514 	if (err != 0) {
3515 		device_printf(sc->dev, "failed to setup params\n");
3516 		goto abort;
3517 	}
3518 
3519 	/* Now give him the pointer to the stats block */
3520 	for (slice = 0;
3521 #ifdef IFNET_BUF_RING
3522 	     slice < sc->num_slices;
3523 #else
3524 	     slice < 1;
3525 #endif
3526 	     slice++) {
3527 		ss = &sc->ss[slice];
3528 		cmd.data0 =
3529 			MXGE_LOWPART_TO_U32(ss->fw_stats_dma.bus_addr);
3530 		cmd.data1 =
3531 			MXGE_HIGHPART_TO_U32(ss->fw_stats_dma.bus_addr);
3532 		cmd.data2 = sizeof(struct mcp_irq_data);
3533 		cmd.data2 |= (slice << 16);
3534 		err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_STATS_DMA_V2, &cmd);
3535 	}
3536 
3537 	if (err != 0) {
3538 		bus = sc->ss->fw_stats_dma.bus_addr;
3539 		bus += offsetof(struct mcp_irq_data, send_done_count);
3540 		cmd.data0 = MXGE_LOWPART_TO_U32(bus);
3541 		cmd.data1 = MXGE_HIGHPART_TO_U32(bus);
3542 		err = mxge_send_cmd(sc,
3543 				    MXGEFW_CMD_SET_STATS_DMA_OBSOLETE,
3544 				    &cmd);
3545 		/* Firmware cannot support multicast without STATS_DMA_V2 */
3546 		sc->fw_multicast_support = 0;
3547 	} else {
3548 		sc->fw_multicast_support = 1;
3549 	}
3550 
3551 	if (err != 0) {
3552 		device_printf(sc->dev, "failed to setup params\n");
3553 		goto abort;
3554 	}
3555 
3556 	for (slice = 0; slice < sc->num_slices; slice++) {
3557 		err = mxge_slice_open(&sc->ss[slice], nbufs, cl_size);
3558 		if (err != 0) {
3559 			device_printf(sc->dev, "couldn't open slice %d\n",
3560 				      slice);
3561 			goto abort;
3562 		}
3563 	}
3564 
3565 	/* Finally, start the firmware running */
3566 	err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_UP, &cmd);
3567 	if (err) {
3568 		device_printf(sc->dev, "Couldn't bring up link\n");
3569 		goto abort;
3570 	}
3571 #ifdef IFNET_BUF_RING
3572 	for (slice = 0; slice < sc->num_slices; slice++) {
3573 		ss = &sc->ss[slice];
3574 		ss->if_drv_flags |= IFF_DRV_RUNNING;
3575 		ss->if_drv_flags &= ~IFF_DRV_OACTIVE;
3576 	}
3577 #endif
3578 	sc->ifp->if_drv_flags |= IFF_DRV_RUNNING;
3579 	sc->ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
3580 	callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
3581 
3582 	return 0;
3583 
3584 
3585 abort:
3586 	mxge_free_mbufs(sc);
3587 
3588 	return err;
3589 }
3590 
3591 static int
3592 mxge_close(mxge_softc_t *sc)
3593 {
3594 	mxge_cmd_t cmd;
3595 	int err, old_down_cnt;
3596 #ifdef IFNET_BUF_RING
3597 	struct mxge_slice_state *ss;
3598 	int slice;
3599 #endif
3600 
3601 	callout_stop(&sc->co_hdl);
3602 #ifdef IFNET_BUF_RING
3603 	for (slice = 0; slice < sc->num_slices; slice++) {
3604 		ss = &sc->ss[slice];
3605 		ss->if_drv_flags &= ~IFF_DRV_RUNNING;
3606 	}
3607 #endif
3608 	sc->ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
3609 	old_down_cnt = sc->down_cnt;
3610 	wmb();
3611 	err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_DOWN, &cmd);
3612 	if (err) {
3613 		device_printf(sc->dev, "Couldn't bring down link\n");
3614 	}
3615 	if (old_down_cnt == sc->down_cnt) {
3616 		/* wait for down irq */
3617 		DELAY(10 * sc->intr_coal_delay);
3618 	}
3619 	wmb();
3620 	if (old_down_cnt == sc->down_cnt) {
3621 		device_printf(sc->dev, "never got down irq\n");
3622 	}
3623 
3624 	mxge_free_mbufs(sc);
3625 
3626 	return 0;
3627 }
3628 
3629 static void
3630 mxge_setup_cfg_space(mxge_softc_t *sc)
3631 {
3632 	device_t dev = sc->dev;
3633 	int reg;
3634 	uint16_t cmd, lnk, pectl;
3635 
3636 	/* find the PCIe link width and set max read request to 4KB*/
3637 	if (pci_find_extcap(dev, PCIY_EXPRESS, &reg) == 0) {
3638 		lnk = pci_read_config(dev, reg + 0x12, 2);
3639 		sc->link_width = (lnk >> 4) & 0x3f;
3640 
3641 		pectl = pci_read_config(dev, reg + 0x8, 2);
3642 		pectl = (pectl & ~0x7000) | (5 << 12);
3643 		pci_write_config(dev, reg + 0x8, pectl, 2);
3644 	}
3645 
3646 	/* Enable DMA and Memory space access */
3647 	pci_enable_busmaster(dev);
3648 	cmd = pci_read_config(dev, PCIR_COMMAND, 2);
3649 	cmd |= PCIM_CMD_MEMEN;
3650 	pci_write_config(dev, PCIR_COMMAND, cmd, 2);
3651 }
3652 
3653 static uint32_t
3654 mxge_read_reboot(mxge_softc_t *sc)
3655 {
3656 	device_t dev = sc->dev;
3657 	uint32_t vs;
3658 
3659 	/* find the vendor specific offset */
3660 	if (pci_find_extcap(dev, PCIY_VENDOR, &vs) != 0) {
3661 		device_printf(sc->dev,
3662 			      "could not find vendor specific offset\n");
3663 		return (uint32_t)-1;
3664 	}
3665 	/* enable read32 mode */
3666 	pci_write_config(dev, vs + 0x10, 0x3, 1);
3667 	/* tell NIC which register to read */
3668 	pci_write_config(dev, vs + 0x18, 0xfffffff0, 4);
3669 	return (pci_read_config(dev, vs + 0x14, 4));
3670 }
3671 
3672 static int
3673 mxge_watchdog_reset(mxge_softc_t *sc, int slice)
3674 {
3675 	struct pci_devinfo *dinfo;
3676 	mxge_tx_ring_t *tx;
3677 	int err;
3678 	uint32_t reboot;
3679 	uint16_t cmd;
3680 
3681 	err = ENXIO;
3682 
3683 	device_printf(sc->dev, "Watchdog reset!\n");
3684 
3685 	/*
3686 	 * check to see if the NIC rebooted.  If it did, then all of
3687 	 * PCI config space has been reset, and things like the
3688 	 * busmaster bit will be zero.  If this is the case, then we
3689 	 * must restore PCI config space before the NIC can be used
3690 	 * again
3691 	 */
3692 	cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3693 	if (cmd == 0xffff) {
3694 		/*
3695 		 * maybe the watchdog caught the NIC rebooting; wait
3696 		 * up to 100ms for it to finish.  If it does not come
3697 		 * back, then give up
3698 		 */
3699 		DELAY(1000*100);
3700 		cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3701 		if (cmd == 0xffff) {
3702 			device_printf(sc->dev, "NIC disappeared!\n");
3703 			return (err);
3704 		}
3705 	}
3706 	if ((cmd & PCIM_CMD_BUSMASTEREN) == 0) {
3707 		/* print the reboot status */
3708 		reboot = mxge_read_reboot(sc);
3709 		device_printf(sc->dev, "NIC rebooted, status = 0x%x\n",
3710 			      reboot);
3711 		/* restore PCI configuration space */
3712 		dinfo = device_get_ivars(sc->dev);
3713 		pci_cfg_restore(sc->dev, dinfo);
3714 
3715 		/* and redo any changes we made to our config space */
3716 		mxge_setup_cfg_space(sc);
3717 
3718 		if (sc->ifp->if_drv_flags & IFF_DRV_RUNNING) {
3719 			mxge_close(sc);
3720 			err = mxge_open(sc);
3721 		}
3722 	} else {
3723 		tx = &sc->ss[slice].tx;
3724 		device_printf(sc->dev,
3725 			      "NIC did not reboot, slice %d ring state:\n",
3726 			      slice);
3727 		device_printf(sc->dev,
3728 			      "tx.req=%d tx.done=%d, tx.queue_active=%d\n",
3729 			      tx->req, tx->done, tx->queue_active);
3730 		device_printf(sc->dev, "tx.activate=%d tx.deactivate=%d\n",
3731 			      tx->activate, tx->deactivate);
3732 		device_printf(sc->dev, "pkt_done=%d fw=%d\n",
3733 			      tx->pkt_done,
3734 			      be32toh(sc->ss->fw_stats->send_done_count));
3735 		device_printf(sc->dev, "not resetting\n");
3736 	}
3737 	return (err);
3738 }
3739 
3740 static int
3741 mxge_watchdog(mxge_softc_t *sc)
3742 {
3743 	mxge_tx_ring_t *tx;
3744 	uint32_t rx_pause = be32toh(sc->ss->fw_stats->dropped_pause);
3745 	int i, err = 0;
3746 
3747 	/* see if we have outstanding transmits, which
3748 	   have been pending for more than mxge_ticks */
3749 	for (i = 0;
3750 #ifdef IFNET_BUF_RING
3751 	     (i < sc->num_slices) && (err == 0);
3752 #else
3753 	     (i < 1) && (err == 0);
3754 #endif
3755 	     i++) {
3756 		tx = &sc->ss[i].tx;
3757 		if (tx->req != tx->done &&
3758 		    tx->watchdog_req != tx->watchdog_done &&
3759 		    tx->done == tx->watchdog_done) {
3760 			/* check for pause blocking before resetting */
3761 			if (tx->watchdog_rx_pause == rx_pause)
3762 				err = mxge_watchdog_reset(sc, i);
3763 			else
3764 				device_printf(sc->dev, "Flow control blocking "
3765 					      "xmits, check link partner\n");
3766 		}
3767 
3768 		tx->watchdog_req = tx->req;
3769 		tx->watchdog_done = tx->done;
3770 		tx->watchdog_rx_pause = rx_pause;
3771 	}
3772 
3773 	if (sc->need_media_probe)
3774 		mxge_media_probe(sc);
3775 	return (err);
3776 }
3777 
3778 static void
3779 mxge_update_stats(mxge_softc_t *sc)
3780 {
3781 	struct mxge_slice_state *ss;
3782 	u_long ipackets = 0;
3783 	u_long opackets = 0;
3784 	u_long oerrors = 0;
3785 	int slice;
3786 
3787 	for (slice = 0; slice < sc->num_slices; slice++) {
3788 		ss = &sc->ss[slice];
3789 		ipackets += ss->ipackets;
3790 		opackets += ss->opackets;
3791 		oerrors += ss->oerrors;
3792 	}
3793 	sc->ifp->if_ipackets = ipackets;
3794 	sc->ifp->if_opackets = opackets;
3795 	sc->ifp->if_oerrors = oerrors;
3796 }
3797 
3798 static void
3799 mxge_tick(void *arg)
3800 {
3801 	mxge_softc_t *sc = arg;
3802 	int err = 0;
3803 
3804 	/* aggregate stats from different slices */
3805 	mxge_update_stats(sc);
3806 	if (!sc->watchdog_countdown) {
3807 		err = mxge_watchdog(sc);
3808 		sc->watchdog_countdown = 4;
3809 	}
3810 	sc->watchdog_countdown--;
3811 	if (err == 0)
3812 		callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
3813 
3814 }
3815 
3816 static int
3817 mxge_media_change(struct ifnet *ifp)
3818 {
3819 	return EINVAL;
3820 }
3821 
3822 static int
3823 mxge_change_mtu(mxge_softc_t *sc, int mtu)
3824 {
3825 	struct ifnet *ifp = sc->ifp;
3826 	int real_mtu, old_mtu;
3827 	int err = 0;
3828 
3829 
3830 	real_mtu = mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
3831 	if ((real_mtu > sc->max_mtu) || real_mtu < 60)
3832 		return EINVAL;
3833 	mtx_lock(&sc->driver_mtx);
3834 	old_mtu = ifp->if_mtu;
3835 	ifp->if_mtu = mtu;
3836 	if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
3837 		mxge_close(sc);
3838 		err = mxge_open(sc);
3839 		if (err != 0) {
3840 			ifp->if_mtu = old_mtu;
3841 			mxge_close(sc);
3842 			(void) mxge_open(sc);
3843 		}
3844 	}
3845 	mtx_unlock(&sc->driver_mtx);
3846 	return err;
3847 }
3848 
3849 static void
3850 mxge_media_status(struct ifnet *ifp, struct ifmediareq *ifmr)
3851 {
3852 	mxge_softc_t *sc = ifp->if_softc;
3853 
3854 
3855 	if (sc == NULL)
3856 		return;
3857 	ifmr->ifm_status = IFM_AVALID;
3858 	ifmr->ifm_status |= sc->link_state ? IFM_ACTIVE : 0;
3859 	ifmr->ifm_active = IFM_AUTO | IFM_ETHER;
3860 	ifmr->ifm_active |= sc->link_state ? IFM_FDX : 0;
3861 }
3862 
3863 static int
3864 mxge_ioctl(struct ifnet *ifp, u_long command, caddr_t data)
3865 {
3866 	mxge_softc_t *sc = ifp->if_softc;
3867 	struct ifreq *ifr = (struct ifreq *)data;
3868 	int err, mask;
3869 
3870 	err = 0;
3871 	switch (command) {
3872 	case SIOCSIFADDR:
3873 	case SIOCGIFADDR:
3874 		err = ether_ioctl(ifp, command, data);
3875 		break;
3876 
3877 	case SIOCSIFMTU:
3878 		err = mxge_change_mtu(sc, ifr->ifr_mtu);
3879 		break;
3880 
3881 	case SIOCSIFFLAGS:
3882 		mtx_lock(&sc->driver_mtx);
3883 		if (ifp->if_flags & IFF_UP) {
3884 			if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) {
3885 				err = mxge_open(sc);
3886 			} else {
3887 				/* take care of promis can allmulti
3888 				   flag chages */
3889 				mxge_change_promisc(sc,
3890 						    ifp->if_flags & IFF_PROMISC);
3891 				mxge_set_multicast_list(sc);
3892 			}
3893 		} else {
3894 			if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
3895 				mxge_close(sc);
3896 			}
3897 		}
3898 		mtx_unlock(&sc->driver_mtx);
3899 		break;
3900 
3901 	case SIOCADDMULTI:
3902 	case SIOCDELMULTI:
3903 		mtx_lock(&sc->driver_mtx);
3904 		mxge_set_multicast_list(sc);
3905 		mtx_unlock(&sc->driver_mtx);
3906 		break;
3907 
3908 	case SIOCSIFCAP:
3909 		mtx_lock(&sc->driver_mtx);
3910 		mask = ifr->ifr_reqcap ^ ifp->if_capenable;
3911 		if (mask & IFCAP_TXCSUM) {
3912 			if (IFCAP_TXCSUM & ifp->if_capenable) {
3913 				ifp->if_capenable &= ~(IFCAP_TXCSUM|IFCAP_TSO4);
3914 				ifp->if_hwassist &= ~(CSUM_TCP | CSUM_UDP
3915 						      | CSUM_TSO);
3916 			} else {
3917 				ifp->if_capenable |= IFCAP_TXCSUM;
3918 				ifp->if_hwassist |= (CSUM_TCP | CSUM_UDP);
3919 			}
3920 		} else if (mask & IFCAP_RXCSUM) {
3921 			if (IFCAP_RXCSUM & ifp->if_capenable) {
3922 				ifp->if_capenable &= ~IFCAP_RXCSUM;
3923 				sc->csum_flag = 0;
3924 			} else {
3925 				ifp->if_capenable |= IFCAP_RXCSUM;
3926 				sc->csum_flag = 1;
3927 			}
3928 		}
3929 		if (mask & IFCAP_TSO4) {
3930 			if (IFCAP_TSO4 & ifp->if_capenable) {
3931 				ifp->if_capenable &= ~IFCAP_TSO4;
3932 				ifp->if_hwassist &= ~CSUM_TSO;
3933 			} else if (IFCAP_TXCSUM & ifp->if_capenable) {
3934 				ifp->if_capenable |= IFCAP_TSO4;
3935 				ifp->if_hwassist |= CSUM_TSO;
3936 			} else {
3937 				printf("mxge requires tx checksum offload"
3938 				       " be enabled to use TSO\n");
3939 				err = EINVAL;
3940 			}
3941 		}
3942 		if (mask & IFCAP_LRO) {
3943 			if (IFCAP_LRO & ifp->if_capenable)
3944 				err = mxge_change_lro_locked(sc, 0);
3945 			else
3946 				err = mxge_change_lro_locked(sc, mxge_lro_cnt);
3947 		}
3948 		if (mask & IFCAP_VLAN_HWTAGGING)
3949 			ifp->if_capenable ^= IFCAP_VLAN_HWTAGGING;
3950 		mtx_unlock(&sc->driver_mtx);
3951 		VLAN_CAPABILITIES(ifp);
3952 
3953 		break;
3954 
3955 	case SIOCGIFMEDIA:
3956 		err = ifmedia_ioctl(ifp, (struct ifreq *)data,
3957 				    &sc->media, command);
3958                 break;
3959 
3960 	default:
3961 		err = ENOTTY;
3962         }
3963 	return err;
3964 }
3965 
3966 static void
3967 mxge_fetch_tunables(mxge_softc_t *sc)
3968 {
3969 
3970 	TUNABLE_INT_FETCH("hw.mxge.max_slices", &mxge_max_slices);
3971 	TUNABLE_INT_FETCH("hw.mxge.flow_control_enabled",
3972 			  &mxge_flow_control);
3973 	TUNABLE_INT_FETCH("hw.mxge.intr_coal_delay",
3974 			  &mxge_intr_coal_delay);
3975 	TUNABLE_INT_FETCH("hw.mxge.nvidia_ecrc_enable",
3976 			  &mxge_nvidia_ecrc_enable);
3977 	TUNABLE_INT_FETCH("hw.mxge.force_firmware",
3978 			  &mxge_force_firmware);
3979 	TUNABLE_INT_FETCH("hw.mxge.deassert_wait",
3980 			  &mxge_deassert_wait);
3981 	TUNABLE_INT_FETCH("hw.mxge.verbose",
3982 			  &mxge_verbose);
3983 	TUNABLE_INT_FETCH("hw.mxge.ticks", &mxge_ticks);
3984 	TUNABLE_INT_FETCH("hw.mxge.lro_cnt", &sc->lro_cnt);
3985 	TUNABLE_INT_FETCH("hw.mxge.always_promisc", &mxge_always_promisc);
3986 	TUNABLE_INT_FETCH("hw.mxge.rss_hash_type", &mxge_rss_hash_type);
3987 	if (sc->lro_cnt != 0)
3988 		mxge_lro_cnt = sc->lro_cnt;
3989 
3990 	if (bootverbose)
3991 		mxge_verbose = 1;
3992 	if (mxge_intr_coal_delay < 0 || mxge_intr_coal_delay > 10*1000)
3993 		mxge_intr_coal_delay = 30;
3994 	if (mxge_ticks == 0)
3995 		mxge_ticks = hz / 2;
3996 	sc->pause = mxge_flow_control;
3997 	if (mxge_rss_hash_type < MXGEFW_RSS_HASH_TYPE_IPV4
3998 	    || mxge_rss_hash_type > MXGEFW_RSS_HASH_TYPE_SRC_PORT) {
3999 		mxge_rss_hash_type = MXGEFW_RSS_HASH_TYPE_SRC_PORT;
4000 	}
4001 }
4002 
4003 
4004 static void
4005 mxge_free_slices(mxge_softc_t *sc)
4006 {
4007 	struct mxge_slice_state *ss;
4008 	int i;
4009 
4010 
4011 	if (sc->ss == NULL)
4012 		return;
4013 
4014 	for (i = 0; i < sc->num_slices; i++) {
4015 		ss = &sc->ss[i];
4016 		if (ss->fw_stats != NULL) {
4017 			mxge_dma_free(&ss->fw_stats_dma);
4018 			ss->fw_stats = NULL;
4019 #ifdef IFNET_BUF_RING
4020 			if (ss->tx.br != NULL) {
4021 				drbr_free(ss->tx.br, M_DEVBUF);
4022 				ss->tx.br = NULL;
4023 			}
4024 #endif
4025 			mtx_destroy(&ss->tx.mtx);
4026 		}
4027 		if (ss->rx_done.entry != NULL) {
4028 			mxge_dma_free(&ss->rx_done.dma);
4029 			ss->rx_done.entry = NULL;
4030 		}
4031 	}
4032 	free(sc->ss, M_DEVBUF);
4033 	sc->ss = NULL;
4034 }
4035 
4036 static int
4037 mxge_alloc_slices(mxge_softc_t *sc)
4038 {
4039 	mxge_cmd_t cmd;
4040 	struct mxge_slice_state *ss;
4041 	size_t bytes;
4042 	int err, i, max_intr_slots;
4043 
4044 	err = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
4045 	if (err != 0) {
4046 		device_printf(sc->dev, "Cannot determine rx ring size\n");
4047 		return err;
4048 	}
4049 	sc->rx_ring_size = cmd.data0;
4050 	max_intr_slots = 2 * (sc->rx_ring_size / sizeof (mcp_dma_addr_t));
4051 
4052 	bytes = sizeof (*sc->ss) * sc->num_slices;
4053 	sc->ss = malloc(bytes, M_DEVBUF, M_NOWAIT | M_ZERO);
4054 	if (sc->ss == NULL)
4055 		return (ENOMEM);
4056 	for (i = 0; i < sc->num_slices; i++) {
4057 		ss = &sc->ss[i];
4058 
4059 		ss->sc = sc;
4060 
4061 		/* allocate per-slice rx interrupt queues */
4062 
4063 		bytes = max_intr_slots * sizeof (*ss->rx_done.entry);
4064 		err = mxge_dma_alloc(sc, &ss->rx_done.dma, bytes, 4096);
4065 		if (err != 0)
4066 			goto abort;
4067 		ss->rx_done.entry = ss->rx_done.dma.addr;
4068 		bzero(ss->rx_done.entry, bytes);
4069 
4070 		/*
4071 		 * allocate the per-slice firmware stats; stats
4072 		 * (including tx) are used used only on the first
4073 		 * slice for now
4074 		 */
4075 #ifndef IFNET_BUF_RING
4076 		if (i > 0)
4077 			continue;
4078 #endif
4079 
4080 		bytes = sizeof (*ss->fw_stats);
4081 		err = mxge_dma_alloc(sc, &ss->fw_stats_dma,
4082 				     sizeof (*ss->fw_stats), 64);
4083 		if (err != 0)
4084 			goto abort;
4085 		ss->fw_stats = (mcp_irq_data_t *)ss->fw_stats_dma.addr;
4086 		snprintf(ss->tx.mtx_name, sizeof(ss->tx.mtx_name),
4087 			 "%s:tx(%d)", device_get_nameunit(sc->dev), i);
4088 		mtx_init(&ss->tx.mtx, ss->tx.mtx_name, NULL, MTX_DEF);
4089 #ifdef IFNET_BUF_RING
4090 		ss->tx.br = buf_ring_alloc(2048, M_DEVBUF, M_WAITOK,
4091 					   &ss->tx.mtx);
4092 #endif
4093 	}
4094 
4095 	return (0);
4096 
4097 abort:
4098 	mxge_free_slices(sc);
4099 	return (ENOMEM);
4100 }
4101 
4102 static void
4103 mxge_slice_probe(mxge_softc_t *sc)
4104 {
4105 	mxge_cmd_t cmd;
4106 	char *old_fw;
4107 	int msix_cnt, status, max_intr_slots;
4108 
4109 	sc->num_slices = 1;
4110 	/*
4111 	 *  don't enable multiple slices if they are not enabled,
4112 	 *  or if this is not an SMP system
4113 	 */
4114 
4115 	if (mxge_max_slices == 0 || mxge_max_slices == 1 || mp_ncpus < 2)
4116 		return;
4117 
4118 	/* see how many MSI-X interrupts are available */
4119 	msix_cnt = pci_msix_count(sc->dev);
4120 	if (msix_cnt < 2)
4121 		return;
4122 
4123 	/* now load the slice aware firmware see what it supports */
4124 	old_fw = sc->fw_name;
4125 	if (old_fw == mxge_fw_aligned)
4126 		sc->fw_name = mxge_fw_rss_aligned;
4127 	else
4128 		sc->fw_name = mxge_fw_rss_unaligned;
4129 	status = mxge_load_firmware(sc, 0);
4130 	if (status != 0) {
4131 		device_printf(sc->dev, "Falling back to a single slice\n");
4132 		return;
4133 	}
4134 
4135 	/* try to send a reset command to the card to see if it
4136 	   is alive */
4137 	memset(&cmd, 0, sizeof (cmd));
4138 	status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
4139 	if (status != 0) {
4140 		device_printf(sc->dev, "failed reset\n");
4141 		goto abort_with_fw;
4142 	}
4143 
4144 	/* get rx ring size */
4145 	status = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
4146 	if (status != 0) {
4147 		device_printf(sc->dev, "Cannot determine rx ring size\n");
4148 		goto abort_with_fw;
4149 	}
4150 	max_intr_slots = 2 * (cmd.data0 / sizeof (mcp_dma_addr_t));
4151 
4152 	/* tell it the size of the interrupt queues */
4153 	cmd.data0 = max_intr_slots * sizeof (struct mcp_slot);
4154 	status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
4155 	if (status != 0) {
4156 		device_printf(sc->dev, "failed MXGEFW_CMD_SET_INTRQ_SIZE\n");
4157 		goto abort_with_fw;
4158 	}
4159 
4160 	/* ask the maximum number of slices it supports */
4161 	status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES, &cmd);
4162 	if (status != 0) {
4163 		device_printf(sc->dev,
4164 			      "failed MXGEFW_CMD_GET_MAX_RSS_QUEUES\n");
4165 		goto abort_with_fw;
4166 	}
4167 	sc->num_slices = cmd.data0;
4168 	if (sc->num_slices > msix_cnt)
4169 		sc->num_slices = msix_cnt;
4170 
4171 	if (mxge_max_slices == -1) {
4172 		/* cap to number of CPUs in system */
4173 		if (sc->num_slices > mp_ncpus)
4174 			sc->num_slices = mp_ncpus;
4175 	} else {
4176 		if (sc->num_slices > mxge_max_slices)
4177 			sc->num_slices = mxge_max_slices;
4178 	}
4179 	/* make sure it is a power of two */
4180 	while (sc->num_slices & (sc->num_slices - 1))
4181 		sc->num_slices--;
4182 
4183 	if (mxge_verbose)
4184 		device_printf(sc->dev, "using %d slices\n",
4185 			      sc->num_slices);
4186 
4187 	return;
4188 
4189 abort_with_fw:
4190 	sc->fw_name = old_fw;
4191 	(void) mxge_load_firmware(sc, 0);
4192 }
4193 
4194 static int
4195 mxge_add_msix_irqs(mxge_softc_t *sc)
4196 {
4197 	size_t bytes;
4198 	int count, err, i, rid;
4199 
4200 	rid = PCIR_BAR(2);
4201 	sc->msix_table_res = bus_alloc_resource_any(sc->dev, SYS_RES_MEMORY,
4202 						    &rid, RF_ACTIVE);
4203 
4204 	if (sc->msix_table_res == NULL) {
4205 		device_printf(sc->dev, "couldn't alloc MSIX table res\n");
4206 		return ENXIO;
4207 	}
4208 
4209 	count = sc->num_slices;
4210 	err = pci_alloc_msix(sc->dev, &count);
4211 	if (err != 0) {
4212 		device_printf(sc->dev, "pci_alloc_msix: failed, wanted %d"
4213 			      "err = %d \n", sc->num_slices, err);
4214 		goto abort_with_msix_table;
4215 	}
4216 	if (count < sc->num_slices) {
4217 		device_printf(sc->dev, "pci_alloc_msix: need %d, got %d\n",
4218 			      count, sc->num_slices);
4219 		device_printf(sc->dev,
4220 			      "Try setting hw.mxge.max_slices to %d\n",
4221 			      count);
4222 		err = ENOSPC;
4223 		goto abort_with_msix;
4224 	}
4225 	bytes = sizeof (*sc->msix_irq_res) * sc->num_slices;
4226 	sc->msix_irq_res = malloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO);
4227 	if (sc->msix_irq_res == NULL) {
4228 		err = ENOMEM;
4229 		goto abort_with_msix;
4230 	}
4231 
4232 	for (i = 0; i < sc->num_slices; i++) {
4233 		rid = i + 1;
4234 		sc->msix_irq_res[i] = bus_alloc_resource_any(sc->dev,
4235 							  SYS_RES_IRQ,
4236 							  &rid, RF_ACTIVE);
4237 		if (sc->msix_irq_res[i] == NULL) {
4238 			device_printf(sc->dev, "couldn't allocate IRQ res"
4239 				      " for message %d\n", i);
4240 			err = ENXIO;
4241 			goto abort_with_res;
4242 		}
4243 	}
4244 
4245 	bytes = sizeof (*sc->msix_ih) * sc->num_slices;
4246 	sc->msix_ih =  malloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO);
4247 
4248 	for (i = 0; i < sc->num_slices; i++) {
4249 		err = bus_setup_intr(sc->dev, sc->msix_irq_res[i],
4250 				     INTR_TYPE_NET | INTR_MPSAFE,
4251 #if __FreeBSD_version > 700030
4252 				     NULL,
4253 #endif
4254 				     mxge_intr, &sc->ss[i], &sc->msix_ih[i]);
4255 		if (err != 0) {
4256 			device_printf(sc->dev, "couldn't setup intr for "
4257 				      "message %d\n", i);
4258 			goto abort_with_intr;
4259 		}
4260 	}
4261 
4262 	if (mxge_verbose) {
4263 		device_printf(sc->dev, "using %d msix IRQs:",
4264 			      sc->num_slices);
4265 		for (i = 0; i < sc->num_slices; i++)
4266 			printf(" %ld",  rman_get_start(sc->msix_irq_res[i]));
4267 		printf("\n");
4268 	}
4269 	return (0);
4270 
4271 abort_with_intr:
4272 	for (i = 0; i < sc->num_slices; i++) {
4273 		if (sc->msix_ih[i] != NULL) {
4274 			bus_teardown_intr(sc->dev, sc->msix_irq_res[i],
4275 					  sc->msix_ih[i]);
4276 			sc->msix_ih[i] = NULL;
4277 		}
4278 	}
4279 	free(sc->msix_ih, M_DEVBUF);
4280 
4281 
4282 abort_with_res:
4283 	for (i = 0; i < sc->num_slices; i++) {
4284 		rid = i + 1;
4285 		if (sc->msix_irq_res[i] != NULL)
4286 			bus_release_resource(sc->dev, SYS_RES_IRQ, rid,
4287 					     sc->msix_irq_res[i]);
4288 		sc->msix_irq_res[i] = NULL;
4289 	}
4290 	free(sc->msix_irq_res, M_DEVBUF);
4291 
4292 
4293 abort_with_msix:
4294 	pci_release_msi(sc->dev);
4295 
4296 abort_with_msix_table:
4297 	bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2),
4298 			     sc->msix_table_res);
4299 
4300 	return err;
4301 }
4302 
4303 static int
4304 mxge_add_single_irq(mxge_softc_t *sc)
4305 {
4306 	int count, err, rid;
4307 
4308 	count = pci_msi_count(sc->dev);
4309 	if (count == 1 && pci_alloc_msi(sc->dev, &count) == 0) {
4310 		rid = 1;
4311 	} else {
4312 		rid = 0;
4313 		sc->legacy_irq = 1;
4314 	}
4315 	sc->irq_res = bus_alloc_resource(sc->dev, SYS_RES_IRQ, &rid, 0, ~0,
4316 					 1, RF_SHAREABLE | RF_ACTIVE);
4317 	if (sc->irq_res == NULL) {
4318 		device_printf(sc->dev, "could not alloc interrupt\n");
4319 		return ENXIO;
4320 	}
4321 	if (mxge_verbose)
4322 		device_printf(sc->dev, "using %s irq %ld\n",
4323 			      sc->legacy_irq ? "INTx" : "MSI",
4324 			      rman_get_start(sc->irq_res));
4325 	err = bus_setup_intr(sc->dev, sc->irq_res,
4326 			     INTR_TYPE_NET | INTR_MPSAFE,
4327 #if __FreeBSD_version > 700030
4328 			     NULL,
4329 #endif
4330 			     mxge_intr, &sc->ss[0], &sc->ih);
4331 	if (err != 0) {
4332 		bus_release_resource(sc->dev, SYS_RES_IRQ,
4333 				     sc->legacy_irq ? 0 : 1, sc->irq_res);
4334 		if (!sc->legacy_irq)
4335 			pci_release_msi(sc->dev);
4336 	}
4337 	return err;
4338 }
4339 
4340 static void
4341 mxge_rem_msix_irqs(mxge_softc_t *sc)
4342 {
4343 	int i, rid;
4344 
4345 	for (i = 0; i < sc->num_slices; i++) {
4346 		if (sc->msix_ih[i] != NULL) {
4347 			bus_teardown_intr(sc->dev, sc->msix_irq_res[i],
4348 					  sc->msix_ih[i]);
4349 			sc->msix_ih[i] = NULL;
4350 		}
4351 	}
4352 	free(sc->msix_ih, M_DEVBUF);
4353 
4354 	for (i = 0; i < sc->num_slices; i++) {
4355 		rid = i + 1;
4356 		if (sc->msix_irq_res[i] != NULL)
4357 			bus_release_resource(sc->dev, SYS_RES_IRQ, rid,
4358 					     sc->msix_irq_res[i]);
4359 		sc->msix_irq_res[i] = NULL;
4360 	}
4361 	free(sc->msix_irq_res, M_DEVBUF);
4362 
4363 	bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2),
4364 			     sc->msix_table_res);
4365 
4366 	pci_release_msi(sc->dev);
4367 	return;
4368 }
4369 
4370 static void
4371 mxge_rem_single_irq(mxge_softc_t *sc)
4372 {
4373 	bus_teardown_intr(sc->dev, sc->irq_res, sc->ih);
4374 	bus_release_resource(sc->dev, SYS_RES_IRQ,
4375 			     sc->legacy_irq ? 0 : 1, sc->irq_res);
4376 	if (!sc->legacy_irq)
4377 		pci_release_msi(sc->dev);
4378 }
4379 
4380 static void
4381 mxge_rem_irq(mxge_softc_t *sc)
4382 {
4383 	if (sc->num_slices > 1)
4384 		mxge_rem_msix_irqs(sc);
4385 	else
4386 		mxge_rem_single_irq(sc);
4387 }
4388 
4389 static int
4390 mxge_add_irq(mxge_softc_t *sc)
4391 {
4392 	int err;
4393 
4394 	if (sc->num_slices > 1)
4395 		err = mxge_add_msix_irqs(sc);
4396 	else
4397 		err = mxge_add_single_irq(sc);
4398 
4399 	if (0 && err == 0 && sc->num_slices > 1) {
4400 		mxge_rem_msix_irqs(sc);
4401 		err = mxge_add_msix_irqs(sc);
4402 	}
4403 	return err;
4404 }
4405 
4406 
4407 static int
4408 mxge_attach(device_t dev)
4409 {
4410 	mxge_softc_t *sc = device_get_softc(dev);
4411 	struct ifnet *ifp;
4412 	int err, rid;
4413 
4414 	sc->dev = dev;
4415 	mxge_fetch_tunables(sc);
4416 
4417 	err = bus_dma_tag_create(NULL,			/* parent */
4418 				 1,			/* alignment */
4419 				 0,			/* boundary */
4420 				 BUS_SPACE_MAXADDR,	/* low */
4421 				 BUS_SPACE_MAXADDR,	/* high */
4422 				 NULL, NULL,		/* filter */
4423 				 65536 + 256,		/* maxsize */
4424 				 MXGE_MAX_SEND_DESC, 	/* num segs */
4425 				 65536,			/* maxsegsize */
4426 				 0,			/* flags */
4427 				 NULL, NULL,		/* lock */
4428 				 &sc->parent_dmat);	/* tag */
4429 
4430 	if (err != 0) {
4431 		device_printf(sc->dev, "Err %d allocating parent dmat\n",
4432 			      err);
4433 		goto abort_with_nothing;
4434 	}
4435 
4436 	ifp = sc->ifp = if_alloc(IFT_ETHER);
4437 	if (ifp == NULL) {
4438 		device_printf(dev, "can not if_alloc()\n");
4439 		err = ENOSPC;
4440 		goto abort_with_parent_dmat;
4441 	}
4442 	if_initname(ifp, device_get_name(dev), device_get_unit(dev));
4443 
4444 	snprintf(sc->cmd_mtx_name, sizeof(sc->cmd_mtx_name), "%s:cmd",
4445 		 device_get_nameunit(dev));
4446 	mtx_init(&sc->cmd_mtx, sc->cmd_mtx_name, NULL, MTX_DEF);
4447 	snprintf(sc->driver_mtx_name, sizeof(sc->driver_mtx_name),
4448 		 "%s:drv", device_get_nameunit(dev));
4449 	mtx_init(&sc->driver_mtx, sc->driver_mtx_name,
4450 		 MTX_NETWORK_LOCK, MTX_DEF);
4451 
4452 	callout_init_mtx(&sc->co_hdl, &sc->driver_mtx, 0);
4453 
4454 	mxge_setup_cfg_space(sc);
4455 
4456 	/* Map the board into the kernel */
4457 	rid = PCIR_BARS;
4458 	sc->mem_res = bus_alloc_resource(dev, SYS_RES_MEMORY, &rid, 0,
4459 					 ~0, 1, RF_ACTIVE);
4460 	if (sc->mem_res == NULL) {
4461 		device_printf(dev, "could not map memory\n");
4462 		err = ENXIO;
4463 		goto abort_with_lock;
4464 	}
4465 	sc->sram = rman_get_virtual(sc->mem_res);
4466 	sc->sram_size = 2*1024*1024 - (2*(48*1024)+(32*1024)) - 0x100;
4467 	if (sc->sram_size > rman_get_size(sc->mem_res)) {
4468 		device_printf(dev, "impossible memory region size %ld\n",
4469 			      rman_get_size(sc->mem_res));
4470 		err = ENXIO;
4471 		goto abort_with_mem_res;
4472 	}
4473 
4474 	/* make NULL terminated copy of the EEPROM strings section of
4475 	   lanai SRAM */
4476 	bzero(sc->eeprom_strings, MXGE_EEPROM_STRINGS_SIZE);
4477 	bus_space_read_region_1(rman_get_bustag(sc->mem_res),
4478 				rman_get_bushandle(sc->mem_res),
4479 				sc->sram_size - MXGE_EEPROM_STRINGS_SIZE,
4480 				sc->eeprom_strings,
4481 				MXGE_EEPROM_STRINGS_SIZE - 2);
4482 	err = mxge_parse_strings(sc);
4483 	if (err != 0)
4484 		goto abort_with_mem_res;
4485 
4486 	/* Enable write combining for efficient use of PCIe bus */
4487 	mxge_enable_wc(sc);
4488 
4489 	/* Allocate the out of band dma memory */
4490 	err = mxge_dma_alloc(sc, &sc->cmd_dma,
4491 			     sizeof (mxge_cmd_t), 64);
4492 	if (err != 0)
4493 		goto abort_with_mem_res;
4494 	sc->cmd = (mcp_cmd_response_t *) sc->cmd_dma.addr;
4495 	err = mxge_dma_alloc(sc, &sc->zeropad_dma, 64, 64);
4496 	if (err != 0)
4497 		goto abort_with_cmd_dma;
4498 
4499 	err = mxge_dma_alloc(sc, &sc->dmabench_dma, 4096, 4096);
4500 	if (err != 0)
4501 		goto abort_with_zeropad_dma;
4502 
4503 	/* select & load the firmware */
4504 	err = mxge_select_firmware(sc);
4505 	if (err != 0)
4506 		goto abort_with_dmabench;
4507 	sc->intr_coal_delay = mxge_intr_coal_delay;
4508 
4509 	mxge_slice_probe(sc);
4510 	err = mxge_alloc_slices(sc);
4511 	if (err != 0)
4512 		goto abort_with_dmabench;
4513 
4514 	err = mxge_reset(sc, 0);
4515 	if (err != 0)
4516 		goto abort_with_slices;
4517 
4518 	err = mxge_alloc_rings(sc);
4519 	if (err != 0) {
4520 		device_printf(sc->dev, "failed to allocate rings\n");
4521 		goto abort_with_dmabench;
4522 	}
4523 
4524 	err = mxge_add_irq(sc);
4525 	if (err != 0) {
4526 		device_printf(sc->dev, "failed to add irq\n");
4527 		goto abort_with_rings;
4528 	}
4529 
4530 	ifp->if_baudrate = IF_Gbps(10UL);
4531 	ifp->if_capabilities = IFCAP_RXCSUM | IFCAP_TXCSUM | IFCAP_TSO4 |
4532 		IFCAP_VLAN_MTU | IFCAP_LRO;
4533 
4534 #ifdef MXGE_NEW_VLAN_API
4535 	ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_HWCSUM;
4536 #endif
4537 
4538 	sc->max_mtu = mxge_max_mtu(sc);
4539 	if (sc->max_mtu >= 9000)
4540 		ifp->if_capabilities |= IFCAP_JUMBO_MTU;
4541 	else
4542 		device_printf(dev, "MTU limited to %d.  Install "
4543 			      "latest firmware for 9000 byte jumbo support\n",
4544 			      sc->max_mtu - ETHER_HDR_LEN);
4545 	ifp->if_hwassist = CSUM_TCP | CSUM_UDP | CSUM_TSO;
4546 	ifp->if_capenable = ifp->if_capabilities;
4547 	if (sc->lro_cnt == 0)
4548 		ifp->if_capenable &= ~IFCAP_LRO;
4549 	sc->csum_flag = 1;
4550         ifp->if_init = mxge_init;
4551         ifp->if_softc = sc;
4552         ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
4553         ifp->if_ioctl = mxge_ioctl;
4554         ifp->if_start = mxge_start;
4555 	/* Initialise the ifmedia structure */
4556 	ifmedia_init(&sc->media, 0, mxge_media_change,
4557 		     mxge_media_status);
4558 	mxge_set_media(sc, IFM_ETHER | IFM_AUTO);
4559 	mxge_media_probe(sc);
4560 	ether_ifattach(ifp, sc->mac_addr);
4561 	/* ether_ifattach sets mtu to 1500 */
4562 	if (ifp->if_capabilities & IFCAP_JUMBO_MTU)
4563 		ifp->if_mtu = 9000;
4564 
4565 	mxge_add_sysctls(sc);
4566 #ifdef IFNET_BUF_RING
4567 	ifp->if_transmit = mxge_transmit;
4568 	ifp->if_qflush = mxge_qflush;
4569 #endif
4570 	return 0;
4571 
4572 abort_with_rings:
4573 	mxge_free_rings(sc);
4574 abort_with_slices:
4575 	mxge_free_slices(sc);
4576 abort_with_dmabench:
4577 	mxge_dma_free(&sc->dmabench_dma);
4578 abort_with_zeropad_dma:
4579 	mxge_dma_free(&sc->zeropad_dma);
4580 abort_with_cmd_dma:
4581 	mxge_dma_free(&sc->cmd_dma);
4582 abort_with_mem_res:
4583 	bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
4584 abort_with_lock:
4585 	pci_disable_busmaster(dev);
4586 	mtx_destroy(&sc->cmd_mtx);
4587 	mtx_destroy(&sc->driver_mtx);
4588 	if_free(ifp);
4589 abort_with_parent_dmat:
4590 	bus_dma_tag_destroy(sc->parent_dmat);
4591 
4592 abort_with_nothing:
4593 	return err;
4594 }
4595 
4596 static int
4597 mxge_detach(device_t dev)
4598 {
4599 	mxge_softc_t *sc = device_get_softc(dev);
4600 
4601 	if (mxge_vlans_active(sc)) {
4602 		device_printf(sc->dev,
4603 			      "Detach vlans before removing module\n");
4604 		return EBUSY;
4605 	}
4606 	mtx_lock(&sc->driver_mtx);
4607 	if (sc->ifp->if_drv_flags & IFF_DRV_RUNNING)
4608 		mxge_close(sc);
4609 	mtx_unlock(&sc->driver_mtx);
4610 	ether_ifdetach(sc->ifp);
4611 	callout_drain(&sc->co_hdl);
4612 	ifmedia_removeall(&sc->media);
4613 	mxge_dummy_rdma(sc, 0);
4614 	mxge_rem_sysctls(sc);
4615 	mxge_rem_irq(sc);
4616 	mxge_free_rings(sc);
4617 	mxge_free_slices(sc);
4618 	mxge_dma_free(&sc->dmabench_dma);
4619 	mxge_dma_free(&sc->zeropad_dma);
4620 	mxge_dma_free(&sc->cmd_dma);
4621 	bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
4622 	pci_disable_busmaster(dev);
4623 	mtx_destroy(&sc->cmd_mtx);
4624 	mtx_destroy(&sc->driver_mtx);
4625 	if_free(sc->ifp);
4626 	bus_dma_tag_destroy(sc->parent_dmat);
4627 	return 0;
4628 }
4629 
4630 static int
4631 mxge_shutdown(device_t dev)
4632 {
4633 	return 0;
4634 }
4635 
4636 /*
4637   This file uses Myri10GE driver indentation.
4638 
4639   Local Variables:
4640   c-file-style:"linux"
4641   tab-width:8
4642   End:
4643 */
4644