xref: /freebsd/sys/dev/mxge/if_mxge.c (revision 82431678fce5c893ef9c7418ad6d998ad4187de6)
1 /******************************************************************************
2 
3 Copyright (c) 2006-2009, Myricom Inc.
4 All rights reserved.
5 
6 Redistribution and use in source and binary forms, with or without
7 modification, are permitted provided that the following conditions are met:
8 
9  1. Redistributions of source code must retain the above copyright notice,
10     this list of conditions and the following disclaimer.
11 
12  2. Neither the name of the Myricom Inc, nor the names of its
13     contributors may be used to endorse or promote products derived from
14     this software without specific prior written permission.
15 
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 POSSIBILITY OF SUCH DAMAGE.
27 
28 ***************************************************************************/
29 
30 #include <sys/cdefs.h>
31 __FBSDID("$FreeBSD$");
32 
33 #define IFNET_BUF_RING
34 
35 #include <sys/param.h>
36 #include <sys/systm.h>
37 #include <sys/linker.h>
38 #include <sys/firmware.h>
39 #include <sys/endian.h>
40 #include <sys/sockio.h>
41 #include <sys/mbuf.h>
42 #include <sys/malloc.h>
43 #include <sys/kdb.h>
44 #include <sys/kernel.h>
45 #include <sys/lock.h>
46 #include <sys/module.h>
47 #include <sys/socket.h>
48 #include <sys/sysctl.h>
49 #include <sys/sx.h>
50 
51 #include <net/if.h>
52 #include <net/if_arp.h>
53 #include <net/ethernet.h>
54 #include <net/if_dl.h>
55 #include <net/if_media.h>
56 
57 #include <net/bpf.h>
58 
59 #include <net/if_types.h>
60 #include <net/if_vlan_var.h>
61 #include <net/zlib.h>
62 
63 #include <netinet/in_systm.h>
64 #include <netinet/in.h>
65 #include <netinet/ip.h>
66 #include <netinet/tcp.h>
67 
68 #include <machine/bus.h>
69 #include <machine/in_cksum.h>
70 #include <machine/resource.h>
71 #ifdef IFNET_BUF_RING
72 #include <sys/buf_ring.h>
73 #endif
74 #include <sys/bus.h>
75 #include <sys/rman.h>
76 #include <sys/smp.h>
77 
78 #include <dev/pci/pcireg.h>
79 #include <dev/pci/pcivar.h>
80 #include <dev/pci/pci_private.h> /* XXX for pci_cfg_restore */
81 
82 #include <vm/vm.h>		/* for pmap_mapdev() */
83 #include <vm/pmap.h>
84 
85 #if defined(__i386) || defined(__amd64)
86 #include <machine/specialreg.h>
87 #endif
88 
89 #include <dev/mxge/mxge_mcp.h>
90 #include <dev/mxge/mcp_gen_header.h>
91 /*#define MXGE_FAKE_IFP*/
92 #include <dev/mxge/if_mxge_var.h>
93 
94 /* tunable params */
95 static int mxge_nvidia_ecrc_enable = 1;
96 static int mxge_force_firmware = 0;
97 static int mxge_intr_coal_delay = 30;
98 static int mxge_deassert_wait = 1;
99 static int mxge_flow_control = 1;
100 static int mxge_verbose = 0;
101 static int mxge_lro_cnt = 8;
102 static int mxge_ticks;
103 static int mxge_max_slices = 1;
104 static int mxge_rss_hash_type = MXGEFW_RSS_HASH_TYPE_SRC_PORT;
105 static int mxge_always_promisc = 0;
106 static char *mxge_fw_unaligned = "mxge_ethp_z8e";
107 static char *mxge_fw_aligned = "mxge_eth_z8e";
108 static char *mxge_fw_rss_aligned = "mxge_rss_eth_z8e";
109 static char *mxge_fw_rss_unaligned = "mxge_rss_ethp_z8e";
110 
111 static int mxge_probe(device_t dev);
112 static int mxge_attach(device_t dev);
113 static int mxge_detach(device_t dev);
114 static int mxge_shutdown(device_t dev);
115 static void mxge_intr(void *arg);
116 
117 static device_method_t mxge_methods[] =
118 {
119   /* Device interface */
120   DEVMETHOD(device_probe, mxge_probe),
121   DEVMETHOD(device_attach, mxge_attach),
122   DEVMETHOD(device_detach, mxge_detach),
123   DEVMETHOD(device_shutdown, mxge_shutdown),
124   {0, 0}
125 };
126 
127 static driver_t mxge_driver =
128 {
129   "mxge",
130   mxge_methods,
131   sizeof(mxge_softc_t),
132 };
133 
134 static devclass_t mxge_devclass;
135 
136 /* Declare ourselves to be a child of the PCI bus.*/
137 DRIVER_MODULE(mxge, pci, mxge_driver, mxge_devclass, 0, 0);
138 MODULE_DEPEND(mxge, firmware, 1, 1, 1);
139 MODULE_DEPEND(mxge, zlib, 1, 1, 1);
140 
141 static int mxge_load_firmware(mxge_softc_t *sc, int adopt);
142 static int mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data);
143 static int mxge_close(mxge_softc_t *sc);
144 static int mxge_open(mxge_softc_t *sc);
145 static void mxge_tick(void *arg);
146 
147 static int
148 mxge_probe(device_t dev)
149 {
150 	int rev;
151 
152 
153 	if ((pci_get_vendor(dev) == MXGE_PCI_VENDOR_MYRICOM) &&
154 	    ((pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E) ||
155 	     (pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E_9))) {
156 		rev = pci_get_revid(dev);
157 		switch (rev) {
158 		case MXGE_PCI_REV_Z8E:
159 			device_set_desc(dev, "Myri10G-PCIE-8A");
160 			break;
161 		case MXGE_PCI_REV_Z8ES:
162 			device_set_desc(dev, "Myri10G-PCIE-8B");
163 			break;
164 		default:
165 			device_set_desc(dev, "Myri10G-PCIE-8??");
166 			device_printf(dev, "Unrecognized rev %d NIC\n",
167 				      rev);
168 			break;
169 		}
170 		return 0;
171 	}
172 	return ENXIO;
173 }
174 
175 static void
176 mxge_enable_wc(mxge_softc_t *sc)
177 {
178 #if defined(__i386) || defined(__amd64)
179 	vm_offset_t len;
180 	int err;
181 
182 	sc->wc = 1;
183 	len = rman_get_size(sc->mem_res);
184 	err = pmap_change_attr((vm_offset_t) sc->sram,
185 			       len, PAT_WRITE_COMBINING);
186 	if (err != 0) {
187 		device_printf(sc->dev, "pmap_change_attr failed, %d\n",
188 			      err);
189 		sc->wc = 0;
190 	}
191 #endif
192 }
193 
194 
195 /* callback to get our DMA address */
196 static void
197 mxge_dmamap_callback(void *arg, bus_dma_segment_t *segs, int nsegs,
198 			 int error)
199 {
200 	if (error == 0) {
201 		*(bus_addr_t *) arg = segs->ds_addr;
202 	}
203 }
204 
205 static int
206 mxge_dma_alloc(mxge_softc_t *sc, mxge_dma_t *dma, size_t bytes,
207 		   bus_size_t alignment)
208 {
209 	int err;
210 	device_t dev = sc->dev;
211 	bus_size_t boundary, maxsegsize;
212 
213 	if (bytes > 4096 && alignment == 4096) {
214 		boundary = 0;
215 		maxsegsize = bytes;
216 	} else {
217 		boundary = 4096;
218 		maxsegsize = 4096;
219 	}
220 
221 	/* allocate DMAable memory tags */
222 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
223 				 alignment,		/* alignment */
224 				 boundary,		/* boundary */
225 				 BUS_SPACE_MAXADDR,	/* low */
226 				 BUS_SPACE_MAXADDR,	/* high */
227 				 NULL, NULL,		/* filter */
228 				 bytes,			/* maxsize */
229 				 1,			/* num segs */
230 				 maxsegsize,		/* maxsegsize */
231 				 BUS_DMA_COHERENT,	/* flags */
232 				 NULL, NULL,		/* lock */
233 				 &dma->dmat);		/* tag */
234 	if (err != 0) {
235 		device_printf(dev, "couldn't alloc tag (err = %d)\n", err);
236 		return err;
237 	}
238 
239 	/* allocate DMAable memory & map */
240 	err = bus_dmamem_alloc(dma->dmat, &dma->addr,
241 			       (BUS_DMA_WAITOK | BUS_DMA_COHERENT
242 				| BUS_DMA_ZERO),  &dma->map);
243 	if (err != 0) {
244 		device_printf(dev, "couldn't alloc mem (err = %d)\n", err);
245 		goto abort_with_dmat;
246 	}
247 
248 	/* load the memory */
249 	err = bus_dmamap_load(dma->dmat, dma->map, dma->addr, bytes,
250 			      mxge_dmamap_callback,
251 			      (void *)&dma->bus_addr, 0);
252 	if (err != 0) {
253 		device_printf(dev, "couldn't load map (err = %d)\n", err);
254 		goto abort_with_mem;
255 	}
256 	return 0;
257 
258 abort_with_mem:
259 	bus_dmamem_free(dma->dmat, dma->addr, dma->map);
260 abort_with_dmat:
261 	(void)bus_dma_tag_destroy(dma->dmat);
262 	return err;
263 }
264 
265 
266 static void
267 mxge_dma_free(mxge_dma_t *dma)
268 {
269 	bus_dmamap_unload(dma->dmat, dma->map);
270 	bus_dmamem_free(dma->dmat, dma->addr, dma->map);
271 	(void)bus_dma_tag_destroy(dma->dmat);
272 }
273 
274 /*
275  * The eeprom strings on the lanaiX have the format
276  * SN=x\0
277  * MAC=x:x:x:x:x:x\0
278  * PC=text\0
279  */
280 
281 static int
282 mxge_parse_strings(mxge_softc_t *sc)
283 {
284 #define MXGE_NEXT_STRING(p) while(ptr < limit && *ptr++)
285 
286 	char *ptr, *limit;
287 	int i, found_mac;
288 
289 	ptr = sc->eeprom_strings;
290 	limit = sc->eeprom_strings + MXGE_EEPROM_STRINGS_SIZE;
291 	found_mac = 0;
292 	while (ptr < limit && *ptr != '\0') {
293 		if (memcmp(ptr, "MAC=", 4) == 0) {
294 			ptr += 1;
295 			sc->mac_addr_string = ptr;
296 			for (i = 0; i < 6; i++) {
297 				ptr += 3;
298 				if ((ptr + 2) > limit)
299 					goto abort;
300 				sc->mac_addr[i] = strtoul(ptr, NULL, 16);
301 				found_mac = 1;
302 			}
303 		} else if (memcmp(ptr, "PC=", 3) == 0) {
304 			ptr += 3;
305 			strncpy(sc->product_code_string, ptr,
306 				sizeof (sc->product_code_string) - 1);
307 		} else if (memcmp(ptr, "SN=", 3) == 0) {
308 			ptr += 3;
309 			strncpy(sc->serial_number_string, ptr,
310 				sizeof (sc->serial_number_string) - 1);
311 		}
312 		MXGE_NEXT_STRING(ptr);
313 	}
314 
315 	if (found_mac)
316 		return 0;
317 
318  abort:
319 	device_printf(sc->dev, "failed to parse eeprom_strings\n");
320 
321 	return ENXIO;
322 }
323 
324 #if defined __i386 || defined i386 || defined __i386__ || defined __x86_64__
325 static void
326 mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
327 {
328 	uint32_t val;
329 	unsigned long base, off;
330 	char *va, *cfgptr;
331 	device_t pdev, mcp55;
332 	uint16_t vendor_id, device_id, word;
333 	uintptr_t bus, slot, func, ivend, idev;
334 	uint32_t *ptr32;
335 
336 
337 	if (!mxge_nvidia_ecrc_enable)
338 		return;
339 
340 	pdev = device_get_parent(device_get_parent(sc->dev));
341 	if (pdev == NULL) {
342 		device_printf(sc->dev, "could not find parent?\n");
343 		return;
344 	}
345 	vendor_id = pci_read_config(pdev, PCIR_VENDOR, 2);
346 	device_id = pci_read_config(pdev, PCIR_DEVICE, 2);
347 
348 	if (vendor_id != 0x10de)
349 		return;
350 
351 	base = 0;
352 
353 	if (device_id == 0x005d) {
354 		/* ck804, base address is magic */
355 		base = 0xe0000000UL;
356 	} else if (device_id >= 0x0374 && device_id <= 0x378) {
357 		/* mcp55, base address stored in chipset */
358 		mcp55 = pci_find_bsf(0, 0, 0);
359 		if (mcp55 &&
360 		    0x10de == pci_read_config(mcp55, PCIR_VENDOR, 2) &&
361 		    0x0369 == pci_read_config(mcp55, PCIR_DEVICE, 2)) {
362 			word = pci_read_config(mcp55, 0x90, 2);
363 			base = ((unsigned long)word & 0x7ffeU) << 25;
364 		}
365 	}
366 	if (!base)
367 		return;
368 
369 	/* XXXX
370 	   Test below is commented because it is believed that doing
371 	   config read/write beyond 0xff will access the config space
372 	   for the next larger function.  Uncomment this and remove
373 	   the hacky pmap_mapdev() way of accessing config space when
374 	   FreeBSD grows support for extended pcie config space access
375 	*/
376 #if 0
377 	/* See if we can, by some miracle, access the extended
378 	   config space */
379 	val = pci_read_config(pdev, 0x178, 4);
380 	if (val != 0xffffffff) {
381 		val |= 0x40;
382 		pci_write_config(pdev, 0x178, val, 4);
383 		return;
384 	}
385 #endif
386 	/* Rather than using normal pci config space writes, we must
387 	 * map the Nvidia config space ourselves.  This is because on
388 	 * opteron/nvidia class machine the 0xe000000 mapping is
389 	 * handled by the nvidia chipset, that means the internal PCI
390 	 * device (the on-chip northbridge), or the amd-8131 bridge
391 	 * and things behind them are not visible by this method.
392 	 */
393 
394 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
395 		      PCI_IVAR_BUS, &bus);
396 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
397 		      PCI_IVAR_SLOT, &slot);
398 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
399 		      PCI_IVAR_FUNCTION, &func);
400 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
401 		      PCI_IVAR_VENDOR, &ivend);
402 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
403 		      PCI_IVAR_DEVICE, &idev);
404 
405 	off =  base
406 		+ 0x00100000UL * (unsigned long)bus
407 		+ 0x00001000UL * (unsigned long)(func
408 						 + 8 * slot);
409 
410 	/* map it into the kernel */
411 	va = pmap_mapdev(trunc_page((vm_paddr_t)off), PAGE_SIZE);
412 
413 
414 	if (va == NULL) {
415 		device_printf(sc->dev, "pmap_kenter_temporary didn't\n");
416 		return;
417 	}
418 	/* get a pointer to the config space mapped into the kernel */
419 	cfgptr = va + (off & PAGE_MASK);
420 
421 	/* make sure that we can really access it */
422 	vendor_id = *(uint16_t *)(cfgptr + PCIR_VENDOR);
423 	device_id = *(uint16_t *)(cfgptr + PCIR_DEVICE);
424 	if (! (vendor_id == ivend && device_id == idev)) {
425 		device_printf(sc->dev, "mapping failed: 0x%x:0x%x\n",
426 			      vendor_id, device_id);
427 		pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
428 		return;
429 	}
430 
431 	ptr32 = (uint32_t*)(cfgptr + 0x178);
432 	val = *ptr32;
433 
434 	if (val == 0xffffffff) {
435 		device_printf(sc->dev, "extended mapping failed\n");
436 		pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
437 		return;
438 	}
439 	*ptr32 = val | 0x40;
440 	pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
441 	if (mxge_verbose)
442 		device_printf(sc->dev,
443 			      "Enabled ECRC on upstream Nvidia bridge "
444 			      "at %d:%d:%d\n",
445 			      (int)bus, (int)slot, (int)func);
446 	return;
447 }
448 #else
449 static void
450 mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
451 {
452 	device_printf(sc->dev,
453 		      "Nforce 4 chipset on non-x86/amd64!?!?!\n");
454 	return;
455 }
456 #endif
457 
458 
459 static int
460 mxge_dma_test(mxge_softc_t *sc, int test_type)
461 {
462 	mxge_cmd_t cmd;
463 	bus_addr_t dmatest_bus = sc->dmabench_dma.bus_addr;
464 	int status;
465 	uint32_t len;
466 	char *test = " ";
467 
468 
469 	/* Run a small DMA test.
470 	 * The magic multipliers to the length tell the firmware
471 	 * to do DMA read, write, or read+write tests.  The
472 	 * results are returned in cmd.data0.  The upper 16
473 	 * bits of the return is the number of transfers completed.
474 	 * The lower 16 bits is the time in 0.5us ticks that the
475 	 * transfers took to complete.
476 	 */
477 
478 	len = sc->tx_boundary;
479 
480 	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
481 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
482 	cmd.data2 = len * 0x10000;
483 	status = mxge_send_cmd(sc, test_type, &cmd);
484 	if (status != 0) {
485 		test = "read";
486 		goto abort;
487 	}
488 	sc->read_dma = ((cmd.data0>>16) * len * 2) /
489 		(cmd.data0 & 0xffff);
490 	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
491 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
492 	cmd.data2 = len * 0x1;
493 	status = mxge_send_cmd(sc, test_type, &cmd);
494 	if (status != 0) {
495 		test = "write";
496 		goto abort;
497 	}
498 	sc->write_dma = ((cmd.data0>>16) * len * 2) /
499 		(cmd.data0 & 0xffff);
500 
501 	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
502 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
503 	cmd.data2 = len * 0x10001;
504 	status = mxge_send_cmd(sc, test_type, &cmd);
505 	if (status != 0) {
506 		test = "read/write";
507 		goto abort;
508 	}
509 	sc->read_write_dma = ((cmd.data0>>16) * len * 2 * 2) /
510 		(cmd.data0 & 0xffff);
511 
512 abort:
513 	if (status != 0 && test_type != MXGEFW_CMD_UNALIGNED_TEST)
514 		device_printf(sc->dev, "DMA %s benchmark failed: %d\n",
515 			      test, status);
516 
517 	return status;
518 }
519 
520 /*
521  * The Lanai Z8E PCI-E interface achieves higher Read-DMA throughput
522  * when the PCI-E Completion packets are aligned on an 8-byte
523  * boundary.  Some PCI-E chip sets always align Completion packets; on
524  * the ones that do not, the alignment can be enforced by enabling
525  * ECRC generation (if supported).
526  *
527  * When PCI-E Completion packets are not aligned, it is actually more
528  * efficient to limit Read-DMA transactions to 2KB, rather than 4KB.
529  *
530  * If the driver can neither enable ECRC nor verify that it has
531  * already been enabled, then it must use a firmware image which works
532  * around unaligned completion packets (ethp_z8e.dat), and it should
533  * also ensure that it never gives the device a Read-DMA which is
534  * larger than 2KB by setting the tx_boundary to 2KB.  If ECRC is
535  * enabled, then the driver should use the aligned (eth_z8e.dat)
536  * firmware image, and set tx_boundary to 4KB.
537  */
538 
539 static int
540 mxge_firmware_probe(mxge_softc_t *sc)
541 {
542 	device_t dev = sc->dev;
543 	int reg, status;
544 	uint16_t pectl;
545 
546 	sc->tx_boundary = 4096;
547 	/*
548 	 * Verify the max read request size was set to 4KB
549 	 * before trying the test with 4KB.
550 	 */
551 	if (pci_find_extcap(dev, PCIY_EXPRESS, &reg) == 0) {
552 		pectl = pci_read_config(dev, reg + 0x8, 2);
553 		if ((pectl & (5 << 12)) != (5 << 12)) {
554 			device_printf(dev, "Max Read Req. size != 4k (0x%x\n",
555 				      pectl);
556 			sc->tx_boundary = 2048;
557 		}
558 	}
559 
560 	/*
561 	 * load the optimized firmware (which assumes aligned PCIe
562 	 * completions) in order to see if it works on this host.
563 	 */
564 	sc->fw_name = mxge_fw_aligned;
565 	status = mxge_load_firmware(sc, 1);
566 	if (status != 0) {
567 		return status;
568 	}
569 
570 	/*
571 	 * Enable ECRC if possible
572 	 */
573 	mxge_enable_nvidia_ecrc(sc);
574 
575 	/*
576 	 * Run a DMA test which watches for unaligned completions and
577 	 * aborts on the first one seen.
578 	 */
579 
580 	status = mxge_dma_test(sc, MXGEFW_CMD_UNALIGNED_TEST);
581 	if (status == 0)
582 		return 0; /* keep the aligned firmware */
583 
584 	if (status != E2BIG)
585 		device_printf(dev, "DMA test failed: %d\n", status);
586 	if (status == ENOSYS)
587 		device_printf(dev, "Falling back to ethp! "
588 			      "Please install up to date fw\n");
589 	return status;
590 }
591 
592 static int
593 mxge_select_firmware(mxge_softc_t *sc)
594 {
595 	int aligned = 0;
596 
597 
598 	if (mxge_force_firmware != 0) {
599 		if (mxge_force_firmware == 1)
600 			aligned = 1;
601 		else
602 			aligned = 0;
603 		if (mxge_verbose)
604 			device_printf(sc->dev,
605 				      "Assuming %s completions (forced)\n",
606 				      aligned ? "aligned" : "unaligned");
607 		goto abort;
608 	}
609 
610 	/* if the PCIe link width is 4 or less, we can use the aligned
611 	   firmware and skip any checks */
612 	if (sc->link_width != 0 && sc->link_width <= 4) {
613 		device_printf(sc->dev,
614 			      "PCIe x%d Link, expect reduced performance\n",
615 			      sc->link_width);
616 		aligned = 1;
617 		goto abort;
618 	}
619 
620 	if (0 == mxge_firmware_probe(sc))
621 		return 0;
622 
623 abort:
624 	if (aligned) {
625 		sc->fw_name = mxge_fw_aligned;
626 		sc->tx_boundary = 4096;
627 	} else {
628 		sc->fw_name = mxge_fw_unaligned;
629 		sc->tx_boundary = 2048;
630 	}
631 	return (mxge_load_firmware(sc, 0));
632 }
633 
634 union qualhack
635 {
636         const char *ro_char;
637         char *rw_char;
638 };
639 
640 static int
641 mxge_validate_firmware(mxge_softc_t *sc, const mcp_gen_header_t *hdr)
642 {
643 
644 
645 	if (be32toh(hdr->mcp_type) != MCP_TYPE_ETH) {
646 		device_printf(sc->dev, "Bad firmware type: 0x%x\n",
647 			      be32toh(hdr->mcp_type));
648 		return EIO;
649 	}
650 
651 	/* save firmware version for sysctl */
652 	strncpy(sc->fw_version, hdr->version, sizeof (sc->fw_version));
653 	if (mxge_verbose)
654 		device_printf(sc->dev, "firmware id: %s\n", hdr->version);
655 
656 	sscanf(sc->fw_version, "%d.%d.%d", &sc->fw_ver_major,
657 	       &sc->fw_ver_minor, &sc->fw_ver_tiny);
658 
659 	if (!(sc->fw_ver_major == MXGEFW_VERSION_MAJOR
660 	      && sc->fw_ver_minor == MXGEFW_VERSION_MINOR)) {
661 		device_printf(sc->dev, "Found firmware version %s\n",
662 			      sc->fw_version);
663 		device_printf(sc->dev, "Driver needs %d.%d\n",
664 			      MXGEFW_VERSION_MAJOR, MXGEFW_VERSION_MINOR);
665 		return EINVAL;
666 	}
667 	return 0;
668 
669 }
670 
671 static void *
672 z_alloc(void *nil, u_int items, u_int size)
673 {
674         void *ptr;
675 
676         ptr = malloc(items * size, M_TEMP, M_NOWAIT);
677         return ptr;
678 }
679 
680 static void
681 z_free(void *nil, void *ptr)
682 {
683         free(ptr, M_TEMP);
684 }
685 
686 
687 static int
688 mxge_load_firmware_helper(mxge_softc_t *sc, uint32_t *limit)
689 {
690 	z_stream zs;
691 	char *inflate_buffer;
692 	const struct firmware *fw;
693 	const mcp_gen_header_t *hdr;
694 	unsigned hdr_offset;
695 	int status;
696 	unsigned int i;
697 	char dummy;
698 	size_t fw_len;
699 
700 	fw = firmware_get(sc->fw_name);
701 	if (fw == NULL) {
702 		device_printf(sc->dev, "Could not find firmware image %s\n",
703 			      sc->fw_name);
704 		return ENOENT;
705 	}
706 
707 
708 
709 	/* setup zlib and decompress f/w */
710 	bzero(&zs, sizeof (zs));
711 	zs.zalloc = z_alloc;
712 	zs.zfree = z_free;
713 	status = inflateInit(&zs);
714 	if (status != Z_OK) {
715 		status = EIO;
716 		goto abort_with_fw;
717 	}
718 
719 	/* the uncompressed size is stored as the firmware version,
720 	   which would otherwise go unused */
721 	fw_len = (size_t) fw->version;
722 	inflate_buffer = malloc(fw_len, M_TEMP, M_NOWAIT);
723 	if (inflate_buffer == NULL)
724 		goto abort_with_zs;
725 	zs.avail_in = fw->datasize;
726 	zs.next_in = __DECONST(char *, fw->data);
727 	zs.avail_out = fw_len;
728 	zs.next_out = inflate_buffer;
729 	status = inflate(&zs, Z_FINISH);
730 	if (status != Z_STREAM_END) {
731 		device_printf(sc->dev, "zlib %d\n", status);
732 		status = EIO;
733 		goto abort_with_buffer;
734 	}
735 
736 	/* check id */
737 	hdr_offset = htobe32(*(const uint32_t *)
738 			     (inflate_buffer + MCP_HEADER_PTR_OFFSET));
739 	if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > fw_len) {
740 		device_printf(sc->dev, "Bad firmware file");
741 		status = EIO;
742 		goto abort_with_buffer;
743 	}
744 	hdr = (const void*)(inflate_buffer + hdr_offset);
745 
746 	status = mxge_validate_firmware(sc, hdr);
747 	if (status != 0)
748 		goto abort_with_buffer;
749 
750 	/* Copy the inflated firmware to NIC SRAM. */
751 	for (i = 0; i < fw_len; i += 256) {
752 		mxge_pio_copy(sc->sram + MXGE_FW_OFFSET + i,
753 			      inflate_buffer + i,
754 			      min(256U, (unsigned)(fw_len - i)));
755 		wmb();
756 		dummy = *sc->sram;
757 		wmb();
758 	}
759 
760 	*limit = fw_len;
761 	status = 0;
762 abort_with_buffer:
763 	free(inflate_buffer, M_TEMP);
764 abort_with_zs:
765 	inflateEnd(&zs);
766 abort_with_fw:
767 	firmware_put(fw, FIRMWARE_UNLOAD);
768 	return status;
769 }
770 
771 /*
772  * Enable or disable periodic RDMAs from the host to make certain
773  * chipsets resend dropped PCIe messages
774  */
775 
776 static void
777 mxge_dummy_rdma(mxge_softc_t *sc, int enable)
778 {
779 	char buf_bytes[72];
780 	volatile uint32_t *confirm;
781 	volatile char *submit;
782 	uint32_t *buf, dma_low, dma_high;
783 	int i;
784 
785 	buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
786 
787 	/* clear confirmation addr */
788 	confirm = (volatile uint32_t *)sc->cmd;
789 	*confirm = 0;
790 	wmb();
791 
792 	/* send an rdma command to the PCIe engine, and wait for the
793 	   response in the confirmation address.  The firmware should
794 	   write a -1 there to indicate it is alive and well
795 	*/
796 
797 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
798 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
799 	buf[0] = htobe32(dma_high);		/* confirm addr MSW */
800 	buf[1] = htobe32(dma_low);		/* confirm addr LSW */
801 	buf[2] = htobe32(0xffffffff);		/* confirm data */
802 	dma_low = MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr);
803 	dma_high = MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr);
804 	buf[3] = htobe32(dma_high); 		/* dummy addr MSW */
805 	buf[4] = htobe32(dma_low); 		/* dummy addr LSW */
806 	buf[5] = htobe32(enable);			/* enable? */
807 
808 
809 	submit = (volatile char *)(sc->sram + MXGEFW_BOOT_DUMMY_RDMA);
810 
811 	mxge_pio_copy(submit, buf, 64);
812 	wmb();
813 	DELAY(1000);
814 	wmb();
815 	i = 0;
816 	while (*confirm != 0xffffffff && i < 20) {
817 		DELAY(1000);
818 		i++;
819 	}
820 	if (*confirm != 0xffffffff) {
821 		device_printf(sc->dev, "dummy rdma %s failed (%p = 0x%x)",
822 			      (enable ? "enable" : "disable"), confirm,
823 			      *confirm);
824 	}
825 	return;
826 }
827 
828 static int
829 mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data)
830 {
831 	mcp_cmd_t *buf;
832 	char buf_bytes[sizeof(*buf) + 8];
833 	volatile mcp_cmd_response_t *response = sc->cmd;
834 	volatile char *cmd_addr = sc->sram + MXGEFW_ETH_CMD;
835 	uint32_t dma_low, dma_high;
836 	int err, sleep_total = 0;
837 
838 	/* ensure buf is aligned to 8 bytes */
839 	buf = (mcp_cmd_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
840 
841 	buf->data0 = htobe32(data->data0);
842 	buf->data1 = htobe32(data->data1);
843 	buf->data2 = htobe32(data->data2);
844 	buf->cmd = htobe32(cmd);
845 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
846 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
847 
848 	buf->response_addr.low = htobe32(dma_low);
849 	buf->response_addr.high = htobe32(dma_high);
850 	mtx_lock(&sc->cmd_mtx);
851 	response->result = 0xffffffff;
852 	wmb();
853 	mxge_pio_copy((volatile void *)cmd_addr, buf, sizeof (*buf));
854 
855 	/* wait up to 20ms */
856 	err = EAGAIN;
857 	for (sleep_total = 0; sleep_total <  20; sleep_total++) {
858 		bus_dmamap_sync(sc->cmd_dma.dmat,
859 				sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
860 		wmb();
861 		switch (be32toh(response->result)) {
862 		case 0:
863 			data->data0 = be32toh(response->data);
864 			err = 0;
865 			break;
866 		case 0xffffffff:
867 			DELAY(1000);
868 			break;
869 		case MXGEFW_CMD_UNKNOWN:
870 			err = ENOSYS;
871 			break;
872 		case MXGEFW_CMD_ERROR_UNALIGNED:
873 			err = E2BIG;
874 			break;
875 		case MXGEFW_CMD_ERROR_BUSY:
876 			err = EBUSY;
877 			break;
878 		default:
879 			device_printf(sc->dev,
880 				      "mxge: command %d "
881 				      "failed, result = %d\n",
882 				      cmd, be32toh(response->result));
883 			err = ENXIO;
884 			break;
885 		}
886 		if (err != EAGAIN)
887 			break;
888 	}
889 	if (err == EAGAIN)
890 		device_printf(sc->dev, "mxge: command %d timed out"
891 			      "result = %d\n",
892 			      cmd, be32toh(response->result));
893 	mtx_unlock(&sc->cmd_mtx);
894 	return err;
895 }
896 
897 static int
898 mxge_adopt_running_firmware(mxge_softc_t *sc)
899 {
900 	struct mcp_gen_header *hdr;
901 	const size_t bytes = sizeof (struct mcp_gen_header);
902 	size_t hdr_offset;
903 	int status;
904 
905 	/* find running firmware header */
906 	hdr_offset = htobe32(*(volatile uint32_t *)
907 			     (sc->sram + MCP_HEADER_PTR_OFFSET));
908 
909 	if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > sc->sram_size) {
910 		device_printf(sc->dev,
911 			      "Running firmware has bad header offset (%d)\n",
912 			      (int)hdr_offset);
913 		return EIO;
914 	}
915 
916 	/* copy header of running firmware from SRAM to host memory to
917 	 * validate firmware */
918 	hdr = malloc(bytes, M_DEVBUF, M_NOWAIT);
919 	if (hdr == NULL) {
920 		device_printf(sc->dev, "could not malloc firmware hdr\n");
921 		return ENOMEM;
922 	}
923 	bus_space_read_region_1(rman_get_bustag(sc->mem_res),
924 				rman_get_bushandle(sc->mem_res),
925 				hdr_offset, (char *)hdr, bytes);
926 	status = mxge_validate_firmware(sc, hdr);
927 	free(hdr, M_DEVBUF);
928 
929 	/*
930 	 * check to see if adopted firmware has bug where adopting
931 	 * it will cause broadcasts to be filtered unless the NIC
932 	 * is kept in ALLMULTI mode
933 	 */
934 	if (sc->fw_ver_major == 1 && sc->fw_ver_minor == 4 &&
935 	    sc->fw_ver_tiny >= 4 && sc->fw_ver_tiny <= 11) {
936 		sc->adopted_rx_filter_bug = 1;
937 		device_printf(sc->dev, "Adopting fw %d.%d.%d: "
938 			      "working around rx filter bug\n",
939 			      sc->fw_ver_major, sc->fw_ver_minor,
940 			      sc->fw_ver_tiny);
941 	}
942 
943 	return status;
944 }
945 
946 
947 static int
948 mxge_load_firmware(mxge_softc_t *sc, int adopt)
949 {
950 	volatile uint32_t *confirm;
951 	volatile char *submit;
952 	char buf_bytes[72];
953 	uint32_t *buf, size, dma_low, dma_high;
954 	int status, i;
955 
956 	buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
957 
958 	size = sc->sram_size;
959 	status = mxge_load_firmware_helper(sc, &size);
960 	if (status) {
961 		if (!adopt)
962 			return status;
963 		/* Try to use the currently running firmware, if
964 		   it is new enough */
965 		status = mxge_adopt_running_firmware(sc);
966 		if (status) {
967 			device_printf(sc->dev,
968 				      "failed to adopt running firmware\n");
969 			return status;
970 		}
971 		device_printf(sc->dev,
972 			      "Successfully adopted running firmware\n");
973 		if (sc->tx_boundary == 4096) {
974 			device_printf(sc->dev,
975 				"Using firmware currently running on NIC"
976 				 ".  For optimal\n");
977 			device_printf(sc->dev,
978 				 "performance consider loading optimized "
979 				 "firmware\n");
980 		}
981 		sc->fw_name = mxge_fw_unaligned;
982 		sc->tx_boundary = 2048;
983 		return 0;
984 	}
985 	/* clear confirmation addr */
986 	confirm = (volatile uint32_t *)sc->cmd;
987 	*confirm = 0;
988 	wmb();
989 	/* send a reload command to the bootstrap MCP, and wait for the
990 	   response in the confirmation address.  The firmware should
991 	   write a -1 there to indicate it is alive and well
992 	*/
993 
994 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
995 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
996 
997 	buf[0] = htobe32(dma_high);	/* confirm addr MSW */
998 	buf[1] = htobe32(dma_low);	/* confirm addr LSW */
999 	buf[2] = htobe32(0xffffffff);	/* confirm data */
1000 
1001 	/* FIX: All newest firmware should un-protect the bottom of
1002 	   the sram before handoff. However, the very first interfaces
1003 	   do not. Therefore the handoff copy must skip the first 8 bytes
1004 	*/
1005 					/* where the code starts*/
1006 	buf[3] = htobe32(MXGE_FW_OFFSET + 8);
1007 	buf[4] = htobe32(size - 8); 	/* length of code */
1008 	buf[5] = htobe32(8);		/* where to copy to */
1009 	buf[6] = htobe32(0);		/* where to jump to */
1010 
1011 	submit = (volatile char *)(sc->sram + MXGEFW_BOOT_HANDOFF);
1012 	mxge_pio_copy(submit, buf, 64);
1013 	wmb();
1014 	DELAY(1000);
1015 	wmb();
1016 	i = 0;
1017 	while (*confirm != 0xffffffff && i < 20) {
1018 		DELAY(1000*10);
1019 		i++;
1020 		bus_dmamap_sync(sc->cmd_dma.dmat,
1021 				sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
1022 	}
1023 	if (*confirm != 0xffffffff) {
1024 		device_printf(sc->dev,"handoff failed (%p = 0x%x)",
1025 			confirm, *confirm);
1026 
1027 		return ENXIO;
1028 	}
1029 	return 0;
1030 }
1031 
1032 static int
1033 mxge_update_mac_address(mxge_softc_t *sc)
1034 {
1035 	mxge_cmd_t cmd;
1036 	uint8_t *addr = sc->mac_addr;
1037 	int status;
1038 
1039 
1040 	cmd.data0 = ((addr[0] << 24) | (addr[1] << 16)
1041 		     | (addr[2] << 8) | addr[3]);
1042 
1043 	cmd.data1 = ((addr[4] << 8) | (addr[5]));
1044 
1045 	status = mxge_send_cmd(sc, MXGEFW_SET_MAC_ADDRESS, &cmd);
1046 	return status;
1047 }
1048 
1049 static int
1050 mxge_change_pause(mxge_softc_t *sc, int pause)
1051 {
1052 	mxge_cmd_t cmd;
1053 	int status;
1054 
1055 	if (pause)
1056 		status = mxge_send_cmd(sc, MXGEFW_ENABLE_FLOW_CONTROL,
1057 				       &cmd);
1058 	else
1059 		status = mxge_send_cmd(sc, MXGEFW_DISABLE_FLOW_CONTROL,
1060 				       &cmd);
1061 
1062 	if (status) {
1063 		device_printf(sc->dev, "Failed to set flow control mode\n");
1064 		return ENXIO;
1065 	}
1066 	sc->pause = pause;
1067 	return 0;
1068 }
1069 
1070 static void
1071 mxge_change_promisc(mxge_softc_t *sc, int promisc)
1072 {
1073 	mxge_cmd_t cmd;
1074 	int status;
1075 
1076 	if (mxge_always_promisc)
1077 		promisc = 1;
1078 
1079 	if (promisc)
1080 		status = mxge_send_cmd(sc, MXGEFW_ENABLE_PROMISC,
1081 				       &cmd);
1082 	else
1083 		status = mxge_send_cmd(sc, MXGEFW_DISABLE_PROMISC,
1084 				       &cmd);
1085 
1086 	if (status) {
1087 		device_printf(sc->dev, "Failed to set promisc mode\n");
1088 	}
1089 }
1090 
1091 static void
1092 mxge_set_multicast_list(mxge_softc_t *sc)
1093 {
1094 	mxge_cmd_t cmd;
1095 	struct ifmultiaddr *ifma;
1096 	struct ifnet *ifp = sc->ifp;
1097 	int err;
1098 
1099 	/* This firmware is known to not support multicast */
1100 	if (!sc->fw_multicast_support)
1101 		return;
1102 
1103 	/* Disable multicast filtering while we play with the lists*/
1104 	err = mxge_send_cmd(sc, MXGEFW_ENABLE_ALLMULTI, &cmd);
1105 	if (err != 0) {
1106 		device_printf(sc->dev, "Failed MXGEFW_ENABLE_ALLMULTI,"
1107 		       " error status: %d\n", err);
1108 		return;
1109 	}
1110 
1111 	if (sc->adopted_rx_filter_bug)
1112 		return;
1113 
1114 	if (ifp->if_flags & IFF_ALLMULTI)
1115 		/* request to disable multicast filtering, so quit here */
1116 		return;
1117 
1118 	/* Flush all the filters */
1119 
1120 	err = mxge_send_cmd(sc, MXGEFW_LEAVE_ALL_MULTICAST_GROUPS, &cmd);
1121 	if (err != 0) {
1122 		device_printf(sc->dev,
1123 			      "Failed MXGEFW_LEAVE_ALL_MULTICAST_GROUPS"
1124 			      ", error status: %d\n", err);
1125 		return;
1126 	}
1127 
1128 	/* Walk the multicast list, and add each address */
1129 
1130 	IF_ADDR_LOCK(ifp);
1131 	TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
1132 		if (ifma->ifma_addr->sa_family != AF_LINK)
1133 			continue;
1134 		bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr),
1135 		      &cmd.data0, 4);
1136 		bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr) + 4,
1137 		      &cmd.data1, 2);
1138 		cmd.data0 = htonl(cmd.data0);
1139 		cmd.data1 = htonl(cmd.data1);
1140 		err = mxge_send_cmd(sc, MXGEFW_JOIN_MULTICAST_GROUP, &cmd);
1141 		if (err != 0) {
1142 			device_printf(sc->dev, "Failed "
1143 			       "MXGEFW_JOIN_MULTICAST_GROUP, error status:"
1144 			       "%d\t", err);
1145 			/* abort, leaving multicast filtering off */
1146 			IF_ADDR_UNLOCK(ifp);
1147 			return;
1148 		}
1149 	}
1150 	IF_ADDR_UNLOCK(ifp);
1151 	/* Enable multicast filtering */
1152 	err = mxge_send_cmd(sc, MXGEFW_DISABLE_ALLMULTI, &cmd);
1153 	if (err != 0) {
1154 		device_printf(sc->dev, "Failed MXGEFW_DISABLE_ALLMULTI"
1155 		       ", error status: %d\n", err);
1156 	}
1157 }
1158 
1159 static int
1160 mxge_max_mtu(mxge_softc_t *sc)
1161 {
1162 	mxge_cmd_t cmd;
1163 	int status;
1164 
1165 	if (MJUMPAGESIZE - MXGEFW_PAD >  MXGEFW_MAX_MTU)
1166 		return  MXGEFW_MAX_MTU - MXGEFW_PAD;
1167 
1168 	/* try to set nbufs to see if it we can
1169 	   use virtually contiguous jumbos */
1170 	cmd.data0 = 0;
1171 	status = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
1172 			       &cmd);
1173 	if (status == 0)
1174 		return  MXGEFW_MAX_MTU - MXGEFW_PAD;
1175 
1176 	/* otherwise, we're limited to MJUMPAGESIZE */
1177 	return MJUMPAGESIZE - MXGEFW_PAD;
1178 }
1179 
1180 static int
1181 mxge_reset(mxge_softc_t *sc, int interrupts_setup)
1182 {
1183 	struct mxge_slice_state *ss;
1184 	mxge_rx_done_t *rx_done;
1185 	volatile uint32_t *irq_claim;
1186 	mxge_cmd_t cmd;
1187 	int slice, status;
1188 
1189 	/* try to send a reset command to the card to see if it
1190 	   is alive */
1191 	memset(&cmd, 0, sizeof (cmd));
1192 	status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
1193 	if (status != 0) {
1194 		device_printf(sc->dev, "failed reset\n");
1195 		return ENXIO;
1196 	}
1197 
1198 	mxge_dummy_rdma(sc, 1);
1199 
1200 
1201 	/* set the intrq size */
1202 	cmd.data0 = sc->rx_ring_size;
1203 	status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
1204 
1205 	/*
1206 	 * Even though we already know how many slices are supported
1207 	 * via mxge_slice_probe(), MXGEFW_CMD_GET_MAX_RSS_QUEUES
1208 	 * has magic side effects, and must be called after a reset.
1209 	 * It must be called prior to calling any RSS related cmds,
1210 	 * including assigning an interrupt queue for anything but
1211 	 * slice 0.  It must also be called *after*
1212 	 * MXGEFW_CMD_SET_INTRQ_SIZE, since the intrq size is used by
1213 	 * the firmware to compute offsets.
1214 	 */
1215 
1216 	if (sc->num_slices > 1) {
1217 		/* ask the maximum number of slices it supports */
1218 		status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES,
1219 					   &cmd);
1220 		if (status != 0) {
1221 			device_printf(sc->dev,
1222 				      "failed to get number of slices\n");
1223 			return status;
1224 		}
1225 		/*
1226 		 * MXGEFW_CMD_ENABLE_RSS_QUEUES must be called prior
1227 		 * to setting up the interrupt queue DMA
1228 		 */
1229 		cmd.data0 = sc->num_slices;
1230 		cmd.data1 = MXGEFW_SLICE_INTR_MODE_ONE_PER_SLICE;
1231 #ifdef IFNET_BUF_RING
1232 		cmd.data1 |= MXGEFW_SLICE_ENABLE_MULTIPLE_TX_QUEUES;
1233 #endif
1234 		status = mxge_send_cmd(sc, MXGEFW_CMD_ENABLE_RSS_QUEUES,
1235 					   &cmd);
1236 		if (status != 0) {
1237 			device_printf(sc->dev,
1238 				      "failed to set number of slices\n");
1239 			return status;
1240 		}
1241 	}
1242 
1243 
1244 	if (interrupts_setup) {
1245 		/* Now exchange information about interrupts  */
1246 		for (slice = 0; slice < sc->num_slices; slice++) {
1247 			rx_done = &sc->ss[slice].rx_done;
1248 			memset(rx_done->entry, 0, sc->rx_ring_size);
1249 			cmd.data0 = MXGE_LOWPART_TO_U32(rx_done->dma.bus_addr);
1250 			cmd.data1 = MXGE_HIGHPART_TO_U32(rx_done->dma.bus_addr);
1251 			cmd.data2 = slice;
1252 			status |= mxge_send_cmd(sc,
1253 						MXGEFW_CMD_SET_INTRQ_DMA,
1254 						&cmd);
1255 		}
1256 	}
1257 
1258 	status |= mxge_send_cmd(sc,
1259 				MXGEFW_CMD_GET_INTR_COAL_DELAY_OFFSET, &cmd);
1260 
1261 
1262 	sc->intr_coal_delay_ptr = (volatile uint32_t *)(sc->sram + cmd.data0);
1263 
1264 	status |= mxge_send_cmd(sc, MXGEFW_CMD_GET_IRQ_ACK_OFFSET, &cmd);
1265 	irq_claim = (volatile uint32_t *)(sc->sram + cmd.data0);
1266 
1267 
1268 	status |= mxge_send_cmd(sc,  MXGEFW_CMD_GET_IRQ_DEASSERT_OFFSET,
1269 				&cmd);
1270 	sc->irq_deassert = (volatile uint32_t *)(sc->sram + cmd.data0);
1271 	if (status != 0) {
1272 		device_printf(sc->dev, "failed set interrupt parameters\n");
1273 		return status;
1274 	}
1275 
1276 
1277 	*sc->intr_coal_delay_ptr = htobe32(sc->intr_coal_delay);
1278 
1279 
1280 	/* run a DMA benchmark */
1281 	(void) mxge_dma_test(sc, MXGEFW_DMA_TEST);
1282 
1283 	for (slice = 0; slice < sc->num_slices; slice++) {
1284 		ss = &sc->ss[slice];
1285 
1286 		ss->irq_claim = irq_claim + (2 * slice);
1287 		/* reset mcp/driver shared state back to 0 */
1288 		ss->rx_done.idx = 0;
1289 		ss->rx_done.cnt = 0;
1290 		ss->tx.req = 0;
1291 		ss->tx.done = 0;
1292 		ss->tx.pkt_done = 0;
1293 		ss->tx.queue_active = 0;
1294 		ss->tx.activate = 0;
1295 		ss->tx.deactivate = 0;
1296 		ss->tx.wake = 0;
1297 		ss->tx.defrag = 0;
1298 		ss->tx.stall = 0;
1299 		ss->rx_big.cnt = 0;
1300 		ss->rx_small.cnt = 0;
1301 		ss->lro_bad_csum = 0;
1302 		ss->lro_queued = 0;
1303 		ss->lro_flushed = 0;
1304 		if (ss->fw_stats != NULL) {
1305 			ss->fw_stats->valid = 0;
1306 			ss->fw_stats->send_done_count = 0;
1307 		}
1308 	}
1309 	sc->rdma_tags_available = 15;
1310 	status = mxge_update_mac_address(sc);
1311 	mxge_change_promisc(sc, 0);
1312 	mxge_change_pause(sc, sc->pause);
1313 	mxge_set_multicast_list(sc);
1314 	return status;
1315 }
1316 
1317 static int
1318 mxge_change_intr_coal(SYSCTL_HANDLER_ARGS)
1319 {
1320         mxge_softc_t *sc;
1321         unsigned int intr_coal_delay;
1322         int err;
1323 
1324         sc = arg1;
1325         intr_coal_delay = sc->intr_coal_delay;
1326         err = sysctl_handle_int(oidp, &intr_coal_delay, arg2, req);
1327         if (err != 0) {
1328                 return err;
1329         }
1330         if (intr_coal_delay == sc->intr_coal_delay)
1331                 return 0;
1332 
1333         if (intr_coal_delay == 0 || intr_coal_delay > 1000*1000)
1334                 return EINVAL;
1335 
1336 	mtx_lock(&sc->driver_mtx);
1337 	*sc->intr_coal_delay_ptr = htobe32(intr_coal_delay);
1338 	sc->intr_coal_delay = intr_coal_delay;
1339 
1340 	mtx_unlock(&sc->driver_mtx);
1341         return err;
1342 }
1343 
1344 static int
1345 mxge_change_flow_control(SYSCTL_HANDLER_ARGS)
1346 {
1347         mxge_softc_t *sc;
1348         unsigned int enabled;
1349         int err;
1350 
1351         sc = arg1;
1352         enabled = sc->pause;
1353         err = sysctl_handle_int(oidp, &enabled, arg2, req);
1354         if (err != 0) {
1355                 return err;
1356         }
1357         if (enabled == sc->pause)
1358                 return 0;
1359 
1360 	mtx_lock(&sc->driver_mtx);
1361 	err = mxge_change_pause(sc, enabled);
1362 	mtx_unlock(&sc->driver_mtx);
1363         return err;
1364 }
1365 
1366 static int
1367 mxge_change_lro_locked(mxge_softc_t *sc, int lro_cnt)
1368 {
1369 	struct ifnet *ifp;
1370 	int err = 0;
1371 
1372 	ifp = sc->ifp;
1373 	if (lro_cnt == 0)
1374 		ifp->if_capenable &= ~IFCAP_LRO;
1375 	else
1376 		ifp->if_capenable |= IFCAP_LRO;
1377 	sc->lro_cnt = lro_cnt;
1378 	if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
1379 		mxge_close(sc);
1380 		err = mxge_open(sc);
1381 	}
1382 	return err;
1383 }
1384 
1385 static int
1386 mxge_change_lro(SYSCTL_HANDLER_ARGS)
1387 {
1388 	mxge_softc_t *sc;
1389 	unsigned int lro_cnt;
1390 	int err;
1391 
1392 	sc = arg1;
1393 	lro_cnt = sc->lro_cnt;
1394 	err = sysctl_handle_int(oidp, &lro_cnt, arg2, req);
1395 	if (err != 0)
1396 		return err;
1397 
1398 	if (lro_cnt == sc->lro_cnt)
1399 		return 0;
1400 
1401 	if (lro_cnt > 128)
1402 		return EINVAL;
1403 
1404 	mtx_lock(&sc->driver_mtx);
1405 	err = mxge_change_lro_locked(sc, lro_cnt);
1406 	mtx_unlock(&sc->driver_mtx);
1407 	return err;
1408 }
1409 
1410 static int
1411 mxge_handle_be32(SYSCTL_HANDLER_ARGS)
1412 {
1413         int err;
1414 
1415         if (arg1 == NULL)
1416                 return EFAULT;
1417         arg2 = be32toh(*(int *)arg1);
1418         arg1 = NULL;
1419         err = sysctl_handle_int(oidp, arg1, arg2, req);
1420 
1421         return err;
1422 }
1423 
1424 static void
1425 mxge_rem_sysctls(mxge_softc_t *sc)
1426 {
1427 	struct mxge_slice_state *ss;
1428 	int slice;
1429 
1430 	if (sc->slice_sysctl_tree == NULL)
1431 		return;
1432 
1433 	for (slice = 0; slice < sc->num_slices; slice++) {
1434 		ss = &sc->ss[slice];
1435 		if (ss == NULL || ss->sysctl_tree == NULL)
1436 			continue;
1437 		sysctl_ctx_free(&ss->sysctl_ctx);
1438 		ss->sysctl_tree = NULL;
1439 	}
1440 	sysctl_ctx_free(&sc->slice_sysctl_ctx);
1441 	sc->slice_sysctl_tree = NULL;
1442 }
1443 
1444 static void
1445 mxge_add_sysctls(mxge_softc_t *sc)
1446 {
1447 	struct sysctl_ctx_list *ctx;
1448 	struct sysctl_oid_list *children;
1449 	mcp_irq_data_t *fw;
1450 	struct mxge_slice_state *ss;
1451 	int slice;
1452 	char slice_num[8];
1453 
1454 	ctx = device_get_sysctl_ctx(sc->dev);
1455 	children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev));
1456 	fw = sc->ss[0].fw_stats;
1457 
1458 	/* random information */
1459 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1460 		       "firmware_version",
1461 		       CTLFLAG_RD, &sc->fw_version,
1462 		       0, "firmware version");
1463 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1464 		       "serial_number",
1465 		       CTLFLAG_RD, &sc->serial_number_string,
1466 		       0, "serial number");
1467 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1468 		       "product_code",
1469 		       CTLFLAG_RD, &sc->product_code_string,
1470 		       0, "product_code");
1471 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1472 		       "pcie_link_width",
1473 		       CTLFLAG_RD, &sc->link_width,
1474 		       0, "tx_boundary");
1475 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1476 		       "tx_boundary",
1477 		       CTLFLAG_RD, &sc->tx_boundary,
1478 		       0, "tx_boundary");
1479 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1480 		       "write_combine",
1481 		       CTLFLAG_RD, &sc->wc,
1482 		       0, "write combining PIO?");
1483 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1484 		       "read_dma_MBs",
1485 		       CTLFLAG_RD, &sc->read_dma,
1486 		       0, "DMA Read speed in MB/s");
1487 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1488 		       "write_dma_MBs",
1489 		       CTLFLAG_RD, &sc->write_dma,
1490 		       0, "DMA Write speed in MB/s");
1491 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1492 		       "read_write_dma_MBs",
1493 		       CTLFLAG_RD, &sc->read_write_dma,
1494 		       0, "DMA concurrent Read/Write speed in MB/s");
1495 
1496 
1497 	/* performance related tunables */
1498 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1499 			"intr_coal_delay",
1500 			CTLTYPE_INT|CTLFLAG_RW, sc,
1501 			0, mxge_change_intr_coal,
1502 			"I", "interrupt coalescing delay in usecs");
1503 
1504 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1505 			"flow_control_enabled",
1506 			CTLTYPE_INT|CTLFLAG_RW, sc,
1507 			0, mxge_change_flow_control,
1508 			"I", "interrupt coalescing delay in usecs");
1509 
1510 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1511 		       "deassert_wait",
1512 		       CTLFLAG_RW, &mxge_deassert_wait,
1513 		       0, "Wait for IRQ line to go low in ihandler");
1514 
1515 	/* stats block from firmware is in network byte order.
1516 	   Need to swap it */
1517 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1518 			"link_up",
1519 			CTLTYPE_INT|CTLFLAG_RD, &fw->link_up,
1520 			0, mxge_handle_be32,
1521 			"I", "link up");
1522 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1523 			"rdma_tags_available",
1524 			CTLTYPE_INT|CTLFLAG_RD, &fw->rdma_tags_available,
1525 			0, mxge_handle_be32,
1526 			"I", "rdma_tags_available");
1527 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1528 			"dropped_bad_crc32",
1529 			CTLTYPE_INT|CTLFLAG_RD,
1530 			&fw->dropped_bad_crc32,
1531 			0, mxge_handle_be32,
1532 			"I", "dropped_bad_crc32");
1533 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1534 			"dropped_bad_phy",
1535 			CTLTYPE_INT|CTLFLAG_RD,
1536 			&fw->dropped_bad_phy,
1537 			0, mxge_handle_be32,
1538 			"I", "dropped_bad_phy");
1539 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1540 			"dropped_link_error_or_filtered",
1541 			CTLTYPE_INT|CTLFLAG_RD,
1542 			&fw->dropped_link_error_or_filtered,
1543 			0, mxge_handle_be32,
1544 			"I", "dropped_link_error_or_filtered");
1545 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1546 			"dropped_link_overflow",
1547 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_link_overflow,
1548 			0, mxge_handle_be32,
1549 			"I", "dropped_link_overflow");
1550 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1551 			"dropped_multicast_filtered",
1552 			CTLTYPE_INT|CTLFLAG_RD,
1553 			&fw->dropped_multicast_filtered,
1554 			0, mxge_handle_be32,
1555 			"I", "dropped_multicast_filtered");
1556 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1557 			"dropped_no_big_buffer",
1558 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_no_big_buffer,
1559 			0, mxge_handle_be32,
1560 			"I", "dropped_no_big_buffer");
1561 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1562 			"dropped_no_small_buffer",
1563 			CTLTYPE_INT|CTLFLAG_RD,
1564 			&fw->dropped_no_small_buffer,
1565 			0, mxge_handle_be32,
1566 			"I", "dropped_no_small_buffer");
1567 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1568 			"dropped_overrun",
1569 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_overrun,
1570 			0, mxge_handle_be32,
1571 			"I", "dropped_overrun");
1572 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1573 			"dropped_pause",
1574 			CTLTYPE_INT|CTLFLAG_RD,
1575 			&fw->dropped_pause,
1576 			0, mxge_handle_be32,
1577 			"I", "dropped_pause");
1578 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1579 			"dropped_runt",
1580 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_runt,
1581 			0, mxge_handle_be32,
1582 			"I", "dropped_runt");
1583 
1584 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1585 			"dropped_unicast_filtered",
1586 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_unicast_filtered,
1587 			0, mxge_handle_be32,
1588 			"I", "dropped_unicast_filtered");
1589 
1590 	/* verbose printing? */
1591 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1592 		       "verbose",
1593 		       CTLFLAG_RW, &mxge_verbose,
1594 		       0, "verbose printing");
1595 
1596 	/* lro */
1597 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1598 			"lro_cnt",
1599 			CTLTYPE_INT|CTLFLAG_RW, sc,
1600 			0, mxge_change_lro,
1601 			"I", "number of lro merge queues");
1602 
1603 
1604 	/* add counters exported for debugging from all slices */
1605 	sysctl_ctx_init(&sc->slice_sysctl_ctx);
1606 	sc->slice_sysctl_tree =
1607 		SYSCTL_ADD_NODE(&sc->slice_sysctl_ctx, children, OID_AUTO,
1608 				"slice", CTLFLAG_RD, 0, "");
1609 
1610 	for (slice = 0; slice < sc->num_slices; slice++) {
1611 		ss = &sc->ss[slice];
1612 		sysctl_ctx_init(&ss->sysctl_ctx);
1613 		ctx = &ss->sysctl_ctx;
1614 		children = SYSCTL_CHILDREN(sc->slice_sysctl_tree);
1615 		sprintf(slice_num, "%d", slice);
1616 		ss->sysctl_tree =
1617 			SYSCTL_ADD_NODE(ctx, children, OID_AUTO, slice_num,
1618 					CTLFLAG_RD, 0, "");
1619 		children = SYSCTL_CHILDREN(ss->sysctl_tree);
1620 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1621 			       "rx_small_cnt",
1622 			       CTLFLAG_RD, &ss->rx_small.cnt,
1623 			       0, "rx_small_cnt");
1624 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1625 			       "rx_big_cnt",
1626 			       CTLFLAG_RD, &ss->rx_big.cnt,
1627 			       0, "rx_small_cnt");
1628 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1629 			       "lro_flushed", CTLFLAG_RD, &ss->lro_flushed,
1630 			       0, "number of lro merge queues flushed");
1631 
1632 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1633 			       "lro_queued", CTLFLAG_RD, &ss->lro_queued,
1634 			       0, "number of frames appended to lro merge"
1635 			       "queues");
1636 
1637 #ifndef IFNET_BUF_RING
1638 		/* only transmit from slice 0 for now */
1639 		if (slice > 0)
1640 			continue;
1641 #endif
1642 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1643 			       "tx_req",
1644 			       CTLFLAG_RD, &ss->tx.req,
1645 			       0, "tx_req");
1646 
1647 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1648 			       "tx_done",
1649 			       CTLFLAG_RD, &ss->tx.done,
1650 			       0, "tx_done");
1651 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1652 			       "tx_pkt_done",
1653 			       CTLFLAG_RD, &ss->tx.pkt_done,
1654 			       0, "tx_done");
1655 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1656 			       "tx_stall",
1657 			       CTLFLAG_RD, &ss->tx.stall,
1658 			       0, "tx_stall");
1659 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1660 			       "tx_wake",
1661 			       CTLFLAG_RD, &ss->tx.wake,
1662 			       0, "tx_wake");
1663 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1664 			       "tx_defrag",
1665 			       CTLFLAG_RD, &ss->tx.defrag,
1666 			       0, "tx_defrag");
1667 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1668 			       "tx_queue_active",
1669 			       CTLFLAG_RD, &ss->tx.queue_active,
1670 			       0, "tx_queue_active");
1671 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1672 			       "tx_activate",
1673 			       CTLFLAG_RD, &ss->tx.activate,
1674 			       0, "tx_activate");
1675 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1676 			       "tx_deactivate",
1677 			       CTLFLAG_RD, &ss->tx.deactivate,
1678 			       0, "tx_deactivate");
1679 	}
1680 }
1681 
1682 /* copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
1683    backwards one at a time and handle ring wraps */
1684 
1685 static inline void
1686 mxge_submit_req_backwards(mxge_tx_ring_t *tx,
1687 			    mcp_kreq_ether_send_t *src, int cnt)
1688 {
1689         int idx, starting_slot;
1690         starting_slot = tx->req;
1691         while (cnt > 1) {
1692                 cnt--;
1693                 idx = (starting_slot + cnt) & tx->mask;
1694                 mxge_pio_copy(&tx->lanai[idx],
1695 			      &src[cnt], sizeof(*src));
1696                 wmb();
1697         }
1698 }
1699 
1700 /*
1701  * copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
1702  * at most 32 bytes at a time, so as to avoid involving the software
1703  * pio handler in the nic.   We re-write the first segment's flags
1704  * to mark them valid only after writing the entire chain
1705  */
1706 
1707 static inline void
1708 mxge_submit_req(mxge_tx_ring_t *tx, mcp_kreq_ether_send_t *src,
1709                   int cnt)
1710 {
1711         int idx, i;
1712         uint32_t *src_ints;
1713 	volatile uint32_t *dst_ints;
1714         mcp_kreq_ether_send_t *srcp;
1715 	volatile mcp_kreq_ether_send_t *dstp, *dst;
1716 	uint8_t last_flags;
1717 
1718         idx = tx->req & tx->mask;
1719 
1720 	last_flags = src->flags;
1721 	src->flags = 0;
1722         wmb();
1723         dst = dstp = &tx->lanai[idx];
1724         srcp = src;
1725 
1726         if ((idx + cnt) < tx->mask) {
1727                 for (i = 0; i < (cnt - 1); i += 2) {
1728                         mxge_pio_copy(dstp, srcp, 2 * sizeof(*src));
1729                         wmb(); /* force write every 32 bytes */
1730                         srcp += 2;
1731                         dstp += 2;
1732                 }
1733         } else {
1734                 /* submit all but the first request, and ensure
1735                    that it is submitted below */
1736                 mxge_submit_req_backwards(tx, src, cnt);
1737                 i = 0;
1738         }
1739         if (i < cnt) {
1740                 /* submit the first request */
1741                 mxge_pio_copy(dstp, srcp, sizeof(*src));
1742                 wmb(); /* barrier before setting valid flag */
1743         }
1744 
1745         /* re-write the last 32-bits with the valid flags */
1746         src->flags = last_flags;
1747         src_ints = (uint32_t *)src;
1748         src_ints+=3;
1749         dst_ints = (volatile uint32_t *)dst;
1750         dst_ints+=3;
1751         *dst_ints =  *src_ints;
1752         tx->req += cnt;
1753         wmb();
1754 }
1755 
1756 #if IFCAP_TSO4
1757 
1758 static void
1759 mxge_encap_tso(struct mxge_slice_state *ss, struct mbuf *m,
1760 	       int busdma_seg_cnt, int ip_off)
1761 {
1762 	mxge_tx_ring_t *tx;
1763 	mcp_kreq_ether_send_t *req;
1764 	bus_dma_segment_t *seg;
1765 	struct ip *ip;
1766 	struct tcphdr *tcp;
1767 	uint32_t low, high_swapped;
1768 	int len, seglen, cum_len, cum_len_next;
1769 	int next_is_first, chop, cnt, rdma_count, small;
1770 	uint16_t pseudo_hdr_offset, cksum_offset, mss;
1771 	uint8_t flags, flags_next;
1772 	static int once;
1773 
1774 	mss = m->m_pkthdr.tso_segsz;
1775 
1776 	/* negative cum_len signifies to the
1777 	 * send loop that we are still in the
1778 	 * header portion of the TSO packet.
1779 	 */
1780 
1781 	/* ensure we have the ethernet, IP and TCP
1782 	   header together in the first mbuf, copy
1783 	   it to a scratch buffer if not */
1784 	if (__predict_false(m->m_len < ip_off + sizeof (*ip))) {
1785 		m_copydata(m, 0, ip_off + sizeof (*ip),
1786 			   ss->scratch);
1787 		ip = (struct ip *)(ss->scratch + ip_off);
1788 	} else {
1789 		ip = (struct ip *)(mtod(m, char *) + ip_off);
1790 	}
1791 	if (__predict_false(m->m_len < ip_off + (ip->ip_hl << 2)
1792 			    + sizeof (*tcp))) {
1793 		m_copydata(m, 0, ip_off + (ip->ip_hl << 2)
1794 			   + sizeof (*tcp),  ss->scratch);
1795 		ip = (struct ip *)(mtod(m, char *) + ip_off);
1796 	}
1797 
1798 	tcp = (struct tcphdr *)((char *)ip + (ip->ip_hl << 2));
1799 	cum_len = -(ip_off + ((ip->ip_hl + tcp->th_off) << 2));
1800 
1801 	/* TSO implies checksum offload on this hardware */
1802 	cksum_offset = ip_off + (ip->ip_hl << 2);
1803 	flags = MXGEFW_FLAGS_TSO_HDR | MXGEFW_FLAGS_FIRST;
1804 
1805 
1806 	/* for TSO, pseudo_hdr_offset holds mss.
1807 	 * The firmware figures out where to put
1808 	 * the checksum by parsing the header. */
1809 	pseudo_hdr_offset = htobe16(mss);
1810 
1811 	tx = &ss->tx;
1812 	req = tx->req_list;
1813 	seg = tx->seg_list;
1814 	cnt = 0;
1815 	rdma_count = 0;
1816 	/* "rdma_count" is the number of RDMAs belonging to the
1817 	 * current packet BEFORE the current send request. For
1818 	 * non-TSO packets, this is equal to "count".
1819 	 * For TSO packets, rdma_count needs to be reset
1820 	 * to 0 after a segment cut.
1821 	 *
1822 	 * The rdma_count field of the send request is
1823 	 * the number of RDMAs of the packet starting at
1824 	 * that request. For TSO send requests with one ore more cuts
1825 	 * in the middle, this is the number of RDMAs starting
1826 	 * after the last cut in the request. All previous
1827 	 * segments before the last cut implicitly have 1 RDMA.
1828 	 *
1829 	 * Since the number of RDMAs is not known beforehand,
1830 	 * it must be filled-in retroactively - after each
1831 	 * segmentation cut or at the end of the entire packet.
1832 	 */
1833 
1834 	while (busdma_seg_cnt) {
1835 		/* Break the busdma segment up into pieces*/
1836 		low = MXGE_LOWPART_TO_U32(seg->ds_addr);
1837 		high_swapped = 	htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
1838 		len = seg->ds_len;
1839 
1840 		while (len) {
1841 			flags_next = flags & ~MXGEFW_FLAGS_FIRST;
1842 			seglen = len;
1843 			cum_len_next = cum_len + seglen;
1844 			(req-rdma_count)->rdma_count = rdma_count + 1;
1845 			if (__predict_true(cum_len >= 0)) {
1846 				/* payload */
1847 				chop = (cum_len_next > mss);
1848 				cum_len_next = cum_len_next % mss;
1849 				next_is_first = (cum_len_next == 0);
1850 				flags |= chop * MXGEFW_FLAGS_TSO_CHOP;
1851 				flags_next |= next_is_first *
1852 					MXGEFW_FLAGS_FIRST;
1853 				rdma_count |= -(chop | next_is_first);
1854 				rdma_count += chop & !next_is_first;
1855 			} else if (cum_len_next >= 0) {
1856 				/* header ends */
1857 				rdma_count = -1;
1858 				cum_len_next = 0;
1859 				seglen = -cum_len;
1860 				small = (mss <= MXGEFW_SEND_SMALL_SIZE);
1861 				flags_next = MXGEFW_FLAGS_TSO_PLD |
1862 					MXGEFW_FLAGS_FIRST |
1863 					(small * MXGEFW_FLAGS_SMALL);
1864 			    }
1865 
1866 			req->addr_high = high_swapped;
1867 			req->addr_low = htobe32(low);
1868 			req->pseudo_hdr_offset = pseudo_hdr_offset;
1869 			req->pad = 0;
1870 			req->rdma_count = 1;
1871 			req->length = htobe16(seglen);
1872 			req->cksum_offset = cksum_offset;
1873 			req->flags = flags | ((cum_len & 1) *
1874 					      MXGEFW_FLAGS_ALIGN_ODD);
1875 			low += seglen;
1876 			len -= seglen;
1877 			cum_len = cum_len_next;
1878 			flags = flags_next;
1879 			req++;
1880 			cnt++;
1881 			rdma_count++;
1882 			if (__predict_false(cksum_offset > seglen))
1883 				cksum_offset -= seglen;
1884 			else
1885 				cksum_offset = 0;
1886 			if (__predict_false(cnt > tx->max_desc))
1887 				goto drop;
1888 		}
1889 		busdma_seg_cnt--;
1890 		seg++;
1891 	}
1892 	(req-rdma_count)->rdma_count = rdma_count;
1893 
1894 	do {
1895 		req--;
1896 		req->flags |= MXGEFW_FLAGS_TSO_LAST;
1897 	} while (!(req->flags & (MXGEFW_FLAGS_TSO_CHOP | MXGEFW_FLAGS_FIRST)));
1898 
1899 	tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
1900 	mxge_submit_req(tx, tx->req_list, cnt);
1901 #ifdef IFNET_BUF_RING
1902 	if ((ss->sc->num_slices > 1) && tx->queue_active == 0) {
1903 		/* tell the NIC to start polling this slice */
1904 		*tx->send_go = 1;
1905 		tx->queue_active = 1;
1906 		tx->activate++;
1907 		wmb();
1908 	}
1909 #endif
1910 	return;
1911 
1912 drop:
1913 	bus_dmamap_unload(tx->dmat, tx->info[tx->req & tx->mask].map);
1914 	m_freem(m);
1915 	ss->oerrors++;
1916 	if (!once) {
1917 		printf("tx->max_desc exceeded via TSO!\n");
1918 		printf("mss = %d, %ld, %d!\n", mss,
1919 		       (long)seg - (long)tx->seg_list, tx->max_desc);
1920 		once = 1;
1921 	}
1922 	return;
1923 
1924 }
1925 
1926 #endif /* IFCAP_TSO4 */
1927 
1928 #ifdef MXGE_NEW_VLAN_API
1929 /*
1930  * We reproduce the software vlan tag insertion from
1931  * net/if_vlan.c:vlan_start() here so that we can advertise "hardware"
1932  * vlan tag insertion. We need to advertise this in order to have the
1933  * vlan interface respect our csum offload flags.
1934  */
1935 static struct mbuf *
1936 mxge_vlan_tag_insert(struct mbuf *m)
1937 {
1938 	struct ether_vlan_header *evl;
1939 
1940 	M_PREPEND(m, ETHER_VLAN_ENCAP_LEN, M_DONTWAIT);
1941 	if (__predict_false(m == NULL))
1942 		return NULL;
1943 	if (m->m_len < sizeof(*evl)) {
1944 		m = m_pullup(m, sizeof(*evl));
1945 		if (__predict_false(m == NULL))
1946 			return NULL;
1947 	}
1948 	/*
1949 	 * Transform the Ethernet header into an Ethernet header
1950 	 * with 802.1Q encapsulation.
1951 	 */
1952 	evl = mtod(m, struct ether_vlan_header *);
1953 	bcopy((char *)evl + ETHER_VLAN_ENCAP_LEN,
1954 	      (char *)evl, ETHER_HDR_LEN - ETHER_TYPE_LEN);
1955 	evl->evl_encap_proto = htons(ETHERTYPE_VLAN);
1956 	evl->evl_tag = htons(m->m_pkthdr.ether_vtag);
1957 	m->m_flags &= ~M_VLANTAG;
1958 	return m;
1959 }
1960 #endif /* MXGE_NEW_VLAN_API */
1961 
1962 static void
1963 mxge_encap(struct mxge_slice_state *ss, struct mbuf *m)
1964 {
1965 	mxge_softc_t *sc;
1966 	mcp_kreq_ether_send_t *req;
1967 	bus_dma_segment_t *seg;
1968 	struct mbuf *m_tmp;
1969 	struct ifnet *ifp;
1970 	mxge_tx_ring_t *tx;
1971 	struct ip *ip;
1972 	int cnt, cum_len, err, i, idx, odd_flag, ip_off;
1973 	uint16_t pseudo_hdr_offset;
1974         uint8_t flags, cksum_offset;
1975 
1976 
1977 	sc = ss->sc;
1978 	ifp = sc->ifp;
1979 	tx = &ss->tx;
1980 
1981 	ip_off = sizeof (struct ether_header);
1982 #ifdef MXGE_NEW_VLAN_API
1983 	if (m->m_flags & M_VLANTAG) {
1984 		m = mxge_vlan_tag_insert(m);
1985 		if (__predict_false(m == NULL))
1986 			goto drop;
1987 		ip_off += ETHER_VLAN_ENCAP_LEN;
1988 	}
1989 #endif
1990 	/* (try to) map the frame for DMA */
1991 	idx = tx->req & tx->mask;
1992 	err = bus_dmamap_load_mbuf_sg(tx->dmat, tx->info[idx].map,
1993 				      m, tx->seg_list, &cnt,
1994 				      BUS_DMA_NOWAIT);
1995 	if (__predict_false(err == EFBIG)) {
1996 		/* Too many segments in the chain.  Try
1997 		   to defrag */
1998 		m_tmp = m_defrag(m, M_NOWAIT);
1999 		if (m_tmp == NULL) {
2000 			goto drop;
2001 		}
2002 		ss->tx.defrag++;
2003 		m = m_tmp;
2004 		err = bus_dmamap_load_mbuf_sg(tx->dmat,
2005 					      tx->info[idx].map,
2006 					      m, tx->seg_list, &cnt,
2007 					      BUS_DMA_NOWAIT);
2008 	}
2009 	if (__predict_false(err != 0)) {
2010 		device_printf(sc->dev, "bus_dmamap_load_mbuf_sg returned %d"
2011 			      " packet len = %d\n", err, m->m_pkthdr.len);
2012 		goto drop;
2013 	}
2014 	bus_dmamap_sync(tx->dmat, tx->info[idx].map,
2015 			BUS_DMASYNC_PREWRITE);
2016 	tx->info[idx].m = m;
2017 
2018 #if IFCAP_TSO4
2019 	/* TSO is different enough, we handle it in another routine */
2020 	if (m->m_pkthdr.csum_flags & (CSUM_TSO)) {
2021 		mxge_encap_tso(ss, m, cnt, ip_off);
2022 		return;
2023 	}
2024 #endif
2025 
2026 	req = tx->req_list;
2027 	cksum_offset = 0;
2028 	pseudo_hdr_offset = 0;
2029 	flags = MXGEFW_FLAGS_NO_TSO;
2030 
2031 	/* checksum offloading? */
2032 	if (m->m_pkthdr.csum_flags & (CSUM_DELAY_DATA)) {
2033 		/* ensure ip header is in first mbuf, copy
2034 		   it to a scratch buffer if not */
2035 		if (__predict_false(m->m_len < ip_off + sizeof (*ip))) {
2036 			m_copydata(m, 0, ip_off + sizeof (*ip),
2037 				   ss->scratch);
2038 			ip = (struct ip *)(ss->scratch + ip_off);
2039 		} else {
2040 			ip = (struct ip *)(mtod(m, char *) + ip_off);
2041 		}
2042 		cksum_offset = ip_off + (ip->ip_hl << 2);
2043 		pseudo_hdr_offset = cksum_offset +  m->m_pkthdr.csum_data;
2044 		pseudo_hdr_offset = htobe16(pseudo_hdr_offset);
2045 		req->cksum_offset = cksum_offset;
2046 		flags |= MXGEFW_FLAGS_CKSUM;
2047 		odd_flag = MXGEFW_FLAGS_ALIGN_ODD;
2048 	} else {
2049 		odd_flag = 0;
2050 	}
2051 	if (m->m_pkthdr.len < MXGEFW_SEND_SMALL_SIZE)
2052 		flags |= MXGEFW_FLAGS_SMALL;
2053 
2054 	/* convert segments into a request list */
2055 	cum_len = 0;
2056 	seg = tx->seg_list;
2057 	req->flags = MXGEFW_FLAGS_FIRST;
2058 	for (i = 0; i < cnt; i++) {
2059 		req->addr_low =
2060 			htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
2061 		req->addr_high =
2062 			htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
2063 		req->length = htobe16(seg->ds_len);
2064 		req->cksum_offset = cksum_offset;
2065 		if (cksum_offset > seg->ds_len)
2066 			cksum_offset -= seg->ds_len;
2067 		else
2068 			cksum_offset = 0;
2069 		req->pseudo_hdr_offset = pseudo_hdr_offset;
2070 		req->pad = 0; /* complete solid 16-byte block */
2071 		req->rdma_count = 1;
2072 		req->flags |= flags | ((cum_len & 1) * odd_flag);
2073 		cum_len += seg->ds_len;
2074 		seg++;
2075 		req++;
2076 		req->flags = 0;
2077 	}
2078 	req--;
2079 	/* pad runts to 60 bytes */
2080 	if (cum_len < 60) {
2081 		req++;
2082 		req->addr_low =
2083 			htobe32(MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr));
2084 		req->addr_high =
2085 			htobe32(MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr));
2086 		req->length = htobe16(60 - cum_len);
2087 		req->cksum_offset = 0;
2088 		req->pseudo_hdr_offset = pseudo_hdr_offset;
2089 		req->pad = 0; /* complete solid 16-byte block */
2090 		req->rdma_count = 1;
2091 		req->flags |= flags | ((cum_len & 1) * odd_flag);
2092 		cnt++;
2093 	}
2094 
2095 	tx->req_list[0].rdma_count = cnt;
2096 #if 0
2097 	/* print what the firmware will see */
2098 	for (i = 0; i < cnt; i++) {
2099 		printf("%d: addr: 0x%x 0x%x len:%d pso%d,"
2100 		    "cso:%d, flags:0x%x, rdma:%d\n",
2101 		    i, (int)ntohl(tx->req_list[i].addr_high),
2102 		    (int)ntohl(tx->req_list[i].addr_low),
2103 		    (int)ntohs(tx->req_list[i].length),
2104 		    (int)ntohs(tx->req_list[i].pseudo_hdr_offset),
2105 		    tx->req_list[i].cksum_offset, tx->req_list[i].flags,
2106 		    tx->req_list[i].rdma_count);
2107 	}
2108 	printf("--------------\n");
2109 #endif
2110 	tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
2111 	mxge_submit_req(tx, tx->req_list, cnt);
2112 #ifdef IFNET_BUF_RING
2113 	if ((ss->sc->num_slices > 1) && tx->queue_active == 0) {
2114 		/* tell the NIC to start polling this slice */
2115 		*tx->send_go = 1;
2116 		tx->queue_active = 1;
2117 		tx->activate++;
2118 		wmb();
2119 	}
2120 #endif
2121 	return;
2122 
2123 drop:
2124 	m_freem(m);
2125 	ss->oerrors++;
2126 	return;
2127 }
2128 
2129 #ifdef IFNET_BUF_RING
2130 static void
2131 mxge_qflush(struct ifnet *ifp)
2132 {
2133 	mxge_softc_t *sc = ifp->if_softc;
2134 	mxge_tx_ring_t *tx;
2135 	struct mbuf *m;
2136 	int slice;
2137 
2138 	for (slice = 0; slice < sc->num_slices; slice++) {
2139 		tx = &sc->ss[slice].tx;
2140 		mtx_lock(&tx->mtx);
2141 		while ((m = buf_ring_dequeue_sc(tx->br)) != NULL)
2142 			m_freem(m);
2143 		mtx_unlock(&tx->mtx);
2144 	}
2145 	if_qflush(ifp);
2146 }
2147 
2148 static inline void
2149 mxge_start_locked(struct mxge_slice_state *ss)
2150 {
2151 	mxge_softc_t *sc;
2152 	struct mbuf *m;
2153 	struct ifnet *ifp;
2154 	mxge_tx_ring_t *tx;
2155 
2156 	sc = ss->sc;
2157 	ifp = sc->ifp;
2158 	tx = &ss->tx;
2159 
2160 	while ((tx->mask - (tx->req - tx->done)) > tx->max_desc) {
2161 		m = drbr_dequeue(ifp, tx->br);
2162 		if (m == NULL) {
2163 			return;
2164 		}
2165 		/* let BPF see it */
2166 		BPF_MTAP(ifp, m);
2167 
2168 		/* give it to the nic */
2169 		mxge_encap(ss, m);
2170 	}
2171 	/* ran out of transmit slots */
2172 	if (((ss->if_drv_flags & IFF_DRV_OACTIVE) == 0)
2173 	    && (!drbr_empty(ifp, tx->br))) {
2174 		ss->if_drv_flags |= IFF_DRV_OACTIVE;
2175 		tx->stall++;
2176 	}
2177 }
2178 
2179 static int
2180 mxge_transmit_locked(struct mxge_slice_state *ss, struct mbuf *m)
2181 {
2182 	mxge_softc_t *sc;
2183 	struct ifnet *ifp;
2184 	mxge_tx_ring_t *tx;
2185 	int err;
2186 
2187 	sc = ss->sc;
2188 	ifp = sc->ifp;
2189 	tx = &ss->tx;
2190 
2191 	if ((ss->if_drv_flags & (IFF_DRV_RUNNING|IFF_DRV_OACTIVE)) !=
2192 	    IFF_DRV_RUNNING) {
2193 		err = drbr_enqueue(ifp, tx->br, m);
2194 		return (err);
2195 	}
2196 
2197 	if (drbr_empty(ifp, tx->br) &&
2198 	    ((tx->mask - (tx->req - tx->done)) > tx->max_desc)) {
2199 		/* let BPF see it */
2200 		BPF_MTAP(ifp, m);
2201 		/* give it to the nic */
2202 		mxge_encap(ss, m);
2203 	} else if ((err = drbr_enqueue(ifp, tx->br, m)) != 0) {
2204 		return (err);
2205 	}
2206 	if (!drbr_empty(ifp, tx->br))
2207 		mxge_start_locked(ss);
2208 	return (0);
2209 }
2210 
2211 static int
2212 mxge_transmit(struct ifnet *ifp, struct mbuf *m)
2213 {
2214 	mxge_softc_t *sc = ifp->if_softc;
2215 	struct mxge_slice_state *ss;
2216 	mxge_tx_ring_t *tx;
2217 	int err = 0;
2218 	int slice;
2219 
2220 	slice = m->m_pkthdr.flowid;
2221 	slice &= (sc->num_slices - 1);  /* num_slices always power of 2 */
2222 
2223 	ss = &sc->ss[slice];
2224 	tx = &ss->tx;
2225 
2226 	if (mtx_trylock(&tx->mtx)) {
2227 		err = mxge_transmit_locked(ss, m);
2228 		mtx_unlock(&tx->mtx);
2229 	} else {
2230 		err = drbr_enqueue(ifp, tx->br, m);
2231 	}
2232 
2233 	return (err);
2234 }
2235 
2236 #else
2237 
2238 static inline void
2239 mxge_start_locked(struct mxge_slice_state *ss)
2240 {
2241 	mxge_softc_t *sc;
2242 	struct mbuf *m;
2243 	struct ifnet *ifp;
2244 	mxge_tx_ring_t *tx;
2245 
2246 	sc = ss->sc;
2247 	ifp = sc->ifp;
2248 	tx = &ss->tx;
2249 	while ((tx->mask - (tx->req - tx->done)) > tx->max_desc) {
2250 		IFQ_DRV_DEQUEUE(&ifp->if_snd, m);
2251 		if (m == NULL) {
2252 			return;
2253 		}
2254 		/* let BPF see it */
2255 		BPF_MTAP(ifp, m);
2256 
2257 		/* give it to the nic */
2258 		mxge_encap(ss, m);
2259 	}
2260 	/* ran out of transmit slots */
2261 	if ((sc->ifp->if_drv_flags & IFF_DRV_OACTIVE) == 0) {
2262 		sc->ifp->if_drv_flags |= IFF_DRV_OACTIVE;
2263 		tx->stall++;
2264 	}
2265 }
2266 #endif
2267 static void
2268 mxge_start(struct ifnet *ifp)
2269 {
2270 	mxge_softc_t *sc = ifp->if_softc;
2271 	struct mxge_slice_state *ss;
2272 
2273 	/* only use the first slice for now */
2274 	ss = &sc->ss[0];
2275 	mtx_lock(&ss->tx.mtx);
2276 	mxge_start_locked(ss);
2277 	mtx_unlock(&ss->tx.mtx);
2278 }
2279 
2280 /*
2281  * copy an array of mcp_kreq_ether_recv_t's to the mcp.  Copy
2282  * at most 32 bytes at a time, so as to avoid involving the software
2283  * pio handler in the nic.   We re-write the first segment's low
2284  * DMA address to mark it valid only after we write the entire chunk
2285  * in a burst
2286  */
2287 static inline void
2288 mxge_submit_8rx(volatile mcp_kreq_ether_recv_t *dst,
2289 		mcp_kreq_ether_recv_t *src)
2290 {
2291 	uint32_t low;
2292 
2293 	low = src->addr_low;
2294 	src->addr_low = 0xffffffff;
2295 	mxge_pio_copy(dst, src, 4 * sizeof (*src));
2296 	wmb();
2297 	mxge_pio_copy(dst + 4, src + 4, 4 * sizeof (*src));
2298 	wmb();
2299 	src->addr_low = low;
2300 	dst->addr_low = low;
2301 	wmb();
2302 }
2303 
2304 static int
2305 mxge_get_buf_small(struct mxge_slice_state *ss, bus_dmamap_t map, int idx)
2306 {
2307 	bus_dma_segment_t seg;
2308 	struct mbuf *m;
2309 	mxge_rx_ring_t *rx = &ss->rx_small;
2310 	int cnt, err;
2311 
2312 	m = m_gethdr(M_DONTWAIT, MT_DATA);
2313 	if (m == NULL) {
2314 		rx->alloc_fail++;
2315 		err = ENOBUFS;
2316 		goto done;
2317 	}
2318 	m->m_len = MHLEN;
2319 	err = bus_dmamap_load_mbuf_sg(rx->dmat, map, m,
2320 				      &seg, &cnt, BUS_DMA_NOWAIT);
2321 	if (err != 0) {
2322 		m_free(m);
2323 		goto done;
2324 	}
2325 	rx->info[idx].m = m;
2326 	rx->shadow[idx].addr_low =
2327 		htobe32(MXGE_LOWPART_TO_U32(seg.ds_addr));
2328 	rx->shadow[idx].addr_high =
2329 		htobe32(MXGE_HIGHPART_TO_U32(seg.ds_addr));
2330 
2331 done:
2332 	if ((idx & 7) == 7)
2333 		mxge_submit_8rx(&rx->lanai[idx - 7], &rx->shadow[idx - 7]);
2334 	return err;
2335 }
2336 
2337 static int
2338 mxge_get_buf_big(struct mxge_slice_state *ss, bus_dmamap_t map, int idx)
2339 {
2340 	bus_dma_segment_t seg[3];
2341 	struct mbuf *m;
2342 	mxge_rx_ring_t *rx = &ss->rx_big;
2343 	int cnt, err, i;
2344 
2345 	if (rx->cl_size == MCLBYTES)
2346 		m = m_getcl(M_DONTWAIT, MT_DATA, M_PKTHDR);
2347 	else
2348 		m = m_getjcl(M_DONTWAIT, MT_DATA, M_PKTHDR, rx->cl_size);
2349 	if (m == NULL) {
2350 		rx->alloc_fail++;
2351 		err = ENOBUFS;
2352 		goto done;
2353 	}
2354 	m->m_len = rx->cl_size;
2355 	err = bus_dmamap_load_mbuf_sg(rx->dmat, map, m,
2356 				      seg, &cnt, BUS_DMA_NOWAIT);
2357 	if (err != 0) {
2358 		m_free(m);
2359 		goto done;
2360 	}
2361 	rx->info[idx].m = m;
2362 	rx->shadow[idx].addr_low =
2363 		htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
2364 	rx->shadow[idx].addr_high =
2365 		htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
2366 
2367 #if MXGE_VIRT_JUMBOS
2368 	for (i = 1; i < cnt; i++) {
2369 		rx->shadow[idx + i].addr_low =
2370 			htobe32(MXGE_LOWPART_TO_U32(seg[i].ds_addr));
2371 		rx->shadow[idx + i].addr_high =
2372 			htobe32(MXGE_HIGHPART_TO_U32(seg[i].ds_addr));
2373        }
2374 #endif
2375 
2376 done:
2377        for (i = 0; i < rx->nbufs; i++) {
2378 		if ((idx & 7) == 7) {
2379 			mxge_submit_8rx(&rx->lanai[idx - 7],
2380 					&rx->shadow[idx - 7]);
2381 		}
2382 		idx++;
2383 	}
2384 	return err;
2385 }
2386 
2387 /*
2388  *  Myri10GE hardware checksums are not valid if the sender
2389  *  padded the frame with non-zero padding.  This is because
2390  *  the firmware just does a simple 16-bit 1s complement
2391  *  checksum across the entire frame, excluding the first 14
2392  *  bytes.  It is best to simply to check the checksum and
2393  *  tell the stack about it only if the checksum is good
2394  */
2395 
2396 static inline uint16_t
2397 mxge_rx_csum(struct mbuf *m, int csum)
2398 {
2399 	struct ether_header *eh;
2400 	struct ip *ip;
2401 	uint16_t c;
2402 
2403 	eh = mtod(m, struct ether_header *);
2404 
2405 	/* only deal with IPv4 TCP & UDP for now */
2406 	if (__predict_false(eh->ether_type != htons(ETHERTYPE_IP)))
2407 		return 1;
2408 	ip = (struct ip *)(eh + 1);
2409 	if (__predict_false(ip->ip_p != IPPROTO_TCP &&
2410 			    ip->ip_p != IPPROTO_UDP))
2411 		return 1;
2412 
2413 	c = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
2414 		      htonl(ntohs(csum) + ntohs(ip->ip_len) +
2415 			    - (ip->ip_hl << 2) + ip->ip_p));
2416 	c ^= 0xffff;
2417 	return (c);
2418 }
2419 
2420 static void
2421 mxge_vlan_tag_remove(struct mbuf *m, uint32_t *csum)
2422 {
2423 	struct ether_vlan_header *evl;
2424 	struct ether_header *eh;
2425 	uint32_t partial;
2426 
2427 	evl = mtod(m, struct ether_vlan_header *);
2428 	eh = mtod(m, struct ether_header *);
2429 
2430 	/*
2431 	 * fix checksum by subtracting ETHER_VLAN_ENCAP_LEN bytes
2432 	 * after what the firmware thought was the end of the ethernet
2433 	 * header.
2434 	 */
2435 
2436 	/* put checksum into host byte order */
2437 	*csum = ntohs(*csum);
2438 	partial = ntohl(*(uint32_t *)(mtod(m, char *) + ETHER_HDR_LEN));
2439 	(*csum) += ~partial;
2440 	(*csum) +=  ((*csum) < ~partial);
2441 	(*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2442 	(*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2443 
2444 	/* restore checksum to network byte order;
2445 	   later consumers expect this */
2446 	*csum = htons(*csum);
2447 
2448 	/* save the tag */
2449 #ifdef MXGE_NEW_VLAN_API
2450 	m->m_pkthdr.ether_vtag = ntohs(evl->evl_tag);
2451 #else
2452 	{
2453 		struct m_tag *mtag;
2454 		mtag = m_tag_alloc(MTAG_VLAN, MTAG_VLAN_TAG, sizeof(u_int),
2455 				   M_NOWAIT);
2456 		if (mtag == NULL)
2457 			return;
2458 		VLAN_TAG_VALUE(mtag) = ntohs(evl->evl_tag);
2459 		m_tag_prepend(m, mtag);
2460 	}
2461 
2462 #endif
2463 	m->m_flags |= M_VLANTAG;
2464 
2465 	/*
2466 	 * Remove the 802.1q header by copying the Ethernet
2467 	 * addresses over it and adjusting the beginning of
2468 	 * the data in the mbuf.  The encapsulated Ethernet
2469 	 * type field is already in place.
2470 	 */
2471 	bcopy((char *)evl, (char *)evl + ETHER_VLAN_ENCAP_LEN,
2472 	      ETHER_HDR_LEN - ETHER_TYPE_LEN);
2473 	m_adj(m, ETHER_VLAN_ENCAP_LEN);
2474 }
2475 
2476 
2477 static inline void
2478 mxge_rx_done_big(struct mxge_slice_state *ss, uint32_t len, uint32_t csum)
2479 {
2480 	mxge_softc_t *sc;
2481 	struct ifnet *ifp;
2482 	struct mbuf *m;
2483 	struct ether_header *eh;
2484 	mxge_rx_ring_t *rx;
2485 	bus_dmamap_t old_map;
2486 	int idx;
2487 	uint16_t tcpudp_csum;
2488 
2489 	sc = ss->sc;
2490 	ifp = sc->ifp;
2491 	rx = &ss->rx_big;
2492 	idx = rx->cnt & rx->mask;
2493 	rx->cnt += rx->nbufs;
2494 	/* save a pointer to the received mbuf */
2495 	m = rx->info[idx].m;
2496 	/* try to replace the received mbuf */
2497 	if (mxge_get_buf_big(ss, rx->extra_map, idx)) {
2498 		/* drop the frame -- the old mbuf is re-cycled */
2499 		ifp->if_ierrors++;
2500 		return;
2501 	}
2502 
2503 	/* unmap the received buffer */
2504 	old_map = rx->info[idx].map;
2505 	bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2506 	bus_dmamap_unload(rx->dmat, old_map);
2507 
2508 	/* swap the bus_dmamap_t's */
2509 	rx->info[idx].map = rx->extra_map;
2510 	rx->extra_map = old_map;
2511 
2512 	/* mcp implicitly skips 1st 2 bytes so that packet is properly
2513 	 * aligned */
2514 	m->m_data += MXGEFW_PAD;
2515 
2516 	m->m_pkthdr.rcvif = ifp;
2517 	m->m_len = m->m_pkthdr.len = len;
2518 	ss->ipackets++;
2519 	eh = mtod(m, struct ether_header *);
2520 	if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2521 		mxge_vlan_tag_remove(m, &csum);
2522 	}
2523 	/* if the checksum is valid, mark it in the mbuf header */
2524 	if (sc->csum_flag && (0 == (tcpudp_csum = mxge_rx_csum(m, csum)))) {
2525 		if (sc->lro_cnt && (0 == mxge_lro_rx(ss, m, csum)))
2526 			return;
2527 		/* otherwise, it was a UDP frame, or a TCP frame which
2528 		   we could not do LRO on.  Tell the stack that the
2529 		   checksum is good */
2530 		m->m_pkthdr.csum_data = 0xffff;
2531 		m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR | CSUM_DATA_VALID;
2532 	}
2533 	/* flowid only valid if RSS hashing is enabled */
2534 	if (sc->num_slices > 1) {
2535 		m->m_pkthdr.flowid = (ss - sc->ss);
2536 		m->m_flags |= M_FLOWID;
2537 	}
2538 	/* pass the frame up the stack */
2539 	(*ifp->if_input)(ifp, m);
2540 }
2541 
2542 static inline void
2543 mxge_rx_done_small(struct mxge_slice_state *ss, uint32_t len, uint32_t csum)
2544 {
2545 	mxge_softc_t *sc;
2546 	struct ifnet *ifp;
2547 	struct ether_header *eh;
2548 	struct mbuf *m;
2549 	mxge_rx_ring_t *rx;
2550 	bus_dmamap_t old_map;
2551 	int idx;
2552 	uint16_t tcpudp_csum;
2553 
2554 	sc = ss->sc;
2555 	ifp = sc->ifp;
2556 	rx = &ss->rx_small;
2557 	idx = rx->cnt & rx->mask;
2558 	rx->cnt++;
2559 	/* save a pointer to the received mbuf */
2560 	m = rx->info[idx].m;
2561 	/* try to replace the received mbuf */
2562 	if (mxge_get_buf_small(ss, rx->extra_map, idx)) {
2563 		/* drop the frame -- the old mbuf is re-cycled */
2564 		ifp->if_ierrors++;
2565 		return;
2566 	}
2567 
2568 	/* unmap the received buffer */
2569 	old_map = rx->info[idx].map;
2570 	bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2571 	bus_dmamap_unload(rx->dmat, old_map);
2572 
2573 	/* swap the bus_dmamap_t's */
2574 	rx->info[idx].map = rx->extra_map;
2575 	rx->extra_map = old_map;
2576 
2577 	/* mcp implicitly skips 1st 2 bytes so that packet is properly
2578 	 * aligned */
2579 	m->m_data += MXGEFW_PAD;
2580 
2581 	m->m_pkthdr.rcvif = ifp;
2582 	m->m_len = m->m_pkthdr.len = len;
2583 	ss->ipackets++;
2584 	eh = mtod(m, struct ether_header *);
2585 	if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2586 		mxge_vlan_tag_remove(m, &csum);
2587 	}
2588 	/* if the checksum is valid, mark it in the mbuf header */
2589 	if (sc->csum_flag && (0 == (tcpudp_csum = mxge_rx_csum(m, csum)))) {
2590 		if (sc->lro_cnt && (0 == mxge_lro_rx(ss, m, csum)))
2591 			return;
2592 		/* otherwise, it was a UDP frame, or a TCP frame which
2593 		   we could not do LRO on.  Tell the stack that the
2594 		   checksum is good */
2595 		m->m_pkthdr.csum_data = 0xffff;
2596 		m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR | CSUM_DATA_VALID;
2597 	}
2598 	/* flowid only valid if RSS hashing is enabled */
2599 	if (sc->num_slices > 1) {
2600 		m->m_pkthdr.flowid = (ss - sc->ss);
2601 		m->m_flags |= M_FLOWID;
2602 	}
2603 	/* pass the frame up the stack */
2604 	(*ifp->if_input)(ifp, m);
2605 }
2606 
2607 static inline void
2608 mxge_clean_rx_done(struct mxge_slice_state *ss)
2609 {
2610 	mxge_rx_done_t *rx_done = &ss->rx_done;
2611 	struct lro_entry *lro;
2612 	int limit = 0;
2613 	uint16_t length;
2614 	uint16_t checksum;
2615 
2616 
2617 	while (rx_done->entry[rx_done->idx].length != 0) {
2618 		length = ntohs(rx_done->entry[rx_done->idx].length);
2619 		rx_done->entry[rx_done->idx].length = 0;
2620 		checksum = rx_done->entry[rx_done->idx].checksum;
2621 		if (length <= (MHLEN - MXGEFW_PAD))
2622 			mxge_rx_done_small(ss, length, checksum);
2623 		else
2624 			mxge_rx_done_big(ss, length, checksum);
2625 		rx_done->cnt++;
2626 		rx_done->idx = rx_done->cnt & rx_done->mask;
2627 
2628 		/* limit potential for livelock */
2629 		if (__predict_false(++limit > rx_done->mask / 2))
2630 			break;
2631 	}
2632 	while (!SLIST_EMPTY(&ss->lro_active)) {
2633 		lro = SLIST_FIRST(&ss->lro_active);
2634 		SLIST_REMOVE_HEAD(&ss->lro_active, next);
2635 		mxge_lro_flush(ss, lro);
2636 	}
2637 }
2638 
2639 
2640 static inline void
2641 mxge_tx_done(struct mxge_slice_state *ss, uint32_t mcp_idx)
2642 {
2643 	struct ifnet *ifp;
2644 	mxge_tx_ring_t *tx;
2645 	struct mbuf *m;
2646 	bus_dmamap_t map;
2647 	int idx;
2648 	int *flags;
2649 
2650 	tx = &ss->tx;
2651 	ifp = ss->sc->ifp;
2652 	while (tx->pkt_done != mcp_idx) {
2653 		idx = tx->done & tx->mask;
2654 		tx->done++;
2655 		m = tx->info[idx].m;
2656 		/* mbuf and DMA map only attached to the first
2657 		   segment per-mbuf */
2658 		if (m != NULL) {
2659 #ifdef IFNET_BUF_RING
2660 			ss->obytes += m->m_pkthdr.len;
2661 			if (m->m_flags & M_MCAST)
2662 				ss->omcasts++;
2663 #endif
2664 			ss->opackets++;
2665 			tx->info[idx].m = NULL;
2666 			map = tx->info[idx].map;
2667 			bus_dmamap_unload(tx->dmat, map);
2668 			m_freem(m);
2669 		}
2670 		if (tx->info[idx].flag) {
2671 			tx->info[idx].flag = 0;
2672 			tx->pkt_done++;
2673 		}
2674 	}
2675 
2676 	/* If we have space, clear IFF_OACTIVE to tell the stack that
2677            its OK to send packets */
2678 #ifdef IFNET_BUF_RING
2679 	flags = &ss->if_drv_flags;
2680 #else
2681 	flags = &ifp->if_drv_flags;
2682 #endif
2683 	mtx_lock(&ss->tx.mtx);
2684 	if ((*flags) & IFF_DRV_OACTIVE &&
2685 	    tx->req - tx->done < (tx->mask + 1)/4) {
2686 		*(flags) &= ~IFF_DRV_OACTIVE;
2687 		ss->tx.wake++;
2688 		mxge_start_locked(ss);
2689 	}
2690 #ifdef IFNET_BUF_RING
2691 	if ((ss->sc->num_slices > 1) && (tx->req == tx->done)) {
2692 		/* let the NIC stop polling this queue, since there
2693 		 * are no more transmits pending */
2694 		if (tx->req == tx->done) {
2695 			*tx->send_stop = 1;
2696 			tx->queue_active = 0;
2697 			tx->deactivate++;
2698 			wmb();
2699 		}
2700 	}
2701 #endif
2702 	mtx_unlock(&ss->tx.mtx);
2703 
2704 }
2705 
2706 static struct mxge_media_type mxge_xfp_media_types[] =
2707 {
2708 	{IFM_10G_CX4,	0x7f, 		"10GBASE-CX4 (module)"},
2709 	{IFM_10G_SR, 	(1 << 7),	"10GBASE-SR"},
2710 	{IFM_10G_LR, 	(1 << 6),	"10GBASE-LR"},
2711 	{0,		(1 << 5),	"10GBASE-ER"},
2712 	{IFM_10G_LRM,	(1 << 4),	"10GBASE-LRM"},
2713 	{0,		(1 << 3),	"10GBASE-SW"},
2714 	{0,		(1 << 2),	"10GBASE-LW"},
2715 	{0,		(1 << 1),	"10GBASE-EW"},
2716 	{0,		(1 << 0),	"Reserved"}
2717 };
2718 static struct mxge_media_type mxge_sfp_media_types[] =
2719 {
2720 	{0,		(1 << 7),	"Reserved"},
2721 	{IFM_10G_LRM,	(1 << 6),	"10GBASE-LRM"},
2722 	{IFM_10G_LR, 	(1 << 5),	"10GBASE-LR"},
2723 	{IFM_10G_SR,	(1 << 4),	"10GBASE-SR"}
2724 };
2725 
2726 static void
2727 mxge_set_media(mxge_softc_t *sc, int type)
2728 {
2729 	sc->media_flags |= type;
2730 	ifmedia_add(&sc->media, sc->media_flags, 0, NULL);
2731 	ifmedia_set(&sc->media, sc->media_flags);
2732 }
2733 
2734 
2735 /*
2736  * Determine the media type for a NIC.  Some XFPs will identify
2737  * themselves only when their link is up, so this is initiated via a
2738  * link up interrupt.  However, this can potentially take up to
2739  * several milliseconds, so it is run via the watchdog routine, rather
2740  * than in the interrupt handler itself.   This need only be done
2741  * once, not each time the link is up.
2742  */
2743 static void
2744 mxge_media_probe(mxge_softc_t *sc)
2745 {
2746 	mxge_cmd_t cmd;
2747 	char *cage_type;
2748 	char *ptr;
2749 	struct mxge_media_type *mxge_media_types = NULL;
2750 	int i, err, ms, mxge_media_type_entries;
2751 	uint32_t byte;
2752 
2753 	sc->need_media_probe = 0;
2754 
2755 	/* if we've already set a media type, we're done */
2756 	if (sc->media_flags  != (IFM_ETHER | IFM_AUTO))
2757 		return;
2758 
2759 	/*
2760 	 * parse the product code to deterimine the interface type
2761 	 * (CX4, XFP, Quad Ribbon Fiber) by looking at the character
2762 	 * after the 3rd dash in the driver's cached copy of the
2763 	 * EEPROM's product code string.
2764 	 */
2765 	ptr = sc->product_code_string;
2766 	if (ptr == NULL) {
2767 		device_printf(sc->dev, "Missing product code\n");
2768 	}
2769 
2770 	for (i = 0; i < 3; i++, ptr++) {
2771 		ptr = index(ptr, '-');
2772 		if (ptr == NULL) {
2773 			device_printf(sc->dev,
2774 				      "only %d dashes in PC?!?\n", i);
2775 			return;
2776 		}
2777 	}
2778 	if (*ptr == 'C') {
2779 		/* -C is CX4 */
2780 		mxge_set_media(sc, IFM_10G_CX4);
2781 		return;
2782 	}
2783 	else if (*ptr == 'Q') {
2784 		/* -Q is Quad Ribbon Fiber */
2785 		device_printf(sc->dev, "Quad Ribbon Fiber Media\n");
2786 		/* FreeBSD has no media type for Quad ribbon fiber */
2787 		return;
2788 	}
2789 
2790 	if (*ptr == 'R') {
2791 		/* -R is XFP */
2792 		mxge_media_types = mxge_xfp_media_types;
2793 		mxge_media_type_entries =
2794 			sizeof (mxge_xfp_media_types) /
2795 			sizeof (mxge_xfp_media_types[0]);
2796 		byte = MXGE_XFP_COMPLIANCE_BYTE;
2797 		cage_type = "XFP";
2798 	}
2799 
2800 	if (*ptr == 'S' || *(ptr +1) == 'S') {
2801 		/* -S or -2S is SFP+ */
2802 		mxge_media_types = mxge_sfp_media_types;
2803 		mxge_media_type_entries =
2804 			sizeof (mxge_sfp_media_types) /
2805 			sizeof (mxge_sfp_media_types[0]);
2806 		cage_type = "SFP+";
2807 		byte = 3;
2808 	}
2809 
2810 	if (mxge_media_types == NULL) {
2811 		device_printf(sc->dev, "Unknown media type: %c\n", *ptr);
2812 		return;
2813 	}
2814 
2815 	/*
2816 	 * At this point we know the NIC has an XFP cage, so now we
2817 	 * try to determine what is in the cage by using the
2818 	 * firmware's XFP I2C commands to read the XFP 10GbE compilance
2819 	 * register.  We read just one byte, which may take over
2820 	 * a millisecond
2821 	 */
2822 
2823 	cmd.data0 = 0;	 /* just fetch 1 byte, not all 256 */
2824 	cmd.data1 = byte;
2825 	err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_READ, &cmd);
2826 	if (err == MXGEFW_CMD_ERROR_I2C_FAILURE) {
2827 		device_printf(sc->dev, "failed to read XFP\n");
2828 	}
2829 	if (err == MXGEFW_CMD_ERROR_I2C_ABSENT) {
2830 		device_printf(sc->dev, "Type R/S with no XFP!?!?\n");
2831 	}
2832 	if (err != MXGEFW_CMD_OK) {
2833 		return;
2834 	}
2835 
2836 	/* now we wait for the data to be cached */
2837 	cmd.data0 = byte;
2838 	err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
2839 	for (ms = 0; (err == EBUSY) && (ms < 50); ms++) {
2840 		DELAY(1000);
2841 		cmd.data0 = byte;
2842 		err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
2843 	}
2844 	if (err != MXGEFW_CMD_OK) {
2845 		device_printf(sc->dev, "failed to read %s (%d, %dms)\n",
2846 			      cage_type, err, ms);
2847 		return;
2848 	}
2849 
2850 	if (cmd.data0 == mxge_media_types[0].bitmask) {
2851 		if (mxge_verbose)
2852 			device_printf(sc->dev, "%s:%s\n", cage_type,
2853 				      mxge_media_types[0].name);
2854 		mxge_set_media(sc, IFM_10G_CX4);
2855 		return;
2856 	}
2857 	for (i = 1; i < mxge_media_type_entries; i++) {
2858 		if (cmd.data0 & mxge_media_types[i].bitmask) {
2859 			if (mxge_verbose)
2860 				device_printf(sc->dev, "%s:%s\n",
2861 					      cage_type,
2862 					      mxge_media_types[i].name);
2863 
2864 			mxge_set_media(sc, mxge_media_types[i].flag);
2865 			return;
2866 		}
2867 	}
2868 	device_printf(sc->dev, "%s media 0x%x unknown\n", cage_type,
2869 		      cmd.data0);
2870 
2871 	return;
2872 }
2873 
2874 static void
2875 mxge_intr(void *arg)
2876 {
2877 	struct mxge_slice_state *ss = arg;
2878 	mxge_softc_t *sc = ss->sc;
2879 	mcp_irq_data_t *stats = ss->fw_stats;
2880 	mxge_tx_ring_t *tx = &ss->tx;
2881 	mxge_rx_done_t *rx_done = &ss->rx_done;
2882 	uint32_t send_done_count;
2883 	uint8_t valid;
2884 
2885 
2886 #ifndef IFNET_BUF_RING
2887 	/* an interrupt on a non-zero slice is implicitly valid
2888 	   since MSI-X irqs are not shared */
2889 	if (ss != sc->ss) {
2890 		mxge_clean_rx_done(ss);
2891 		*ss->irq_claim = be32toh(3);
2892 		return;
2893 	}
2894 #endif
2895 
2896 	/* make sure the DMA has finished */
2897 	if (!stats->valid) {
2898 		return;
2899 	}
2900 	valid = stats->valid;
2901 
2902 	if (sc->legacy_irq) {
2903 		/* lower legacy IRQ  */
2904 		*sc->irq_deassert = 0;
2905 		if (!mxge_deassert_wait)
2906 			/* don't wait for conf. that irq is low */
2907 			stats->valid = 0;
2908 	} else {
2909 		stats->valid = 0;
2910 	}
2911 
2912 	/* loop while waiting for legacy irq deassertion */
2913 	do {
2914 		/* check for transmit completes and receives */
2915 		send_done_count = be32toh(stats->send_done_count);
2916 		while ((send_done_count != tx->pkt_done) ||
2917 		       (rx_done->entry[rx_done->idx].length != 0)) {
2918 			if (send_done_count != tx->pkt_done)
2919 				mxge_tx_done(ss, (int)send_done_count);
2920 			mxge_clean_rx_done(ss);
2921 			send_done_count = be32toh(stats->send_done_count);
2922 		}
2923 		if (sc->legacy_irq && mxge_deassert_wait)
2924 			wmb();
2925 	} while (*((volatile uint8_t *) &stats->valid));
2926 
2927 	/* fw link & error stats meaningful only on the first slice */
2928 	if (__predict_false((ss == sc->ss) && stats->stats_updated)) {
2929 		if (sc->link_state != stats->link_up) {
2930 			sc->link_state = stats->link_up;
2931 			if (sc->link_state) {
2932 				if_link_state_change(sc->ifp, LINK_STATE_UP);
2933 				if (mxge_verbose)
2934 					device_printf(sc->dev, "link up\n");
2935 			} else {
2936 				if_link_state_change(sc->ifp, LINK_STATE_DOWN);
2937 				if (mxge_verbose)
2938 					device_printf(sc->dev, "link down\n");
2939 			}
2940 			sc->need_media_probe = 1;
2941 		}
2942 		if (sc->rdma_tags_available !=
2943 		    be32toh(stats->rdma_tags_available)) {
2944 			sc->rdma_tags_available =
2945 				be32toh(stats->rdma_tags_available);
2946 			device_printf(sc->dev, "RDMA timed out! %d tags "
2947 				      "left\n", sc->rdma_tags_available);
2948 		}
2949 
2950 		if (stats->link_down) {
2951 			sc->down_cnt += stats->link_down;
2952 			sc->link_state = 0;
2953 			if_link_state_change(sc->ifp, LINK_STATE_DOWN);
2954 		}
2955 	}
2956 
2957 	/* check to see if we have rx token to pass back */
2958 	if (valid & 0x1)
2959 	    *ss->irq_claim = be32toh(3);
2960 	*(ss->irq_claim + 1) = be32toh(3);
2961 }
2962 
2963 static void
2964 mxge_init(void *arg)
2965 {
2966 }
2967 
2968 
2969 
2970 static void
2971 mxge_free_slice_mbufs(struct mxge_slice_state *ss)
2972 {
2973 	struct lro_entry *lro_entry;
2974 	int i;
2975 
2976 	while (!SLIST_EMPTY(&ss->lro_free)) {
2977 		lro_entry = SLIST_FIRST(&ss->lro_free);
2978 		SLIST_REMOVE_HEAD(&ss->lro_free, next);
2979 		free(lro_entry, M_DEVBUF);
2980 	}
2981 
2982 	for (i = 0; i <= ss->rx_big.mask; i++) {
2983 		if (ss->rx_big.info[i].m == NULL)
2984 			continue;
2985 		bus_dmamap_unload(ss->rx_big.dmat,
2986 				  ss->rx_big.info[i].map);
2987 		m_freem(ss->rx_big.info[i].m);
2988 		ss->rx_big.info[i].m = NULL;
2989 	}
2990 
2991 	for (i = 0; i <= ss->rx_small.mask; i++) {
2992 		if (ss->rx_small.info[i].m == NULL)
2993 			continue;
2994 		bus_dmamap_unload(ss->rx_small.dmat,
2995 				  ss->rx_small.info[i].map);
2996 		m_freem(ss->rx_small.info[i].m);
2997 		ss->rx_small.info[i].m = NULL;
2998 	}
2999 
3000 	/* transmit ring used only on the first slice */
3001 	if (ss->tx.info == NULL)
3002 		return;
3003 
3004 	for (i = 0; i <= ss->tx.mask; i++) {
3005 		ss->tx.info[i].flag = 0;
3006 		if (ss->tx.info[i].m == NULL)
3007 			continue;
3008 		bus_dmamap_unload(ss->tx.dmat,
3009 				  ss->tx.info[i].map);
3010 		m_freem(ss->tx.info[i].m);
3011 		ss->tx.info[i].m = NULL;
3012 	}
3013 }
3014 
3015 static void
3016 mxge_free_mbufs(mxge_softc_t *sc)
3017 {
3018 	int slice;
3019 
3020 	for (slice = 0; slice < sc->num_slices; slice++)
3021 		mxge_free_slice_mbufs(&sc->ss[slice]);
3022 }
3023 
3024 static void
3025 mxge_free_slice_rings(struct mxge_slice_state *ss)
3026 {
3027 	int i;
3028 
3029 
3030 	if (ss->rx_done.entry != NULL)
3031 		mxge_dma_free(&ss->rx_done.dma);
3032 	ss->rx_done.entry = NULL;
3033 
3034 	if (ss->tx.req_bytes != NULL)
3035 		free(ss->tx.req_bytes, M_DEVBUF);
3036 	ss->tx.req_bytes = NULL;
3037 
3038 	if (ss->tx.seg_list != NULL)
3039 		free(ss->tx.seg_list, M_DEVBUF);
3040 	ss->tx.seg_list = NULL;
3041 
3042 	if (ss->rx_small.shadow != NULL)
3043 		free(ss->rx_small.shadow, M_DEVBUF);
3044 	ss->rx_small.shadow = NULL;
3045 
3046 	if (ss->rx_big.shadow != NULL)
3047 		free(ss->rx_big.shadow, M_DEVBUF);
3048 	ss->rx_big.shadow = NULL;
3049 
3050 	if (ss->tx.info != NULL) {
3051 		if (ss->tx.dmat != NULL) {
3052 			for (i = 0; i <= ss->tx.mask; i++) {
3053 				bus_dmamap_destroy(ss->tx.dmat,
3054 						   ss->tx.info[i].map);
3055 			}
3056 			bus_dma_tag_destroy(ss->tx.dmat);
3057 		}
3058 		free(ss->tx.info, M_DEVBUF);
3059 	}
3060 	ss->tx.info = NULL;
3061 
3062 	if (ss->rx_small.info != NULL) {
3063 		if (ss->rx_small.dmat != NULL) {
3064 			for (i = 0; i <= ss->rx_small.mask; i++) {
3065 				bus_dmamap_destroy(ss->rx_small.dmat,
3066 						   ss->rx_small.info[i].map);
3067 			}
3068 			bus_dmamap_destroy(ss->rx_small.dmat,
3069 					   ss->rx_small.extra_map);
3070 			bus_dma_tag_destroy(ss->rx_small.dmat);
3071 		}
3072 		free(ss->rx_small.info, M_DEVBUF);
3073 	}
3074 	ss->rx_small.info = NULL;
3075 
3076 	if (ss->rx_big.info != NULL) {
3077 		if (ss->rx_big.dmat != NULL) {
3078 			for (i = 0; i <= ss->rx_big.mask; i++) {
3079 				bus_dmamap_destroy(ss->rx_big.dmat,
3080 						   ss->rx_big.info[i].map);
3081 			}
3082 			bus_dmamap_destroy(ss->rx_big.dmat,
3083 					   ss->rx_big.extra_map);
3084 			bus_dma_tag_destroy(ss->rx_big.dmat);
3085 		}
3086 		free(ss->rx_big.info, M_DEVBUF);
3087 	}
3088 	ss->rx_big.info = NULL;
3089 }
3090 
3091 static void
3092 mxge_free_rings(mxge_softc_t *sc)
3093 {
3094 	int slice;
3095 
3096 	for (slice = 0; slice < sc->num_slices; slice++)
3097 		mxge_free_slice_rings(&sc->ss[slice]);
3098 }
3099 
3100 static int
3101 mxge_alloc_slice_rings(struct mxge_slice_state *ss, int rx_ring_entries,
3102 		       int tx_ring_entries)
3103 {
3104 	mxge_softc_t *sc = ss->sc;
3105 	size_t bytes;
3106 	int err, i;
3107 
3108 	err = ENOMEM;
3109 
3110 	/* allocate per-slice receive resources */
3111 
3112 	ss->rx_small.mask = ss->rx_big.mask = rx_ring_entries - 1;
3113 	ss->rx_done.mask = (2 * rx_ring_entries) - 1;
3114 
3115 	/* allocate the rx shadow rings */
3116 	bytes = rx_ring_entries * sizeof (*ss->rx_small.shadow);
3117 	ss->rx_small.shadow = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3118 	if (ss->rx_small.shadow == NULL)
3119 		return err;;
3120 
3121 	bytes = rx_ring_entries * sizeof (*ss->rx_big.shadow);
3122 	ss->rx_big.shadow = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3123 	if (ss->rx_big.shadow == NULL)
3124 		return err;;
3125 
3126 	/* allocate the rx host info rings */
3127 	bytes = rx_ring_entries * sizeof (*ss->rx_small.info);
3128 	ss->rx_small.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3129 	if (ss->rx_small.info == NULL)
3130 		return err;;
3131 
3132 	bytes = rx_ring_entries * sizeof (*ss->rx_big.info);
3133 	ss->rx_big.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3134 	if (ss->rx_big.info == NULL)
3135 		return err;;
3136 
3137 	/* allocate the rx busdma resources */
3138 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
3139 				 1,			/* alignment */
3140 				 4096,			/* boundary */
3141 				 BUS_SPACE_MAXADDR,	/* low */
3142 				 BUS_SPACE_MAXADDR,	/* high */
3143 				 NULL, NULL,		/* filter */
3144 				 MHLEN,			/* maxsize */
3145 				 1,			/* num segs */
3146 				 MHLEN,			/* maxsegsize */
3147 				 BUS_DMA_ALLOCNOW,	/* flags */
3148 				 NULL, NULL,		/* lock */
3149 				 &ss->rx_small.dmat);	/* tag */
3150 	if (err != 0) {
3151 		device_printf(sc->dev, "Err %d allocating rx_small dmat\n",
3152 			      err);
3153 		return err;;
3154 	}
3155 
3156 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
3157 				 1,			/* alignment */
3158 #if MXGE_VIRT_JUMBOS
3159 				 4096,			/* boundary */
3160 #else
3161 				 0,			/* boundary */
3162 #endif
3163 				 BUS_SPACE_MAXADDR,	/* low */
3164 				 BUS_SPACE_MAXADDR,	/* high */
3165 				 NULL, NULL,		/* filter */
3166 				 3*4096,		/* maxsize */
3167 #if MXGE_VIRT_JUMBOS
3168 				 3,			/* num segs */
3169 				 4096,			/* maxsegsize*/
3170 #else
3171 				 1,			/* num segs */
3172 				 MJUM9BYTES,		/* maxsegsize*/
3173 #endif
3174 				 BUS_DMA_ALLOCNOW,	/* flags */
3175 				 NULL, NULL,		/* lock */
3176 				 &ss->rx_big.dmat);	/* tag */
3177 	if (err != 0) {
3178 		device_printf(sc->dev, "Err %d allocating rx_big dmat\n",
3179 			      err);
3180 		return err;;
3181 	}
3182 	for (i = 0; i <= ss->rx_small.mask; i++) {
3183 		err = bus_dmamap_create(ss->rx_small.dmat, 0,
3184 					&ss->rx_small.info[i].map);
3185 		if (err != 0) {
3186 			device_printf(sc->dev, "Err %d  rx_small dmamap\n",
3187 				      err);
3188 			return err;;
3189 		}
3190 	}
3191 	err = bus_dmamap_create(ss->rx_small.dmat, 0,
3192 				&ss->rx_small.extra_map);
3193 	if (err != 0) {
3194 		device_printf(sc->dev, "Err %d extra rx_small dmamap\n",
3195 			      err);
3196 		return err;;
3197 	}
3198 
3199 	for (i = 0; i <= ss->rx_big.mask; i++) {
3200 		err = bus_dmamap_create(ss->rx_big.dmat, 0,
3201 					&ss->rx_big.info[i].map);
3202 		if (err != 0) {
3203 			device_printf(sc->dev, "Err %d  rx_big dmamap\n",
3204 				      err);
3205 			return err;;
3206 		}
3207 	}
3208 	err = bus_dmamap_create(ss->rx_big.dmat, 0,
3209 				&ss->rx_big.extra_map);
3210 	if (err != 0) {
3211 		device_printf(sc->dev, "Err %d extra rx_big dmamap\n",
3212 			      err);
3213 		return err;;
3214 	}
3215 
3216 	/* now allocate TX resouces */
3217 
3218 #ifndef IFNET_BUF_RING
3219 	/* only use a single TX ring for now */
3220 	if (ss != ss->sc->ss)
3221 		return 0;
3222 #endif
3223 
3224 	ss->tx.mask = tx_ring_entries - 1;
3225 	ss->tx.max_desc = MIN(MXGE_MAX_SEND_DESC, tx_ring_entries / 4);
3226 
3227 
3228 	/* allocate the tx request copy block */
3229 	bytes = 8 +
3230 		sizeof (*ss->tx.req_list) * (ss->tx.max_desc + 4);
3231 	ss->tx.req_bytes = malloc(bytes, M_DEVBUF, M_WAITOK);
3232 	if (ss->tx.req_bytes == NULL)
3233 		return err;;
3234 	/* ensure req_list entries are aligned to 8 bytes */
3235 	ss->tx.req_list = (mcp_kreq_ether_send_t *)
3236 		((unsigned long)(ss->tx.req_bytes + 7) & ~7UL);
3237 
3238 	/* allocate the tx busdma segment list */
3239 	bytes = sizeof (*ss->tx.seg_list) * ss->tx.max_desc;
3240 	ss->tx.seg_list = (bus_dma_segment_t *)
3241 		malloc(bytes, M_DEVBUF, M_WAITOK);
3242 	if (ss->tx.seg_list == NULL)
3243 		return err;;
3244 
3245 	/* allocate the tx host info ring */
3246 	bytes = tx_ring_entries * sizeof (*ss->tx.info);
3247 	ss->tx.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3248 	if (ss->tx.info == NULL)
3249 		return err;;
3250 
3251 	/* allocate the tx busdma resources */
3252 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
3253 				 1,			/* alignment */
3254 				 sc->tx_boundary,	/* boundary */
3255 				 BUS_SPACE_MAXADDR,	/* low */
3256 				 BUS_SPACE_MAXADDR,	/* high */
3257 				 NULL, NULL,		/* filter */
3258 				 65536 + 256,		/* maxsize */
3259 				 ss->tx.max_desc - 2,	/* num segs */
3260 				 sc->tx_boundary,	/* maxsegsz */
3261 				 BUS_DMA_ALLOCNOW,	/* flags */
3262 				 NULL, NULL,		/* lock */
3263 				 &ss->tx.dmat);		/* tag */
3264 
3265 	if (err != 0) {
3266 		device_printf(sc->dev, "Err %d allocating tx dmat\n",
3267 			      err);
3268 		return err;;
3269 	}
3270 
3271 	/* now use these tags to setup dmamaps for each slot
3272 	   in the ring */
3273 	for (i = 0; i <= ss->tx.mask; i++) {
3274 		err = bus_dmamap_create(ss->tx.dmat, 0,
3275 					&ss->tx.info[i].map);
3276 		if (err != 0) {
3277 			device_printf(sc->dev, "Err %d  tx dmamap\n",
3278 				      err);
3279 			return err;;
3280 		}
3281 	}
3282 	return 0;
3283 
3284 }
3285 
3286 static int
3287 mxge_alloc_rings(mxge_softc_t *sc)
3288 {
3289 	mxge_cmd_t cmd;
3290 	int tx_ring_size;
3291 	int tx_ring_entries, rx_ring_entries;
3292 	int err, slice;
3293 
3294 	/* get ring sizes */
3295 	err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_RING_SIZE, &cmd);
3296 	tx_ring_size = cmd.data0;
3297 	if (err != 0) {
3298 		device_printf(sc->dev, "Cannot determine tx ring sizes\n");
3299 		goto abort;
3300 	}
3301 
3302 	tx_ring_entries = tx_ring_size / sizeof (mcp_kreq_ether_send_t);
3303 	rx_ring_entries = sc->rx_ring_size / sizeof (mcp_dma_addr_t);
3304 	IFQ_SET_MAXLEN(&sc->ifp->if_snd, tx_ring_entries - 1);
3305 	sc->ifp->if_snd.ifq_drv_maxlen = sc->ifp->if_snd.ifq_maxlen;
3306 	IFQ_SET_READY(&sc->ifp->if_snd);
3307 
3308 	for (slice = 0; slice < sc->num_slices; slice++) {
3309 		err = mxge_alloc_slice_rings(&sc->ss[slice],
3310 					     rx_ring_entries,
3311 					     tx_ring_entries);
3312 		if (err != 0)
3313 			goto abort;
3314 	}
3315 	return 0;
3316 
3317 abort:
3318 	mxge_free_rings(sc);
3319 	return err;
3320 
3321 }
3322 
3323 
3324 static void
3325 mxge_choose_params(int mtu, int *big_buf_size, int *cl_size, int *nbufs)
3326 {
3327 	int bufsize = mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN + MXGEFW_PAD;
3328 
3329 	if (bufsize < MCLBYTES) {
3330 		/* easy, everything fits in a single buffer */
3331 		*big_buf_size = MCLBYTES;
3332 		*cl_size = MCLBYTES;
3333 		*nbufs = 1;
3334 		return;
3335 	}
3336 
3337 	if (bufsize < MJUMPAGESIZE) {
3338 		/* still easy, everything still fits in a single buffer */
3339 		*big_buf_size = MJUMPAGESIZE;
3340 		*cl_size = MJUMPAGESIZE;
3341 		*nbufs = 1;
3342 		return;
3343 	}
3344 #if MXGE_VIRT_JUMBOS
3345 	/* now we need to use virtually contiguous buffers */
3346 	*cl_size = MJUM9BYTES;
3347 	*big_buf_size = 4096;
3348 	*nbufs = mtu / 4096 + 1;
3349 	/* needs to be a power of two, so round up */
3350 	if (*nbufs == 3)
3351 		*nbufs = 4;
3352 #else
3353 	*cl_size = MJUM9BYTES;
3354 	*big_buf_size = MJUM9BYTES;
3355 	*nbufs = 1;
3356 #endif
3357 }
3358 
3359 static int
3360 mxge_slice_open(struct mxge_slice_state *ss, int nbufs, int cl_size)
3361 {
3362 	mxge_softc_t *sc;
3363 	mxge_cmd_t cmd;
3364 	bus_dmamap_t map;
3365 	struct lro_entry *lro_entry;
3366 	int err, i, slice;
3367 
3368 
3369 	sc = ss->sc;
3370 	slice = ss - sc->ss;
3371 
3372 	SLIST_INIT(&ss->lro_free);
3373 	SLIST_INIT(&ss->lro_active);
3374 
3375 	for (i = 0; i < sc->lro_cnt; i++) {
3376 		lro_entry = (struct lro_entry *)
3377 			malloc(sizeof (*lro_entry), M_DEVBUF,
3378 			       M_NOWAIT | M_ZERO);
3379 		if (lro_entry == NULL) {
3380 			sc->lro_cnt = i;
3381 			break;
3382 		}
3383 		SLIST_INSERT_HEAD(&ss->lro_free, lro_entry, next);
3384 	}
3385 	/* get the lanai pointers to the send and receive rings */
3386 
3387 	err = 0;
3388 #ifndef IFNET_BUF_RING
3389 	/* We currently only send from the first slice */
3390 	if (slice == 0) {
3391 #endif
3392 		cmd.data0 = slice;
3393 		err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_OFFSET, &cmd);
3394 		ss->tx.lanai =
3395 			(volatile mcp_kreq_ether_send_t *)(sc->sram + cmd.data0);
3396 		ss->tx.send_go = (volatile uint32_t *)
3397 			(sc->sram + MXGEFW_ETH_SEND_GO + 64 * slice);
3398 		ss->tx.send_stop = (volatile uint32_t *)
3399 		(sc->sram + MXGEFW_ETH_SEND_STOP + 64 * slice);
3400 #ifndef IFNET_BUF_RING
3401 	}
3402 #endif
3403 	cmd.data0 = slice;
3404 	err |= mxge_send_cmd(sc,
3405 			     MXGEFW_CMD_GET_SMALL_RX_OFFSET, &cmd);
3406 	ss->rx_small.lanai =
3407 		(volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
3408 	cmd.data0 = slice;
3409 	err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_BIG_RX_OFFSET, &cmd);
3410 	ss->rx_big.lanai =
3411 		(volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
3412 
3413 	if (err != 0) {
3414 		device_printf(sc->dev,
3415 			      "failed to get ring sizes or locations\n");
3416 		return EIO;
3417 	}
3418 
3419 	/* stock receive rings */
3420 	for (i = 0; i <= ss->rx_small.mask; i++) {
3421 		map = ss->rx_small.info[i].map;
3422 		err = mxge_get_buf_small(ss, map, i);
3423 		if (err) {
3424 			device_printf(sc->dev, "alloced %d/%d smalls\n",
3425 				      i, ss->rx_small.mask + 1);
3426 			return ENOMEM;
3427 		}
3428 	}
3429 	for (i = 0; i <= ss->rx_big.mask; i++) {
3430 		ss->rx_big.shadow[i].addr_low = 0xffffffff;
3431 		ss->rx_big.shadow[i].addr_high = 0xffffffff;
3432 	}
3433 	ss->rx_big.nbufs = nbufs;
3434 	ss->rx_big.cl_size = cl_size;
3435 	for (i = 0; i <= ss->rx_big.mask; i += ss->rx_big.nbufs) {
3436 		map = ss->rx_big.info[i].map;
3437 		err = mxge_get_buf_big(ss, map, i);
3438 		if (err) {
3439 			device_printf(sc->dev, "alloced %d/%d bigs\n",
3440 				      i, ss->rx_big.mask + 1);
3441 			return ENOMEM;
3442 		}
3443 	}
3444 	return 0;
3445 }
3446 
3447 static int
3448 mxge_open(mxge_softc_t *sc)
3449 {
3450 	mxge_cmd_t cmd;
3451 	int err, big_bytes, nbufs, slice, cl_size, i;
3452 	bus_addr_t bus;
3453 	volatile uint8_t *itable;
3454 	struct mxge_slice_state *ss;
3455 
3456 	/* Copy the MAC address in case it was overridden */
3457 	bcopy(IF_LLADDR(sc->ifp), sc->mac_addr, ETHER_ADDR_LEN);
3458 
3459 	err = mxge_reset(sc, 1);
3460 	if (err != 0) {
3461 		device_printf(sc->dev, "failed to reset\n");
3462 		return EIO;
3463 	}
3464 
3465 	if (sc->num_slices > 1) {
3466 		/* setup the indirection table */
3467 		cmd.data0 = sc->num_slices;
3468 		err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_TABLE_SIZE,
3469 				    &cmd);
3470 
3471 		err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_RSS_TABLE_OFFSET,
3472 				     &cmd);
3473 		if (err != 0) {
3474 			device_printf(sc->dev,
3475 				      "failed to setup rss tables\n");
3476 			return err;
3477 		}
3478 
3479 		/* just enable an identity mapping */
3480 		itable = sc->sram + cmd.data0;
3481 		for (i = 0; i < sc->num_slices; i++)
3482 			itable[i] = (uint8_t)i;
3483 
3484 		cmd.data0 = 1;
3485 		cmd.data1 = mxge_rss_hash_type;
3486 		err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_ENABLE, &cmd);
3487 		if (err != 0) {
3488 			device_printf(sc->dev, "failed to enable slices\n");
3489 			return err;
3490 		}
3491 	}
3492 
3493 
3494 	mxge_choose_params(sc->ifp->if_mtu, &big_bytes, &cl_size, &nbufs);
3495 
3496 	cmd.data0 = nbufs;
3497 	err = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
3498 			    &cmd);
3499 	/* error is only meaningful if we're trying to set
3500 	   MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS > 1 */
3501 	if (err && nbufs > 1) {
3502 		device_printf(sc->dev,
3503 			      "Failed to set alway-use-n to %d\n",
3504 			      nbufs);
3505 		return EIO;
3506 	}
3507 	/* Give the firmware the mtu and the big and small buffer
3508 	   sizes.  The firmware wants the big buf size to be a power
3509 	   of two. Luckily, FreeBSD's clusters are powers of two */
3510 	cmd.data0 = sc->ifp->if_mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
3511 	err = mxge_send_cmd(sc, MXGEFW_CMD_SET_MTU, &cmd);
3512 	cmd.data0 = MHLEN - MXGEFW_PAD;
3513 	err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_SMALL_BUFFER_SIZE,
3514 			     &cmd);
3515 	cmd.data0 = big_bytes;
3516 	err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_BIG_BUFFER_SIZE, &cmd);
3517 
3518 	if (err != 0) {
3519 		device_printf(sc->dev, "failed to setup params\n");
3520 		goto abort;
3521 	}
3522 
3523 	/* Now give him the pointer to the stats block */
3524 	for (slice = 0;
3525 #ifdef IFNET_BUF_RING
3526 	     slice < sc->num_slices;
3527 #else
3528 	     slice < 1;
3529 #endif
3530 	     slice++) {
3531 		ss = &sc->ss[slice];
3532 		cmd.data0 =
3533 			MXGE_LOWPART_TO_U32(ss->fw_stats_dma.bus_addr);
3534 		cmd.data1 =
3535 			MXGE_HIGHPART_TO_U32(ss->fw_stats_dma.bus_addr);
3536 		cmd.data2 = sizeof(struct mcp_irq_data);
3537 		cmd.data2 |= (slice << 16);
3538 		err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_STATS_DMA_V2, &cmd);
3539 	}
3540 
3541 	if (err != 0) {
3542 		bus = sc->ss->fw_stats_dma.bus_addr;
3543 		bus += offsetof(struct mcp_irq_data, send_done_count);
3544 		cmd.data0 = MXGE_LOWPART_TO_U32(bus);
3545 		cmd.data1 = MXGE_HIGHPART_TO_U32(bus);
3546 		err = mxge_send_cmd(sc,
3547 				    MXGEFW_CMD_SET_STATS_DMA_OBSOLETE,
3548 				    &cmd);
3549 		/* Firmware cannot support multicast without STATS_DMA_V2 */
3550 		sc->fw_multicast_support = 0;
3551 	} else {
3552 		sc->fw_multicast_support = 1;
3553 	}
3554 
3555 	if (err != 0) {
3556 		device_printf(sc->dev, "failed to setup params\n");
3557 		goto abort;
3558 	}
3559 
3560 	for (slice = 0; slice < sc->num_slices; slice++) {
3561 		err = mxge_slice_open(&sc->ss[slice], nbufs, cl_size);
3562 		if (err != 0) {
3563 			device_printf(sc->dev, "couldn't open slice %d\n",
3564 				      slice);
3565 			goto abort;
3566 		}
3567 	}
3568 
3569 	/* Finally, start the firmware running */
3570 	err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_UP, &cmd);
3571 	if (err) {
3572 		device_printf(sc->dev, "Couldn't bring up link\n");
3573 		goto abort;
3574 	}
3575 #ifdef IFNET_BUF_RING
3576 	for (slice = 0; slice < sc->num_slices; slice++) {
3577 		ss = &sc->ss[slice];
3578 		ss->if_drv_flags |= IFF_DRV_RUNNING;
3579 		ss->if_drv_flags &= ~IFF_DRV_OACTIVE;
3580 	}
3581 #endif
3582 	sc->ifp->if_drv_flags |= IFF_DRV_RUNNING;
3583 	sc->ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
3584 	callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
3585 
3586 	return 0;
3587 
3588 
3589 abort:
3590 	mxge_free_mbufs(sc);
3591 
3592 	return err;
3593 }
3594 
3595 static int
3596 mxge_close(mxge_softc_t *sc)
3597 {
3598 	mxge_cmd_t cmd;
3599 	int err, old_down_cnt;
3600 #ifdef IFNET_BUF_RING
3601 	struct mxge_slice_state *ss;
3602 	int slice;
3603 #endif
3604 
3605 	callout_stop(&sc->co_hdl);
3606 #ifdef IFNET_BUF_RING
3607 	for (slice = 0; slice < sc->num_slices; slice++) {
3608 		ss = &sc->ss[slice];
3609 		ss->if_drv_flags &= ~IFF_DRV_RUNNING;
3610 	}
3611 #endif
3612 	sc->ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
3613 	old_down_cnt = sc->down_cnt;
3614 	wmb();
3615 	err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_DOWN, &cmd);
3616 	if (err) {
3617 		device_printf(sc->dev, "Couldn't bring down link\n");
3618 	}
3619 	if (old_down_cnt == sc->down_cnt) {
3620 		/* wait for down irq */
3621 		DELAY(10 * sc->intr_coal_delay);
3622 	}
3623 	wmb();
3624 	if (old_down_cnt == sc->down_cnt) {
3625 		device_printf(sc->dev, "never got down irq\n");
3626 	}
3627 
3628 	mxge_free_mbufs(sc);
3629 
3630 	return 0;
3631 }
3632 
3633 static void
3634 mxge_setup_cfg_space(mxge_softc_t *sc)
3635 {
3636 	device_t dev = sc->dev;
3637 	int reg;
3638 	uint16_t cmd, lnk, pectl;
3639 
3640 	/* find the PCIe link width and set max read request to 4KB*/
3641 	if (pci_find_extcap(dev, PCIY_EXPRESS, &reg) == 0) {
3642 		lnk = pci_read_config(dev, reg + 0x12, 2);
3643 		sc->link_width = (lnk >> 4) & 0x3f;
3644 
3645 		pectl = pci_read_config(dev, reg + 0x8, 2);
3646 		pectl = (pectl & ~0x7000) | (5 << 12);
3647 		pci_write_config(dev, reg + 0x8, pectl, 2);
3648 	}
3649 
3650 	/* Enable DMA and Memory space access */
3651 	pci_enable_busmaster(dev);
3652 	cmd = pci_read_config(dev, PCIR_COMMAND, 2);
3653 	cmd |= PCIM_CMD_MEMEN;
3654 	pci_write_config(dev, PCIR_COMMAND, cmd, 2);
3655 }
3656 
3657 static uint32_t
3658 mxge_read_reboot(mxge_softc_t *sc)
3659 {
3660 	device_t dev = sc->dev;
3661 	uint32_t vs;
3662 
3663 	/* find the vendor specific offset */
3664 	if (pci_find_extcap(dev, PCIY_VENDOR, &vs) != 0) {
3665 		device_printf(sc->dev,
3666 			      "could not find vendor specific offset\n");
3667 		return (uint32_t)-1;
3668 	}
3669 	/* enable read32 mode */
3670 	pci_write_config(dev, vs + 0x10, 0x3, 1);
3671 	/* tell NIC which register to read */
3672 	pci_write_config(dev, vs + 0x18, 0xfffffff0, 4);
3673 	return (pci_read_config(dev, vs + 0x14, 4));
3674 }
3675 
3676 static int
3677 mxge_watchdog_reset(mxge_softc_t *sc, int slice)
3678 {
3679 	struct pci_devinfo *dinfo;
3680 	mxge_tx_ring_t *tx;
3681 	int err;
3682 	uint32_t reboot;
3683 	uint16_t cmd;
3684 
3685 	err = ENXIO;
3686 
3687 	device_printf(sc->dev, "Watchdog reset!\n");
3688 
3689 	/*
3690 	 * check to see if the NIC rebooted.  If it did, then all of
3691 	 * PCI config space has been reset, and things like the
3692 	 * busmaster bit will be zero.  If this is the case, then we
3693 	 * must restore PCI config space before the NIC can be used
3694 	 * again
3695 	 */
3696 	cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3697 	if (cmd == 0xffff) {
3698 		/*
3699 		 * maybe the watchdog caught the NIC rebooting; wait
3700 		 * up to 100ms for it to finish.  If it does not come
3701 		 * back, then give up
3702 		 */
3703 		DELAY(1000*100);
3704 		cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3705 		if (cmd == 0xffff) {
3706 			device_printf(sc->dev, "NIC disappeared!\n");
3707 			return (err);
3708 		}
3709 	}
3710 	if ((cmd & PCIM_CMD_BUSMASTEREN) == 0) {
3711 		/* print the reboot status */
3712 		reboot = mxge_read_reboot(sc);
3713 		device_printf(sc->dev, "NIC rebooted, status = 0x%x\n",
3714 			      reboot);
3715 		/* restore PCI configuration space */
3716 		dinfo = device_get_ivars(sc->dev);
3717 		pci_cfg_restore(sc->dev, dinfo);
3718 
3719 		/* and redo any changes we made to our config space */
3720 		mxge_setup_cfg_space(sc);
3721 
3722 		if (sc->ifp->if_drv_flags & IFF_DRV_RUNNING) {
3723 			mxge_close(sc);
3724 			err = mxge_open(sc);
3725 		}
3726 	} else {
3727 		tx = &sc->ss[slice].tx;
3728 		device_printf(sc->dev,
3729 			      "NIC did not reboot, slice %d ring state:\n",
3730 			      slice);
3731 		device_printf(sc->dev,
3732 			      "tx.req=%d tx.done=%d, tx.queue_active=%d\n",
3733 			      tx->req, tx->done, tx->queue_active);
3734 		device_printf(sc->dev, "tx.activate=%d tx.deactivate=%d\n",
3735 			      tx->activate, tx->deactivate);
3736 		device_printf(sc->dev, "pkt_done=%d fw=%d\n",
3737 			      tx->pkt_done,
3738 			      be32toh(sc->ss->fw_stats->send_done_count));
3739 		device_printf(sc->dev, "not resetting\n");
3740 	}
3741 	return (err);
3742 }
3743 
3744 static int
3745 mxge_watchdog(mxge_softc_t *sc)
3746 {
3747 	mxge_tx_ring_t *tx;
3748 	uint32_t rx_pause = be32toh(sc->ss->fw_stats->dropped_pause);
3749 	int i, err = 0;
3750 
3751 	/* see if we have outstanding transmits, which
3752 	   have been pending for more than mxge_ticks */
3753 	for (i = 0;
3754 #ifdef IFNET_BUF_RING
3755 	     (i < sc->num_slices) && (err == 0);
3756 #else
3757 	     (i < 1) && (err == 0);
3758 #endif
3759 	     i++) {
3760 		tx = &sc->ss[i].tx;
3761 		if (tx->req != tx->done &&
3762 		    tx->watchdog_req != tx->watchdog_done &&
3763 		    tx->done == tx->watchdog_done) {
3764 			/* check for pause blocking before resetting */
3765 			if (tx->watchdog_rx_pause == rx_pause)
3766 				err = mxge_watchdog_reset(sc, i);
3767 			else
3768 				device_printf(sc->dev, "Flow control blocking "
3769 					      "xmits, check link partner\n");
3770 		}
3771 
3772 		tx->watchdog_req = tx->req;
3773 		tx->watchdog_done = tx->done;
3774 		tx->watchdog_rx_pause = rx_pause;
3775 	}
3776 
3777 	if (sc->need_media_probe)
3778 		mxge_media_probe(sc);
3779 	return (err);
3780 }
3781 
3782 static void
3783 mxge_update_stats(mxge_softc_t *sc)
3784 {
3785 	struct mxge_slice_state *ss;
3786 	u_long ipackets = 0;
3787 	u_long opackets = 0;
3788 #ifdef IFNET_BUF_RING
3789 	u_long obytes = 0;
3790 	u_long omcasts = 0;
3791 	u_long odrops = 0;
3792 #endif
3793 	u_long oerrors = 0;
3794 	int slice;
3795 
3796 	for (slice = 0; slice < sc->num_slices; slice++) {
3797 		ss = &sc->ss[slice];
3798 		ipackets += ss->ipackets;
3799 		opackets += ss->opackets;
3800 #ifdef IFNET_BUF_RING
3801 		obytes += ss->obytes;
3802 		omcasts += ss->omcasts;
3803 		odrops += ss->tx.br->br_drops;
3804 #endif
3805 		oerrors += ss->oerrors;
3806 	}
3807 	sc->ifp->if_ipackets = ipackets;
3808 	sc->ifp->if_opackets = opackets;
3809 #ifdef IFNET_BUF_RING
3810 	sc->ifp->if_obytes = obytes;
3811 	sc->ifp->if_omcasts = omcasts;
3812 	sc->ifp->if_snd.ifq_drops = odrops;
3813 #endif
3814 	sc->ifp->if_oerrors = oerrors;
3815 }
3816 
3817 static void
3818 mxge_tick(void *arg)
3819 {
3820 	mxge_softc_t *sc = arg;
3821 	int err = 0;
3822 
3823 	/* aggregate stats from different slices */
3824 	mxge_update_stats(sc);
3825 	if (!sc->watchdog_countdown) {
3826 		err = mxge_watchdog(sc);
3827 		sc->watchdog_countdown = 4;
3828 	}
3829 	sc->watchdog_countdown--;
3830 	if (err == 0)
3831 		callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
3832 
3833 }
3834 
3835 static int
3836 mxge_media_change(struct ifnet *ifp)
3837 {
3838 	return EINVAL;
3839 }
3840 
3841 static int
3842 mxge_change_mtu(mxge_softc_t *sc, int mtu)
3843 {
3844 	struct ifnet *ifp = sc->ifp;
3845 	int real_mtu, old_mtu;
3846 	int err = 0;
3847 
3848 
3849 	real_mtu = mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
3850 	if ((real_mtu > sc->max_mtu) || real_mtu < 60)
3851 		return EINVAL;
3852 	mtx_lock(&sc->driver_mtx);
3853 	old_mtu = ifp->if_mtu;
3854 	ifp->if_mtu = mtu;
3855 	if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
3856 		mxge_close(sc);
3857 		err = mxge_open(sc);
3858 		if (err != 0) {
3859 			ifp->if_mtu = old_mtu;
3860 			mxge_close(sc);
3861 			(void) mxge_open(sc);
3862 		}
3863 	}
3864 	mtx_unlock(&sc->driver_mtx);
3865 	return err;
3866 }
3867 
3868 static void
3869 mxge_media_status(struct ifnet *ifp, struct ifmediareq *ifmr)
3870 {
3871 	mxge_softc_t *sc = ifp->if_softc;
3872 
3873 
3874 	if (sc == NULL)
3875 		return;
3876 	ifmr->ifm_status = IFM_AVALID;
3877 	ifmr->ifm_status |= sc->link_state ? IFM_ACTIVE : 0;
3878 	ifmr->ifm_active = IFM_AUTO | IFM_ETHER;
3879 	ifmr->ifm_active |= sc->link_state ? IFM_FDX : 0;
3880 }
3881 
3882 static int
3883 mxge_ioctl(struct ifnet *ifp, u_long command, caddr_t data)
3884 {
3885 	mxge_softc_t *sc = ifp->if_softc;
3886 	struct ifreq *ifr = (struct ifreq *)data;
3887 	int err, mask;
3888 
3889 	err = 0;
3890 	switch (command) {
3891 	case SIOCSIFADDR:
3892 	case SIOCGIFADDR:
3893 		err = ether_ioctl(ifp, command, data);
3894 		break;
3895 
3896 	case SIOCSIFMTU:
3897 		err = mxge_change_mtu(sc, ifr->ifr_mtu);
3898 		break;
3899 
3900 	case SIOCSIFFLAGS:
3901 		mtx_lock(&sc->driver_mtx);
3902 		if (ifp->if_flags & IFF_UP) {
3903 			if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) {
3904 				err = mxge_open(sc);
3905 			} else {
3906 				/* take care of promis can allmulti
3907 				   flag chages */
3908 				mxge_change_promisc(sc,
3909 						    ifp->if_flags & IFF_PROMISC);
3910 				mxge_set_multicast_list(sc);
3911 			}
3912 		} else {
3913 			if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
3914 				mxge_close(sc);
3915 			}
3916 		}
3917 		mtx_unlock(&sc->driver_mtx);
3918 		break;
3919 
3920 	case SIOCADDMULTI:
3921 	case SIOCDELMULTI:
3922 		mtx_lock(&sc->driver_mtx);
3923 		mxge_set_multicast_list(sc);
3924 		mtx_unlock(&sc->driver_mtx);
3925 		break;
3926 
3927 	case SIOCSIFCAP:
3928 		mtx_lock(&sc->driver_mtx);
3929 		mask = ifr->ifr_reqcap ^ ifp->if_capenable;
3930 		if (mask & IFCAP_TXCSUM) {
3931 			if (IFCAP_TXCSUM & ifp->if_capenable) {
3932 				ifp->if_capenable &= ~(IFCAP_TXCSUM|IFCAP_TSO4);
3933 				ifp->if_hwassist &= ~(CSUM_TCP | CSUM_UDP
3934 						      | CSUM_TSO);
3935 			} else {
3936 				ifp->if_capenable |= IFCAP_TXCSUM;
3937 				ifp->if_hwassist |= (CSUM_TCP | CSUM_UDP);
3938 			}
3939 		} else if (mask & IFCAP_RXCSUM) {
3940 			if (IFCAP_RXCSUM & ifp->if_capenable) {
3941 				ifp->if_capenable &= ~IFCAP_RXCSUM;
3942 				sc->csum_flag = 0;
3943 			} else {
3944 				ifp->if_capenable |= IFCAP_RXCSUM;
3945 				sc->csum_flag = 1;
3946 			}
3947 		}
3948 		if (mask & IFCAP_TSO4) {
3949 			if (IFCAP_TSO4 & ifp->if_capenable) {
3950 				ifp->if_capenable &= ~IFCAP_TSO4;
3951 				ifp->if_hwassist &= ~CSUM_TSO;
3952 			} else if (IFCAP_TXCSUM & ifp->if_capenable) {
3953 				ifp->if_capenable |= IFCAP_TSO4;
3954 				ifp->if_hwassist |= CSUM_TSO;
3955 			} else {
3956 				printf("mxge requires tx checksum offload"
3957 				       " be enabled to use TSO\n");
3958 				err = EINVAL;
3959 			}
3960 		}
3961 		if (mask & IFCAP_LRO) {
3962 			if (IFCAP_LRO & ifp->if_capenable)
3963 				err = mxge_change_lro_locked(sc, 0);
3964 			else
3965 				err = mxge_change_lro_locked(sc, mxge_lro_cnt);
3966 		}
3967 		if (mask & IFCAP_VLAN_HWTAGGING)
3968 			ifp->if_capenable ^= IFCAP_VLAN_HWTAGGING;
3969 		mtx_unlock(&sc->driver_mtx);
3970 		VLAN_CAPABILITIES(ifp);
3971 
3972 		break;
3973 
3974 	case SIOCGIFMEDIA:
3975 		err = ifmedia_ioctl(ifp, (struct ifreq *)data,
3976 				    &sc->media, command);
3977                 break;
3978 
3979 	default:
3980 		err = ENOTTY;
3981         }
3982 	return err;
3983 }
3984 
3985 static void
3986 mxge_fetch_tunables(mxge_softc_t *sc)
3987 {
3988 
3989 	TUNABLE_INT_FETCH("hw.mxge.max_slices", &mxge_max_slices);
3990 	TUNABLE_INT_FETCH("hw.mxge.flow_control_enabled",
3991 			  &mxge_flow_control);
3992 	TUNABLE_INT_FETCH("hw.mxge.intr_coal_delay",
3993 			  &mxge_intr_coal_delay);
3994 	TUNABLE_INT_FETCH("hw.mxge.nvidia_ecrc_enable",
3995 			  &mxge_nvidia_ecrc_enable);
3996 	TUNABLE_INT_FETCH("hw.mxge.force_firmware",
3997 			  &mxge_force_firmware);
3998 	TUNABLE_INT_FETCH("hw.mxge.deassert_wait",
3999 			  &mxge_deassert_wait);
4000 	TUNABLE_INT_FETCH("hw.mxge.verbose",
4001 			  &mxge_verbose);
4002 	TUNABLE_INT_FETCH("hw.mxge.ticks", &mxge_ticks);
4003 	TUNABLE_INT_FETCH("hw.mxge.lro_cnt", &sc->lro_cnt);
4004 	TUNABLE_INT_FETCH("hw.mxge.always_promisc", &mxge_always_promisc);
4005 	TUNABLE_INT_FETCH("hw.mxge.rss_hash_type", &mxge_rss_hash_type);
4006 	if (sc->lro_cnt != 0)
4007 		mxge_lro_cnt = sc->lro_cnt;
4008 
4009 	if (bootverbose)
4010 		mxge_verbose = 1;
4011 	if (mxge_intr_coal_delay < 0 || mxge_intr_coal_delay > 10*1000)
4012 		mxge_intr_coal_delay = 30;
4013 	if (mxge_ticks == 0)
4014 		mxge_ticks = hz / 2;
4015 	sc->pause = mxge_flow_control;
4016 	if (mxge_rss_hash_type < MXGEFW_RSS_HASH_TYPE_IPV4
4017 	    || mxge_rss_hash_type > MXGEFW_RSS_HASH_TYPE_SRC_PORT) {
4018 		mxge_rss_hash_type = MXGEFW_RSS_HASH_TYPE_SRC_PORT;
4019 	}
4020 }
4021 
4022 
4023 static void
4024 mxge_free_slices(mxge_softc_t *sc)
4025 {
4026 	struct mxge_slice_state *ss;
4027 	int i;
4028 
4029 
4030 	if (sc->ss == NULL)
4031 		return;
4032 
4033 	for (i = 0; i < sc->num_slices; i++) {
4034 		ss = &sc->ss[i];
4035 		if (ss->fw_stats != NULL) {
4036 			mxge_dma_free(&ss->fw_stats_dma);
4037 			ss->fw_stats = NULL;
4038 #ifdef IFNET_BUF_RING
4039 			if (ss->tx.br != NULL) {
4040 				drbr_free(ss->tx.br, M_DEVBUF);
4041 				ss->tx.br = NULL;
4042 			}
4043 #endif
4044 			mtx_destroy(&ss->tx.mtx);
4045 		}
4046 		if (ss->rx_done.entry != NULL) {
4047 			mxge_dma_free(&ss->rx_done.dma);
4048 			ss->rx_done.entry = NULL;
4049 		}
4050 	}
4051 	free(sc->ss, M_DEVBUF);
4052 	sc->ss = NULL;
4053 }
4054 
4055 static int
4056 mxge_alloc_slices(mxge_softc_t *sc)
4057 {
4058 	mxge_cmd_t cmd;
4059 	struct mxge_slice_state *ss;
4060 	size_t bytes;
4061 	int err, i, max_intr_slots;
4062 
4063 	err = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
4064 	if (err != 0) {
4065 		device_printf(sc->dev, "Cannot determine rx ring size\n");
4066 		return err;
4067 	}
4068 	sc->rx_ring_size = cmd.data0;
4069 	max_intr_slots = 2 * (sc->rx_ring_size / sizeof (mcp_dma_addr_t));
4070 
4071 	bytes = sizeof (*sc->ss) * sc->num_slices;
4072 	sc->ss = malloc(bytes, M_DEVBUF, M_NOWAIT | M_ZERO);
4073 	if (sc->ss == NULL)
4074 		return (ENOMEM);
4075 	for (i = 0; i < sc->num_slices; i++) {
4076 		ss = &sc->ss[i];
4077 
4078 		ss->sc = sc;
4079 
4080 		/* allocate per-slice rx interrupt queues */
4081 
4082 		bytes = max_intr_slots * sizeof (*ss->rx_done.entry);
4083 		err = mxge_dma_alloc(sc, &ss->rx_done.dma, bytes, 4096);
4084 		if (err != 0)
4085 			goto abort;
4086 		ss->rx_done.entry = ss->rx_done.dma.addr;
4087 		bzero(ss->rx_done.entry, bytes);
4088 
4089 		/*
4090 		 * allocate the per-slice firmware stats; stats
4091 		 * (including tx) are used used only on the first
4092 		 * slice for now
4093 		 */
4094 #ifndef IFNET_BUF_RING
4095 		if (i > 0)
4096 			continue;
4097 #endif
4098 
4099 		bytes = sizeof (*ss->fw_stats);
4100 		err = mxge_dma_alloc(sc, &ss->fw_stats_dma,
4101 				     sizeof (*ss->fw_stats), 64);
4102 		if (err != 0)
4103 			goto abort;
4104 		ss->fw_stats = (mcp_irq_data_t *)ss->fw_stats_dma.addr;
4105 		snprintf(ss->tx.mtx_name, sizeof(ss->tx.mtx_name),
4106 			 "%s:tx(%d)", device_get_nameunit(sc->dev), i);
4107 		mtx_init(&ss->tx.mtx, ss->tx.mtx_name, NULL, MTX_DEF);
4108 #ifdef IFNET_BUF_RING
4109 		ss->tx.br = buf_ring_alloc(2048, M_DEVBUF, M_WAITOK,
4110 					   &ss->tx.mtx);
4111 #endif
4112 	}
4113 
4114 	return (0);
4115 
4116 abort:
4117 	mxge_free_slices(sc);
4118 	return (ENOMEM);
4119 }
4120 
4121 static void
4122 mxge_slice_probe(mxge_softc_t *sc)
4123 {
4124 	mxge_cmd_t cmd;
4125 	char *old_fw;
4126 	int msix_cnt, status, max_intr_slots;
4127 
4128 	sc->num_slices = 1;
4129 	/*
4130 	 *  don't enable multiple slices if they are not enabled,
4131 	 *  or if this is not an SMP system
4132 	 */
4133 
4134 	if (mxge_max_slices == 0 || mxge_max_slices == 1 || mp_ncpus < 2)
4135 		return;
4136 
4137 	/* see how many MSI-X interrupts are available */
4138 	msix_cnt = pci_msix_count(sc->dev);
4139 	if (msix_cnt < 2)
4140 		return;
4141 
4142 	/* now load the slice aware firmware see what it supports */
4143 	old_fw = sc->fw_name;
4144 	if (old_fw == mxge_fw_aligned)
4145 		sc->fw_name = mxge_fw_rss_aligned;
4146 	else
4147 		sc->fw_name = mxge_fw_rss_unaligned;
4148 	status = mxge_load_firmware(sc, 0);
4149 	if (status != 0) {
4150 		device_printf(sc->dev, "Falling back to a single slice\n");
4151 		return;
4152 	}
4153 
4154 	/* try to send a reset command to the card to see if it
4155 	   is alive */
4156 	memset(&cmd, 0, sizeof (cmd));
4157 	status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
4158 	if (status != 0) {
4159 		device_printf(sc->dev, "failed reset\n");
4160 		goto abort_with_fw;
4161 	}
4162 
4163 	/* get rx ring size */
4164 	status = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
4165 	if (status != 0) {
4166 		device_printf(sc->dev, "Cannot determine rx ring size\n");
4167 		goto abort_with_fw;
4168 	}
4169 	max_intr_slots = 2 * (cmd.data0 / sizeof (mcp_dma_addr_t));
4170 
4171 	/* tell it the size of the interrupt queues */
4172 	cmd.data0 = max_intr_slots * sizeof (struct mcp_slot);
4173 	status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
4174 	if (status != 0) {
4175 		device_printf(sc->dev, "failed MXGEFW_CMD_SET_INTRQ_SIZE\n");
4176 		goto abort_with_fw;
4177 	}
4178 
4179 	/* ask the maximum number of slices it supports */
4180 	status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES, &cmd);
4181 	if (status != 0) {
4182 		device_printf(sc->dev,
4183 			      "failed MXGEFW_CMD_GET_MAX_RSS_QUEUES\n");
4184 		goto abort_with_fw;
4185 	}
4186 	sc->num_slices = cmd.data0;
4187 	if (sc->num_slices > msix_cnt)
4188 		sc->num_slices = msix_cnt;
4189 
4190 	if (mxge_max_slices == -1) {
4191 		/* cap to number of CPUs in system */
4192 		if (sc->num_slices > mp_ncpus)
4193 			sc->num_slices = mp_ncpus;
4194 	} else {
4195 		if (sc->num_slices > mxge_max_slices)
4196 			sc->num_slices = mxge_max_slices;
4197 	}
4198 	/* make sure it is a power of two */
4199 	while (sc->num_slices & (sc->num_slices - 1))
4200 		sc->num_slices--;
4201 
4202 	if (mxge_verbose)
4203 		device_printf(sc->dev, "using %d slices\n",
4204 			      sc->num_slices);
4205 
4206 	return;
4207 
4208 abort_with_fw:
4209 	sc->fw_name = old_fw;
4210 	(void) mxge_load_firmware(sc, 0);
4211 }
4212 
4213 static int
4214 mxge_add_msix_irqs(mxge_softc_t *sc)
4215 {
4216 	size_t bytes;
4217 	int count, err, i, rid;
4218 
4219 	rid = PCIR_BAR(2);
4220 	sc->msix_table_res = bus_alloc_resource_any(sc->dev, SYS_RES_MEMORY,
4221 						    &rid, RF_ACTIVE);
4222 
4223 	if (sc->msix_table_res == NULL) {
4224 		device_printf(sc->dev, "couldn't alloc MSIX table res\n");
4225 		return ENXIO;
4226 	}
4227 
4228 	count = sc->num_slices;
4229 	err = pci_alloc_msix(sc->dev, &count);
4230 	if (err != 0) {
4231 		device_printf(sc->dev, "pci_alloc_msix: failed, wanted %d"
4232 			      "err = %d \n", sc->num_slices, err);
4233 		goto abort_with_msix_table;
4234 	}
4235 	if (count < sc->num_slices) {
4236 		device_printf(sc->dev, "pci_alloc_msix: need %d, got %d\n",
4237 			      count, sc->num_slices);
4238 		device_printf(sc->dev,
4239 			      "Try setting hw.mxge.max_slices to %d\n",
4240 			      count);
4241 		err = ENOSPC;
4242 		goto abort_with_msix;
4243 	}
4244 	bytes = sizeof (*sc->msix_irq_res) * sc->num_slices;
4245 	sc->msix_irq_res = malloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO);
4246 	if (sc->msix_irq_res == NULL) {
4247 		err = ENOMEM;
4248 		goto abort_with_msix;
4249 	}
4250 
4251 	for (i = 0; i < sc->num_slices; i++) {
4252 		rid = i + 1;
4253 		sc->msix_irq_res[i] = bus_alloc_resource_any(sc->dev,
4254 							  SYS_RES_IRQ,
4255 							  &rid, RF_ACTIVE);
4256 		if (sc->msix_irq_res[i] == NULL) {
4257 			device_printf(sc->dev, "couldn't allocate IRQ res"
4258 				      " for message %d\n", i);
4259 			err = ENXIO;
4260 			goto abort_with_res;
4261 		}
4262 	}
4263 
4264 	bytes = sizeof (*sc->msix_ih) * sc->num_slices;
4265 	sc->msix_ih =  malloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO);
4266 
4267 	for (i = 0; i < sc->num_slices; i++) {
4268 		err = bus_setup_intr(sc->dev, sc->msix_irq_res[i],
4269 				     INTR_TYPE_NET | INTR_MPSAFE,
4270 #if __FreeBSD_version > 700030
4271 				     NULL,
4272 #endif
4273 				     mxge_intr, &sc->ss[i], &sc->msix_ih[i]);
4274 		if (err != 0) {
4275 			device_printf(sc->dev, "couldn't setup intr for "
4276 				      "message %d\n", i);
4277 			goto abort_with_intr;
4278 		}
4279 	}
4280 
4281 	if (mxge_verbose) {
4282 		device_printf(sc->dev, "using %d msix IRQs:",
4283 			      sc->num_slices);
4284 		for (i = 0; i < sc->num_slices; i++)
4285 			printf(" %ld",  rman_get_start(sc->msix_irq_res[i]));
4286 		printf("\n");
4287 	}
4288 	return (0);
4289 
4290 abort_with_intr:
4291 	for (i = 0; i < sc->num_slices; i++) {
4292 		if (sc->msix_ih[i] != NULL) {
4293 			bus_teardown_intr(sc->dev, sc->msix_irq_res[i],
4294 					  sc->msix_ih[i]);
4295 			sc->msix_ih[i] = NULL;
4296 		}
4297 	}
4298 	free(sc->msix_ih, M_DEVBUF);
4299 
4300 
4301 abort_with_res:
4302 	for (i = 0; i < sc->num_slices; i++) {
4303 		rid = i + 1;
4304 		if (sc->msix_irq_res[i] != NULL)
4305 			bus_release_resource(sc->dev, SYS_RES_IRQ, rid,
4306 					     sc->msix_irq_res[i]);
4307 		sc->msix_irq_res[i] = NULL;
4308 	}
4309 	free(sc->msix_irq_res, M_DEVBUF);
4310 
4311 
4312 abort_with_msix:
4313 	pci_release_msi(sc->dev);
4314 
4315 abort_with_msix_table:
4316 	bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2),
4317 			     sc->msix_table_res);
4318 
4319 	return err;
4320 }
4321 
4322 static int
4323 mxge_add_single_irq(mxge_softc_t *sc)
4324 {
4325 	int count, err, rid;
4326 
4327 	count = pci_msi_count(sc->dev);
4328 	if (count == 1 && pci_alloc_msi(sc->dev, &count) == 0) {
4329 		rid = 1;
4330 	} else {
4331 		rid = 0;
4332 		sc->legacy_irq = 1;
4333 	}
4334 	sc->irq_res = bus_alloc_resource(sc->dev, SYS_RES_IRQ, &rid, 0, ~0,
4335 					 1, RF_SHAREABLE | RF_ACTIVE);
4336 	if (sc->irq_res == NULL) {
4337 		device_printf(sc->dev, "could not alloc interrupt\n");
4338 		return ENXIO;
4339 	}
4340 	if (mxge_verbose)
4341 		device_printf(sc->dev, "using %s irq %ld\n",
4342 			      sc->legacy_irq ? "INTx" : "MSI",
4343 			      rman_get_start(sc->irq_res));
4344 	err = bus_setup_intr(sc->dev, sc->irq_res,
4345 			     INTR_TYPE_NET | INTR_MPSAFE,
4346 #if __FreeBSD_version > 700030
4347 			     NULL,
4348 #endif
4349 			     mxge_intr, &sc->ss[0], &sc->ih);
4350 	if (err != 0) {
4351 		bus_release_resource(sc->dev, SYS_RES_IRQ,
4352 				     sc->legacy_irq ? 0 : 1, sc->irq_res);
4353 		if (!sc->legacy_irq)
4354 			pci_release_msi(sc->dev);
4355 	}
4356 	return err;
4357 }
4358 
4359 static void
4360 mxge_rem_msix_irqs(mxge_softc_t *sc)
4361 {
4362 	int i, rid;
4363 
4364 	for (i = 0; i < sc->num_slices; i++) {
4365 		if (sc->msix_ih[i] != NULL) {
4366 			bus_teardown_intr(sc->dev, sc->msix_irq_res[i],
4367 					  sc->msix_ih[i]);
4368 			sc->msix_ih[i] = NULL;
4369 		}
4370 	}
4371 	free(sc->msix_ih, M_DEVBUF);
4372 
4373 	for (i = 0; i < sc->num_slices; i++) {
4374 		rid = i + 1;
4375 		if (sc->msix_irq_res[i] != NULL)
4376 			bus_release_resource(sc->dev, SYS_RES_IRQ, rid,
4377 					     sc->msix_irq_res[i]);
4378 		sc->msix_irq_res[i] = NULL;
4379 	}
4380 	free(sc->msix_irq_res, M_DEVBUF);
4381 
4382 	bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2),
4383 			     sc->msix_table_res);
4384 
4385 	pci_release_msi(sc->dev);
4386 	return;
4387 }
4388 
4389 static void
4390 mxge_rem_single_irq(mxge_softc_t *sc)
4391 {
4392 	bus_teardown_intr(sc->dev, sc->irq_res, sc->ih);
4393 	bus_release_resource(sc->dev, SYS_RES_IRQ,
4394 			     sc->legacy_irq ? 0 : 1, sc->irq_res);
4395 	if (!sc->legacy_irq)
4396 		pci_release_msi(sc->dev);
4397 }
4398 
4399 static void
4400 mxge_rem_irq(mxge_softc_t *sc)
4401 {
4402 	if (sc->num_slices > 1)
4403 		mxge_rem_msix_irqs(sc);
4404 	else
4405 		mxge_rem_single_irq(sc);
4406 }
4407 
4408 static int
4409 mxge_add_irq(mxge_softc_t *sc)
4410 {
4411 	int err;
4412 
4413 	if (sc->num_slices > 1)
4414 		err = mxge_add_msix_irqs(sc);
4415 	else
4416 		err = mxge_add_single_irq(sc);
4417 
4418 	if (0 && err == 0 && sc->num_slices > 1) {
4419 		mxge_rem_msix_irqs(sc);
4420 		err = mxge_add_msix_irqs(sc);
4421 	}
4422 	return err;
4423 }
4424 
4425 
4426 static int
4427 mxge_attach(device_t dev)
4428 {
4429 	mxge_softc_t *sc = device_get_softc(dev);
4430 	struct ifnet *ifp;
4431 	int err, rid;
4432 
4433 	sc->dev = dev;
4434 	mxge_fetch_tunables(sc);
4435 
4436 	err = bus_dma_tag_create(NULL,			/* parent */
4437 				 1,			/* alignment */
4438 				 0,			/* boundary */
4439 				 BUS_SPACE_MAXADDR,	/* low */
4440 				 BUS_SPACE_MAXADDR,	/* high */
4441 				 NULL, NULL,		/* filter */
4442 				 65536 + 256,		/* maxsize */
4443 				 MXGE_MAX_SEND_DESC, 	/* num segs */
4444 				 65536,			/* maxsegsize */
4445 				 0,			/* flags */
4446 				 NULL, NULL,		/* lock */
4447 				 &sc->parent_dmat);	/* tag */
4448 
4449 	if (err != 0) {
4450 		device_printf(sc->dev, "Err %d allocating parent dmat\n",
4451 			      err);
4452 		goto abort_with_nothing;
4453 	}
4454 
4455 	ifp = sc->ifp = if_alloc(IFT_ETHER);
4456 	if (ifp == NULL) {
4457 		device_printf(dev, "can not if_alloc()\n");
4458 		err = ENOSPC;
4459 		goto abort_with_parent_dmat;
4460 	}
4461 	if_initname(ifp, device_get_name(dev), device_get_unit(dev));
4462 
4463 	snprintf(sc->cmd_mtx_name, sizeof(sc->cmd_mtx_name), "%s:cmd",
4464 		 device_get_nameunit(dev));
4465 	mtx_init(&sc->cmd_mtx, sc->cmd_mtx_name, NULL, MTX_DEF);
4466 	snprintf(sc->driver_mtx_name, sizeof(sc->driver_mtx_name),
4467 		 "%s:drv", device_get_nameunit(dev));
4468 	mtx_init(&sc->driver_mtx, sc->driver_mtx_name,
4469 		 MTX_NETWORK_LOCK, MTX_DEF);
4470 
4471 	callout_init_mtx(&sc->co_hdl, &sc->driver_mtx, 0);
4472 
4473 	mxge_setup_cfg_space(sc);
4474 
4475 	/* Map the board into the kernel */
4476 	rid = PCIR_BARS;
4477 	sc->mem_res = bus_alloc_resource(dev, SYS_RES_MEMORY, &rid, 0,
4478 					 ~0, 1, RF_ACTIVE);
4479 	if (sc->mem_res == NULL) {
4480 		device_printf(dev, "could not map memory\n");
4481 		err = ENXIO;
4482 		goto abort_with_lock;
4483 	}
4484 	sc->sram = rman_get_virtual(sc->mem_res);
4485 	sc->sram_size = 2*1024*1024 - (2*(48*1024)+(32*1024)) - 0x100;
4486 	if (sc->sram_size > rman_get_size(sc->mem_res)) {
4487 		device_printf(dev, "impossible memory region size %ld\n",
4488 			      rman_get_size(sc->mem_res));
4489 		err = ENXIO;
4490 		goto abort_with_mem_res;
4491 	}
4492 
4493 	/* make NULL terminated copy of the EEPROM strings section of
4494 	   lanai SRAM */
4495 	bzero(sc->eeprom_strings, MXGE_EEPROM_STRINGS_SIZE);
4496 	bus_space_read_region_1(rman_get_bustag(sc->mem_res),
4497 				rman_get_bushandle(sc->mem_res),
4498 				sc->sram_size - MXGE_EEPROM_STRINGS_SIZE,
4499 				sc->eeprom_strings,
4500 				MXGE_EEPROM_STRINGS_SIZE - 2);
4501 	err = mxge_parse_strings(sc);
4502 	if (err != 0)
4503 		goto abort_with_mem_res;
4504 
4505 	/* Enable write combining for efficient use of PCIe bus */
4506 	mxge_enable_wc(sc);
4507 
4508 	/* Allocate the out of band dma memory */
4509 	err = mxge_dma_alloc(sc, &sc->cmd_dma,
4510 			     sizeof (mxge_cmd_t), 64);
4511 	if (err != 0)
4512 		goto abort_with_mem_res;
4513 	sc->cmd = (mcp_cmd_response_t *) sc->cmd_dma.addr;
4514 	err = mxge_dma_alloc(sc, &sc->zeropad_dma, 64, 64);
4515 	if (err != 0)
4516 		goto abort_with_cmd_dma;
4517 
4518 	err = mxge_dma_alloc(sc, &sc->dmabench_dma, 4096, 4096);
4519 	if (err != 0)
4520 		goto abort_with_zeropad_dma;
4521 
4522 	/* select & load the firmware */
4523 	err = mxge_select_firmware(sc);
4524 	if (err != 0)
4525 		goto abort_with_dmabench;
4526 	sc->intr_coal_delay = mxge_intr_coal_delay;
4527 
4528 	mxge_slice_probe(sc);
4529 	err = mxge_alloc_slices(sc);
4530 	if (err != 0)
4531 		goto abort_with_dmabench;
4532 
4533 	err = mxge_reset(sc, 0);
4534 	if (err != 0)
4535 		goto abort_with_slices;
4536 
4537 	err = mxge_alloc_rings(sc);
4538 	if (err != 0) {
4539 		device_printf(sc->dev, "failed to allocate rings\n");
4540 		goto abort_with_dmabench;
4541 	}
4542 
4543 	err = mxge_add_irq(sc);
4544 	if (err != 0) {
4545 		device_printf(sc->dev, "failed to add irq\n");
4546 		goto abort_with_rings;
4547 	}
4548 
4549 	ifp->if_baudrate = IF_Gbps(10UL);
4550 	ifp->if_capabilities = IFCAP_RXCSUM | IFCAP_TXCSUM | IFCAP_TSO4 |
4551 		IFCAP_VLAN_MTU | IFCAP_LRO;
4552 
4553 #ifdef MXGE_NEW_VLAN_API
4554 	ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_HWCSUM;
4555 #endif
4556 
4557 	sc->max_mtu = mxge_max_mtu(sc);
4558 	if (sc->max_mtu >= 9000)
4559 		ifp->if_capabilities |= IFCAP_JUMBO_MTU;
4560 	else
4561 		device_printf(dev, "MTU limited to %d.  Install "
4562 			      "latest firmware for 9000 byte jumbo support\n",
4563 			      sc->max_mtu - ETHER_HDR_LEN);
4564 	ifp->if_hwassist = CSUM_TCP | CSUM_UDP | CSUM_TSO;
4565 	ifp->if_capenable = ifp->if_capabilities;
4566 	if (sc->lro_cnt == 0)
4567 		ifp->if_capenable &= ~IFCAP_LRO;
4568 	sc->csum_flag = 1;
4569         ifp->if_init = mxge_init;
4570         ifp->if_softc = sc;
4571         ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
4572         ifp->if_ioctl = mxge_ioctl;
4573         ifp->if_start = mxge_start;
4574 	/* Initialise the ifmedia structure */
4575 	ifmedia_init(&sc->media, 0, mxge_media_change,
4576 		     mxge_media_status);
4577 	mxge_set_media(sc, IFM_ETHER | IFM_AUTO);
4578 	mxge_media_probe(sc);
4579 	ether_ifattach(ifp, sc->mac_addr);
4580 	/* ether_ifattach sets mtu to 1500 */
4581 	if (ifp->if_capabilities & IFCAP_JUMBO_MTU)
4582 		ifp->if_mtu = 9000;
4583 
4584 	mxge_add_sysctls(sc);
4585 #ifdef IFNET_BUF_RING
4586 	ifp->if_transmit = mxge_transmit;
4587 	ifp->if_qflush = mxge_qflush;
4588 #endif
4589 	return 0;
4590 
4591 abort_with_rings:
4592 	mxge_free_rings(sc);
4593 abort_with_slices:
4594 	mxge_free_slices(sc);
4595 abort_with_dmabench:
4596 	mxge_dma_free(&sc->dmabench_dma);
4597 abort_with_zeropad_dma:
4598 	mxge_dma_free(&sc->zeropad_dma);
4599 abort_with_cmd_dma:
4600 	mxge_dma_free(&sc->cmd_dma);
4601 abort_with_mem_res:
4602 	bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
4603 abort_with_lock:
4604 	pci_disable_busmaster(dev);
4605 	mtx_destroy(&sc->cmd_mtx);
4606 	mtx_destroy(&sc->driver_mtx);
4607 	if_free(ifp);
4608 abort_with_parent_dmat:
4609 	bus_dma_tag_destroy(sc->parent_dmat);
4610 
4611 abort_with_nothing:
4612 	return err;
4613 }
4614 
4615 static int
4616 mxge_detach(device_t dev)
4617 {
4618 	mxge_softc_t *sc = device_get_softc(dev);
4619 
4620 	if (mxge_vlans_active(sc)) {
4621 		device_printf(sc->dev,
4622 			      "Detach vlans before removing module\n");
4623 		return EBUSY;
4624 	}
4625 	mtx_lock(&sc->driver_mtx);
4626 	if (sc->ifp->if_drv_flags & IFF_DRV_RUNNING)
4627 		mxge_close(sc);
4628 	mtx_unlock(&sc->driver_mtx);
4629 	ether_ifdetach(sc->ifp);
4630 	callout_drain(&sc->co_hdl);
4631 	ifmedia_removeall(&sc->media);
4632 	mxge_dummy_rdma(sc, 0);
4633 	mxge_rem_sysctls(sc);
4634 	mxge_rem_irq(sc);
4635 	mxge_free_rings(sc);
4636 	mxge_free_slices(sc);
4637 	mxge_dma_free(&sc->dmabench_dma);
4638 	mxge_dma_free(&sc->zeropad_dma);
4639 	mxge_dma_free(&sc->cmd_dma);
4640 	bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
4641 	pci_disable_busmaster(dev);
4642 	mtx_destroy(&sc->cmd_mtx);
4643 	mtx_destroy(&sc->driver_mtx);
4644 	if_free(sc->ifp);
4645 	bus_dma_tag_destroy(sc->parent_dmat);
4646 	return 0;
4647 }
4648 
4649 static int
4650 mxge_shutdown(device_t dev)
4651 {
4652 	return 0;
4653 }
4654 
4655 /*
4656   This file uses Myri10GE driver indentation.
4657 
4658   Local Variables:
4659   c-file-style:"linux"
4660   tab-width:8
4661   End:
4662 */
4663